load libraries
librarian::shelf(
dplyr,
tidyr,
here,
DT
)The allData.csv file is in “long” format, with “analyte name” as a column. This document transforms the data to “wide” “tidy” format, with “analyte name” on the columns. This is done by pivoting on the analyte name column.
Long format dimensions:
Rows: 1265123
Columns: 10
The transformation uses tidyr::pivot_wider() to convert the dataset from long format (where each row is one measurement of one analyte) to wide format (where each row represents all measurements at a specific location and time).
library(dplyr)
library(tidyr)
# Pivot to wide format using the actual column names from allData.csv
# Columns: source, site, datetime, analyte, value, units, latitude, longitude, sample_depth
df_wide <- df_long %>%
pivot_wider(
id_cols = c(source, site, datetime, units, latitude, longitude, sample_depth),
names_from = analyte,
values_from = value,
values_fn = mean # aggregate duplicates by taking mean
)
# Show dimensions of wide format data
cat("Wide format dimensions:\n")Wide format dimensions:
Rows: 354239
Columns: 27
Number of analyte columns created: 20
Summary of numeric analyte columns:
| Name | df_wide[, numeric_cols] |
| Number of rows | 354239 |
| Number of columns | 23 |
| _______________________ | |
| Column type frequency: | |
| numeric | 23 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| latitude | 157882 | 0.55 | 25.63 | 0.51 | 24.40 | 25.29 | 25.74 | 25.88 | 28.78 | ▃▇▁▁▁ |
| longitude | 157882 | 0.55 | -80.49 | 0.52 | -85.02 | -80.62 | -80.28 | -80.16 | -80.02 | ▁▁▁▁▇ |
| sample_depth | 198581 | 0.44 | 0.44 | 0.20 | 0.00 | 0.50 | 0.50 | 0.50 | 1.00 | ▃▁▇▂▁ |
| Water Temperature | 263170 | 0.26 | 19.55 | 11.63 | 0.00 | 2.10 | 24.60 | 28.45 | 74.65 | ▃▇▂▁▁ |
| Salinity | 266376 | 0.25 | 18.52 | 16.10 | 0.00 | 1.16 | 22.72 | 34.57 | 933.00 | ▇▁▁▁▁ |
| Dissolved Oxygen | 276439 | 0.22 | 4.45 | 11.33 | 0.00 | 1.51 | 5.05 | 6.45 | 2697.44 | ▇▁▁▁▁ |
| Ammonium, Filtered (NH4) | 319411 | 0.10 | 0.53 | 1.86 | 0.00 | 0.01 | 0.02 | 0.25 | 76.39 | ▇▁▁▁▁ |
| Nitrite (NO2) | 307591 | 0.13 | 0.03 | 0.14 | 0.00 | 0.00 | 0.00 | 0.01 | 14.10 | ▇▁▁▁▁ |
| Nitrate (NO3) | 322088 | 0.09 | 0.35 | 3.51 | -0.01 | 0.00 | 0.01 | 0.04 | 216.06 | ▇▁▁▁▁ |
| NO2+3, Filtered | 292945 | 0.17 | 0.27 | 2.63 | 0.00 | 0.00 | 0.01 | 0.06 | 218.00 | ▇▁▁▁▁ |
| Phosphate, Filtered (PO4) | 311180 | 0.12 | 0.15 | 0.28 | 0.00 | 0.00 | 0.01 | 0.10 | 2.60 | ▇▁▁▁▁ |
| pH | 290791 | 0.18 | 4.05 | 4.67 | 0.00 | 1.00 | 1.78 | 7.68 | 820.00 | ▇▁▁▁▁ |
| Specific Conductivity | 351520 | 0.01 | 2600.53 | 9322.27 | 0.00 | 45.30 | 49.56 | 54.89 | 53465.00 | ▇▁▁▁▁ |
| Turbidity | 285975 | 0.19 | 2.54 | 5.47 | 0.00 | 0.60 | 1.00 | 2.69 | 254.00 | ▇▁▁▁▁ |
| Total Kjeldahl Nitrogen | 300334 | 0.15 | 0.46 | 0.40 | 0.00 | 0.18 | 0.36 | 0.62 | 9.20 | ▇▁▁▁▁ |
| Total Phosphorus | 303465 | 0.14 | 0.03 | 0.08 | 0.00 | 0.01 | 0.01 | 0.03 | 3.38 | ▇▁▁▁▁ |
| Total Nitrogen | 322127 | 0.09 | 0.69 | 3.90 | 0.00 | 0.20 | 0.34 | 0.55 | 296.21 | ▇▁▁▁▁ |
| Chlorophyll a, Uncorrected for Pheophytin | 350110 | 0.01 | 0.62 | 2.09 | 0.04 | 0.22 | 0.32 | 0.49 | 54.60 | ▇▁▁▁▁ |
| Dissolved Oxygen Saturation | 353751 | 0.00 | 93.37 | 16.77 | 36.23 | 84.46 | 94.53 | 100.72 | 147.89 | ▁▂▇▂▁ |
| Total Suspended Solids | 349939 | 0.01 | 4.46 | 6.88 | 0.02 | 1.10 | 2.03 | 4.40 | 83.00 | ▇▁▁▁▁ |
| Nitrogen, organic | 353999 | 0.00 | 32.90 | 26.56 | 0.00 | 17.32 | 27.23 | 45.02 | 292.07 | ▇▁▁▁▁ |
| Chlorophyll a, Corrected for Pheophytin | 334548 | 0.06 | 3.70 | 9.42 | 0.00 | 0.39 | 0.97 | 3.22 | 747.00 | ▇▁▁▁▁ |
| Ammonia, Un-ionized (NH3) | 341599 | 0.04 | 0.81 | 0.30 | 0.00 | 0.55 | 1.00 | 1.00 | 1.00 | ▁▁▂▁▇ |
To make analysis easier, an unique ID is given to each row to identify the spatial location. This identifier is built from {source}-{site}-{depth}.
df_wide$site_id <- paste(
df_wide$source,
df_wide$site,
ifelse(is.na(df_wide$sample_depth), "NA", df_wide$sample_depth),
sep = "-"
)
# Display first 100 rows with scrolling
DT::datatable(
head(df_wide, 100),
options = list(
scrollX = TRUE,
scrollY = "400px",
pageLength = 10
),
caption = "Wide format data with site_id column (scroll to see all columns)"
)# desired remaining columns: analytes, datetime, site_id, depth
# Get analyte column names (all columns except the id columns)
id_columns <- c("source", "site", "units", "latitude", "longitude", "sample_depth", "site_id")
analyte_columns <- setdiff(names(df_wide), id_columns)
# Keep only datetime, site_id, sample_depth, and all analyte columns
df_wide <- df_wide %>%
select(site_id, datetime, sample_depth, all_of(analyte_columns))
# Display first 100 rows with scrolling
DT::datatable(
head(df_wide, 100),
options = list(
scrollX = TRUE,
scrollY = "400px",
pageLength = 10
),
caption = "Cleaned wide format data with selected columns (scroll to see all columns)"
)Summary of numeric analyte columns:
| Name | df_wide[, numeric_cols] |
| Number of rows | 354239 |
| Number of columns | 21 |
| _______________________ | |
| Column type frequency: | |
| numeric | 21 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| sample_depth | 198581 | 0.44 | 0.44 | 0.20 | 0.00 | 0.50 | 0.50 | 0.50 | 1.00 | ▃▁▇▂▁ |
| Water Temperature | 263170 | 0.26 | 19.55 | 11.63 | 0.00 | 2.10 | 24.60 | 28.45 | 74.65 | ▃▇▂▁▁ |
| Salinity | 266376 | 0.25 | 18.52 | 16.10 | 0.00 | 1.16 | 22.72 | 34.57 | 933.00 | ▇▁▁▁▁ |
| Dissolved Oxygen | 276439 | 0.22 | 4.45 | 11.33 | 0.00 | 1.51 | 5.05 | 6.45 | 2697.44 | ▇▁▁▁▁ |
| Ammonium, Filtered (NH4) | 319411 | 0.10 | 0.53 | 1.86 | 0.00 | 0.01 | 0.02 | 0.25 | 76.39 | ▇▁▁▁▁ |
| Nitrite (NO2) | 307591 | 0.13 | 0.03 | 0.14 | 0.00 | 0.00 | 0.00 | 0.01 | 14.10 | ▇▁▁▁▁ |
| Nitrate (NO3) | 322088 | 0.09 | 0.35 | 3.51 | -0.01 | 0.00 | 0.01 | 0.04 | 216.06 | ▇▁▁▁▁ |
| NO2+3, Filtered | 292945 | 0.17 | 0.27 | 2.63 | 0.00 | 0.00 | 0.01 | 0.06 | 218.00 | ▇▁▁▁▁ |
| Phosphate, Filtered (PO4) | 311180 | 0.12 | 0.15 | 0.28 | 0.00 | 0.00 | 0.01 | 0.10 | 2.60 | ▇▁▁▁▁ |
| pH | 290791 | 0.18 | 4.05 | 4.67 | 0.00 | 1.00 | 1.78 | 7.68 | 820.00 | ▇▁▁▁▁ |
| Specific Conductivity | 351520 | 0.01 | 2600.53 | 9322.27 | 0.00 | 45.30 | 49.56 | 54.89 | 53465.00 | ▇▁▁▁▁ |
| Turbidity | 285975 | 0.19 | 2.54 | 5.47 | 0.00 | 0.60 | 1.00 | 2.69 | 254.00 | ▇▁▁▁▁ |
| Total Kjeldahl Nitrogen | 300334 | 0.15 | 0.46 | 0.40 | 0.00 | 0.18 | 0.36 | 0.62 | 9.20 | ▇▁▁▁▁ |
| Total Phosphorus | 303465 | 0.14 | 0.03 | 0.08 | 0.00 | 0.01 | 0.01 | 0.03 | 3.38 | ▇▁▁▁▁ |
| Total Nitrogen | 322127 | 0.09 | 0.69 | 3.90 | 0.00 | 0.20 | 0.34 | 0.55 | 296.21 | ▇▁▁▁▁ |
| Chlorophyll a, Uncorrected for Pheophytin | 350110 | 0.01 | 0.62 | 2.09 | 0.04 | 0.22 | 0.32 | 0.49 | 54.60 | ▇▁▁▁▁ |
| Dissolved Oxygen Saturation | 353751 | 0.00 | 93.37 | 16.77 | 36.23 | 84.46 | 94.53 | 100.72 | 147.89 | ▁▂▇▂▁ |
| Total Suspended Solids | 349939 | 0.01 | 4.46 | 6.88 | 0.02 | 1.10 | 2.03 | 4.40 | 83.00 | ▇▁▁▁▁ |
| Nitrogen, organic | 353999 | 0.00 | 32.90 | 26.56 | 0.00 | 17.32 | 27.23 | 45.02 | 292.07 | ▇▁▁▁▁ |
| Chlorophyll a, Corrected for Pheophytin | 334548 | 0.06 | 3.70 | 9.42 | 0.00 | 0.39 | 0.97 | 3.22 | 747.00 | ▇▁▁▁▁ |
| Ammonia, Un-ionized (NH3) | 341599 | 0.04 | 0.81 | 0.30 | 0.00 | 0.55 | 1.00 | 1.00 | 1.00 | ▁▁▂▁▇ |
Note: Ideas for next steps: - Multivariate analysis where you need multiple parameters as separate variables - Creating correlation matrices between different analytes - Machine learning tasks that expect features as columns - Quick comparison of multiple parameters at the same observation point