In 2026 management of this data is being transferred to SEACAR In this file we prepare a transformation of the dataset for SEACAR ingestion, and perform some checks.

The data here should align with standards described in SEACAR_Metadata_UnifiedWQ_ID10006.xlsx.

Code
librarian::shelf(
  dplyr,
  glue,
  here,
  tidyr
)
Code
library("here")
df_all <- read.csv(here("data/exports/allDataRaw.csv"))
source(here("R/dropNonStandardAnalytes.R"))
df <- dropNonStandardAnalytes(df_all)

source(here("R/mutateWINToSEACAR.R"))
df <- mutateWINToSEACAR(df)

# write SEACAR .csv
write.csv(df, here("data", "exports", "allDataSEACAR.csv"))
show percent of missing programID
library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(ProgramID)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)
           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET            0      24290       0.0%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET            0     442825       0.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL            0     484218       0.0%
12     FIU_WQMP_RECENT            0       4020       0.0%
13          MiamiBeach            0      27484       0.0%
14           PALMBEACH            0       7658       0.0%
15    PALMBEACH_STORET            0      13145       0.0%
16                SFER            0      76230       0.0%
show percent of missing SampleDate
library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(SampleDate)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)
           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET            0      24290       0.0%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET            0     442825       0.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL            0     484218       0.0%
12     FIU_WQMP_RECENT            0       4020       0.0%
13          MiamiBeach            0      27484       0.0%
14           PALMBEACH            0       7658       0.0%
15    PALMBEACH_STORET            0      13145       0.0%
16                SFER           18      76230       0.0%
show percent of missing ResultValue
library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(ResultValue)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)
           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            3      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET         1410      24290       5.8%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ           15     144946       0.0%
8     DERM_BBWQ_STORET         2850     442825       0.6%
9        FIU_Estuaries        21510     388128       5.5%
10            FIU_WQMP         2773      62621       4.4%
11 FIU_WQMP_HISTORICAL       121735     484218      25.1%
12     FIU_WQMP_RECENT          773       4020      19.2%
13          MiamiBeach          347      27484       1.3%
14           PALMBEACH           44       7658       0.6%
15    PALMBEACH_STORET          329      13145       2.5%
16                SFER        14482      76230      19.0%
show percent of missing coordinates
library(dplyr)
library(scales)
# show percentage of missing or na rows in df$Monitoring.Location.ID grouped by df$program
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(OriginalLatitude) | is.na(OriginalLongitude)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)
           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET        24290      24290     100.0%
6                  DEP          261      99792       0.3%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET       442825     442825     100.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL           54     484218       0.0%
12     FIU_WQMP_RECENT         4020       4020     100.0%
13          MiamiBeach         4443      27484      16.2%
14           PALMBEACH         7658       7658     100.0%
15    PALMBEACH_STORET        13145      13145     100.0%
16                SFER           18      76230       0.0%
show percent of missing ProgramLocationID
library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(ProgramLocationID)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)
           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET            0      24290       0.0%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET            0     442825       0.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL            0     484218       0.0%
12     FIU_WQMP_RECENT            0       4020       0.0%
13          MiamiBeach            0      27484       0.0%
14           PALMBEACH            0       7658       0.0%
15    PALMBEACH_STORET            0      13145       0.0%
16                SFER           18      76230       0.0%
show percent of missing ParameterUnits
library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(ParameterUnits)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)
           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET            0      24290       0.0%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET            0     442825       0.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL            0     484218       0.0%
12     FIU_WQMP_RECENT         4020       4020     100.0%
13          MiamiBeach            0      27484       0.0%
14           PALMBEACH            0       7658       0.0%
15    PALMBEACH_STORET            0      13145       0.0%
16                SFER        76230      76230     100.0%
Code
#show table from df with a row for each unique of 
# (ProgramLocationID and ProgramName) with columns 
# programLocationID, ProgramName, and N_locations, 
# where N_locations is the number of unique OriginalLatitude and
# OriginalLongitude pairs for the programLocationID+ProgramName.

library(dplyr)

summary_table <- df %>%
  group_by(ProgramLocationID, ProgramName) %>%
  summarise(
    N_locations = n_distinct(paste(OriginalLatitude, OriginalLongitude)),
    .groups = "drop"
  ) %>%
  filter(N_locations > 1)

summary_table
# A tibble: 292 × 3
   ProgramLocationID ProgramName N_locations
   <chr>             <chr>             <int>
 1 1                 AOML_FBBB           133
 2 1                 SFER                 51
 3 10                AOML_FBBB           113
 4 10                SFER                 53
 5 11                AOML_FBBB           117
 6 11                SFER                 55
 7 12                AOML_FBBB           116
 8 12                SFER                 57
 9 13                AOML_FBBB           119
10 13                SFER                 54
# ℹ 282 more rows

Summary of column completeness

Code
skimr::skim(df)
Data summary
Name df
Number of rows 1845404
Number of columns 35
_______________________
Column type frequency:
character 18
logical 1
numeric 16
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Habitat 1480700 0.20 12 21 0 3 0
IndicatorName 1821453 0.01 9 13 0 3 0
ManagedAreaName 1821453 0.01 29 38 0 2 0
RelativeDepth 890081 0.52 0 7 17934 4 0
DetectionUnit 1843238 0.00 4 4 0 1 0
SEACAR_QAQCFlagCode 1821453 0.01 2 13 0 26 0
SEACAR_QAQC_Description 1821453 0.01 31 195 0 26 0
ExportVersion 1821453 0.01 23 23 0 3 0
Region 1778072 0.04 2 19 0 22 0
ProgramName 0 1.00 3 19 0 16 0
ParameterName 0 1.00 2 41 0 20 0
ParameterUnits 80250 0.96 0 10 14862 20 0
ProgramLocationID 18 1.00 1 25 0 2133 0
ActivityType 1503741 0.19 5 15 0 4 0
SampleDate 18 1.00 10 10 0 3702 0
ValueQualifier 1305542 0.29 0 4 221265 95 0
ResultComments 1504579 0.18 0 874 308298 756 9
SEACAR_EventID 1822363 0.01 36 36 0 2394 0

Variable type: logical

skim_variable n_missing complete_rate mean count
ValueQualifierSource 1845404 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
RowID 1822363 0.01 1009723.34 798651.12 486217.00 514368.00 534968.00 1551534.00 3870318.00 ▇▂▁▁▁
ProgramID 0 1.00 2165.93 2360.54 3.00 297.00 509.00 4018.00 10013.00 ▇▁▆▁▁
IndicatorID 1821453 0.01 6.73 0.63 6.00 6.00 7.00 7.00 8.00 ▆▁▇▁▂
ParameterID 1821453 0.01 8.60 6.56 1.00 3.00 6.00 16.00 19.00 ▇▃▁▂▅
AreaID 1821453 0.01 6.09 0.89 6.00 6.00 6.00 6.00 15.00 ▇▁▁▁▁
Year 972959 0.47 2010.98 9.77 1989.48 2002.08 2016.00 2020.00 2024.00 ▁▃▃▁▇
Month 972941 0.47 6.59 3.44 0.00 4.00 7.00 10.00 12.00 ▅▅▇▆▇
TotalDepth_m 1845341 0.00 62.22 105.84 1.10 2.10 2.85 8.50 250.00 ▇▁▁▁▂
MDL 1657062 0.10 0.06 0.49 0.00 0.00 0.01 0.08 50.50 ▇▁▁▁▁
PQL 1657062 0.10 0.18 2.41 0.00 0.01 0.06 0.20 250.00 ▇▁▁▁▁
Include 1821453 0.01 1.00 0.03 0.00 1.00 1.00 1.00 1.00 ▁▁▁▁▇
MADup 1821453 0.01 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
ResultValue 166271 0.91 11.01 389.59 -0.03 0.03 0.61 6.00 53465.00 ▇▁▁▁▁
ActivityDepth_m 991216 0.46 6.28 14.51 0.00 0.50 3.20 8.49 2494.00 ▇▁▁▁▁
OriginalLatitude 496660 0.73 25.35 0.62 24.38 24.83 25.28 25.78 28.78 ▇▇▁▁▁
OriginalLongitude 496714 0.73 -80.98 0.74 -85.02 -81.52 -80.93 -80.29 -80.02 ▁▁▁▆▇
Plot providers row count grouped by analyte
library(ggplot2)

ggplot(df, aes(x = ProgramName, fill = ParameterName)) +
  geom_bar() +
  scale_y_log10() +
  labs(
    title = "Count of Observations by Program and Parameter",
    x = "Program Name",
    y = "Count (Log Scale)",
    fill = "Parameter"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

show n samples from bottom & surface
library(ggplot2)

ggplot(df, aes(x = RelativeDepth)) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Number of Samples by Relative Depth",
    x = "Relative Depth",
    y = "Number of Samples"
  ) +
  theme_minimal()

Code
source(here("R/mutateWINToSEACAR.R"))
df <- df_all %>% mutateWINToSEACAR(.keep = "unused")
SEACAR_COLUMNS <- c(
 "ExportVersion","RowID","ProgramID","ProgramName","Habitat",
 "IndicatorID","IndicatorName","ParameterID","ParameterName",
 "ParameterUnits","ProgramLocationID","AreaID","ManagedAreaName",
 "Region","ActivityType","SampleDate","ResultValue","Year","Month",
 "ActivityDepth_m","RelativeDepth","TotalDepth_m","MDL","PQL",
 "DetectionUnit","ValueQualifier","ValueQualifierSource",
 "SampleFraction","ResultComments","OriginalLatitude",
 "OriginalLongitude","SEACAR_QAQCFlagCode",
 "SEACAR_QAQC_Description","Include","SEACAR_EventID","MADup"
)

# figure out which are actually present
present_cols <- intersect(SEACAR_COLUMNS, names(df))
missing_cols <- setdiff(SEACAR_COLUMNS, names(df))

if (length(missing_cols) > 0) {
 warning("\nThese SEACAR_COLUMNS were not found in df:\n  ",
         paste(missing_cols, collapse = ", "))
}

# Any additional columns in df that are not mapped to SEACAR
extra_columns <- setdiff(names(df), SEACAR_COLUMNS)
cat("\nColumns from source data that are not (directly) mapped to SEACAR columns:\n")

Columns from source data that are not (directly) mapped to SEACAR columns:
Code
print(extra_columns)
character(0)