seacar-check – FCRWQDC Ingest Reports

In 2026 management of this data is being transferred to SEACAR In this file we prepare a transformation of the dataset for SEACAR ingestion, and perform some checks.

The data here should align with standards described in SEACAR_Metadata_UnifiedWQ_ID10006.xlsx.

Code

librarian::shelf(
  dplyr,
  glue,
  here,
  tidyr
)

Code

library("here")
df_all <- read.csv(here("data/exports/allDataRaw.csv"))
source(here("R/dropNonStandardAnalytes.R"))
df <- dropNonStandardAnalytes(df_all)

source(here("R/mutateWINToSEACAR.R"))
df <- mutateWINToSEACAR(df)

# write SEACAR .csv
write.csv(df, here("data", "exports", "allDataSEACAR.csv"))

show percent of missing programID

library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(ProgramID)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)

           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET            0      24290       0.0%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET            0     442825       0.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL            0     484218       0.0%
12     FIU_WQMP_RECENT            0       4020       0.0%
13          MiamiBeach            0      27484       0.0%
14           PALMBEACH            0       7658       0.0%
15    PALMBEACH_STORET            0      13145       0.0%
16                SFER            0      76230       0.0%

show percent of missing SampleDate

library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(SampleDate)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)

           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET            0      24290       0.0%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET            0     442825       0.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL            0     484218       0.0%
12     FIU_WQMP_RECENT            0       4020       0.0%
13          MiamiBeach            0      27484       0.0%
14           PALMBEACH            0       7658       0.0%
15    PALMBEACH_STORET            0      13145       0.0%
16                SFER           18      76230       0.0%

show percent of missing ResultValue

library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(ResultValue)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)

           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            3      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET         1410      24290       5.8%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ           15     144946       0.0%
8     DERM_BBWQ_STORET         2850     442825       0.6%
9        FIU_Estuaries        21510     388128       5.5%
10            FIU_WQMP         2773      62621       4.4%
11 FIU_WQMP_HISTORICAL       121735     484218      25.1%
12     FIU_WQMP_RECENT          773       4020      19.2%
13          MiamiBeach          347      27484       1.3%
14           PALMBEACH           44       7658       0.6%
15    PALMBEACH_STORET          329      13145       2.5%
16                SFER        14482      76230      19.0%

show percent of missing coordinates

library(dplyr)
library(scales)
# show percentage of missing or na rows in df$Monitoring.Location.ID grouped by df$program
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(OriginalLatitude) | is.na(OriginalLongitude)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)

           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET        24290      24290     100.0%
6                  DEP          261      99792       0.3%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET       442825     442825     100.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL           54     484218       0.0%
12     FIU_WQMP_RECENT         4020       4020     100.0%
13          MiamiBeach         4443      27484      16.2%
14           PALMBEACH         7658       7658     100.0%
15    PALMBEACH_STORET        13145      13145     100.0%
16                SFER           18      76230       0.0%

show percent of missing ProgramLocationID

library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(ProgramLocationID)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)

           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET            0      24290       0.0%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET            0     442825       0.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL            0     484218       0.0%
12     FIU_WQMP_RECENT            0       4020       0.0%
13          MiamiBeach            0      27484       0.0%
14           PALMBEACH            0       7658       0.0%
15    PALMBEACH_STORET            0      13145       0.0%
16                SFER           18      76230       0.0%

show percent of missing ParameterUnits

library(dplyr)
library(scales)
# show percentage of missing or na rows 
# 1. Get the total number of rows for each program
total_counts <- df %>%
  count(ProgramName, name = "total_rows")

# 2. Get the number of NA rows for each program
na_counts <- df %>%
  filter(is.na(ParameterUnits)) %>%
  count(ProgramName, name = "missing_rows")

# 3. Combine total and NA counts, calculate percentage
percentage_table <- total_counts %>%
  left_join(na_counts, by = "ProgramName") %>%
  mutate(
    # If a program had no NAs, na_counts will be NA. Replace with 0.
    missing_rows = ifelse(is.na(missing_rows), 0, missing_rows),
    
    # Calculate the percentage
    percentage_raw = missing_rows / total_rows,
    
    # Format the percentage for clean table output
    percentage = scales::percent(percentage_raw, accuracy = 0.1)
  ) %>%
  
  # Select and order the columns for the final table
  select(ProgramName, missing_rows, total_rows, percentage) %>%
  
  # Sort by program name
  arrange(ProgramName)

# 4. Print the final data frame (which R displays as a table)
print(percentage_table)

           ProgramName missing_rows total_rows percentage
1            AOML_FBBB            0      43401       0.0%
2                 BBAP            0      10594       0.0%
3                 BBWW            0        910       0.0%
4              BROWARD            0      15142       0.0%
5       BROWARD_STORET            0      24290       0.0%
6                  DEP            0      99792       0.0%
7            DERM_BBWQ            0     144946       0.0%
8     DERM_BBWQ_STORET            0     442825       0.0%
9        FIU_Estuaries            0     388128       0.0%
10            FIU_WQMP            0      62621       0.0%
11 FIU_WQMP_HISTORICAL            0     484218       0.0%
12     FIU_WQMP_RECENT         4020       4020     100.0%
13          MiamiBeach            0      27484       0.0%
14           PALMBEACH            0       7658       0.0%
15    PALMBEACH_STORET            0      13145       0.0%
16                SFER        76230      76230     100.0%

Code

#show table from df with a row for each unique of 
# (ProgramLocationID and ProgramName) with columns 
# programLocationID, ProgramName, and N_locations, 
# where N_locations is the number of unique OriginalLatitude and
# OriginalLongitude pairs for the programLocationID+ProgramName.

library(dplyr)

summary_table <- df %>%
  group_by(ProgramLocationID, ProgramName) %>%
  summarise(
    N_locations = n_distinct(paste(OriginalLatitude, OriginalLongitude)),
    .groups = "drop"
  ) %>%
  filter(N_locations > 1)

summary_table

# A tibble: 292 × 3
   ProgramLocationID ProgramName N_locations
   <chr>             <chr>             <int>
 1 1                 AOML_FBBB           133
 2 1                 SFER                 51
 3 10                AOML_FBBB           113
 4 10                SFER                 53
 5 11                AOML_FBBB           117
 6 11                SFER                 55
 7 12                AOML_FBBB           116
 8 12                SFER                 57
 9 13                AOML_FBBB           119
10 13                SFER                 54
# ℹ 282 more rows

Summary of column completeness

Code

skimr::skim(df)

Data summary
Name	df
Number of rows	1845404
Number of columns	35
_______________________
Column type frequency:
character	18
logical	1
numeric	16
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
Habitat	1480700	0.20	12	21	0	3	0
IndicatorName	1821453	0.01	9	13	0	3	0
ManagedAreaName	1821453	0.01	29	38	0	2	0
RelativeDepth	890081	0.52	0	7	17934	4	0
DetectionUnit	1843238	0.00	4	4	0	1	0
SEACAR_QAQCFlagCode	1821453	0.01	2	13	0	26	0
SEACAR_QAQC_Description	1821453	0.01	31	195	0	26	0
ExportVersion	1821453	0.01	23	23	0	3	0
Region	1778072	0.04	2	19	0	22	0
ProgramName	0	1.00	3	19	0	16	0
ParameterName	0	1.00	2	41	0	20	0
ParameterUnits	80250	0.96	0	10	14862	20	0
ProgramLocationID	18	1.00	1	25	0	2133	0
ActivityType	1503741	0.19	5	15	0	4	0
SampleDate	18	1.00	10	10	0	3702	0
ValueQualifier	1305542	0.29	0	4	221265	95	0
ResultComments	1504579	0.18	0	874	308298	756	9
SEACAR_EventID	1822363	0.01	36	36	0	2394	0

Variable type: logical

skim_variable	n_missing	complete_rate	mean	count
ValueQualifierSource	1845404	0	NaN	:

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
RowID	1822363	0.01	1009723.34	798651.12	486217.00	514368.00	534968.00	1551534.00	3870318.00	▇▂▁▁▁
ProgramID	0	1.00	2165.93	2360.54	3.00	297.00	509.00	4018.00	10013.00	▇▁▆▁▁
IndicatorID	1821453	0.01	6.73	0.63	6.00	6.00	7.00	7.00	8.00	▆▁▇▁▂
ParameterID	1821453	0.01	8.60	6.56	1.00	3.00	6.00	16.00	19.00	▇▃▁▂▅
AreaID	1821453	0.01	6.09	0.89	6.00	6.00	6.00	6.00	15.00	▇▁▁▁▁
Year	972959	0.47	2010.98	9.77	1989.48	2002.08	2016.00	2020.00	2024.00	▁▃▃▁▇
Month	972941	0.47	6.59	3.44	0.00	4.00	7.00	10.00	12.00	▅▅▇▆▇
TotalDepth_m	1845341	0.00	62.22	105.84	1.10	2.10	2.85	8.50	250.00	▇▁▁▁▂
MDL	1657062	0.10	0.06	0.49	0.00	0.00	0.01	0.08	50.50	▇▁▁▁▁
PQL	1657062	0.10	0.18	2.41	0.00	0.01	0.06	0.20	250.00	▇▁▁▁▁
Include	1821453	0.01	1.00	0.03	0.00	1.00	1.00	1.00	1.00	▁▁▁▁▇
MADup	1821453	0.01	1.00	0.00	1.00	1.00	1.00	1.00	1.00	▁▁▇▁▁
ResultValue	166271	0.91	11.01	389.59	-0.03	0.03	0.61	6.00	53465.00	▇▁▁▁▁
ActivityDepth_m	991216	0.46	6.28	14.51	0.00	0.50	3.20	8.49	2494.00	▇▁▁▁▁
OriginalLatitude	496660	0.73	25.35	0.62	24.38	24.83	25.28	25.78	28.78	▇▇▁▁▁
OriginalLongitude	496714	0.73	-80.98	0.74	-85.02	-81.52	-80.93	-80.29	-80.02	▁▁▁▆▇

Plot providers row count grouped by analyte

library(ggplot2)

ggplot(df, aes(x = ProgramName, fill = ParameterName)) +
  geom_bar() +
  scale_y_log10() +
  labs(
    title = "Count of Observations by Program and Parameter",
    x = "Program Name",
    y = "Count (Log Scale)",
    fill = "Parameter"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

show n samples from bottom & surface

library(ggplot2)

ggplot(df, aes(x = RelativeDepth)) +
  geom_bar(fill = "steelblue") +
  labs(
    title = "Number of Samples by Relative Depth",
    x = "Relative Depth",
    y = "Number of Samples"
  ) +
  theme_minimal()

Code

source(here("R/mutateWINToSEACAR.R"))
df <- df_all %>% mutateWINToSEACAR(.keep = "unused")
SEACAR_COLUMNS <- c(
 "ExportVersion","RowID","ProgramID","ProgramName","Habitat",
 "IndicatorID","IndicatorName","ParameterID","ParameterName",
 "ParameterUnits","ProgramLocationID","AreaID","ManagedAreaName",
 "Region","ActivityType","SampleDate","ResultValue","Year","Month",
 "ActivityDepth_m","RelativeDepth","TotalDepth_m","MDL","PQL",
 "DetectionUnit","ValueQualifier","ValueQualifierSource",
 "SampleFraction","ResultComments","OriginalLatitude",
 "OriginalLongitude","SEACAR_QAQCFlagCode",
 "SEACAR_QAQC_Description","Include","SEACAR_EventID","MADup"
)

# figure out which are actually present
present_cols <- intersect(SEACAR_COLUMNS, names(df))
missing_cols <- setdiff(SEACAR_COLUMNS, names(df))

if (length(missing_cols) > 0) {
 warning("\nThese SEACAR_COLUMNS were not found in df:\n  ",
         paste(missing_cols, collapse = ", "))
}

# Any additional columns in df that are not mapped to SEACAR
extra_columns <- setdiff(names(df), SEACAR_COLUMNS)
cat("\nColumns from source data that are not (directly) mapped to SEACAR columns:\n")


Columns from source data that are not (directly) mapped to SEACAR columns:

Code

print(extra_columns)

character(0)