In 2026 management of this data is being transferred to SEACAR In this file we prepare a transformation of the dataset for SEACAR ingestion, and perform some checks.

The data here should align with standards described in SEACAR_Metadata_UnifiedWQ_ID10006.xlsx.

Code
librarian::shelf(
  dplyr,
  glue,
  here,
  tidyr
)
Code
source(here("R/getAllData.R"))
df_all <- getAllData()
=== LOADING PROVIDER : SFER...
[1] "WARN - rows found with no location ID"
=== LOADING PROVIDER : MiamiBeach...
=== LOADING PROVIDER : BBWW...
=== LOADING PROVIDER : FIU_Estuaries...
=== LOADING PROVIDER : AOML_FBBB...
=== LOADING PROVIDER : BBAP...
=== LOADING PROVIDER : BROWARD...
=== LOADING PROVIDER : DEP...
=== LOADING PROVIDER : DERM_BBWQ...
=== LOADING PROVIDER : FIU_WQMP...
=== LOADING PROVIDER : PALMBEACH...
Code
source(here("R/mutateWINToSEACAR.R"))
df <- mutateWINToSEACAR(df_all)

# write SEACAR .csv
write.csv(df, here("data", "exports", "allDataSEACAR.csv"))

Summary of column completeness

Code
skimr::skim(df)
Data summary
Name df
Number of rows 1422618
Number of columns 35
_______________________
Column type frequency:
character 28
Date 1
numeric 6
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
RowID 1399577 0.02 6 7 0 23041 0
ProgramID 1398667 0.02 4 4 0 2 0
Habitat 1078890 0.24 12 21 0 5 0
IndicatorID 1398667 0.02 1 1 0 3 0
IndicatorName 1398667 0.02 9 13 0 3 0
ParameterID 1398667 0.02 1 2 0 10 0
AreaID 1398667 0.02 1 2 0 2 0
ManagedAreaName 1398667 0.02 29 38 0 2 0
RelativeDepth 1305519 0.08 3 7 0 3 0
TotalDepth_m 1422555 0.00 6 8 0 11 0
MDL 1102841 0.22 0 11 110146 241 0
PQL 1102841 0.22 0 11 110146 180 0
DetectionUnit 1420452 0.00 4 4 0 1 0
ValueQualifierSource 1422618 0.00 NA NA 0 0 0
SEACAR_QAQCFlagCode 1398667 0.02 2 13 0 26 0
SEACAR_QAQC_Description 1398667 0.02 31 195 0 26 0
Include 1398667 0.02 1 1 0 2 0
MADup 1398667 0.02 1 1 0 1 0
ExportVersion 1398667 0.02 23 23 0 3 0
Region 1302832 0.08 2 19 0 22 0
ProgramName 0 1.00 3 13 0 11 0
ParameterName 0 1.00 2 23 0 21 0
ParameterUnits 172638 0.88 0 10 18892 21 0
ProgramLocationID 22 1.00 0 25 23721 2003 0
ActivityType 1101931 0.23 5 35 0 12 0
ValueQualifier 894177 0.37 0 5 184005 127 0
ResultComments 1102753 0.22 0 874 280333 939 11
SEACAR_EventID 1399577 0.02 36 36 0 2394 0

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
SampleDate 623217 0.56 23-09-01 2024-11-17 2019-11-20 2395

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Year 631822 0.56 2011.40 9.90 1989.48 2002.18 2018.00 2020.00 2024 ▁▃▃▁▇
Month 631800 0.56 6.57 3.45 0.00 4.00 7.00 10.00 12 ▅▅▇▆▇
ResultValue 103767 0.93 1258.27 5457.01 -0.83 0.10 1.00 10.64 178550 ▇▁▁▁▁
ActivityDepth_m 1080237 0.24 0.48 0.17 0.00 0.50 0.50 0.50 1 ▂▁▇▂▁
OriginalLatitude 560180 0.61 25.54 0.60 24.00 25.12 25.54 25.87 28 ▁▇▆▁▁
OriginalLongitude 560180 0.61 -80.77 0.65 -85.00 -81.12 -80.61 -80.21 -80 ▁▁▁▃▇

List unique analyte names

Code
unique(df$ParameterName)
 [1] "Temperature"             "Salinity"               
 [3] "Dissolved_Oxygen"        "Ammonium"               
 [5] "Nitrite"                 "Nitrate"                
 [7] "Nitrate+Nitrite"         "Orthophosphate"         
 [9] "Silicate"                "Chlorophyll_a"          
[11] "Pheophytin"              "pH"                     
[13] "Specific_Conductivity"   "Turbidity"              
[15] "Total_Kjeldahl_Nitrogen" "Phosphorus"             
[17] "Total_Nitrogen"          "Fecal_Coliforms"        
[19] "Enterococci"             "Ammonia"                
[21] "Ammonia+Ammonium"       
Code
source(here("R/mutateWINToSEACAR.R"))
df <- df_all %>% mutateWINToSEACAR(.keep = "unused")
SEACAR_COLUMNS <- c(
 "ExportVersion","RowID","ProgramID","ProgramName","Habitat",
 "IndicatorID","IndicatorName","ParameterID","ParameterName",
 "ParameterUnits","ProgramLocationID","AreaID","ManagedAreaName",
 "Region","ActivityType","SampleDate","ResultValue","Year","Month",
 "ActivityDepth_m","RelativeDepth","TotalDepth_m","MDL","PQL",
 "DetectionUnit","ValueQualifier","ValueQualifierSource",
 "SampleFraction","ResultComments","OriginalLatitude",
 "OriginalLongitude","SEACAR_QAQCFlagCode",
 "SEACAR_QAQC_Description","Include","SEACAR_EventID","MADup"
)

# figure out which are actually present
present_cols <- intersect(SEACAR_COLUMNS, names(df))
missing_cols <- setdiff(SEACAR_COLUMNS, names(df))

if (length(missing_cols) > 0) {
 warning("\nThese SEACAR_COLUMNS were not found in df:\n  ",
         paste(missing_cols, collapse = ", "))
}

# compute percent‐complete only over the present ones
completeness <- df %>%
 summarise(
   across(
     any_of(present_cols),
     ~ mean(!is.na(.)) * 100,
     .names = "pct_complete_{col}"
   )
 ) %>%
 pivot_longer(
   cols      = everything(),
   names_to  = "column",
   values_to = "pct_complete"
 ) %>%
 mutate(column = sub("^pct_complete_", "", column))

print(completeness)
# A tibble: 36 × 2
   column         pct_complete
   <chr>                 <dbl>
 1 ExportVersion          1.68
 2 RowID                  1.62
 3 ProgramID              1.68
 4 ProgramName          100   
 5 Habitat               24.2 
 6 IndicatorID            1.68
 7 IndicatorName          1.68
 8 ParameterID            1.68
 9 ParameterName        100   
10 ParameterUnits        87.9 
# ℹ 26 more rows
Code
# 2) Any additional columns in df that are not mapped to SEACAR
extra_columns <- setdiff(names(df), SEACAR_COLUMNS)
cat("\nColumns from source data that are not (directly) mapped to SEACAR columns:\n")

Columns from source data that are not (directly) mapped to SEACAR columns:
Code
print(extra_columns)
 [1] "keyfield"                       "Activity.ID"                   
 [3] "year"                           "month"                         
 [5] "day"                            "time"                          
 [7] "lat_min"                        "lat_dec"                       
 [9] "lon_min"                        "lon_dec"                       
[11] "station_type"                   "depth_class"                   
[13] "depth_order"                    "cast"                          
[15] "nisk_start"                     "nisk_end"                      
[17] "notes"                          "original.analyte.name"         
[19] "data_source"                    "CLIENT SAMPLE ID"              
[21] "LAB SAMPLE ID"                  "MATRIX"                        
[23] "COLLECTED"                      "ANALYTE"                       
[25] "SAMPLE RESULT"                  "REPORTING LIMIT"               
[27] "UNITS"                          "METHOD"                        
[29] "DILUTION"                       "ANALYZED"                      
[31] "PREPARED"                       "source_file"                   
[33] "SAMPLE COLLECTION DATE"         "DEP.Result.ID"                 
[35] "SURV"                           "TIME"                          
[37] "STA"                            "BASIN"                         
[39] "CLUSTER"                        "ZSI"                           
[41] "ZONE"                           "YEAR"                          
[43] "DETECTION LIMITS"               "NOX DL"                        
[45] "NO3 DL"                         "NO2 DL"                        
[47] "NH4 DL"                         "TN DL"                         
[49] "DIN DL"                         "TON DL"                        
[51] "TP DL"                          "SRP DL"                        
[53] "APA DL"                         "CHLA DL"                       
[55] "TOC DL"                         "SiO2 DL"                       
[57] "Long Deg"                       "Long Min"                      
[59] "Lat Deg"                        "Lat Min"                       
[61] "Organization.ID"                "Org.Latitude..DD.MM.SS.SSSS."  
[63] "Org.Longitude..DD.MM.SS.SSSS."  "WBID"                          
[65] "Sample.Collection.Type"         "Sampling.Agency.Name"          
[67] "Activity.Depth.Unit"            "Activity.Top.Depth"            
[69] "Activity.Bottom.Depth"          "Activity.Depth.Top.Bottom.Unit"
[71] "DEP.Result.Value.Text"          "Sample.Fraction"               
[73] "Lab.ID"                         "Audit.Censored.Decisions"      
[75] "source"                         "Value.1"                       
[77] "Station"                        "Date"