In 2026 management of this data is being transferred to SEACAR In this file we prepare a transformation of the dataset for SEACAR ingestion, and perform some checks.

The data here should align with standards described in SEACAR_Metadata_UnifiedWQ_ID10006.xlsx.

Code
librarian::shelf(
  dplyr,
  glue,
  here,
  tidyr
)
Code
library("here")
df_all <- read.csv(here("data/exports/allDataRaw.csv"))
source(here("R/dropNonStandardAnalytes.R"))
df <- dropNonStandardAnalytes(df_all)

source(here("R/mutateWINToSEACAR.R"))
df <- mutateWINToSEACAR(df)

# write SEACAR .csv
write.csv(df, here("data", "exports", "allDataSEACAR.csv"))

Summary of column completeness

Code
skimr::skim(df)
Data summary
Name df
Number of rows 1235477
Number of columns 35
_______________________
Column type frequency:
character 18
logical 1
numeric 16
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Habitat 963802 0.22 12 21 0 5 0
IndicatorName 1211526 0.02 9 13 0 3 0
ManagedAreaName 1211526 0.02 29 38 0 2 0
RelativeDepth 1135314 0.08 3 7 0 3 0
DetectionUnit 1233311 0.00 4 4 0 1 0
SEACAR_QAQCFlagCode 1211526 0.02 2 13 0 26 0
SEACAR_QAQC_Description 1211526 0.02 31 195 0 26 0
ExportVersion 1211526 0.02 23 23 0 3 0
Region 1134858 0.08 2 19 0 22 0
ProgramName 0 1.00 3 19 0 16 0
ParameterName 0 1.00 2 41 0 20 0
ParameterUnits 79580 0.94 0 10 26408 20 0
ProgramLocationID 18 1.00 0 25 16624 2013 0
ActivityType 986843 0.20 5 35 0 12 0
SampleDate 564685 0.54 8 10 0 2610 0
ValueQualifier 788644 0.36 0 4 163070 93 0
ResultComments 987681 0.20 0 874 221332 768 9
SEACAR_EventID 1212436 0.02 36 36 0 2394 0

Variable type: logical

skim_variable n_missing complete_rate mean count
ValueQualifierSource 1235477 0 NaN :

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
RowID 1212436 0.02 1009723.34 798651.12 486217.00 514368.00 534968.00 1551534.00 3870318.00 ▇▂▁▁▁
ProgramID 1211526 0.02 4057.96 0.19 4057.00 4058.00 4058.00 4058.00 4058.00 ▁▁▁▁▇
IndicatorID 1211526 0.02 6.73 0.63 6.00 6.00 7.00 7.00 8.00 ▆▁▇▁▂
ParameterID 1211526 0.02 8.60 6.56 1.00 3.00 6.00 16.00 19.00 ▇▃▁▂▅
AreaID 1211526 0.02 6.09 0.89 6.00 6.00 6.00 6.00 15.00 ▇▁▁▁▁
Year 579870 0.53 2011.00 9.94 1989.48 2001.86 2016.00 2020.00 2024.00 ▂▃▃▁▇
Month 579852 0.53 6.57 3.45 0.00 4.00 7.00 10.00 12.00 ▅▅▇▅▇
TotalDepth_m 1235414 0.00 62.22 105.84 1.10 2.10 2.85 8.50 250.00 ▇▁▁▁▂
MDL 1096047 0.11 0.06 0.57 0.00 0.00 0.03 0.08 50.50 ▇▁▁▁▁
PQL 1096047 0.11 0.21 2.80 0.00 0.01 0.06 0.25 250.00 ▇▁▁▁▁
Include 1211526 0.02 1.00 0.03 0.00 1.00 1.00 1.00 1.00 ▁▁▁▁▇
MADup 1211526 0.02 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
ResultValue 78726 0.94 10.74 469.20 -0.03 0.09 0.69 3.40 53465.00 ▇▁▁▁▁
ActivityDepth_m 967966 0.22 0.48 0.20 0.00 0.50 0.50 0.50 1.00 ▂▁▇▂▁
OriginalLatitude 512995 0.58 25.58 0.57 24.40 25.16 25.55 25.87 28.78 ▅▇▂▁▁
OriginalLongitude 512995 0.58 -80.83 0.70 -85.02 -81.28 -80.68 -80.24 -80.02 ▁▁▁▃▇
Plot providers row count grouped by analyte
library(ggplot2)

ggplot(df, aes(x = ProgramName, fill = ParameterName)) +
  geom_bar() +
  scale_y_log10() +
  labs(
    title = "Count of Observations by Program and Parameter",
    x = "Program Name",
    y = "Count (Log Scale)",
    fill = "Parameter"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

Code
source(here("R/mutateWINToSEACAR.R"))
df <- df_all %>% mutateWINToSEACAR(.keep = "unused")
SEACAR_COLUMNS <- c(
 "ExportVersion","RowID","ProgramID","ProgramName","Habitat",
 "IndicatorID","IndicatorName","ParameterID","ParameterName",
 "ParameterUnits","ProgramLocationID","AreaID","ManagedAreaName",
 "Region","ActivityType","SampleDate","ResultValue","Year","Month",
 "ActivityDepth_m","RelativeDepth","TotalDepth_m","MDL","PQL",
 "DetectionUnit","ValueQualifier","ValueQualifierSource",
 "SampleFraction","ResultComments","OriginalLatitude",
 "OriginalLongitude","SEACAR_QAQCFlagCode",
 "SEACAR_QAQC_Description","Include","SEACAR_EventID","MADup"
)

# figure out which are actually present
present_cols <- intersect(SEACAR_COLUMNS, names(df))
missing_cols <- setdiff(SEACAR_COLUMNS, names(df))

if (length(missing_cols) > 0) {
 warning("\nThese SEACAR_COLUMNS were not found in df:\n  ",
         paste(missing_cols, collapse = ", "))
}

# Any additional columns in df that are not mapped to SEACAR
extra_columns <- setdiff(names(df), SEACAR_COLUMNS)
cat("\nColumns from source data that are not (directly) mapped to SEACAR columns:\n")

Columns from source data that are not (directly) mapped to SEACAR columns:
Code
print(extra_columns)
 [1] "X"                              "keyfield"                      
 [3] "Activity.ID"                    "year"                          
 [5] "month"                          "day"                           
 [7] "time"                           "lat_deg"                       
 [9] "lat_min"                        "lon_deg"                       
[11] "lon_min"                        "station_type"                  
[13] "depth_class"                    "depth_order"                   
[15] "cast"                           "nisk_start"                    
[17] "nisk_end"                       "notes"                         
[19] "data_source"                    "CLIENT.SAMPLE.ID"              
[21] "LAB.SAMPLE.ID"                  "MATRIX"                        
[23] "COLLECTED"                      "ANALYTE"                       
[25] "SAMPLE.RESULT"                  "REPORTING.LIMIT"               
[27] "UNITS"                          "METHOD"                        
[29] "DILUTION"                       "ANALYZED"                      
[31] "PREPARED"                       "source_file"                   
[33] "SAMPLE.COLLECTION.DATE"         "DEP.Result.ID"                 
[35] "SURV"                           "TIME"                          
[37] "STA"                            "BASIN"                         
[39] "CLUSTER"                        "ZSI"                           
[41] "ZONE"                           "YEAR"                          
[43] "DETECTION.LIMITS"               "NOX.DL"                        
[45] "NO3.DL"                         "NO2.DL"                        
[47] "NH4.DL"                         "TN.DL"                         
[49] "DIN.DL"                         "TON.DL"                        
[51] "TP.DL"                          "SRP.DL"                        
[53] "APA.DL"                         "CHLA.DL"                       
[55] "TOC.DL"                         "SiO2.DL"                       
[57] "Station"                        "Date"                          
[59] "Organization.ID"                "SEGMENT"                       
[61] "DATE"                           "STATION"                       
[63] "SITE"                           "LATDEC"                        
[65] "LONDEC"                         "DEPTH"                         
[67] "Long.Deg"                       "Long.Min"                      
[69] "Lat.Deg"                        "Lat.Min"                       
[71] "Sampling.Agency.Name"           "Value.1"                       
[73] "Org.Latitude..DD.MM.SS.SSSS."   "Org.Longitude..DD.MM.SS.SSSS." 
[75] "WBID"                           "Sample.Collection.Type"        
[77] "Activity.Depth.Unit"            "Activity.Top.Depth"            
[79] "Activity.Bottom.Depth"          "Activity.Depth.Top.Bottom.Unit"
[81] "DEP.Result.Value.Text"          "Sample.Fraction"               
[83] "Lab.ID"                         "Audit.Censored.Decisions"      
[85] "source"