Code
::shelf(
librarian
dplyr,
glue,
here,
tidyr )
In 2026 management of this data is being transferred to SEACAR In this file we prepare a transformation of the dataset for SEACAR ingestion, and perform some checks.
The data here should align with standards described in SEACAR_Metadata_UnifiedWQ_ID10006.xlsx
.
Name | df |
Number of rows | 1235477 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
character | 18 |
logical | 1 |
numeric | 16 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
Habitat | 963802 | 0.22 | 12 | 21 | 0 | 5 | 0 |
IndicatorName | 1211526 | 0.02 | 9 | 13 | 0 | 3 | 0 |
ManagedAreaName | 1211526 | 0.02 | 29 | 38 | 0 | 2 | 0 |
RelativeDepth | 1135314 | 0.08 | 3 | 7 | 0 | 3 | 0 |
DetectionUnit | 1233311 | 0.00 | 4 | 4 | 0 | 1 | 0 |
SEACAR_QAQCFlagCode | 1211526 | 0.02 | 2 | 13 | 0 | 26 | 0 |
SEACAR_QAQC_Description | 1211526 | 0.02 | 31 | 195 | 0 | 26 | 0 |
ExportVersion | 1211526 | 0.02 | 23 | 23 | 0 | 3 | 0 |
Region | 1134858 | 0.08 | 2 | 19 | 0 | 22 | 0 |
ProgramName | 0 | 1.00 | 3 | 19 | 0 | 16 | 0 |
ParameterName | 0 | 1.00 | 2 | 41 | 0 | 20 | 0 |
ParameterUnits | 79580 | 0.94 | 0 | 10 | 26408 | 20 | 0 |
ProgramLocationID | 18 | 1.00 | 0 | 25 | 16624 | 2013 | 0 |
ActivityType | 986843 | 0.20 | 5 | 35 | 0 | 12 | 0 |
SampleDate | 564685 | 0.54 | 8 | 10 | 0 | 2610 | 0 |
ValueQualifier | 788644 | 0.36 | 0 | 4 | 163070 | 93 | 0 |
ResultComments | 987681 | 0.20 | 0 | 874 | 221332 | 768 | 9 |
SEACAR_EventID | 1212436 | 0.02 | 36 | 36 | 0 | 2394 | 0 |
Variable type: logical
skim_variable | n_missing | complete_rate | mean | count |
---|---|---|---|---|
ValueQualifierSource | 1235477 | 0 | NaN | : |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
RowID | 1212436 | 0.02 | 1009723.34 | 798651.12 | 486217.00 | 514368.00 | 534968.00 | 1551534.00 | 3870318.00 | ▇▂▁▁▁ |
ProgramID | 1211526 | 0.02 | 4057.96 | 0.19 | 4057.00 | 4058.00 | 4058.00 | 4058.00 | 4058.00 | ▁▁▁▁▇ |
IndicatorID | 1211526 | 0.02 | 6.73 | 0.63 | 6.00 | 6.00 | 7.00 | 7.00 | 8.00 | ▆▁▇▁▂ |
ParameterID | 1211526 | 0.02 | 8.60 | 6.56 | 1.00 | 3.00 | 6.00 | 16.00 | 19.00 | ▇▃▁▂▅ |
AreaID | 1211526 | 0.02 | 6.09 | 0.89 | 6.00 | 6.00 | 6.00 | 6.00 | 15.00 | ▇▁▁▁▁ |
Year | 579870 | 0.53 | 2011.00 | 9.94 | 1989.48 | 2001.86 | 2016.00 | 2020.00 | 2024.00 | ▂▃▃▁▇ |
Month | 579852 | 0.53 | 6.57 | 3.45 | 0.00 | 4.00 | 7.00 | 10.00 | 12.00 | ▅▅▇▅▇ |
TotalDepth_m | 1235414 | 0.00 | 62.22 | 105.84 | 1.10 | 2.10 | 2.85 | 8.50 | 250.00 | ▇▁▁▁▂ |
MDL | 1096047 | 0.11 | 0.06 | 0.57 | 0.00 | 0.00 | 0.03 | 0.08 | 50.50 | ▇▁▁▁▁ |
PQL | 1096047 | 0.11 | 0.21 | 2.80 | 0.00 | 0.01 | 0.06 | 0.25 | 250.00 | ▇▁▁▁▁ |
Include | 1211526 | 0.02 | 1.00 | 0.03 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
MADup | 1211526 | 0.02 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
ResultValue | 78726 | 0.94 | 10.74 | 469.20 | -0.03 | 0.09 | 0.69 | 3.40 | 53465.00 | ▇▁▁▁▁ |
ActivityDepth_m | 967966 | 0.22 | 0.48 | 0.20 | 0.00 | 0.50 | 0.50 | 0.50 | 1.00 | ▂▁▇▂▁ |
OriginalLatitude | 512995 | 0.58 | 25.58 | 0.57 | 24.40 | 25.16 | 25.55 | 25.87 | 28.78 | ▅▇▂▁▁ |
OriginalLongitude | 512995 | 0.58 | -80.83 | 0.70 | -85.02 | -81.28 | -80.68 | -80.24 | -80.02 | ▁▁▁▃▇ |
library(ggplot2)
ggplot(df, aes(x = ProgramName, fill = ParameterName)) +
geom_bar() +
scale_y_log10() +
labs(
title = "Count of Observations by Program and Parameter",
x = "Program Name",
y = "Count (Log Scale)",
fill = "Parameter"
) +
theme_minimal() +
theme(
legend.position = "bottom",
axis.text.x = element_text(angle = 45, hjust = 1)
)
source(here("R/mutateWINToSEACAR.R"))
df <- df_all %>% mutateWINToSEACAR(.keep = "unused")
SEACAR_COLUMNS <- c(
"ExportVersion","RowID","ProgramID","ProgramName","Habitat",
"IndicatorID","IndicatorName","ParameterID","ParameterName",
"ParameterUnits","ProgramLocationID","AreaID","ManagedAreaName",
"Region","ActivityType","SampleDate","ResultValue","Year","Month",
"ActivityDepth_m","RelativeDepth","TotalDepth_m","MDL","PQL",
"DetectionUnit","ValueQualifier","ValueQualifierSource",
"SampleFraction","ResultComments","OriginalLatitude",
"OriginalLongitude","SEACAR_QAQCFlagCode",
"SEACAR_QAQC_Description","Include","SEACAR_EventID","MADup"
)
# figure out which are actually present
present_cols <- intersect(SEACAR_COLUMNS, names(df))
missing_cols <- setdiff(SEACAR_COLUMNS, names(df))
if (length(missing_cols) > 0) {
warning("\nThese SEACAR_COLUMNS were not found in df:\n ",
paste(missing_cols, collapse = ", "))
}
# Any additional columns in df that are not mapped to SEACAR
extra_columns <- setdiff(names(df), SEACAR_COLUMNS)
cat("\nColumns from source data that are not (directly) mapped to SEACAR columns:\n")
Columns from source data that are not (directly) mapped to SEACAR columns:
[1] "X" "keyfield"
[3] "Activity.ID" "year"
[5] "month" "day"
[7] "time" "lat_deg"
[9] "lat_min" "lon_deg"
[11] "lon_min" "station_type"
[13] "depth_class" "depth_order"
[15] "cast" "nisk_start"
[17] "nisk_end" "notes"
[19] "data_source" "CLIENT.SAMPLE.ID"
[21] "LAB.SAMPLE.ID" "MATRIX"
[23] "COLLECTED" "ANALYTE"
[25] "SAMPLE.RESULT" "REPORTING.LIMIT"
[27] "UNITS" "METHOD"
[29] "DILUTION" "ANALYZED"
[31] "PREPARED" "source_file"
[33] "SAMPLE.COLLECTION.DATE" "DEP.Result.ID"
[35] "SURV" "TIME"
[37] "STA" "BASIN"
[39] "CLUSTER" "ZSI"
[41] "ZONE" "YEAR"
[43] "DETECTION.LIMITS" "NOX.DL"
[45] "NO3.DL" "NO2.DL"
[47] "NH4.DL" "TN.DL"
[49] "DIN.DL" "TON.DL"
[51] "TP.DL" "SRP.DL"
[53] "APA.DL" "CHLA.DL"
[55] "TOC.DL" "SiO2.DL"
[57] "Station" "Date"
[59] "Organization.ID" "SEGMENT"
[61] "DATE" "STATION"
[63] "SITE" "LATDEC"
[65] "LONDEC" "DEPTH"
[67] "Long.Deg" "Long.Min"
[69] "Lat.Deg" "Lat.Min"
[71] "Sampling.Agency.Name" "Value.1"
[73] "Org.Latitude..DD.MM.SS.SSSS." "Org.Longitude..DD.MM.SS.SSSS."
[75] "WBID" "Sample.Collection.Type"
[77] "Activity.Depth.Unit" "Activity.Top.Depth"
[79] "Activity.Bottom.Depth" "Activity.Depth.Top.Bottom.Unit"
[81] "DEP.Result.Value.Text" "Sample.Fraction"
[83] "Lab.ID" "Audit.Censored.Decisions"
[85] "source"