Code
::shelf(
librarian
dplyr,
glue,
here,
tidyr )
In 2026 management of this data is being transferred to SEACAR In this file we prepare a transformation of the dataset for SEACAR ingestion, and perform some checks.
The data here should align with standards described in SEACAR_Metadata_UnifiedWQ_ID10006.xlsx
.
=== LOADING PROVIDER : SFER...
[1] "WARN - rows found with no location ID"
=== LOADING PROVIDER : MiamiBeach...
=== LOADING PROVIDER : BBWW...
=== LOADING PROVIDER : FIU_Estuaries...
=== LOADING PROVIDER : AOML_FBBB...
=== LOADING PROVIDER : BBAP...
=== LOADING PROVIDER : BROWARD...
=== LOADING PROVIDER : DEP...
=== LOADING PROVIDER : DERM_BBWQ...
=== LOADING PROVIDER : FIU_WQMP...
=== LOADING PROVIDER : PALMBEACH...
Name | df |
Number of rows | 1422618 |
Number of columns | 35 |
_______________________ | |
Column type frequency: | |
character | 28 |
Date | 1 |
numeric | 6 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
RowID | 1399577 | 0.02 | 6 | 7 | 0 | 23041 | 0 |
ProgramID | 1398667 | 0.02 | 4 | 4 | 0 | 2 | 0 |
Habitat | 1078890 | 0.24 | 12 | 21 | 0 | 5 | 0 |
IndicatorID | 1398667 | 0.02 | 1 | 1 | 0 | 3 | 0 |
IndicatorName | 1398667 | 0.02 | 9 | 13 | 0 | 3 | 0 |
ParameterID | 1398667 | 0.02 | 1 | 2 | 0 | 10 | 0 |
AreaID | 1398667 | 0.02 | 1 | 2 | 0 | 2 | 0 |
ManagedAreaName | 1398667 | 0.02 | 29 | 38 | 0 | 2 | 0 |
RelativeDepth | 1305519 | 0.08 | 3 | 7 | 0 | 3 | 0 |
TotalDepth_m | 1422555 | 0.00 | 6 | 8 | 0 | 11 | 0 |
MDL | 1102841 | 0.22 | 0 | 11 | 110146 | 241 | 0 |
PQL | 1102841 | 0.22 | 0 | 11 | 110146 | 180 | 0 |
DetectionUnit | 1420452 | 0.00 | 4 | 4 | 0 | 1 | 0 |
ValueQualifierSource | 1422618 | 0.00 | NA | NA | 0 | 0 | 0 |
SEACAR_QAQCFlagCode | 1398667 | 0.02 | 2 | 13 | 0 | 26 | 0 |
SEACAR_QAQC_Description | 1398667 | 0.02 | 31 | 195 | 0 | 26 | 0 |
Include | 1398667 | 0.02 | 1 | 1 | 0 | 2 | 0 |
MADup | 1398667 | 0.02 | 1 | 1 | 0 | 1 | 0 |
ExportVersion | 1398667 | 0.02 | 23 | 23 | 0 | 3 | 0 |
Region | 1302832 | 0.08 | 2 | 19 | 0 | 22 | 0 |
ProgramName | 0 | 1.00 | 3 | 13 | 0 | 11 | 0 |
ParameterName | 0 | 1.00 | 2 | 23 | 0 | 21 | 0 |
ParameterUnits | 172638 | 0.88 | 0 | 10 | 18892 | 21 | 0 |
ProgramLocationID | 22 | 1.00 | 0 | 25 | 23721 | 2003 | 0 |
ActivityType | 1101931 | 0.23 | 5 | 35 | 0 | 12 | 0 |
ValueQualifier | 894177 | 0.37 | 0 | 5 | 184005 | 127 | 0 |
ResultComments | 1102753 | 0.22 | 0 | 874 | 280333 | 939 | 11 |
SEACAR_EventID | 1399577 | 0.02 | 36 | 36 | 0 | 2394 | 0 |
Variable type: Date
skim_variable | n_missing | complete_rate | min | max | median | n_unique |
---|---|---|---|---|---|---|
SampleDate | 623217 | 0.56 | 23-09-01 | 2024-11-17 | 2019-11-20 | 2395 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
Year | 631822 | 0.56 | 2011.40 | 9.90 | 1989.48 | 2002.18 | 2018.00 | 2020.00 | 2024 | ▁▃▃▁▇ |
Month | 631800 | 0.56 | 6.57 | 3.45 | 0.00 | 4.00 | 7.00 | 10.00 | 12 | ▅▅▇▆▇ |
ResultValue | 103767 | 0.93 | 1258.27 | 5457.01 | -0.83 | 0.10 | 1.00 | 10.64 | 178550 | ▇▁▁▁▁ |
ActivityDepth_m | 1080237 | 0.24 | 0.48 | 0.17 | 0.00 | 0.50 | 0.50 | 0.50 | 1 | ▂▁▇▂▁ |
OriginalLatitude | 560180 | 0.61 | 25.54 | 0.60 | 24.00 | 25.12 | 25.54 | 25.87 | 28 | ▁▇▆▁▁ |
OriginalLongitude | 560180 | 0.61 | -80.77 | 0.65 | -85.00 | -81.12 | -80.61 | -80.21 | -80 | ▁▁▁▃▇ |
[1] "Temperature" "Salinity"
[3] "Dissolved_Oxygen" "Ammonium"
[5] "Nitrite" "Nitrate"
[7] "Nitrate+Nitrite" "Orthophosphate"
[9] "Silicate" "Chlorophyll_a"
[11] "Pheophytin" "pH"
[13] "Specific_Conductivity" "Turbidity"
[15] "Total_Kjeldahl_Nitrogen" "Phosphorus"
[17] "Total_Nitrogen" "Fecal_Coliforms"
[19] "Enterococci" "Ammonia"
[21] "Ammonia+Ammonium"
source(here("R/mutateWINToSEACAR.R"))
df <- df_all %>% mutateWINToSEACAR(.keep = "unused")
SEACAR_COLUMNS <- c(
"ExportVersion","RowID","ProgramID","ProgramName","Habitat",
"IndicatorID","IndicatorName","ParameterID","ParameterName",
"ParameterUnits","ProgramLocationID","AreaID","ManagedAreaName",
"Region","ActivityType","SampleDate","ResultValue","Year","Month",
"ActivityDepth_m","RelativeDepth","TotalDepth_m","MDL","PQL",
"DetectionUnit","ValueQualifier","ValueQualifierSource",
"SampleFraction","ResultComments","OriginalLatitude",
"OriginalLongitude","SEACAR_QAQCFlagCode",
"SEACAR_QAQC_Description","Include","SEACAR_EventID","MADup"
)
# figure out which are actually present
present_cols <- intersect(SEACAR_COLUMNS, names(df))
missing_cols <- setdiff(SEACAR_COLUMNS, names(df))
if (length(missing_cols) > 0) {
warning("\nThese SEACAR_COLUMNS were not found in df:\n ",
paste(missing_cols, collapse = ", "))
}
# compute percent‐complete only over the present ones
completeness <- df %>%
summarise(
across(
any_of(present_cols),
~ mean(!is.na(.)) * 100,
.names = "pct_complete_{col}"
)
) %>%
pivot_longer(
cols = everything(),
names_to = "column",
values_to = "pct_complete"
) %>%
mutate(column = sub("^pct_complete_", "", column))
print(completeness)
# A tibble: 36 × 2
column pct_complete
<chr> <dbl>
1 ExportVersion 1.68
2 RowID 1.62
3 ProgramID 1.68
4 ProgramName 100
5 Habitat 24.2
6 IndicatorID 1.68
7 IndicatorName 1.68
8 ParameterID 1.68
9 ParameterName 100
10 ParameterUnits 87.9
# ℹ 26 more rows
Columns from source data that are not (directly) mapped to SEACAR columns:
[1] "keyfield" "Activity.ID"
[3] "year" "month"
[5] "day" "time"
[7] "lat_min" "lat_dec"
[9] "lon_min" "lon_dec"
[11] "station_type" "depth_class"
[13] "depth_order" "cast"
[15] "nisk_start" "nisk_end"
[17] "notes" "original.analyte.name"
[19] "data_source" "CLIENT SAMPLE ID"
[21] "LAB SAMPLE ID" "MATRIX"
[23] "COLLECTED" "ANALYTE"
[25] "SAMPLE RESULT" "REPORTING LIMIT"
[27] "UNITS" "METHOD"
[29] "DILUTION" "ANALYZED"
[31] "PREPARED" "source_file"
[33] "SAMPLE COLLECTION DATE" "DEP.Result.ID"
[35] "SURV" "TIME"
[37] "STA" "BASIN"
[39] "CLUSTER" "ZSI"
[41] "ZONE" "YEAR"
[43] "DETECTION LIMITS" "NOX DL"
[45] "NO3 DL" "NO2 DL"
[47] "NH4 DL" "TN DL"
[49] "DIN DL" "TON DL"
[51] "TP DL" "SRP DL"
[53] "APA DL" "CHLA DL"
[55] "TOC DL" "SiO2 DL"
[57] "Long Deg" "Long Min"
[59] "Lat Deg" "Lat Min"
[61] "Organization.ID" "Org.Latitude..DD.MM.SS.SSSS."
[63] "Org.Longitude..DD.MM.SS.SSSS." "WBID"
[65] "Sample.Collection.Type" "Sampling.Agency.Name"
[67] "Activity.Depth.Unit" "Activity.Top.Depth"
[69] "Activity.Bottom.Depth" "Activity.Depth.Top.Bottom.Unit"
[71] "DEP.Result.Value.Text" "Sample.Fraction"
[73] "Lab.ID" "Audit.Censored.Decisions"
[75] "source" "Value.1"
[77] "Station" "Date"