Station Name Alignment Check

This document checks alignment between station names in Station_Mean_Coords.csv and stations found in the raw CTD data files.

setup

library(tidyverse)
library(DT)
library(here)

Load Station Coordinates & Standard Names

load-coords

station_coords <- read_csv("dataStation_Mean_Coords.csv", show_col_types = FALSE)

Error: 'dataStation_Mean_Coords.csv' does not exist in current working directory ('/home/tylar/repos/seus-mbon-cruise-ctd-processing').

load-coords

stopifnot(
  "station column missing" = "station" %in% names(station_coords),
  "coords empty" = nrow(station_coords) > 0
)

Error: object 'station_coords' not found

load-coords

# # Display the station coordinates
# station_coords %>%
#   datatable(
#     caption = "Expected Stations from Station_Mean_Coords.csv",
#     options = list(
#       pageLength = 10,
#       scrollY = "400px",
#       scrollCollapse = TRUE
#     )
#   ) %>%
#   formatRound(columns = c("lat_mean", "lon_mean"), digits = 4)

Extract Station Names from Raw Data Files

extract stations from from raw file names

# Get all CSV files from data/01_raw directory
raw_files <- list.files(
  path = "data/01_raw",
  pattern = "\\.csv$",
  recursive = TRUE,
  full.names = TRUE
)

# Extract station names from filenames 
# Pattern: {CRUISE_NAME}/{CRUISE_NAME}_{CRUISE_NAME}_{CRUISE_NAME}_{STATION_NAME}.csv
source(here::here("R/get_metadata_from_cast_id.R"))

file_stations <- tibble(
  filepath = raw_files,
  filename = basename(filepath)
) %>%
  mutate(
    # Remove .csv extension to get cast_id
    cast_id = str_remove(filename, "\\.csv$"),
    # Extract metadata using the function
    metadata = map(cast_id, get_metadata_from_cast_id),
    # Extract cruise and station from metadata
    cruise = map_chr(metadata, ~.x$cruise_id),
    station = map_chr(metadata, ~.x$station_id)
  ) %>%
  select(cruise, station, filename, filepath)

# # Show unique stations found in files
# file_stations %>%
#   count(station, name = "n_files") %>%
#   arrange(desc(n_files)) %>%
#   datatable(
#     caption = "Stations Found in Raw Data Files",
#     colnames = c("Station Name", "Number of Files"),
#     options = list(
#       pageLength = 10,
#       scrollY = "400px",
#       scrollCollapse = TRUE
#     )
#   )

Align Columns

align filenames to standard

source(here::here("R/align_raw_ctd_filename.R"))

# Create output directory if it doesn't exist
output_dir <- "data02_renamed"

# Delete directory if it exists, then recreate it empty
if (dir.exists(output_dir)) {
    unlink(output_dir, recursive = TRUE)
}
dir.create(output_dir, recursive = TRUE)

# Apply standardization to each file
file_stations_aligned <- file_stations %>%
    mutate(
        # Get standardized filename using the function
        filename_standard = map_chr(filename, align_raw_ctd_filename),
        # Create new standardized filepath in data02_renamed
        filepath_standard = file.path(output_dir, filename_standard),
        # Check if rename is needed
        needs_rename = filename != filename_standard
    )

# ---- VALIDATE ----
stopifnot(
  "alignment produced NA" =
    !anyNA(file_stations_aligned$filename_standard)
)

# Summary of changes
rename_summary <- file_stations_aligned %>%
    summarise(
        total_files = n(),
        files_needing_rename = sum(needs_rename),
        files_already_aligned = sum(!needs_rename)
    )

cat("Total files:", rename_summary$total_files, "\n")

Total files: 3555

align filenames to standard

cat("Files needing rename:", rename_summary$files_needing_rename, "\n")

Files needing rename: 0

align filenames to standard

cat("Files already aligned:", rename_summary$files_already_aligned, "\n\n")

Files already aligned: 3555

align filenames to standard

# Show files that need renaming
if (rename_summary$files_needing_rename > 0) {
    file_stations_aligned %>%
        filter(needs_rename) %>%
        select(cruise, station, filename_standard, filename, filename_standard) %>%
        datatable(
            caption = "Files that need renaming to align with standard station names",
            colnames = c("Cruise", "Current Station", "Standard Station", "Current Filename", "Standard Filename"),
            options = list(
                pageLength = 10,
                scrollY = "400px",
                scrollCollapse = TRUE
            )
        ) %>%
        formatStyle(columns = 1:5, backgroundColor = "lightyellow")
} else {
    cat("✅ All files are already aligned with standard station names!\n")
}

✅ All files are already aligned with standard station names!

align filenames to standard

# Perform copying of ALL files
cat("\nCopying ALL files to data02_renamed...\n")


Copying ALL files to data02_renamed...

align filenames to standard

file_stations_aligned %>%
    pwalk(function(filepath, filepath_standard, cruise, needs_rename, ...) {
        # Copy file
        file.copy(filepath, filepath_standard, overwrite = TRUE)
        # if (needs_rename) {
        #     cat("Copied & renamed:", basename(filepath), "->", basename(filepath_standard), "\n")
        # } else {
        #     cat("Copied:", basename(filepath), "\n")
        # }
    })
cat("\nCopying complete! Total files copied:", rename_summary$total_files, "\n")


Copying complete! Total files copied: 3555

align filenames to standard

# Update file_stations to use standardized names for subsequent analysis
file_stations <- file_stations_aligned %>%
    mutate(
        # Extract station name from standardized filename
        cast_id_standard = str_remove(filename_standard, "\\.csv$"),
        metadata_standard = map(cast_id_standard, get_metadata_from_cast_id),
        station = map_chr(metadata_standard, ~.x$station_id),
        filename = filename_standard,
        filepath = filepath_standard
    ) %>%
    select(cruise, station, filename, filepath)

Comparison Analysis

compare-stations

# Get unique station names from each source
coords_stations <- unique(station_coords$station)
file_stations_unique <- unique(file_stations$station)

# Stations in coords but NOT in files
missing_from_files <- setdiff(coords_stations, file_stations_unique)

# Stations in files but NOT in coords
missing_from_coords <- setdiff(file_stations_unique, coords_stations)

# Stations in both (aligned)
aligned_stations <- intersect(coords_stations, file_stations_unique)

Summary Statistics

summary-stats

tibble(
  Category = c(
    "Total stations in Station_Mean_Coords.csv",
    "Total stations in raw data files",
    "Stations aligned (in both)",
    "Stations missing from raw files",
    "Stations missing from coordinates"
  ),
  Count = c(
    length(coords_stations),
    length(file_stations_unique),
    length(aligned_stations),
    length(missing_from_files),
    length(missing_from_coords)
  )
) %>%
  datatable(
    caption = "Station Alignment Summary",
    options = list(
      dom = 't',
      ordering = FALSE
    )
  ) %>%
  formatStyle(
    'Count',
    target = 'row',
    backgroundColor = styleEqual(
      c(0, 0),
      c('lightgreen', 'lightgreen')
    )
  )

Stations Missing from Raw Data Files

missing-from-files

if (length(missing_from_files) > 0) {
  tibble(Station = missing_from_files) %>%
    datatable(
      caption = "Stations in Coordinates but NOT in Raw Files - These stations have coordinates but no CTD data files",
      options = list(
        pageLength = 10,
        scrollY = "300px",
        scrollCollapse = TRUE
      )
    ) %>%
    formatStyle(columns = 1, backgroundColor = 'lightyellow')
} else {
  cat("✅ All stations from Station_Mean_Coords.csv have corresponding raw data files!\n")
}

Stations Missing from Coordinate File

missing-from-coords

if (length(missing_from_coords) > 0) {
  # Show which cruises these stations appear in
  file_stations %>%
    filter(station %in% missing_from_coords) %>%
    count(station, cruise) %>%
    arrange(station, cruise) %>%
    datatable(
      caption = "Stations in Raw Files but NOT in Coordinates - These stations have CTD data but no coordinate reference",
      colnames = c("Station Name", "Cruise", "Number of Files"),
      options = list(
        pageLength = 10,
        scrollY = "400px",
        scrollCollapse = TRUE
      )
    ) %>%
    formatStyle(columns = 1:3, backgroundColor = 'lightcoral')
} else {
  cat("✅ All stations from raw data files have entries in Station_Mean_Coords.csv!\n")
}

Files by Cruise and Station

files-by-cruise

# Create the pivot table
heatmap_data <- file_stations %>%
  count(cruise, station) %>%
  pivot_wider(names_from = cruise, values_from = n, values_fill = 0) %>%
  # Convert counts to binary (0 or 1)
  mutate(across(-station, ~as.integer(. > 0))) %>%
  arrange(station)

# Get the number of columns (excluding station column)
n_cols <- ncol(heatmap_data)

heatmap_data %>%
  datatable(
    caption = "CTD Files per Station by Cruise (Presence/Absence)",
    options = list(
      pageLength = 20,
      scrollY = "500px",
      scrollCollapse = TRUE,
      scrollX = TRUE,
      dom = 'tip',
      columnDefs = list(
        list(className = 'dt-center', targets = 1:(n_cols - 1))
      )
    ),
    class = 'compact stripe'
  ) %>%
  formatStyle(
    columns = 2:n_cols,
    backgroundColor = styleEqual(c(0, 1), c('white', '#4682b4')),
    color = styleEqual(c(0, 1), c('#cccccc', 'white'))
  )