Sites QC

Quality control steps taken to filter out low-quality sites.
Code
if (!requireNamespace("librarian", quietly = TRUE)) {
  # If not installed, install the package
  install.packages("librarian")
}

librarian::shelf(
  dplyr,
  glue,
  ggplot2,
  here,
  skimr,
)

source(here("R/getData.R"))
load data & skim
df <- getRawData()
New names:
• `` -> `...1`
Warning in getRawData(): NAs introduced by coercion
load data & skim
print(skimr::skim(df))
── Data Summary ────────────────────────
                           Values
Name                       df    
Number of rows             612830
Number of columns          17    
_______________________          
Column type frequency:           
  character                10    
  numeric                  7     
________________________         
Group variables            None  

── Variable type: character ────────────────────────────────────────────────────
   skim_variable  n_missing complete_rate min max empty n_unique whitespace
 1 Source                 0         1       3  11     0        9          0
 2 Site                   0         1       1  28     0     1411          0
 3 Latitude            5462         0.991   8   8     0     4739          0
 4 Longitude           5496         0.991   9   9     0     5495          0
 5 Parameter              0         1       6  24     0       11          0
 6 Units                968         0.998   2   9     0       13          0
 7 Trend Analysis       154         1.00    2   3     0        2          0
 8 Continuous           154         1.00    2   3     0        2          0
 9 End Date             154         1.00    4   7     0       15          0
10 Value_orig         29273         0.952   1  13     0    88595          0

── Variable type: numeric ──────────────────────────────────────────────────────
  skim_variable n_missing complete_rate       mean          sd           p0
1 ...1                  0         1      360506.     206864.              1
2 Month                63         1.00        6.54        3.43            1
3 Year                 63         1.00     2012.          7.92         1995
4 Value             34262         0.944 -193965.   46216162.   -11111111110
5 Total Depth      345777         0.436      10.2        10.1             0
6 Sample Depth     132515         0.784       3.62       13.4             0
7 Start Date          154         1.00     1999.         11.1          1988
         p25       p50        p75    p100 hist 
1 182543.    365608.   537934.    717536  ▇▇▇▇▇
2      4          7        10         12  ▇▅▅▅▇
3   2006       2013      2019       2022  ▂▃▅▃▇
4      0.004      0.02      0.233   8800  ▁▁▁▁▇
5      3.75       7.3      12        121. ▇▁▁▁▁
6      0.5        0.5       3.2     2494  ▇▁▁▁▁
7   1988       1998      2005       2022  ▇▃▃▁▃
function for plotting sites histogram
site_histogram <- function(df){
  number_of_unique_sites <- df %>% summarise(unique_sites = dplyr::n_distinct(Site))

  site_counts <- df %>%
    group_by(Site) %>%
    summarise(Count = n())
  
  ggplot(site_counts, aes(x = Count)) +
    geom_histogram(binwidth = 1, fill = "blue", color = "black") +
    labs(title = glue("Histogram of Rows Per Site (total sites: {number_of_unique_sites})"),
         x = "Number of Rows",
         y = "Number of Sites") +
    theme_minimal()
}
show initial sites setup
site_histogram(df)

drop rows without Year, Value, Lat, or Lon
df <- df %>%
  filter(!is.na(Year) & !is.na(Value) & !is.na(Latitude) & !is.na(Longitude))

site_histogram(df)

remove sites with < 20 data points
df <- df %>%
  group_by(Site) %>%
  filter(n() >= 20) %>%
  ungroup()  # It's a good practice to ungroup data after operations like this

site_histogram(df)

remove sites with no data since last two years (2022)
# TODO: should calculate "last two years" from latest date in the data
latest_year <- max(df$Year)

ggplot(df, aes(x = Year)) +
  geom_histogram(binwidth = 1, fill = "blue", color = "black") +
  labs(title = "N Rows per year",
       x = "Year",
       y = "N Rows") +
  theme_minimal()

remove sites with no data since last two years (2022)
sites_with_recent_data <- df %>%
  filter(Year > latest_year-2) %>%  # Filter for rows with Year greater than 2022
  distinct(Site)           # Get unique sites that meet the criteria

df <- df %>%
  semi_join(sites_with_recent_data, by = "Site")  # Keep rows where 'Site' matches those in the recent data list

site_histogram(df)

write cleaned DataFrame to a file
write.csv(df, "data/df_cleaned_01.csv", row.names = FALSE)