Code
if (!requireNamespace("librarian", quietly = TRUE)) {
# If not installed, install the package
install.packages("librarian")
}
::shelf(
librarian
dplyr,
glue,
ggplot2,
here,
skimr,
)
source(here("R/getData.R"))
New names:
• `` -> `...1`
Warning in getRawData(): NAs introduced by coercion
── Data Summary ────────────────────────
Values
Name df
Number of rows 612830
Number of columns 17
_______________________
Column type frequency:
character 10
numeric 7
________________________
Group variables None
── Variable type: character ────────────────────────────────────────────────────
skim_variable n_missing complete_rate min max empty n_unique whitespace
1 Source 0 1 3 11 0 9 0
2 Site 0 1 1 28 0 1411 0
3 Latitude 5462 0.991 8 8 0 4739 0
4 Longitude 5496 0.991 9 9 0 5495 0
5 Parameter 0 1 6 24 0 11 0
6 Units 968 0.998 2 9 0 13 0
7 Trend Analysis 154 1.00 2 3 0 2 0
8 Continuous 154 1.00 2 3 0 2 0
9 End Date 154 1.00 4 7 0 15 0
10 Value_orig 29273 0.952 1 13 0 88595 0
── Variable type: numeric ──────────────────────────────────────────────────────
skim_variable n_missing complete_rate mean sd p0
1 ...1 0 1 360506. 206864. 1
2 Month 63 1.00 6.54 3.43 1
3 Year 63 1.00 2012. 7.92 1995
4 Value 34262 0.944 -193965. 46216162. -11111111110
5 Total Depth 345777 0.436 10.2 10.1 0
6 Sample Depth 132515 0.784 3.62 13.4 0
7 Start Date 154 1.00 1999. 11.1 1988
p25 p50 p75 p100 hist
1 182543. 365608. 537934. 717536 ▇▇▇▇▇
2 4 7 10 12 ▇▅▅▅▇
3 2006 2013 2019 2022 ▂▃▅▃▇
4 0.004 0.02 0.233 8800 ▁▁▁▁▇
5 3.75 7.3 12 121. ▇▁▁▁▁
6 0.5 0.5 3.2 2494 ▇▁▁▁▁
7 1988 1998 2005 2022 ▇▃▃▁▃
site_histogram <- function(df){
number_of_unique_sites <- df %>% summarise(unique_sites = dplyr::n_distinct(Site))
site_counts <- df %>%
group_by(Site) %>%
summarise(Count = n())
ggplot(site_counts, aes(x = Count)) +
geom_histogram(binwidth = 1, fill = "blue", color = "black") +
labs(title = glue("Histogram of Rows Per Site (total sites: {number_of_unique_sites})"),
x = "Number of Rows",
y = "Number of Sites") +
theme_minimal()
}
sites_with_recent_data <- df %>%
filter(Year > latest_year-2) %>% # Filter for rows with Year greater than 2022
distinct(Site) # Get unique sites that meet the criteria
df <- df %>%
semi_join(sites_with_recent_data, by = "Site") # Keep rows where 'Site' matches those in the recent data list
site_histogram(df)