#’ Clear the environment rm(list = ls())
# List all files recursively with full paths
files <- list.files("/Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData",
full.names = TRUE,
recursive = TRUE)
# Filter files to only include those with "countdata" in the name
countdata_files <- files[grepl("countdata.*\\.csv$", files)]
# Print the countdata files to verify
# print(countdata_files)
# print(countdata_files) # if any files are being collected
if (length(countdata_files) == 0) {
print("No countdata files found. Check the directory path and file pattern.")
}
if (length(countdata_files) > 0) {
example_data <- read_csv(countdata_files[[1]])
print(colnames(example_data))
} else {
print("No files found.")
}
## Rows: 553 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (17): uid, namedLocation, domainID, siteID, plotID, plotType, eventID, ...
## dbl (4): pointID, pointCountMinute, observerDistance, clusterSize
## lgl (1): identificationHistoryID
## dttm (1): startDate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## [1] "uid" "namedLocation"
## [3] "domainID" "siteID"
## [5] "plotID" "plotType"
## [7] "pointID" "startDate"
## [9] "eventID" "pointCountMinute"
## [11] "targetTaxaPresent" "taxonID"
## [13] "scientificName" "taxonRank"
## [15] "vernacularName" "observerDistance"
## [17] "detectionMethod" "visualConfirmation"
## [19] "sexOrAge" "clusterSize"
## [21] "clusterCode" "identifiedBy"
## [23] "identificationHistoryID"
# Define functions
clean_data <- function(data) {
# Drop rows where 'scientificName' or 'clusterSize' are NA
data %>% filter(!is.na(scientificName) & !is.na(clusterSize))
}
# clean_data <- function(data) {
# tidyr::drop_na(data) # using tidyr for drop_na()
# }
extract_year <- function(filename) {
# This regex looks specifically for a pattern where four digits are followed by "-MM"
matches <- regmatches(filename, regexpr("\\d{4}(?=-\\d{2})", filename, perl = TRUE))
if (length(matches) > 0 && !is.na(matches[1])) {
as.integer(matches[1])
} else {
NA_integer_ # Return NA if no valid year is found
}
}
# Example filename
example_file <- "NEON.D18.BARR.DP1.10003.001.brd_countdata.2017-07.basic.20231227T060201Z.csv"
extract_year(example_file) # print 2017
## [1] 2017
calculate_abundance <- function(data) {
if ("clusterSize" %in% names(data)) {
# Convert clusterSize to numeric to ensure sum works correctly
numeric_cluster_size <- as.numeric(data$clusterSize)
if (any(is.na(numeric_cluster_size))) {
print("NA introduced by coercion when converting clusterSize to numeric.")
}
sum(numeric_cluster_size, na.rm = TRUE)
} else {
0 # Return 0 if the column doesn't exist
}
}
calculate_species_richness <- function(data) {
if ("scientificName" %in% names(data)) {
length(unique(data$scientificName))
} else {
0 # Return 0 if the column doesn't exist
}
}
# source function files ----
source(clean_data)
source(extract_year)
source(calculate_species_richness)
source(calculate_abundance)
# Initialize summary data frame
summary_df <- data.frame(FileName = character(),
Abundance = integer(),
SpeciesRichness = integer(),
Year = integer())
for (file in countdata_files) {
print(paste("Processing:", file))
data <- readr::read_csv(file, show_col_types = FALSE)
print("Data read successfully.")
# Apply cleaning function
cleaned_data <- clean_data(data)
print(paste("Data after cleaning:", nrow(cleaned_data), "rows remaining."))
# Convert clusterSize to numeric if necessary
cleaned_data$clusterSize <- as.numeric(cleaned_data$clusterSize)
if (any(is.na(cleaned_data$clusterSize))) {
print("NA values found in clusterSize after conversion to numeric.")
}
# Extract year, calculate abundance and species richness
year <- extract_year(file)
abundance <- calculate_abundance(cleaned_data)
species_richness <- calculate_species_richness(cleaned_data)
# Append to summary data frame
summary_df <- rbind(summary_df, data.frame(FileName = basename(file),
Abundance = abundance,
SpeciesRichness = species_richness,
Year = year))
}
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2017-07.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2017-07.basic.20231227T060201Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 434 rows remaining."
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2018-07.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2018-07.basic.20231228T183224Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 465 rows remaining."
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2019-06.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2019-06.basic.20231227T174358Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 774 rows remaining."
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2021-06.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2021-06.basic.20231228T013730Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 655 rows remaining."
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2022-06.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2022-06.basic.20231229T042310Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 0 rows remaining."
# Print the final summary data frame
print(summary_df)
## FileName
## 1 NEON.D18.BARR.DP1.10003.001.brd_countdata.2017-07.basic.20231227T060201Z.csv
## 2 NEON.D18.BARR.DP1.10003.001.brd_countdata.2018-07.basic.20231228T183224Z.csv
## 3 NEON.D18.BARR.DP1.10003.001.brd_countdata.2019-06.basic.20231227T174358Z.csv
## 4 NEON.D18.BARR.DP1.10003.001.brd_countdata.2021-06.basic.20231228T013730Z.csv
## 5 NEON.D18.BARR.DP1.10003.001.brd_countdata.2022-06.basic.20231229T042310Z.csv
## Abundance SpeciesRichness Year
## 1 545 28 2017
## 2 738 23 2018
## 3 1113 31 2019
## 4 1142 30 2021
## 5 0 0 2022
ggplot(summary_df, aes(x = Year, y = Abundance, group = 1)) +
geom_line() +
geom_point() +
labs(title = "Abundance Over Years",
x = "Year",
y = "Abundance") +
theme_minimal()
ggplot(summary_df, aes(x = Year, y = SpeciesRichness, group = 1)) +
geom_line(color = "blue") +
geom_point(color = "blue") +
labs(title = "Species Richness Over Years",
x = "Year",
y = "Species Richness") +
theme_minimal()
summary_df_long <- tidyr::pivot_longer(summary_df, cols = c("Abundance", "SpeciesRichness"))
ggplot(summary_df_long, aes(x = Year, y = value, group = name, color = name)) +
geom_line() +
geom_point() +
facet_wrap(~ name, scales = "free_y") +
labs(title = "Abundance and Species Richness Over Years",
x = "Year",
y = "Value") +
theme_minimal()