#’ Clear the environment rm(list = ls())

Packages

Developping functions to batch processing BART dataset

# List all files recursively with full paths
files <- list.files("/Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData", 
                    full.names = TRUE, 
                    recursive = TRUE)

# Filter files to only include those with "countdata" in the name
countdata_files <- files[grepl("countdata.*\\.csv$", files)]

# Print the countdata files to verify
# print(countdata_files)

# print(countdata_files)  #  if any files are being collected
if (length(countdata_files) == 0) {
  print("No countdata files found. Check the directory path and file pattern.")
}

if (length(countdata_files) > 0) {
  example_data <- read_csv(countdata_files[[1]])
  print(colnames(example_data))
} else {
  print("No files found.")
}
## Rows: 553 Columns: 23
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (17): uid, namedLocation, domainID, siteID, plotID, plotType, eventID, ...
## dbl   (4): pointID, pointCountMinute, observerDistance, clusterSize
## lgl   (1): identificationHistoryID
## dttm  (1): startDate
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
##  [1] "uid"                     "namedLocation"          
##  [3] "domainID"                "siteID"                 
##  [5] "plotID"                  "plotType"               
##  [7] "pointID"                 "startDate"              
##  [9] "eventID"                 "pointCountMinute"       
## [11] "targetTaxaPresent"       "taxonID"                
## [13] "scientificName"          "taxonRank"              
## [15] "vernacularName"          "observerDistance"       
## [17] "detectionMethod"         "visualConfirmation"     
## [19] "sexOrAge"                "clusterSize"            
## [21] "clusterCode"             "identifiedBy"           
## [23] "identificationHistoryID"
# Define functions
clean_data <- function(data) {
  # Drop rows where 'scientificName' or 'clusterSize' are NA
  data %>% filter(!is.na(scientificName) & !is.na(clusterSize))
}
# clean_data <- function(data) {
#   tidyr::drop_na(data)  # using tidyr for drop_na()
# }

extract_year <- function(filename) {
  # This regex looks specifically for a pattern where four digits are followed by "-MM"
  matches <- regmatches(filename, regexpr("\\d{4}(?=-\\d{2})", filename, perl = TRUE))
  if (length(matches) > 0 && !is.na(matches[1])) {
    as.integer(matches[1])
  } else {
    NA_integer_  # Return NA if no valid year is found
  }
}


# Example filename
example_file <- "NEON.D18.BARR.DP1.10003.001.brd_countdata.2017-07.basic.20231227T060201Z.csv"
extract_year(example_file)  # print 2017
## [1] 2017
calculate_abundance <- function(data) {
  if ("clusterSize" %in% names(data)) {
    # Convert clusterSize to numeric to ensure sum works correctly
    numeric_cluster_size <- as.numeric(data$clusterSize)
    if (any(is.na(numeric_cluster_size))) {
      print("NA introduced by coercion when converting clusterSize to numeric.")
    }
    sum(numeric_cluster_size, na.rm = TRUE)
  } else {
    0  # Return 0 if the column doesn't exist
  }
}

calculate_species_richness <- function(data) {
  if ("scientificName" %in% names(data)) {
    length(unique(data$scientificName))
  } else {
    0  # Return 0 if the column doesn't exist
  }
}

List of the functions

# source function files ----

source(clean_data)
source(extract_year)
source(calculate_species_richness)
source(calculate_abundance)

Printing the results and visualizing the data

# Initialize summary data frame
summary_df <- data.frame(FileName = character(),
                         Abundance = integer(),
                         SpeciesRichness = integer(),
                         Year = integer())

for (file in countdata_files) {
  print(paste("Processing:", file))
  data <- readr::read_csv(file, show_col_types = FALSE)
  print("Data read successfully.")
  
  # Apply cleaning function
  cleaned_data <- clean_data(data)
  print(paste("Data after cleaning:", nrow(cleaned_data), "rows remaining."))
  
  # Convert clusterSize to numeric if necessary
  cleaned_data$clusterSize <- as.numeric(cleaned_data$clusterSize)
  if (any(is.na(cleaned_data$clusterSize))) {
    print("NA values found in clusterSize after conversion to numeric.")
  }
  
  # Extract year, calculate abundance and species richness
  year <- extract_year(file)
  abundance <- calculate_abundance(cleaned_data)
  species_richness <- calculate_species_richness(cleaned_data)
  
  # Append to summary data frame
  summary_df <- rbind(summary_df, data.frame(FileName = basename(file),
                                             Abundance = abundance,
                                             SpeciesRichness = species_richness,
                                             Year = year))
}
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2017-07.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2017-07.basic.20231227T060201Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 434 rows remaining."
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2018-07.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2018-07.basic.20231228T183224Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 465 rows remaining."
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2019-06.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2019-06.basic.20231227T174358Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 774 rows remaining."
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2021-06.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2021-06.basic.20231228T013730Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 655 rows remaining."
## [1] "Processing: /Users/maryamnouri-aiin/Desktop/githubRepos/homework11/CleanedData/NEON.D18.BARR.DP1.10003.001.2022-06.basic.20240127T000425Z.RELEASE-2024/NEON.D18.BARR.DP1.10003.001.brd_countdata.2022-06.basic.20231229T042310Z.csv"
## [1] "Data read successfully."
## [1] "Data after cleaning: 0 rows remaining."
# Print the final summary data frame
print(summary_df)
##                                                                       FileName
## 1 NEON.D18.BARR.DP1.10003.001.brd_countdata.2017-07.basic.20231227T060201Z.csv
## 2 NEON.D18.BARR.DP1.10003.001.brd_countdata.2018-07.basic.20231228T183224Z.csv
## 3 NEON.D18.BARR.DP1.10003.001.brd_countdata.2019-06.basic.20231227T174358Z.csv
## 4 NEON.D18.BARR.DP1.10003.001.brd_countdata.2021-06.basic.20231228T013730Z.csv
## 5 NEON.D18.BARR.DP1.10003.001.brd_countdata.2022-06.basic.20231229T042310Z.csv
##   Abundance SpeciesRichness Year
## 1       545              28 2017
## 2       738              23 2018
## 3      1113              31 2019
## 4      1142              30 2021
## 5         0               0 2022
ggplot(summary_df, aes(x = Year, y = Abundance, group = 1)) +
  geom_line() +
  geom_point() +
  labs(title = "Abundance Over Years",
       x = "Year",
       y = "Abundance") +
  theme_minimal()

ggplot(summary_df, aes(x = Year, y = SpeciesRichness, group = 1)) +
  geom_line(color = "blue") +
  geom_point(color = "blue") +
  labs(title = "Species Richness Over Years",
       x = "Year",
       y = "Species Richness") +
  theme_minimal()

summary_df_long <- tidyr::pivot_longer(summary_df, cols = c("Abundance", "SpeciesRichness"))

ggplot(summary_df_long, aes(x = Year, y = value, group = name, color = name)) +
  geom_line() +
  geom_point() +
  facet_wrap(~ name, scales = "free_y") +
  labs(title = "Abundance and Species Richness Over Years",
       x = "Year",
       y = "Value") +
  theme_minimal()