base_dir <- "~/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data"

countdata_files <- list.files(
  path = base_dir,
  pattern = "_count_data\\.[Cc][Ss][Vv]$",  # e.g. 2015_count_data.csv
  full.names = TRUE
)

print(countdata_files)
## [1] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2015_count_data.csv"
## [2] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2016_count_data.csv"
## [3] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2017_count_data.csv"
## [4] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2018_count_data.csv"
## [5] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2019_count_data.csv"
## [6] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2020_count_data.csv"
## [7] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2021_count_data.csv"
## [8] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2022_count_data.csv"
## [9] "/Users/taylorbean/Desktop/UVM/Master's/Spring 2025/Computational Biology/BeanBio6100/count_data/2023_count_data.csv"

Define helper functions

clean_data <- function(df) {
  df %>% drop_na(scientificName, clusterSize)
}

extract_year <- function(file_path) {
  str_extract(basename(file_path), "\\d{4}")
}

calculate_abundance <- function(df) {
  sum(df$clusterSize, na.rm = TRUE)
}

calculate_richness <- function(df) {
  n_distinct(df$scientificName)
}

loop through files and build data frames

summary_stats <- data.frame()
all_year_data <- data.frame()

for (file in countdata_files) {
  df <- read_csv(file, show_col_types = FALSE)
  df <- clean_data(df)

  # Skip files with missing species or count data
  if (nrow(df) == 0 || all(is.na(df$clusterSize)) || all(is.na(df$scientificName))) {
    cat("Skipping file (no valid data):", basename(file), "\n")
    next
  }

  # Calculate year, abundance, richness
  year <- as.integer(extract_year(file))
  abundance <- calculate_abundance(df)
  richness <- calculate_richness(df)

  # Skip if either stat is missing
  if (is.na(abundance) || is.na(richness)) {
    cat("Skipping file (NA in abundance/richness):", basename(file), "\n")
    next
  }

  # Debug print
  cat("✅ Included — Year:", year, "| Abundance:", abundance, "| Richness:", richness, "\n")

  summary_stats <- rbind(summary_stats, data.frame(
    file = basename(file),
    year = year,
    abundance = abundance,
    richness = richness
  ))

  all_year_data <- rbind(all_year_data, data.frame(
    year = year,
    abundance = abundance,
    richness = richness
  ))
}
## ✅ Included — Year: 2015 | Abundance: 459 | Richness: 40 
## ✅ Included — Year: 2016 | Abundance: 696 | Richness: 38 
## ✅ Included — Year: 2017 | Abundance: 411 | Richness: 34 
## ✅ Included — Year: 2018 | Abundance: 515 | Richness: 36 
## ✅ Included — Year: 2019 | Abundance: 410 | Richness: 43 
## ✅ Included — Year: 2020 | Abundance: 489 | Richness: 45 
## ✅ Included — Year: 2021 | Abundance: 920 | Richness: 49 
## ✅ Included — Year: 2022 | Abundance: 592 | Richness: 38 
## ✅ Included — Year: 2023 | Abundance: 523 | Richness: 41

Regression analysis

# Ensure numeric
all_year_data$abundance <- as.numeric(all_year_data$abundance)
all_year_data$richness <- as.numeric(all_year_data$richness)

# Remove NAs
cleaned_data <- all_year_data %>% drop_na(abundance, richness)

# Print for verification
cat("Rows in all_year_data:", nrow(all_year_data), "\n")
## Rows in all_year_data: 9
cat("Rows in cleaned_data:", nrow(cleaned_data), "\n")
## Rows in cleaned_data: 9
print(head(cleaned_data))
##   year abundance richness
## 1 2015       459       40
## 2 2016       696       38
## 3 2017       411       34
## 4 2018       515       36
## 5 2019       410       43
## 6 2020       489       45
# Run model if data exists
if (nrow(cleaned_data) > 1) {
  model <- lm(richness ~ abundance, data = cleaned_data)
  summary(model)
} else {
  print("Not enough valid data to run regression.")
}
## 
## Call:
## lm(formula = richness ~ abundance, data = cleaned_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.501 -3.819  1.011  3.180  5.566 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 32.187651   5.354615   6.011 0.000536 ***
## abundance    0.014818   0.009264   1.600 0.153738    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.269 on 7 degrees of freedom
## Multiple R-squared:  0.2677, Adjusted R-squared:  0.163 
## F-statistic: 2.558 on 1 and 7 DF,  p-value: 0.1537

Histogram

# Use cleaned data if available
plot_data <- if (nrow(cleaned_data) > 0) cleaned_data else all_year_data

# Ensure numeric columns
plot_data$abundance <- as.numeric(plot_data$abundance)
plot_data$richness <- as.numeric(plot_data$richness)

cat("Year:", year, "| Abundance:", abundance, "| Richness:", richness, "\n")
## Year: 2023 | Abundance: 523 | Richness: 41
# Filter out NA values
plot_data <- plot_data %>% filter(!is.na(abundance) & !is.na(richness))

# Debug output
cat("Rows available for plotting:", nrow(plot_data), "\n")
## Rows available for plotting: 9
# Plot if data exists
if (nrow(plot_data) > 0) {
  ggplot(plot_data, aes(x = abundance)) +
    geom_histogram(binwidth = 10, fill = "skyblue", color = "black") +
    labs(title = "Histogram of Bird Abundance", x = "Abundance", y = "Frequency")

  ggplot(plot_data, aes(x = richness)) +
    geom_histogram(binwidth = 2, fill = "lightgreen", color = "black") +
    labs(title = "Histogram of Species Richness", x = "Richness", y = "Frequency")
} else {
  print("Still no valid data available to plot histograms.")
}

```