qPCR analysis

.Rmd script

Load libraries

library(ggplot2) ## for plotting
library(tidyverse) ## for data table manipulation

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr) ## for reading in tsv files
library(readxl) ## for reading in excel files
library(stringr) ## for data transformation
library(strex) ## for data transformation
library(writexl) ## for excel output
library(purrr) ## for data transformation
library(ggrepel)  ## For geom_text_repel
library(drc) ## for LOD calculations

## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## 'drc' has been loaded.
## 
## Please cite R and 'drc' if used for a publication,
## for references type 'citation()' and 'citation('drc')'.
## 
## 
## Attaching package: 'drc'
## 
## The following objects are masked from 'package:stats':
## 
##     gaussian, getInitial

library(ggpubr)

Evaluating standard curve

Reading in data

### USER EDITS:
## 1. Replace path of file with your own standard curve
std_curve <- read.csv("example standard curve.csv", skip = 19) %>% dplyr::rename(St_Quant=7) %>%

  ## Calculate mean Cq
  dplyr::group_by(Sample) %>%
  mutate(Cq_mean = mean(Cq, na.rm=TRUE))

Calculating standard curve metrics

# Fit a linear model to calculate slope and intercept
linear_model <- lm(Cq_mean ~ log10(St_Quant), data = std_curve)

# Extract slope and y-intercept
slope <- coef(linear_model)[2]       # Slope (m)
y_intercept <- coef(linear_model)[1] # Y-intercept (b)
efficiency <- (-1+(10^(-1/slope)))

## Standard curve calculations (assuming std_curve is already grouped by Sample)
std_curve <- std_curve %>%

  ## calculate detection rates (number of Cqs present / number of replicates)
  mutate(detection_rate = sum(!is.na(Cq)) / n(),

  ## calculate Cq stats - SD, and CV
         Cq_sd = sd(Cq, na.rm=TRUE),
         #Cq_cv = (Cq_sd / Cq_mean) * 100 ## old method 
         Cq_cv = sqrt(((1+efficiency) ^ ((Cq_sd^2) * log(1+efficiency))) - 1)) %>%
  ungroup()

Confirming negatives were clean and filter them out

## all output should have NAs in Cq column
## if any negatives have values, then pause and re-run those samples if needed
std_curve %>% filter(Sample == "neg")

## # A tibble: 3 × 11
##   Well  Fluor Target  Content Sample    Cq St_Quant Cq_mean detection_rate Cq_sd
##   <chr> <chr> <chr>   <chr>   <chr>  <dbl>    <dbl>   <dbl>          <dbl> <dbl>
## 1 F12   SYBR  Library NTC     neg       NA       NA     NaN              0    NA
## 2 G12   SYBR  Library NTC     neg       NA       NA     NaN              0    NA
## 3 H12   SYBR  Library NTC     neg       NA       NA     NaN              0    NA
## # ℹ 1 more variable: Cq_cv <dbl>

std_curve <- std_curve %>% filter(!Sample == "neg")

Calculating LOD and LOQ based on discrete thresholds

Making list of standard curve quantities

# Assuming std_curve is your data frame
quants <- std_curve %>% dplyr::select(St_Quant) %>% distinct() %>% arrange(desc(St_Quant))

# Creating a factored list of these standard curve values
quant_list <- as.factor(quants$St_Quant)

Find the Limit of Detection (LOD)

# Find the first value below the 0.95 detection rate
first_below_LOD_threshold <- std_curve %>% filter(detection_rate < 0.95) %>% arrange(desc(St_Quant)) %>%
  slice(1) %>% dplyr::select(St_Quant)

# Get the standard quantity of the first value below the threshold
below_threshold_LOD_quant <- first_below_LOD_threshold$St_Quant[1]

# Find the index of this quantity in the ordered list
LOD_index <- which(quants$St_Quant == below_threshold_LOD_quant)

LOD <- quants$St_Quant[LOD_index - 1]

Find the Limit of Quantification (LOQ)

# Find the first value below the 0.35 CV
first_above_LOQ_threshold <- std_curve %>% filter(Cq_cv > 0.350) %>% arrange(desc(St_Quant)) %>%
  slice(1) %>% dplyr::select(St_Quant)

# Get the standard quantity of the first value below the threshold
above_threshold_LOQ_quant <- first_above_LOQ_threshold$St_Quant[1]

# Find the index of this quantity in the ordered list
LOQ_index <- which(quants$St_Quant == above_threshold_LOQ_quant)

LOQ <- quants$St_Quant[LOQ_index - 1]

Using modeling to calculate LOD and LOQ

## Different models 
# 
# LOQ1 <- nls(Cq.CV~SSasymp(log10(Standards),Asym,R0,lrc),
#                   data=DAT2[DAT2$Target==Targets[i],])
# 
# 
# LOQ2 <- lm(Cq.CV~log10(Standards),data=DAT2[DAT2$Target==Targets[i],])
# 
# 
# LOQ3 <- lm(Cq.CV~poly(log10(Standards),2),data=DAT2[DAT2$Target==Targets[i],])
# 
# LOQ4 <- lm(Cq.CV~poly(log10(Standards),3),data=DAT2[DAT2$Target==Targets[i],])
# 
# LOQ5 <- lm(Cq.CV~poly(log10(Standards),4),data=DAT2[DAT2$Target==Targets[i],])
# 
# LOQ6 <- lm(Cq.CV~poly(log10(Standards),5),data=DAT2[DAT2$Target==Targets[i],])
# 
# LOQ7 <- lm(Cq.CV~poly(log10(Standards),6),data=DAT2[DAT2$Target==Targets[i],])

## find residual standard error
## pick best fitting model

Plotting standard curve

std_curve %>% 
  ## Taking only those detections with >50% to make curve 
  filter(detection_rate > 0.50) %>%

  ## Plotting that std. curve 
  ggplot(., aes(x = log10(St_Quant), y = Cq)) +

  geom_smooth(method = 'lm', se = FALSE, color = "darkred") +

  stat_regline_equation(size = 4, color = 'darkred',
                        aes(label = after_stat(rr.label)), hjust=1,
                        label.x.npc = "right", label.y.npc = "top") +

  geom_point(color = 'black', fill = 'white', shape = 21, size = 3.5, alpha=0.8) +

  ## ADDING LOD LINE -- if no line shows up, all st. curve is qualitative 
  geom_vline(xintercept=log10(LOD), linetype="dashed", color = "grey40") +
  geom_text(aes(x = log10(LOD), y = max(std_curve$Cq, na.rm=TRUE) * 0.3,
                label = paste("LOD =", LOD, "copies")), 
            angle = 90, vjust = -0.5, hjust = -0.1, color = "grey40", size = 4) +

  ## ADDING LOQ LINE -- if no line shows up, all st. curve is quantitative 
  geom_vline(xintercept=log10(LOQ), linetype="dashed", color = "grey40") +
  geom_text(aes(x = log10(LOQ), y = max(std_curve$Cq, na.rm=TRUE) * 0.3, 
                label = paste("LOQ =", LOQ, "copies")), 
            angle = 90, vjust = -0.5, hjust = -0.1, color = "grey40", size = 4) +

  theme_bw() +

  labs(
    x = "Normalized standard concentrations",
    y = "Cq"
  ) +

  theme(axis.text.y = element_text(size=10, color="grey20"),
        axis.text.x = element_text(size=10, color="grey20"),
        axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=12, face="bold"),
        axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12, face="bold"))

## Warning: Use of `std_curve$Cq` is discouraged.
## ℹ Use `Cq` instead.

## Warning in geom_text(aes(x = log10(LOD), y = max(std_curve$Cq, na.rm = TRUE) * : All aesthetics have length 1, but the data has 93 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning: Use of `std_curve$Cq` is discouraged.
## ℹ Use `Cq` instead.

## Warning in geom_text(aes(x = log10(LOQ), y = max(std_curve$Cq, na.rm = TRUE) * : All aesthetics have length 1, but the data has 93 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_regline_equation()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave("example output/Standard_curve.png", width=6, height=4)

## Warning: Use of `std_curve$Cq` is discouraged.
## ℹ Use `Cq` instead.

## Warning in geom_text(aes(x = log10(LOD), y = max(std_curve$Cq, na.rm = TRUE) * : All aesthetics have length 1, but the data has 93 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning: Use of `std_curve$Cq` is discouraged.
## ℹ Use `Cq` instead.

## Warning in geom_text(aes(x = log10(LOQ), y = max(std_curve$Cq, na.rm = TRUE) * : All aesthetics have length 1, but the data has 93 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_regline_equation()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

Calculating slope and y-intercept

These values get fed in later in data calculations.

# Fit a linear model to calculate slope and intercept
linear_model <- lm(Cq_mean ~ log10(St_Quant), data = std_curve)

# Extract slope and y-intercept
slope <- coef(linear_model)[2]      # Slope (m)
y_intercept <- coef(linear_model)[1] # Y-intercept (b)

Reading in datafiles and merging into one large dataframe

### USER EDITS:
## 1. Replace path of files 
## 2. In str_after_nth(file.ID, "results/", 1), make sure this matches the folder ending referenced in list.files

df <- 
  # list files in directory following a particular pattern
  list.files(path = 'example input/qPCR_results/', pattern = ".csv", full.names = TRUE) %>%

  # get the column names
  set_names(.) %>% 

  # join all files together in one data frame by file ID, skipping first 19 rows
  map_dfr(~read.csv(., skip = 19), .id = "file.ID") %>% 

  # turn file.ID into just plate information (plate.ID)
  mutate(file.ID = str_after_nth(file.ID, "results/", 1),
         file.ID = str_before_nth(file.ID, ".csv", 1)) %>%
  dplyr::rename(plate.ID = file.ID, Sample_ID = Sample) %>%

  ## RI STRIPED BASS SPECIFIC CODE FOR SAMPLE ID MISMATCHES
  mutate(Sample_ID = gsub(" #1", "_1", Sample_ID),
         Sample_ID = gsub(" #2", "_2", Sample_ID)) %>%

  mutate(Sample_ID = case_when(
    str_detect(Well, "4") & Sample_ID == "2/21/24_ONE_NTR" ~ paste0(Sample_ID, "_1"),
    str_detect(Well, "5") & Sample_ID == "2/21/24_ONE_NTR" ~ paste0(Sample_ID, "_2"),
    TRUE ~ Sample_ID
  ))

head(df)

##              plate.ID Well Fluor  Target Content       Sample_ID    Cq
## 1 26 AUG 2024 PLATE 1  A01  SYBR Library    Unkn 1/17/24_IPC_GCO    NA
## 2 26 AUG 2024 PLATE 1  A02  SYBR Library    Unkn 1/17/24_IPC_BBC    NA
## 3 26 AUG 2024 PLATE 1  A03  SYBR Library    Unkn 1/17/24_NAN_EDI    NA
## 4 26 AUG 2024 PLATE 1  A04  SYBR Library    Unkn 1/17/24_EPS_DMF 34.79
## 5 26 AUG 2024 PLATE 1  A05  SYBR Library    Unkn 1/17/24_WBJ_MHB    NA
## 6 26 AUG 2024 PLATE 1  A06  SYBR Library    Unkn 1/17/24_NBT_SAK    NA
##   Starting.Quantity..SQ.
## 1                     NA
## 2                     NA
## 3                     NA
## 4                     NA
## 5                     NA
## 6                     NA

Reading in meta df

Sample ID from meta needs to match the Sample ID from the qPCR output

meta <- read_xlsx("example input/client_metadata_example.xlsx") %>%

  ## RI STRIPED BASS SPECIFIC CODE FOR SAMPLE ID ISSUES
  ## For other projects, address any Sample_ID differences (Sample_ID on qPCR output needs to match meta)
  mutate(Sample_ID = gsub(" #1", "_1", Sample_ID),
         Sample_ID = gsub(" #2", "_2", Sample_ID))

Evaluating No Template Controls (NTCs; Negatives)

NTC <- df %>% 
  filter(grepl("neg", Sample_ID, ignore.case = TRUE))

## Calculate number of negatives with Cq values 
sum(!is.na(NTC$Cq))

## [1] 1

## Calculate number of negatives total 
nrow(NTC)

## [1] 120

## Calculate number of plates
length(unique(NTC$plate.ID))

## [1] 20

Spike information

For each sample, calculate the distance from the Positive Spike-in value.

spike_samples <- df %>% 
  ## subset to spiked samples 
  filter(grepl("spike|pos", Sample_ID, ignore.case = TRUE)) %>%

  ## remove spike from SampleID column only if 'pos' is NOT in the Sample_ID
  mutate(Sample_ID = case_when(
    !grepl("pos", Sample_ID, ignore.case = TRUE) ~ str_before_nth(Sample_ID, " spike", 1),
    TRUE ~ Sample_ID
  )) %>%

  ## group by plate ID and sample ID 
  group_by(plate.ID, Sample_ID) %>%

  ## calculate mean of spikes per plate and sample 
  ## ungroup by Sample_ID so the next calculation is only done grouped by Plate 
  mutate(Filter_Cq = mean(Cq)) %>% 
  ungroup(Sample_ID) %>%

  ## calculate difference from plate's Cq value 
  mutate(Cq_diff = Filter_Cq - Filter_Cq[grepl("pos", Sample_ID, ignore.case = TRUE)]) %>%
  ungroup(); spike_samples

## # A tibble: 480 × 10
##    plate.ID    Well  Fluor Target Content Sample_ID    Cq Starting.Quantity..SQ.
##    <chr>       <chr> <chr> <chr>  <chr>   <chr>     <dbl> <lgl>                 
##  1 26 AUG 202… G01   SYBR  Libra… Unkn    1/17/24_…  10.5 NA                    
##  2 26 AUG 202… G02   SYBR  Libra… Unkn    1/17/24_…  10.6 NA                    
##  3 26 AUG 202… G03   SYBR  Libra… Unkn    1/17/24_…  10.8 NA                    
##  4 26 AUG 202… G04   SYBR  Libra… Unkn    1/17/24_…  10.8 NA                    
##  5 26 AUG 202… G05   SYBR  Libra… Unkn    1/17/24_…  10.5 NA                    
##  6 26 AUG 202… G06   SYBR  Libra… Unkn    1/17/24_…  11.0 NA                    
##  7 26 AUG 202… G07   SYBR  Libra… Unkn    1/17/24_…  10.8 NA                    
##  8 26 AUG 202… G08   SYBR  Libra… Unkn    1/17/24_…  11   NA                    
##  9 26 AUG 202… G09   SYBR  Libra… Unkn    1/17/24_…  10.7 NA                    
## 10 26 AUG 202… G10   SYBR  Libra… Unkn    1/18/24_…  10.9 NA                    
## # ℹ 470 more rows
## # ℹ 2 more variables: Filter_Cq <dbl>, Cq_diff <dbl>

## Remove any lab error samples 
## RI Striped Bass only 
## removing 2/21/24 DI Blank 1 (pipetting issue) 
spike_samples <- spike_samples %>%
  filter(!Sample_ID == "2/21/24_Tap Blank_1")

Plot those distance values and save to output folder

# Create a jitter position object
jitter_pos <- position_jitter(width = 0.45, seed = 123)

## Plot samples 
spike_samples %>% dplyr::select(Target, Sample_ID, Cq_diff) %>% distinct() %>%

  ggplot(., aes(x=Target, y = Cq_diff)) + 
  geom_hline(yintercept = c(-2, 2), linetype = "dotted", color = "grey50", size=1.5) +

  geom_jitter(aes(fill = abs(Cq_diff) > 2), 
              size = 2, alpha = 0.5, color = 'black', shape = 21, #width = 0.45,
              position = jitter_pos) +

  scale_fill_manual(values = c("TRUE" = "red", "FALSE" = "white")) +
  geom_text_repel(
    aes(label = ifelse(abs(Cq_diff) > 2, Sample_ID, "")),  position = jitter_pos,
    size = 3, box.padding = 0.5, point.padding = 0.2, force = 2
  ) +

  labs(
    x = "Sample",
    y = "Distance from Positive Control"
  ) +

  theme_bw() +

  ## keep y axis at least -2,2 but extend to max 
  coord_cartesian(
    ylim = c(
      min(-2.5, min(spike_samples$Cq_diff, na.rm = TRUE)),
      max(2.5, max(spike_samples$Cq_diff, na.rm = TRUE))
    )) +

  ## theme variables
    theme(panel.background=element_rect(fill='white', colour='black'),
        legend.position = "none",
        axis.text.y = element_text(size=10, color="grey20"),
        axis.text.x = element_blank(),
        axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=12, face="bold"),
        axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12, face="bold"))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave("example output/Inhibition.png", width=4, height=4)

Filter data Cq, Number of Replicates, and Copy Num calculations

## Complete calculations
filters_df <- df %>% 
  ## subset out the spiked samples 
  filter(!grepl("spike|pos|neg", Sample_ID, ignore.case = TRUE)) %>%

  ## group by plate ID and sample ID 
  group_by(plate.ID, Sample_ID) %>%

  ## calculate mean of Cq per plate and sample, replacing NaN with NA
  mutate(Filter_Cq = if_else(is.nan(mean(Cq, na.rm = TRUE)), NA_real_, mean(Cq, na.rm = TRUE))) %>%

  ## calculate # of replicates 
  mutate(Filter_Num_Replicates = sum(!is.na(Cq))) %>%

  ## calculate copy number 
  mutate(Filter_Copy_Num = 10^((Filter_Cq-y_intercept)/slope)) %>%

  ## summarize df with specific columns 
  dplyr::select(plate.ID, Sample_ID, Filter_Cq, Filter_Num_Replicates, Filter_Copy_Num) %>% 
  distinct() %>% ungroup()

Addressing duplicate samples (a.k.a., re-runs)

This code assumes the user wants to always use the most recent plate if completing a sample for a 2nd time. If this is not the case, chat with Fisheries team to make sure code reflect user’s needs.

filters_df <- filters_df %>%
  ## Separating Plate.ID into Date and Plate Number
  separate(plate.ID, c("Plate_date", "Plate_number"), sep = " PLATE") %>%

  ## Change Date column to a Date format 
  mutate(Plate_date = dmy(Plate_date)) %>%

  ## Group by Sample_ID and keep only the row with the most recent date
  group_by(Sample_ID) %>%
  slice_max(Plate_date, n = 1, with_ties = FALSE) %>% ungroup()

## confirm the number of rows matches the number of unique Sample IDs (output should be TRUE)
nrow(filters_df) == length(unique(filters_df$Sample_ID))

## [1] TRUE

Combing with meta and collapsing by sample

samples_df <- filters_df %>% right_join(meta, ., by = "Sample_ID") %>%
  group_by(Date, Sample_Location) %>%

  ## mutate to mean Ct value
  mutate(`Mean Ct` = if_else(is.nan(mean(Filter_Cq, na.rm = TRUE)), 
                                    NA_real_, mean(Filter_Cq, na.rm = TRUE)),

         ## take highest value of number of replicates
         `Number of Replicates` = max(Filter_Num_Replicates, na.rm=TRUE),

         ## sum of 2 copy number values 
         `Mean Copy Number` = ifelse(sum(Filter_Copy_Num, na.rm=TRUE) == 0, NA, 
                            sum(Filter_Copy_Num, na.rm=TRUE))
           ) %>%

  ungroup() %>%

  ## summarizing df 
  dplyr::select(Date, Sample_Location, Sample_Type, Number_of_Filters, 
                `Mean Ct`, `Number of Replicates`, `Mean Copy Number`) %>%
  distinct() %>%

  ## adding new SampleID back in
  unite(Sample_ID, Date, Sample_Location, sep = " ", remove=F)

head(samples_df)

## # A tibble: 6 × 8
##   Sample_ID    Date                Sample_Location Sample_Type Number_of_Filters
##   <chr>        <dttm>              <chr>           <chr>       <lgl>            
## 1 2024-01-17 … 2024-01-17 00:00:00 DI Blank #1     Blank       NA               
## 2 2024-01-17 … 2024-01-17 00:00:00 DI Blank #2     Blank       NA               
## 3 2024-01-17 … 2024-01-17 00:00:00 GBP_OCA         Field       NA               
## 4 2024-01-17 … 2024-01-17 00:00:00 IPC_GCO         Field       NA               
## 5 2024-01-17 … 2024-01-17 00:00:00 IPC_BBC         Field       NA               
## 6 2024-01-17 … 2024-01-17 00:00:00 NAN_EDI         Field       NA               
## # ℹ 3 more variables: `Mean Ct` <dbl>, `Number of Replicates` <int>,
## #   `Mean Copy Number` <dbl>

## Confirm that the number of samples output from qPCR matches the number of samples in the metadata (output = TRUE)
nrow(samples_df) == nrow(meta %>% dplyr::select(-Sample_ID) %>% distinct())

## [1] TRUE

Normalizing data

## normalize data 
normalized_df <- samples_df %>%
  ## take log10 of copy number 
  mutate(`Mean Copy Number Normalized` = log10(`Mean Copy Number` + 1))

## create an outlier cut-off 
cutoff <- quantile(normalized_df$`Mean Copy Number Normalized`, na.rm = TRUE, probs=0.75) + 
  1.5*IQR(normalized_df$`Mean Copy Number Normalized`, na.rm=TRUE)

## create an outlier cut-off 
cutoff_below <- quantile(normalized_df$`Mean Copy Number Normalized`, na.rm = TRUE, probs=0.25) - 
  1.5*IQR(normalized_df$`Mean Copy Number Normalized`, na.rm=TRUE)

## Output the values that outside the cutoff
normalized_df %>% filter(`Mean Copy Number Normalized` > cutoff)

## # A tibble: 3 × 9
##   Sample_ID    Date                Sample_Location Sample_Type Number_of_Filters
##   <chr>        <dttm>              <chr>           <chr>       <lgl>            
## 1 2024-01-18 … 2024-01-18 00:00:00 POP_SEC         Field       NA               
## 2 2024-01-18 … 2024-01-18 00:00:00 POP_OUT         Field       NA               
## 3 2024-01-18 … 2024-01-18 00:00:00 PJP_CFL         Field       NA               
## # ℹ 4 more variables: `Mean Ct` <dbl>, `Number of Replicates` <int>,
## #   `Mean Copy Number` <dbl>, `Mean Copy Number Normalized` <dbl>

normalized_df %>% filter(`Mean Copy Number Normalized` < cutoff_below)

## # A tibble: 0 × 9
## # ℹ 9 variables: Sample_ID <chr>, Date <dttm>, Sample_Location <chr>,
## #   Sample_Type <chr>, Number_of_Filters <lgl>, Mean Ct <dbl>,
## #   Number of Replicates <int>, Mean Copy Number <dbl>,
## #   Mean Copy Number Normalized <dbl>

## the cutoff we move forward with is the cutoff (high) but confirm no values are below the lower cutoff value

Adding present/absent information

normalized_df <- normalized_df %>%
  mutate(Detection = case_when(
    is.na(`Mean Copy Number Normalized`) ~ "Absent",
    TRUE ~ "Present"
  ))

Blank specific information

blank_df <- normalized_df %>% subset(Sample_Type == "Blank")

detection_counts <- blank_df %>%
  count(Detection) %>%
  mutate(percentage = n / sum(n) * 100,
         label = paste0(Detection, "\n", "n=", n, "\n", round(percentage, 1), "%"))

ggplot(detection_counts, aes(x = "", y = n, fill = Detection)) +
  geom_bar(stat = "identity", width = 1, color = "black", alpha=0.75) +
  coord_polar("y", start = 0) +
  geom_text(aes(label = label), 
            position = position_stack(vjust = 0.5), 
            size = 5,
            fontface = "bold") + 
  theme_void() +
  theme(legend.position = "none",
        #plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
        ) +
  scale_fill_manual(values = c("#e9ecef", "#669bbc"))

ggsave("example output/Blanks_piechart.png", width=4, height=4)

Sample preliminary data

Pie chart

fieldsamples_df <- normalized_df %>% subset(Sample_Type == "Field")

field_detection_counts <- fieldsamples_df %>%
  count(Detection) %>%
  mutate(percentage = n / sum(n) * 100,
         label = paste0(Detection, "\n", "n=", n, "\n", round(percentage, 1), "%"))

ggplot(field_detection_counts, aes(x = "", y = n, fill = Detection)) +
  geom_bar(stat = "identity", width = 1, color = "black", alpha=0.75) +
  coord_polar("y", start = 0) +
  geom_text(aes(label = label), 
            position = position_stack(vjust = 0.5), 
            size = 5,
            fontface = "bold") + 
  theme_void() +
  theme(legend.position = "none",
        #plot.title = element_text(hjust = 0.5, size = 16, face = "bold")
        ) +
  scale_fill_manual(values = c("#e9ecef", "#669bbc"))

ggsave("example output/Fields_piechart.png", width=4, height=4)

Bar chart

fieldsamples_df %>% subset(Detection == "Present") %>%

  ggplot(., aes(x=Sample_ID, y=`Mean Copy Number Normalized`)) + 
  geom_bar(stat = "identity", width = 0.7, alpha=0.5, fill = "#669bbc") +
  labs(
    y="Log10-Normalized Copy Number",
    x = "Sample ID",
  ) +

  ## Outlier line
  geom_hline(yintercept = cutoff, color = "darkred", linetype = "dashed", size = 0.5) +
  geom_text(aes(y = cutoff, x = 2, label = "Outlier"), hjust = 0.4, vjust=-0.5, color = "grey40", size = 3) +

  ## LOD line
  geom_hline(yintercept=log10(LOD), linetype="dashed", color = "grey40") +
  geom_text(aes(y = log10(LOD), x = 2, label = "LOD"), hjust = 0.5, vjust=-0.5, color = "grey40", size = 3) +

  ## LOQ line
  geom_hline(yintercept=log10(LOQ), linetype="dashed", color = "grey40") +
  geom_text(aes(y = log10(LOQ), x=2, label = "LOQ"), hjust = 0.5, vjust=-0.5, color = "grey40", size = 3) +

  theme_bw() +
    ## theme variables
    theme(panel.background=element_rect(fill='white', colour='black'),
        #legend.position = c(0.98, 0.98),  # This puts the legend in the top right corner
        legend.position = "right",
        legend.justification = c(1, 1),  # This aligns the legend to the top right
        axis.text.y = element_text(size=8, color="grey20"),
        axis.text.x = element_text(size=6, color="grey20", angle=90, vjust=0.5, hjust=1),
        axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=10, face="bold"),
        axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=10, face="bold"),
        # New theme elements for strip appearance
        strip.background = element_rect(fill = "white", color = "black"),
        strip.text = element_text(face = "bold", size = 9)
    )

## Warning in geom_text(aes(y = cutoff, x = 2, label = "Outlier"), hjust = 0.4, : All aesthetics have length 1, but the data has 58 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning in geom_text(aes(y = log10(LOD), x = 2, label = "LOD"), hjust = 0.5, : All aesthetics have length 1, but the data has 58 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning in geom_text(aes(y = log10(LOQ), x = 2, label = "LOQ"), hjust = 0.5, : All aesthetics have length 1, but the data has 58 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

ggsave("example output/Fieldsamples_barchart.png", width = 9.5, height=5)

## Warning in geom_text(aes(y = cutoff, x = 2, label = "Outlier"), hjust = 0.4, : All aesthetics have length 1, but the data has 58 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning in geom_text(aes(y = log10(LOD), x = 2, label = "LOD"), hjust = 0.5, : All aesthetics have length 1, but the data has 58 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## Warning in geom_text(aes(y = log10(LOQ), x = 2, label = "LOQ"), hjust = 0.5, : All aesthetics have length 1, but the data has 58 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

Show distance of outliers

fieldsamples_df %>% ggplot(., aes(x=Sample_Type, y=`Mean Copy Number`, 
                                  fill = ifelse(is.na(`Mean Copy Number Normalized`) | is.na(cutoff), FALSE, 
                                          `Mean Copy Number Normalized` > cutoff))) + 
  geom_jitter(width=0.2, size=3.5, color='black', shape=21, alpha=0.5) +
  scale_fill_manual(values = c("#545E56", "#c1121f"),
                    labels = c("Normal", "Outlier")) +
  theme_bw() +
  #geom_hline(yintercept = cutoff, linetype = "dotted", color = "grey50") +
  labs(x = "Sample",
       fill = "Outlier Detection"
       ) +
      theme(panel.background=element_rect(fill='white', colour='black'),
        legend.position = "right",
        axis.text.y = element_text(size=8, color="grey20"),
        axis.text.x = element_blank(),
        axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=10, face="bold"),
        axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=10, face="bold"),
    ) +
    guides(fill = guide_legend(override.aes = list(alpha = 1)))

## Warning: Removed 82 rows containing missing values or values outside the scale range
## (`geom_point()`).

ggsave("example output/Outliers.png", width = 5, height=4.5)

## Warning: Removed 82 rows containing missing values or values outside the scale range
## (`geom_point()`).

Exporting data

blank_df %>% mutate(Date = as.Date(Date)) %>% 
  dplyr::select(Date, Sample_Location, `Mean Ct`, `Number of Replicates`, 
                `Mean Copy Number`, `Mean Copy Number Normalized`, Detection) %>%

  unite(Sample, Date, Sample_Location, sep = " ", remove = TRUE) %>%

  # cut decimals down to 2
  mutate(across(where(is.numeric), ~round(., 2))) %>%

  write_xlsx("example output/Results_Blanks.xlsx")

### adding some meta for Rich 
rich_meta <- read_xlsx("C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/RI Striped Bass/metadata/eDNA_Data_RI_STB_2024.xlsx") %>%
  dplyr::rename(Date = Sample_Date, Sample_Location = Station_Code) %>%
  mutate(Sample_Time = format(Sample_Time, format = "%H:%M:%S"))


normalized_df %>% #full_join(rich_meta, ., by = c("Date", "Sample_Location")) %>%
  mutate(Date = as.Date(Date)) %>%
  dplyr::select(-Number_of_Filters) %>% write_xlsx("example output/Results.xlsx")