Metabarcoding data quality: eDNA metabarcoding base script
.Rmd script
This script evaluates your sequence quality and taxonomic assignment quality. Figures produced in this script can go into supplemental data for a manuscript.
Load libraries
library(dplyr) # for data transformation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse) # for data transformation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ ggplot2 3.5.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) # for plotting
library(readxl) ## for reading in excel files
library(viridis)
## Loading required package: viridisLite
library(hrbrthemes)
library(ggrepel)
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(Rmisc)
## Loading required package: lattice
## Loading required package: plyr
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following object is masked from 'package:purrr':
##
## compact
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
library(writexl)
# removing scientific notation
## remove line or comment out if not desired
options(scipen=999)
Load data
### User edits:
### 1. Replace the 3 paths below: edit example_input to your project specific path
### 2. Confirm your sampleIDs match between metadata, results df, and filtering stats output
filtering_stats <- read_tsv("example_input/overall_summary.tsv", show_col_types = FALSE) %>% dplyr::rename(sampleID = sample)
meta <- read_excel("example_input/metadata.xlsx")
results <- read_xlsx("example_output/Results_rawreads_long_format.xlsx") %>%
## calculate sum of reads
group_by(sampleID) %>%
dplyr::mutate(`Number of reads` = sum(reads)) %>%
## calculate relative abundance
group_by(sampleID, Species_name) %>%
dplyr::mutate(`Relative Abundance` = reads/`Number of reads`) %>%
## factor the Category list
dplyr::mutate(Category = factor(Category, levels = c("Human", "Livestock", "Other", "Unassigned", "Bird",
"Sea Turtle", "Elasmobranch", "Marine Mammal", "Teleost Fish")))
ASV_breakdown <- read_xlsx("example_output/ASV_breakdown.xlsx") %>%
## factor the Category list
dplyr::mutate(Category = factor(Category, levels = c("Human", "Livestock", "Other", "Unassigned", "Bird",
"Sea Turtle", "Elasmobranch", "Marine Mammal", "Teleost Fish")))
Sequence data
Data Transformation
No user edits.
df <- full_join(filtering_stats, meta, by = "sampleID") %>%
# filtering out columns we don't need
dplyr::select(-cutadapt_reverse_complemented) %>%
# removing percentage icon from cutadapt_passing_filters_percent
dplyr::mutate(cutadapt_passing_filters_percent = gsub("%", "", cutadapt_passing_filters_percent)) %>%
# confirming that all columns of interest are numerical
dplyr::mutate_at(vars(2:10), as.numeric) %>%
# data transformation so all columns of interest are together
gather("measure", "value", 2:10)
Plotting
Suggested webpage to choose colors: https://coolors.co/
### User edits:
### 1. Change paths of output to desired folder (data/figures is suggested data structure)
### 2. Change x axis and color, size based on metadata desired
### 3. Change custom colors and sizes if desired and number of colors and sizes based on metadata variable chosen
df %>%
filter(!is.na(Month)) %>%
## USER EDITS IN LINE BELOW
ggplot(., aes(x=Month, y=value)) +
## adding points in jitter format
geom_jitter(width=0.15, alpha=0.5, fill="#0077b6", color='black', size=1, shape=21) +
## option for additional boxplots if desired (uncomment to add)
#geom_boxplot() +
## using facet_wrap to create grid based on variables and factor() to order them in custom format
facet_wrap(~factor(measure, levels=c('cutadapt_total_processed', 'cutadapt_passing_filters',
'cutadapt_passing_filters_percent', 'DADA2_input',
'filtered', 'denoisedF', 'denoisedR', 'merged', 'nonchim')), scales = "free") +
## graph asthetics
theme_bw() +
ylab("Number of reads") +
theme(panel.background=element_rect(fill='white', colour='black'),
strip.background=element_rect(fill='white', colour='black'),
strip.text = element_text(size = 10, face="bold"),
legend.position = "right",
axis.text.y = element_text(size=7, color="grey30"),
axis.text.x = element_text(angle=45, hjust=1),
axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=11, face="bold"),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=11, face="bold"))
## Warning: Removed 45 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave("example_output/Figures/SampleReport_FilteringStats.png", width = 11, height=9)
## Warning: Removed 45 rows containing missing values or values outside the scale range
## (`geom_point()`).
Determine sequencing outliers
Determine outlier
## Subset to start and final read counts
outliers <- df %>% subset(measure == "nonchim") %>% spread(measure, value)
## Calculate mean, standard deviation, and standard error
outlier_summary <- summarySE(outliers, measurevar = c("nonchim"))
## Determine cutoffs based on 2 standard deviations away from the mean
cutoff_low <- outlier_summary$nonchim - (outlier_summary$sd*2)
cutoff_high <- outlier_summary$nonchim + (outlier_summary$sd*2)
## Print samples that will be labeled as outliers
outliers %>% filter(nonchim < cutoff_low)
## # A tibble: 0 × 11
## # ℹ 11 variables: sampleID <chr>, Month <chr>, Site <chr>, Depth <chr>,
## # SampleType <chr>, Lease_area <chr>, Latitude <dbl>, Longitude <dbl>,
## # Sequencing Round 1 <chr>, SampleDate <dttm>, nonchim <dbl>
outliers %>% filter(nonchim > cutoff_high)
## # A tibble: 0 × 11
## # ℹ 11 variables: sampleID <chr>, Month <chr>, Site <chr>, Depth <chr>,
## # SampleType <chr>, Lease_area <chr>, Latitude <dbl>, Longitude <dbl>,
## # Sequencing Round 1 <chr>, SampleDate <dttm>, nonchim <dbl>
Export table with outlier information
### Create table of outliers and export
outlier_export <- outliers %>%
## Rename nonchim column
dplyr::rename(`Number of Filtered Reads`= nonchim) %>%
## Create lower threshold, upper threshold, and sequencing outlier columns
dplyr::mutate(`Lower Threshold` = cutoff_low,
`Upper Threshold` = cutoff_high) %>%
dplyr::mutate(`Sequencing Outlier` = case_when(
`Number of Filtered Reads` > cutoff_high ~ "Outlier",
`Number of Filtered Reads` < cutoff_low ~ "Outlier",
TRUE ~ NA
))
## Export dataframe made
outlier_export %>% write_xlsx("example_output/Results_Outliers.xlsx")
Condensed plot used in contract reporting.
### User edits:
### 1. Change paths of output to desired folder (data/figures is suggested data structure)
### 2. Change x axis and color, size based on metadata desired
### 3. Change custom colors and sizes if desired and number of colors and sizes based on metadata variable chosen
df %>%
subset(measure == "cutadapt_total_processed" | measure == "nonchim") %>%
## USER EDITS IN LINE BELOW
ggplot(., aes(x=measure, y=value)) +
geom_hline(yintercept = cutoff_low, color = "grey80", linetype = "dashed") +
## adding points in jitter format
geom_boxplot(outlier.shape = NA, fill=NA, aes(color = measure)) +
geom_jitter(width=0.15, shape=21, alpha=0.25, size=1.5, color = 'black', aes(fill = measure)) +
## graph asthetics
theme_bw() +
labs(
y="Number of reads",
x="Bioinformatic Step"
) +
## USER EDITS IN MANUAL CODE BELOW
# scale_color_manual(values = c("red3", "lightblue", "purple2", "gold", "green4", "black")) +
# scale_size_manual(values = c(21,17)) +
scale_fill_manual(values = c("#264653", "#2a9d8f")) +
scale_color_manual(values = c("#264653", "#2a9d8f")) +
scale_x_discrete(labels = c("cutadapt_total_processed" = "Start",
"nonchim" = "Final")) +
theme(panel.background=element_rect(fill='white', colour='black'),
strip.background=element_rect(fill='white', colour='black'),
strip.text = element_text(size = 10, face="bold"),
legend.position = "none",
axis.text.y = element_text(size=10, color="black"),
axis.text.x = element_text(size=10, color="black"),
axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=11, face="bold"),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=11, face="bold"))
## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_hline()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
ggsave("example_output/SampleReport_FilteringStats_Condensed.png", width = 4, height=4)
## Warning: Removed 10 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_hline()`).
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_point()`).
Plot taxonomy
Data transformation
No user edits.
order_vector <- c("Teleost Fish", "Marine Mammal", "Elasmobranch", "Sea Turtle", "Bird",
"Livestock", "Other", "Human", "Unassigned") # Replace with your categories
results_summary <- results %>%
group_by(Category) %>%
reframe(sum_reads = sum(reads))
general_stats <- results %>%
group_by(Category) %>%
reframe(sum_reads = sum(reads)) %>%
dplyr::mutate(total = sum(sum_reads),
percent = sum_reads/total*100) %>% dplyr::select(Category, percent) %>% distinct() %>%
## round to 2 decimal places
dplyr::mutate(across(c('percent'), round, 4))
## Warning: There was 1 warning in `dplyr::mutate()`.
## ℹ In argument: `across(c("percent"), round, 4)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
ASV_summary <- ASV_breakdown %>%
group_by(Category) %>%
reframe(count = n_distinct(ASV_ID))
species_summary <- results %>%
group_by(Category) %>%
reframe(count = n_distinct(Species_name))
Graph color and order options
# fill_colors <- c("Human" = "#e76f51",
# "Livestock" = "#FF740A",
# "Other" = "#FE9E20",
# "Unassigned" = "#FFC571",
# "Bird" = "#FFECC2",
# "Sea Turtle" = "#C8D2B1",
# "Elasmobranch" = "#DCF1F9",
# "Marine Mammal" ="#8CD0EC",
# "Teleost Fish" = "#3FB1DE"
# )
fill_colors <- c("Human" = "#FE9E20",
"Livestock" = "#FCCA46",
"Other" = "#FFECC2",
"Unassigned" = "grey85",
"Bird" = "#C8D2B1",
"Sea Turtle" = "#A1C181",
"Elasmobranch" = "#97ADCB",
"Marine Mammal" ="#DCF1F9",
"Teleost Fish" = "#85B6CB"
)
Relative abundance by category
results %>% group_by(sampleID, Category) %>%
reframe(group_sum = sum(reads),
group_relab = group_sum/`Number of reads`
) %>% distinct() %>%
ggplot(., aes(y=group_relab, x=Category)) +
geom_boxplot(outlier.shape = NA, aes(color=Category), fill=NA) +
geom_jitter(aes(fill=Category), width=0.2, shape=21, color='black', size = 0.75, alpha=0.35) +
scale_color_manual(values = fill_colors) +
scale_fill_manual(values = fill_colors) +
labs(color = "Category") +
scale_x_discrete(labels = scales::label_wrap(10), limits = order_vector) +
theme_bw() +
xlab("Category") + ylab("Relative abundance") +
theme(panel.background=element_rect(fill='white', colour='black'),
legend.position = "none",
axis.text.y = element_text(size=7, color="grey30"),
axis.text.x = element_text(size=7), # angle=45, hjust=1
axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=11, face="bold"),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=11, face="bold"))
ggsave("example_output/Figures/Categories_relative_abundance.png", width = 6.5, height = 4)
Human specific reads by Sample Type
results %>% group_by(sampleID, Category) %>%
reframe(group_sum = sum(reads),
group_relab = group_sum/`Number of reads`
) %>% distinct() %>%
## subset to include Human and metadata
subset(Category == "Human") %>%
left_join(., meta, by = "sampleID") %>%
ggplot(., aes(x=SampleType, y=group_relab)) +
geom_boxplot(outlier.shape=NA, fill=NA, color = "#e76f51") +
geom_jitter(fill="#e76f51", shape=21, alpha=0.5, width=0.2) +
labs(
x = "Sample Type",
y = "Relative Abundance"
) +
theme_bw() +
theme(panel.background=element_rect(fill='white', colour='black'),
legend.position = "none",
axis.text.y = element_text(size=7, color="grey30"),
axis.text.x = element_text(size=7),
axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=11, face="bold"),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=11, face="bold"))
ggsave("example_output/Figures/Human.png", width=4, height=3)
Piechart
Reads Piechart. Used in contract report.
### User edits:
### 1. Change paths of output to desired folder (data/figures is suggested data structure)
### 2. Change scale brewer color if desired
piechart <- general_stats %>%
dplyr::mutate(csum = rev(cumsum(rev(percent))),
pos = percent/2 + lead(csum, 1),
pos = if_else(is.na(pos), percent/2, pos)) %>%
dplyr::mutate_if(is.numeric, round, digits = 4)
unassigned_percent <- piechart %>%
filter(Category == "Unassigned") %>%
pull(percent)
piechart_reads <- general_stats %>%
ggplot(., aes(x="", y = percent, fill = Category)) +
geom_col(color = "black", width=1.25) +
geom_label_repel(data = piechart,
aes(y = pos, label = paste0(percent, "%")),
size = 3, nudge_x = 1, show.legend = FALSE) +
coord_polar(theta = "y") +
scale_fill_manual(values = fill_colors) +
theme_bw() +
# labs(
# x = NULL, y = NULL, fill = "Category",
# caption = paste0("Unassigned reads = ", unassigned_percent, "%")
# ) +
theme(
plot.background = element_rect(fill = "white", colour = NA),
plot.margin = margin(t = 5, r = 5, b = 5, l = 5, unit = "pt"),
legend.position = "none",
panel.border = element_blank(), # Remove panel border
panel.grid = element_blank(), # Remove grid lines
axis.ticks = element_blank(), # Remove axis ticks
axis.text = element_blank(), # Remove axis text
axis.title = element_blank() # Remove axis titles
) +
ggtitle("Raw reads (%)") +
xlab("") + ylab("") + labs(fill = "Category"); piechart_reads
ggsave("example_output/Figures/Category_breakdown_percent_rawreads.png", width=4, height=3)
ASV Piechart (NOT used in contract report)
### User edits:
### 1. Change paths of output to desired folder (data/figures is suggested data structure)
### 2. Change scale brewer color if desired
piechart_ASV <- ASV_summary %>%
dplyr::mutate(csum = rev(cumsum(rev(count))),
pos = count/2 + lead(csum, 1),
pos = if_else(is.na(pos), count/2, pos))
ASV_summary %>%
ggplot(., aes(x="", y = count, fill = Category)) +
geom_col(color = "black", width=1.25) +
geom_label_repel(data = piechart_ASV,
aes(y = pos, label = paste0(count)),
size = 3, nudge_x = 1, show.legend = FALSE) +
coord_polar(theta = "y") +
scale_fill_manual(values = fill_colors) +
theme_bw() +
theme(
plot.background = element_rect(fill = "white", colour = NA),
plot.margin = margin(t = 5, r = 5, b = 5, l = 5, unit = "pt"),
panel.border = element_blank(), # Remove panel border
panel.grid = element_blank(), # Remove grid lines
axis.ticks = element_blank(), # Remove axis ticks
axis.text = element_blank(), # Remove axis text
#legend.position = "none"
axis.title = element_blank() # Remove axis titles
) +
ggtitle("Number of ASVs") +
xlab("") + ylab("") + labs(fill = "Category")
ggsave("example_output/Figures/Category_breakdown_ASVs.png", width=4, height=3)
Number of species pie chart. Used in contract report.
piechart_spp <- species_summary %>%
dplyr::mutate(csum = rev(cumsum(rev(count))),
pos = count/2 + lead(csum, 1),
pos = if_else(is.na(pos), count/2, pos))
piechart_species_plot <- species_summary %>%
ggplot(., aes(x="", y = count, fill = Category)) +
geom_col(color = "black", width=1.25) +
geom_label_repel(data = piechart_spp,
aes(y = pos, label = paste0(count)),
size = 3, nudge_x = 1, show.legend = FALSE) +
coord_polar(theta = "y") +
scale_fill_manual(values = fill_colors) +
theme_bw() +
theme(
plot.background = element_rect(fill = "white", colour = NA),
plot.margin = margin(t = 5, r = 5, b = 5, l = 5, unit = "pt"),
panel.border = element_blank(), # Remove panel border
panel.grid = element_blank(), # Remove grid lines
axis.ticks = element_blank(), # Remove axis ticks
axis.text = element_blank(), # Remove axis text
axis.title = element_blank() # Remove axis titles
) +
ggtitle("Number of Taxonomic Assignments") +
xlab("") + ylab("") + labs(fill = "Category"); piechart_species_plot
ggsave("example_output/Figures/Category_breakdown_species.png", width=4, height=3)
Plot together and export
plot_grid(piechart_reads, piechart_species_plot,
ncol=2,
align = "vh"
)
ggsave("example_output/Figures/Category_breakdown_contract.png", width=14, height=4)