Detecting outliers based on raw read counts from fastq files
If paths are not working, select Knit > Project Directory. Ignore this comment if you’re not using an R project.
Load libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse) # for data transformation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ readr 2.1.5
## ✔ ggplot2 4.0.0 ✔ stringr 1.6.0
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.2.0 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2) # for plotting
library(readxl) ## for reading in excel files
library(ggrepel)
library(writexl)
library(vegan) ## rareify
## Loading required package: permute
library(strex)
# removing scientific notation
## remove line or comment out if not desired
options(scipen=999)
set.seed(123)
Load read count data
## If loading from Basespace
basespace_data <- read.csv("docs/eDNA 12S metab/example_input/TID12SRun.demultiplex_stats.csv") %>%
subset(!Sample.ID=="Undetermined") %>%
mutate(`GMGI Sample ID` = str_before_nth(Sample.ID, "-12S", 1))
basespace_data$Reads <- as.numeric(gsub(",", "", basespace_data$Reads))
## If loading from MultiQC
multiqc_data <- read_csv("docs/eDNA 12S metab/example_input/general_stats_table.csv", trim_ws=TRUE) %>%
## creating new column called Reads by multiplying M Seqs column by 1,000,000
mutate(Reads = `M Seqs`*1000000)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 92 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Sample
## dbl (2): % Dups, % GC
## num (1): M Seqs
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
multiqc_summary <- multiqc_data %>%
## cleaning sample ID to remove R1 and R2 information
mutate(`GMGI Sample ID` = str_before_nth(Sample, "-12S", 1)) %>%
dplyr::select(`GMGI Sample ID`, Reads) %>% distinct()
Load metadata
meta <- read_xlsx("docs/eDNA 12S metab/example_input/metadata_tidal_cycle.xlsx")
data <- basespace_data %>%
## joining meta information to export outliers as the
left_join(., meta %>% dplyr::select(`GMGI Sample ID`, `Project Sample ID`), by = "GMGI Sample ID")
Calculate outlier
Identify outliers based on MAD calculation
outliers <- data %>% dplyr::select(`GMGI Sample ID`, `Project Sample ID`, Reads) %>%
mutate(
Median = median(Reads),
MAD = mad(Reads, constant=1),
Mi = 0.6745*(abs(Reads - Median)/MAD),
Outlier = ifelse(Mi > 3.5, "Outlier", ""),
Upper_threshold = (3.5*MAD+0.6745*Median)/0.6745,
Lower_threshold =-1*(3.5*MAD-0.6745*Median)/0.6745
) %>%
arrange(desc(Reads))
Identify high and low thresholds
## Print high and low thresholds for report
## If low threshold is <0, then just put 0 in the report
unique(outliers$Upper_threshold)
## [1] 101586.7
unique(outliers$Lower_threshold)
## [1] -5950.718
## Samples above outlier
outliers %>% filter(Reads > Upper_threshold)
## GMGI Sample ID Project Sample ID Reads Median MAD Mi Outlier
## 1 TDL-FS-15 9-16-2022_CP_11 106924 47818 10362 3.847423 Outlier
## Upper_threshold Lower_threshold
## 1 101586.7 -5950.718
### mean
mean(data$Reads)
## [1] 51865.63
### median (used for sub-sampling)
median(data$Reads)
## [1] 47818
Export outliers spreadsheet
outliers %>%
### for client projects, remove `GMGI Sample ID`
dplyr::select(`GMGI Sample ID`, `Project Sample ID`, Reads, Outlier) %>%
write_xlsx("docs/eDNA 12S metab/example_output/Outliers.xlsx")
Plot
You can adjust the GMGI or Project Sample ID as needed. Use the Project Sample ID for client reports.
outliers %>%
mutate(Outlier = ifelse(Outlier == "Outlier", "Outlier", "NotOutlier")) %>%
ggplot(., aes(x=`GMGI Sample ID`, y=Reads)) +
geom_point(aes(fill = Outlier), shape = 21, size = 3) +
scale_fill_manual(
values = c("Outlier" = "darkred", "NotOutlier" = "lightblue"),
guide = "none" # remove legend if not needed
) +
geom_text(
data = ~ subset(.x, Reads > unique(outliers$Upper_threshold)),
aes(label = `Project Sample ID`),
vjust = 2.5,
size = 3.5
) +
geom_hline(yintercept = unique(outliers$Upper_threshold), linetype = "dashed", linewidth = 0.7, color = "grey70") +
theme_bw() +
theme(
axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),
axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=12, face="bold"),
axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12, face="bold")
)

ggsave("docs/eDNA 12S metab/example_output/figures/Outliers.png", width=7, height=4)
Plot for report
outliers_plot <- outliers %>%
mutate(
Outlier = ifelse(Outlier == "Outlier", "Outlier", "NotOutlier"),
x = "x"
)
outliers_plot %>%
ggplot(., aes(x = x, y = Reads)) +
geom_boxplot(outlier.shape = NA, fill = NA, color = "#264653", width=0.75) +
geom_jitter(
width = 0.15,
shape = 21,
alpha = 0.4,
size = 2.5,
color = "black",
aes(fill = Outlier)
) +
geom_hline(
yintercept = unique(outliers_plot$Upper_threshold),
color = "grey60",
linetype = "dotted",
size=0.75
) +
ggrepel::geom_text_repel(
data = subset(outliers_plot, Outlier == "Outlier"),
aes(label = `Project Sample ID`),
color = "black",
size = 4,
nudge_y = 0.1 * max(outliers_plot$Reads),
direction = "y",
segment.colour = NA,
point.padding = unit(0.7, "lines"),
box.padding = unit(0.75, "lines")
) +
scale_fill_manual(
values = c("Outlier" = "darkred", "NotOutlier" = "#264653"),
guide = "none"
) +
theme_bw() +
labs(y = "Number of reads",
x = "") +
theme(
axis.text.x = element_blank(),
axis.text.y = element_text(size = 10, color = "black"),
axis.title.y = element_text(margin = margin(r = 10), size = 11, face = "bold"),
legend.position = "none"
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave("docs/eDNA 12S metab/example_output/figures/SampleReport_Outliers.png", width = 4, height=5)