Detecting outliers based on raw read counts from fastq files

If paths are not working, select Knit > Project Directory. Ignore this comment if you’re not using an R project.

Load libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyverse) # for data transformation

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ readr     2.1.5
## ✔ ggplot2   4.0.0     ✔ stringr   1.6.0
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.2.0     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2) # for plotting
library(readxl) ## for reading in excel files
library(ggrepel)
library(writexl)
library(vegan) ## rareify

## Loading required package: permute

library(strex)

# removing scientific notation
## remove line or comment out if not desired 
options(scipen=999)
set.seed(123)

Load read count data

## If loading from Basespace
basespace_data <- read.csv("docs/eDNA 12S metab/example_input/TID12SRun.demultiplex_stats.csv") %>%
  subset(!Sample.ID=="Undetermined") %>%
  mutate(`GMGI Sample ID` = str_before_nth(Sample.ID, "-12S", 1))

basespace_data$Reads <- as.numeric(gsub(",", "", basespace_data$Reads))

## If loading from MultiQC
multiqc_data <- read_csv("docs/eDNA 12S metab/example_input/general_stats_table.csv", trim_ws=TRUE) %>%

  ## creating new column called Reads by multiplying M Seqs column by 1,000,000
  mutate(Reads = `M Seqs`*1000000)

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 92 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Sample
## dbl (2): % Dups, % GC
## num (1): M Seqs
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

multiqc_summary <- multiqc_data %>%

  ## cleaning sample ID to remove R1 and R2 information
  mutate(`GMGI Sample ID` = str_before_nth(Sample, "-12S", 1)) %>%
  dplyr::select(`GMGI Sample ID`, Reads) %>% distinct()

Load metadata

meta <- read_xlsx("docs/eDNA 12S metab/example_input/metadata_tidal_cycle.xlsx") 

data <- basespace_data %>%
  ## joining meta information to export outliers as the 
  left_join(., meta %>% dplyr::select(`GMGI Sample ID`, `Project Sample ID`), by = "GMGI Sample ID")

Calculate outlier

Identify outliers based on MAD calculation

outliers <- data %>% dplyr::select(`GMGI Sample ID`, `Project Sample ID`, Reads) %>%
  mutate(
    Median = median(Reads),
    MAD = mad(Reads, constant=1),

    Mi = 0.6745*(abs(Reads - Median)/MAD),
    Outlier = ifelse(Mi > 3.5, "Outlier", ""),


    Upper_threshold = (3.5*MAD+0.6745*Median)/0.6745,
    Lower_threshold =-1*(3.5*MAD-0.6745*Median)/0.6745
  ) %>%

  arrange(desc(Reads))

Identify high and low thresholds

## Print high and low thresholds for report 
## If low threshold is <0, then just put 0 in the report 
unique(outliers$Upper_threshold)

## [1] 101586.7

unique(outliers$Lower_threshold)

## [1] -5950.718

## Samples above outlier 
outliers %>% filter(Reads > Upper_threshold)

##   GMGI Sample ID Project Sample ID  Reads Median   MAD       Mi Outlier
## 1      TDL-FS-15   9-16-2022_CP_11 106924  47818 10362 3.847423 Outlier
##   Upper_threshold Lower_threshold
## 1        101586.7       -5950.718

### mean
mean(data$Reads)

## [1] 51865.63

### median (used for sub-sampling)
median(data$Reads)

## [1] 47818

Export outliers spreadsheet

outliers %>% 

  ### for client projects, remove `GMGI Sample ID`
  dplyr::select(`GMGI Sample ID`, `Project Sample ID`, Reads, Outlier) %>%

  write_xlsx("docs/eDNA 12S metab/example_output/Outliers.xlsx")

Plot

You can adjust the GMGI or Project Sample ID as needed. Use the Project Sample ID for client reports.

outliers %>% 
  mutate(Outlier = ifelse(Outlier == "Outlier", "Outlier", "NotOutlier")) %>%

  ggplot(., aes(x=`GMGI Sample ID`, y=Reads)) + 

  geom_point(aes(fill = Outlier), shape = 21, size = 3) +
  scale_fill_manual(
    values = c("Outlier" = "darkred", "NotOutlier" = "lightblue"),
    guide = "none"     # remove legend if not needed
  ) +

  geom_text(
    data = ~ subset(.x, Reads > unique(outliers$Upper_threshold)),
    aes(label = `Project Sample ID`),
    vjust = 2.5,
    size = 3.5
  ) +

  geom_hline(yintercept = unique(outliers$Upper_threshold), linetype = "dashed", linewidth = 0.7, color = "grey70") +

  theme_bw() +
  theme(
    axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),
    axis.title.y = element_text(margin = margin(t = 0, r = 10, b = 0, l = 0), size=12, face="bold"),
    axis.title.x = element_text(margin = margin(t = 10, r = 0, b = 0, l = 0), size=12, face="bold")
  )

ggsave("docs/eDNA 12S metab/example_output/figures/Outliers.png", width=7, height=4)

Plot for report

outliers_plot <- outliers %>%
  mutate(
    Outlier = ifelse(Outlier == "Outlier", "Outlier", "NotOutlier"),
    x = "x"
  ) 

outliers_plot %>%
  ggplot(., aes(x = x, y = Reads)) +
  geom_boxplot(outlier.shape = NA, fill = NA, color = "#264653", width=0.75) +
  geom_jitter(
    width = 0.15,
    shape = 21,
    alpha = 0.4,
    size = 2.5,
    color = "black",
    aes(fill = Outlier)
  ) +
  geom_hline(
    yintercept = unique(outliers_plot$Upper_threshold),
    color = "grey60",
    linetype = "dotted",
    size=0.75
  ) +

  ggrepel::geom_text_repel(
    data = subset(outliers_plot, Outlier == "Outlier"),
    aes(label = `Project Sample ID`),
    color = "black",
    size = 4,
    nudge_y = 0.1 * max(outliers_plot$Reads),
    direction = "y",
    segment.colour = NA,
    point.padding = unit(0.7, "lines"),
    box.padding = unit(0.75, "lines")
  ) +

  scale_fill_manual(
    values = c("Outlier" = "darkred", "NotOutlier" = "#264653"),
    guide = "none"
  ) +

  theme_bw() +
  labs(y = "Number of reads",
       x = "") +

  theme(
    axis.text.x = element_blank(),
    axis.text.y = element_text(size = 10, color = "black"),
    axis.title.y = element_text(margin = margin(r = 10), size = 11, face = "bold"),
    legend.position = "none"
  )

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggsave("docs/eDNA 12S metab/example_output/figures/SampleReport_Outliers.png", width = 4, height=5)