Skip to content

Datatable preparation base script for eDNA metabarcoding

.Rmd script

This script takes your Blast output from the GMGI database, Mitofish database, and NCBI database to create one datatable with read counts and taxonomic assignment.

Workflow summary:
1. Load libraries
2. Load metadata
3. Load BLAST output from GMGI, Mitofish, and NCBI
4. Load DADA2 ASV Table
5. Taxonomic Assignment
- 5a. Identify ASVs with multiple hits from GMGI’s database
- 5b. Identify entries that mismatch between GMGI, Mitofish, and NCBI databases
- 5c. Assign taxonomy based on hierarchical approach
- 5d. Edit taxonomy annotations based on mismatch table choices
- 5e. Adjusting common name and category for those entries that don’t have one (from Mito or NCBI)
6. Filtering: Filter ASV by less than 0.1% reads and then collapse by group
7. Collapsing read counts by species name
8. Creating results output

Load libraries

library(ggplot2) ## for plotting
library(dplyr) ## for data table manipulation
## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr) ## for data table manipulation
library(readr) ## for reading in tsv files
library(readxl) ## for reading in excel files
library(stringr) ## for data transformation
library(strex) ## for data transformation
library(writexl) ## for excel output
library(purrr) ## for data transformation
library(funrar) ## for make_relative()
library(tidyverse) ## for data table manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Biostrings) ## for reading in fasta file 
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:utils':
## 
##     findMatches
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:grDevices':
## 
##     windows
## 
## Loading required package: XVector
## 
## Attaching package: 'XVector'
## 
## The following object is masked from 'package:purrr':
## 
##     compact
## 
## Loading required package: GenomeInfoDb
## 
## Attaching package: 'Biostrings'
## 
## The following object is masked from 'package:base':
## 
##     strsplit
library(data.table)  ## for data table manipulation
## 
## Attaching package: 'data.table'
## 
## The following object is masked from 'package:IRanges':
## 
##     shift
## 
## The following objects are masked from 'package:S4Vectors':
## 
##     first, second
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(microDecon) ## for microDecon 

Metadata input

Identify paths for metadata and project data

Each user needs to write in their specific directory outputs prior to the file name. The default working directory is this document so the folder where this script is saved for the user. To change the working directory to the Rproject directory, select ‘Knit’ and ‘Knit Directory’ > ‘Project Directory’.

### User edits:
### 1. change paths of input and output as desired 

## GMGI Fish database
path_GMGIdb = "C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/Metabarcoding Lab Resources/Reference Databases/GMGI_Vert_Ref.xlsx"

## BLAST results
path_blast_gmgi = "docs/eDNA 12S metab/example_input/blast/BLASTResults_GMGI.txt"
path_blast_mito = "docs/eDNA 12S metab/example_input/blast/BLASTResults_Mito.txt"
path_blast_ncbi_taxassigned = "docs/eDNA 12S metab/example_input/blast/NCBI_taxassigned.txt"
path_blast_ncbi = "docs/eDNA 12S metab/example_input/blast/BLASTResults_NCBI.txt"

## ASV table results 
## confirm that the ASV_table.len.tsv name is correct for user's project
path_asv_table = "docs/eDNA 12S metab/example_input/ASV_table.len.tsv"
path_seq_file = "docs/eDNA 12S metab/example_input/ASV_seqs.len.fasta"
path_output_summary = "docs/eDNA 12S metab/example_input/overall_summary.tsv"

Create output files

### TAXONOMIC CHOICES 
# output paths 
path_choice_required = "docs/eDNA 12S metab/example_output/taxonomic_assignments/Choice_required_GMGI_multiplehits.xlsx"
path_choice_required_edited="docs/eDNA 12S metab/example_output/taxonomic_assignments/Choice_required_GMGI_multiplehits_edited.xlsx"

path_disagree_list = "docs/eDNA 12S metab/example_output/taxonomic_assignments/Three_DB_comparison_taxonomic_ID.xlsx"
path_disagree_list_edited="docs/eDNA 12S metab/example_output/taxonomic_assignments/Three_DB_comparison_taxonomic_ID_edited.xlsx"

path_commonnames_add="docs/eDNA 12S metab/example_output/taxonomic_assignments/CommonNames_required.xlsx"
path_commonnames_add_edited="docs/eDNA 12S metab/example_output/taxonomic_assignments/CommonNames_required_edited.xlsx"

reads_filtered_percent_out="docs/eDNA 12S metab/example_output/Filtered_Percent_removed_reads.xlsx"
reads_filtered10_out="docs/eDNA 12S metab/example_output/Filtered_10_removed_reads.xlsx"
reads_decontaminated_out="docs/eDNA 12S metab/example_output/Filtered_Decontaminated_removed_reads.RData"

Species_breakdown_sheet="docs/eDNA 12S metab/example_output/Summary_Species_level.xlsx"
ASV_breakdown_sheet="docs/eDNA 12S metab/example_output/Summary_ASV_level.xlsx"

results_filtered_df="docs/eDNA 12S metab/example_output/ASV_table_filtered.rds"
results_matrix_df="docs/eDNA 12S metab/example_output/Results_matrix.xlsx"
results_relative_df="docs/eDNA 12S metab/example_output/Results_matrix_relative.xlsx"

Load project metadata

Metadata specific to each project. This contains information about each sample (e.g., month, site, time, sample type, etc.). Confirm that sample IDs match those used in the ASV_table.len.tsv file.

### User edits:
### 1. change path of metadata file

## EXCEL
meta <- read_excel("docs/eDNA 12S metab/example_input/metadata_tidal_cycle.xlsx") %>%

  ## ampliseq needs _ but the MiSeq needs - 
  mutate(`GMGI Sample ID` = gsub("-", "_", `GMGI Sample ID`))

Load database metadata

No user edits in this section because paths have already been set above.

# Load GMGI database information (common name, species name, etc.)
gmgi_db <- read_xlsx(path_GMGIdb, sheet = 1) %>% dplyr::rename(sseqid = Ref) %>%
  ## removing > from beginning of entires within Ref column
  mutate(sseqid = gsub(">", "", sseqid))

BLAST data input

No user edits unless user changed blastn parameters from fisheries team default.

## Setting column header names and classes
blast_col_headers = c("ASV_ID", "sseqid", "pident", "length", "mismatch", "gapopen",
                                        "qstart", "qend", "sstart", "send", "evalue", "bitscore")
blast_col_classes = c(rep("character", 2), rep("numeric", 10))

GMGI database

No user edits.

Blast_GMGI <- read.table(path_blast_gmgi, header=F, col.names = blast_col_headers, colClasses = blast_col_classes) %>%
  ## blast changes spaces to hyphons so we need to change that back to match our metadata
  mutate(sseqid = gsub("-", " ", sseqid)) %>%

  ## join with GMGI database information
  left_join(., gmgi_db, by = "sseqid") %>%

  ## only keep the first human hit if there are multiple (this is already ordered by pident)
  group_by(ASV_ID) %>%
  filter(!(str_detect(sseqid, "Human") & row_number() > 1)) %>%
  ungroup()

## Check how many ASVs were identified with the GMGI Database
length(unique(Blast_GMGI$ASV_ID)) 
## [1] 391

Mitofish database

No user edits.

Blast_Mito <- read.table(path_blast_mito, header=F, col.names = blast_col_headers, colClasses = blast_col_classes) %>%
  # renaming sseqid to species name
  dplyr::rename(Species_name = sseqid) %>%

  # replacing _ with spaces
  mutate(Species_name = gsub("_", " ", Species_name),

         ## removing gb || sequence from species name 
         ### uncomment if we need this line again 
         Species_name = str_after_nth(Species_name, "\\|", 2)

         )

NCBI database

No user edits.

NCBI_taxassigned <- read.delim2(path_blast_ncbi_taxassigned, header=F, col.names = c("staxid", "Phylo")) %>%
  ## creating taxonomic assignment columns
  separate(Phylo, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species_name"), sep = ";") %>%
  ## creating species column based on Species_name
  mutate(., species = str_after_nth(Species_name, " ", 1))

Blast_NCBI <- read.table(path_blast_ncbi, header=F,
                           col.names = c("ASV_ID", "sseqid", "sscinames", "staxid", "pident", "length", "mismatch",
                                         "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"),
                           colClasses = c(rep("character", 3), "integer", rep("numeric", 9))) %>%
  left_join(., NCBI_taxassigned, by = "staxid")

Load ASV sequences

No user edits.

### Read in fasta file 
fasta <- readDNAStringSet(path_seq_file)

## Creating db from the fasta information 
fasta_db <- data.table(
  ASV_ID = names(fasta),
  sequence = as.character(fasta)
)

Load DADA2 ASV Table

The column headers will be the Sample IDs and the first column is the ASV ID. ASVs are given a “rank” based on sum of reads from that ASV (pre-filtering). ‘Random’ indicates that if ASVs are tied, then the code will randomly assign a rank for those tied. Because we don’t need an exact rank here, ‘random’ will do for a tie-breaker.

No user edits.

ASV_table <- read_tsv(path_asv_table, show_col_types = FALSE) %>%
  ## calculate the sum of all reads for each ASV
  mutate(., ASV_sum = rowSums(across(where(is.numeric)))) %>% 

  ## calculate a ranking based on those sum calculated above
  mutate(ASV_rank = rank(-ASV_sum, ties.method='random')) %>%

  ## move the sum and rank columns to after ASV_ID and arrange by rank
  relocate(c(ASV_sum,ASV_rank), .after = ASV_ID) %>% arrange((ASV_rank)) %>%

  ## editing the rank value to be out of the max # 
  mutate(ASV_rank = paste0(ASV_rank, "/", max(ASV_rank, na.rm = TRUE)))

## creating list of rankings
ASV_rank_list <- ASV_table %>% dplyr::select(ASV_ID, ASV_sum, ASV_rank)

Taxonomic Assignment

Identifying where NCBI, Mito, and GMGI disagree on tax assignment. With the hierarchical approach, ASVs that match to GMGI and several other databases will only result in GMGI assignment. By reviewing this df, we can be sure we aren’t missing an assignment in our GMGI curated database.

Sub-workflow:
1. Identify any ASVs that contain multiple hits within the GMGI database.
2. Identify entries that mismatch between GMGI, Mitofish, and NCBI databases.
3. Assign taxonomy based on hierarchical approach.
4. Edit taxonomy annotations based on mismatch table.
5. Adjusting common name for those entries that don’t have one (from Mito or GMGI).

Identify any ASVs that contain multiple hits within the GMGI database

At this point, a fisheries team member needs to make choices about which taxonomic assignment to accept.

Create list of those ASVs with multiple hits

No user edits.

multiple_hit_choice <- Blast_GMGI %>% dplyr::group_by(ASV_ID) %>%
  ## take top percent identity hit, count the number of top hits, and filter to those with more than 1 top hit 
  slice_max(pident, n=1) %>% dplyr::count() %>% filter(n>1) %>%
  dplyr::rename(No_tied_hits=n) %>%

  ## adding BLAST_GMGI information with these ASVs and ASV rank and sum
  left_join(., Blast_GMGI, by = "ASV_ID") %>%
  left_join(., ASV_rank_list, by = "ASV_ID") %>%

  ## adding choice column for next steps 
  mutate(Choice = NA) %>%

  ## adding sequencing information 
  left_join(., fasta_db, by = "ASV_ID") %>%

  ## moving database percent ID to be next to Blast percent ID
  relocate(c(db_percent_ID, ASV_sum, ASV_rank), .after = pident)


## export this data frame as excel sheet 
multiple_hit_choice %>% write_xlsx(path_choice_required)

Based on the output above, user needs to make some choices. In the excel spreadsheet, user needs to mark ‘x’ on the choices desired while leaving the other entries blank.

Choosing one of several hits.

Load choice edited dataset. No user edits.

multiple_hit_choice_edited <- read_xlsx(path_choice_required_edited) %>%
  ## selecting the choices made
  filter(!is.na(Choice)) %>%
  ## selecting only columns needed 
  dplyr::select(ASV_ID, sseqid, Choice)

A for loop will filter Blast_GMGI df based on these choices. No user edits.

# Create a new edited df
Blast_GMGI_edited <- Blast_GMGI 

# Loop through each row of the dataframe
for (i in multiple_hit_choice_edited$ASV_ID) {
  # Extract the current row (will do this for each ASV_ID in the choice df)
  current_row <- multiple_hit_choice_edited %>% subset(ASV_ID==i)

  # Apply filter based on the current row's condition
  Blast_GMGI_edited <- Blast_GMGI_edited %>%
    filter(case_when(ASV_ID == current_row$ASV_ID ~ sseqid == current_row$sseqid,
           TRUE ~ TRUE))
}

Confirming that all entries have been dealt with

No user edits.

### Check the below output to confirm the filtering steps above worked (if it worked, it won't be in output)
Blast_GMGI_edited %>% dplyr::group_by(ASV_ID) %>% slice_max(pident, n=1) %>% dplyr::count() %>% filter(n>1)
## # A tibble: 0 × 2
## # Groups:   ASV_ID [0]
## # ℹ 2 variables: ASV_ID <chr>, n <int>

Identify entries that mismatch between GMGI, Mitofish, and NCBI databases

Creating a df called “Disagree”. Review the output before moving onto the next section.

No user edits.

## Create GMGI species input
disagree_input_GMGI <- Blast_GMGI_edited %>% 
  dplyr::select(ASV_ID, GMGI_pident = pident, GMGI_db_ID = db_percent_ID, GMGI_Species = Species_name) %>%
  group_by(ASV_ID) %>% slice_max(GMGI_pident, n = 1, with_ties = FALSE) %>% ungroup()

## Create Mitofish input
disagree_input_Mito <- Blast_Mito %>%
  dplyr::select(ASV_ID, Mito_Species = Species_name) %>% distinct() %>%
  dplyr::group_by(ASV_ID) %>% mutate(Mito_Species = paste0(Mito_Species, collapse = ";")) %>% distinct() %>% ungroup()

## Create NCBI input
disagree_input_NCBI <- Blast_NCBI %>% 
  dplyr::select(ASV_ID, NCBI_Species = Species_name) %>% distinct() %>% 
  group_by(ASV_ID) %>% mutate(NCBI_Species = paste0(NCBI_Species, collapse = ";")) %>% distinct() %>% ungroup()

## Combine all three dfs into disagree_df with ASV rank information
disagree_df <- disagree_input_GMGI %>%
  full_join(disagree_input_Mito, by = "ASV_ID") %>%
  full_join(disagree_input_NCBI, by = "ASV_ID") %>%
  full_join(., ASV_rank_list, by = "ASV_ID") %>%
  mutate(Choice = NA) %>%
  left_join(., fasta_db, by = "ASV_ID") 


## Filtering the disagree_df to only output the entries that disagree with multiple entries
filtered_disagree_df <- disagree_df %>%

  ## grouping by ASV (row)
  rowwise() %>%

  ## Filtering out rows that are unassigned across all 3 db 
  filter(! (is.na(GMGI_Species) && is.na(Mito_Species) && is.na(NCBI_Species) )) %>%

  ## Filter out rows that are GMGI assigned and empty from Mito and NCBI
  filter(! (!is.na(GMGI_Species) && is.na(Mito_Species) && is.na(NCBI_Species) )) %>%

  ## Filtering out rows that have the same information across databases (ONLY in rows with a GMGI entry)
  filter(ifelse(!is.na(GMGI_Species), 
          n_distinct(c_across(GMGI_Species:NCBI_Species), na.rm = TRUE) > 1, TRUE))

## export this data frame as excel sheet 
filtered_disagree_df %>% write_xlsx(path_disagree_list)

Assign taxonomy based on hierarchical approach

Taxonomic identification is taken from GMGI 100%, then GMGI \<100%, then Mitofish 100%, and finally NCBI 100%.

No user edits.

ASV_table_taxID <- ASV_table %>% 

  ## 1. Top hit from GMGI's database
  left_join(Blast_GMGI_edited %>%  group_by(ASV_ID) %>%
              slice_max(pident, n = 1) %>%
                            dplyr::select(ASV_ID, Species_name),
            by = join_by(ASV_ID)) %>%

  ## 2. Mitofish database
  ### join df, select ASV_ID and Species_name columns, rename Species_name to Mito, call only distinct rows
  left_join(., Blast_Mito %>% dplyr::select(ASV_ID, Mito = Species_name) %>% distinct() %>%

              ### group by ASV_ID, and collapse all species names separated by ;, then take only distinct rows
              group_by(ASV_ID) %>% mutate(Mito = paste0(Mito, collapse = ";")) %>% distinct(), by = "ASV_ID") %>%

  ### if GMGI annotation is NA, then replace with Mitofish 
  mutate(., Species_name = ifelse(is.na(Species_name), Mito, Species_name)) %>%

  ## 3. NCBI database; same functions as above
  left_join(., Blast_NCBI %>% dplyr::select(ASV_ID, NCBI = Species_name) %>% distinct() %>%
              group_by(ASV_ID) %>% mutate(NCBI = paste0(NCBI, collapse = ";")) %>% distinct(), by = "ASV_ID") %>%
  mutate(., Species_name = ifelse(is.na(Species_name), NCBI, Species_name)) %>%

  ## 4. if Species name is STILL not filled, call it "Unassigned"
  mutate(., Species_name = ifelse(is.na(Species_name), "Unassigned", Species_name)) %>%  

  ## removing Mito spp and NCBI spp
  dplyr::select(-Mito, -NCBI) %>%

  ## move species name to be after ASV_ID
  relocate(., c(Species_name), .after = ASV_ID)

Edit taxonomy annotations based on mismatch table

Override any annotations with edited taxonomic identification table.
No user edits.

## read in edited df 
taxonomic_choice <- read_xlsx(path_disagree_list_edited) %>%
    ## selecting only columns needed 
  dplyr::select(ASV_ID, Choice)  

# Create a new edited df
ASV_table_taxID_edited <- ASV_table_taxID 

# Loop through each row of the dataframe
for (i in taxonomic_choice$ASV_ID) {
  # Extract the current row (will do this for each ASV_ID in the choice df)
  current_row <- taxonomic_choice %>% subset(ASV_ID==i)

  # Apply filter based on the current row's condition
  ASV_table_taxID_edited <- ASV_table_taxID_edited %>%
    mutate(Species_name = case_when(
          ASV_ID == current_row$ASV_ID ~ current_row$Choice,
           TRUE ~ Species_name))
}

Confirm all entries are dealt with

No user edits.

## Output will be blank
ASV_table_taxID_edited %>% dplyr::select(Species_name) %>% distinct() %>% 
  filter(., grepl(";", Species_name)) %>% arrange(Species_name) 
## # A tibble: 0 × 1
## # ℹ 1 variable: Species_name <chr>

Adjusting common name for those entries that don’t have one (from Mito or NCBI)

No user edits.

### add common name column to df
ASV_table_taxID_edited <- ASV_table_taxID_edited %>%
  left_join(., gmgi_db %>% dplyr::select(Species_name, Common_name, Category) %>% distinct(), by = "Species_name") %>%
  relocate(., c(Common_name, Category), .after = Species_name)

### print entries with no common name
ASV_table_taxID_edited %>% dplyr::select(Species_name, Common_name) %>% 
  filter(is.na(Common_name)) %>% distinct() %>%
  mutate(Category = NA, Kingdom = NA, Phylum = NA, Class = NA, 
         Order = NA, Family = NA, Genus = NA, species = NA, PhylogenySorting = NA) %>%
  mutate(
    across(everything(), ~case_when(
      Species_name == "Unassigned" ~ "Unassigned",
      TRUE ~ .x
    ))) %>% 
  write_xlsx(path_commonnames_add)

Editing common names and category when needed.

## read in edited df 
commonNames_annotated <- read_xlsx(path_commonnames_add_edited)

# Create a new edited df
ASV_table_taxID_annotated <- ASV_table_taxID_edited 

# Loop through each row of the dataframe
for (i in commonNames_annotated$Species_name) {
  # Extract the current row (will do this for each ASV_ID in the choice df)
  current_row <- commonNames_annotated %>% subset(Species_name==i)

  # Apply filter based on the current row's condition
  ASV_table_taxID_annotated <- ASV_table_taxID_annotated %>%
    mutate(Common_name = case_when(
          Species_name == current_row$Species_name ~ current_row$Common_name,
           TRUE ~ Common_name)) %>%
    mutate(Category = case_when(
          Species_name == current_row$Species_name ~ current_row$Category,
           TRUE ~ Category))  
}

## printing list of species name without common names 
## after additions to mutate function above, this output should be zero 
ASV_table_taxID_annotated %>% dplyr::select(Species_name, Common_name) %>% filter(is.na(Common_name)) %>% distinct()
## # A tibble: 0 × 2
## # ℹ 2 variables: Species_name <chr>, Common_name <chr>

Filtering

Filter out reads that are less than 10 in each sample per ASV

ASV_table_taxID_filtered_step1 <- ASV_table_taxID_annotated %>%
  ## telling the df we are doing the following function by rows (ASVs)
  rowwise() %>%

  ## Filter out reads that are less than 10 in each sample per ASV
  mutate(across(.cols = (7:ncol(.)),
                .fns = ~ ifelse(.x<10, 0, .x))) %>% ungroup()

nrow(ASV_table_taxID_filtered_step1)  
## [1] 422
## Remove rows with zero sums across the table
ASV_table_taxID_filtered_step2 <- ASV_table_taxID_filtered_step1 %>%
  # Keep only rows where the sum across columns 7:ncol(.) is greater than 0
  filter(rowSums(across(7:ncol(.))) > 0)

nrow(ASV_table_taxID_filtered_step2) 
## [1] 400

Create output of reads that were filtered out

ASV_table_taxID_annotated %>%
  ## telling the df we are doing the following function by rows (ASVs)
  rowwise() %>%

  ## Filter out reads that are less than 10 in each sample per ASV
  mutate(across(.cols = (7:ncol(.)),
                .fns = ~ ifelse(.x>10, 0, .x))) %>% ungroup() %>%

  write_xlsx(reads_filtered10_out)

Filter out reads that are less than a certain % of ASV (row) total per sample

Values are based on the MiSeq used because the flow cell elicits different adapter hopping tendencies:
- Legacy MiSeq = 0.001
- New MiSeq = 0.0025

ASV_table_taxID_filtered <- ASV_table_taxID_filtered_step2 %>%
  ## telling the df we are doing the following function by rows (ASVs)
  rowwise() %>%

  mutate(across(.cols = 7:ncol(.),
                .fns = ~ ifelse((.x / sum(.x, na.rm = TRUE)) < 0.001, NA, .x)))

Create output of reads that were filtered out

ASV_table_taxID_filtered_step1 %>%
  ## telling the df we are doing the following function by rows (ASVs)
  rowwise() %>%

  mutate(across(.cols = 7:ncol(.),
                .fns = ~ ifelse((.x / sum(.x, na.rm = TRUE)) > 0.001, NA, .x))) %>%

  write_xlsx(reads_filtered_percent_out)

Filter out samples with less than X number of reads

Remove samples that don’t meet minimum read count thresholds. GMGI Fisheries team minimum is 1,000 reads.

### View final number of reads for each sample
colsums <- as.data.frame(colSums(ASV_table_taxID_filtered[, 7:ncol(ASV_table_taxID_filtered)], na.rm = TRUE)) %>%
  dplyr::rename(sum=1) %>% arrange(sum) %>% rownames_to_column(var = "sampleID")

range(colsums$sum)
## [1]     0 90067
# ### Identify samples with <1000 reads
# low_read_samples <- colsums %>% filter(sum < 1000) %>% pull(sampleID)
# 
# ### Remove those samples from dataset
# ASV_table_taxID_filtered <- ASV_table_taxID_filtered %>%
#   dplyr::select(-all_of(low_read_samples))
# 
# ### Verify
# range(colSums(ASV_table_taxID_filtered[, 7:ncol(ASV_table_taxID_filtered)], na.rm = TRUE))

Creating results Rdata output

ASV_table_taxID_filtered
## # A tibble: 400 × 55
## # Rowwise: 
##    ASV_ID       Species_name Common_name Category ASV_sum ASV_rank TDL_EC_49_12S
##    <chr>        <chr>        <chr>       <chr>      <dbl> <chr>            <dbl>
##  1 e9d13e82268… Menidia men… Atlantic s… Teleost…  867666 1/422             1417
##  2 80b85e59cac… Fundulus he… Mummichog   Teleost…  478263 2/422             5299
##  3 0c345c9a2aa… Brevoortia … Atlantic m… Teleost…  215802 3/422              406
##  4 8961e2d14d8… Clupeidae sp Atlantic m… Teleost…  188456 4/422               75
##  5 6a9c2d5770b… Ammodytes d… Northern s… Teleost…   56436 5/422             2766
##  6 1186c621e04… Pseudopleur… Winter or … Teleost…   55158 6/422             6011
##  7 ef311a4fd2c… Fundulus ma… Striped ki… Teleost…   49796 7/422               98
##  8 ea364c18219… Homo sapiens Human       Human      49413 8/422             3537
##  9 9a193f7029a… Gallus gall… Chicken     Livesto…   36080 9/422               12
## 10 d38ddf42991… Apeltes qua… Fourspine … Teleost…   24082 10/422            2895
## # ℹ 390 more rows
## # ℹ 48 more variables: TDL_FS_10_12S <dbl>, TDL_FS_11_12S <dbl>,
## #   TDL_FS_12_12S <dbl>, TDL_FS_13_12S <lgl>, TDL_FS_14_12S <dbl>,
## #   TDL_FS_15_12S <dbl>, TDL_FS_16_12S <dbl>, TDL_FS_17_12S <dbl>,
## #   TDL_FS_18_12S <dbl>, TDL_FS_19_12S <dbl>, TDL_FS_1_12S <dbl>,
## #   TDL_FS_20_12S <dbl>, TDL_FS_21_12S <dbl>, TDL_FS_22_12S <dbl>,
## #   TDL_FS_23_12S <dbl>, TDL_FS_24_12S <dbl>, TDL_FS_25_12S <dbl>, …
saveRDS(ASV_table_taxID_filtered, results_filtered_df)

Decontaminating based on blanks

Optional if blanks are included in dataset. User edits:
1. Change the samples to be removed with 0 zeros. If none, then comment out that line.
2. Edit the number of blanks and number of samples.

## Add annotation as needed for the name of the NTC or blank
df_for_microDecon <- ASV_table_taxID_filtered %>%
  dplyr::select(-(Species_name:Category), -ASV_sum, -ASV_rank) %>%

  ## remove any samples with 0 reads total 
  dplyr::select(-TDL_FS_13_12S) %>%

  ## replace NAs with zeros 
  dplyr::mutate(across(where(is.numeric), ~ tidyr::replace_na(.x, 0))) %>%
  relocate(matches("BK|EC"), .after = ASV_ID) 

## re-formatting
df_for_microDecon <- as.data.frame(df_for_microDecon)

## print the number of field samples in dataframe 
sum(grepl("_FS", colnames(df_for_microDecon)))
## [1] 47
sum(grepl("BK|EC", colnames(df_for_microDecon)))
## [1] 1
## numb.blanks = the number of blank columns present in the dataset
## numb.ind=c(141) = the number of field samples 

decontaminated <- decon(data = df_for_microDecon, 

                        ## number of blank columns [integer]
                        numb.blanks=1, 

                        ## number of individuals per group [integer]
                        numb.ind=c(47),

                        ## Indicate if taxa columns are present (T/F)
                        ## no species information here
                        taxa=F)

Exporting

### Reads removed 
removed <- decontaminated$reads.removed
save(removed, file = reads_decontaminated_out)

decontamin_df <- decontaminated$decon.table

## if multiple blanks, a new column called Mean.blank is created, if not the column name remains the same
ASV_table_taxID_decontaminated <- decontamin_df %>% full_join(
  ASV_table_taxID_filtered %>% dplyr::select(ASV_ID, Species_name, Common_name, Category, ASV_sum, ASV_rank), ., 
  by = "ASV_ID") %>%

  dplyr::select(-matches("_EC")) %>%

  ## removing ASVs that were removed by decontamin
  ## these are characterized by NA read counts across every sample so remove NAs
  na.omit()

At this point:
- ASV_table_taxID_filtered = ASV-level pre-decontaminated matrix
- ASV_table_taxID_decontaminated = ASV-level post-decontaminated matrix

Generate species list

species_list <- ASV_table_taxID_decontaminated %>% dplyr::select(Species_name, Category, ASV_sum) %>% 

  ## calculate total filtered reads
  dplyr::group_by(Species_name, Category) %>%
  reframe(Filtered_reads = sum(ASV_sum)) %>%

  ## arrange by stats
  arrange(desc(Filtered_reads)) %>%
  mutate(
    Percent = round(Filtered_reads / sum(Filtered_reads) * 100, 4)
  )

species_list %>% write_xlsx(Species_breakdown_sheet)

nrow(species_list) == length(unique(species_list$Species_name)) ## should be TRUE
## [1] TRUE
length(unique(species_list$Species_name))  
## [1] 107

Collapsing read counts by species name

No user edits.

ASV_table_taxID_collapsed <- ASV_table_taxID_decontaminated %>% 
  # removing original ASV_ID to collapse
  dplyr::select(-ASV_ID, -ASV_sum, -ASV_rank) %>%  

  ## group by Species_name and sample
  dplyr::group_by(Species_name, Common_name, Category) %>%

  ## sum down column by species name and sample to collapse
  dplyr::summarise(across(.cols = (1:last_col()),            
                          .fns = ~ sum(., na.rm = TRUE)), 
                     .groups = 'drop')

# Print number of samples = 39
ncol(ASV_table_taxID_filtered %>% dplyr::select(-Species_name, -Common_name, -Category, -ASV_ID, -ASV_sum, -ASV_rank))
## [1] 49
ncol(ASV_table_taxID_collapsed %>% dplyr::select(-Species_name, -Common_name, -Category))
## [1] 47
length(unique(ASV_table_taxID_collapsed$Species_name))  
## [1] 107

Check if any rows or columns == 0

### Rows
ASV_table_taxID_collapsed %>%
  filter(rowSums(across(where(is.numeric))) == 0)
## # A tibble: 0 × 50
## # ℹ 50 variables: Species_name <chr>, Common_name <chr>, Category <chr>,
## #   TDL_FS_10_12S <dbl>, TDL_FS_11_12S <dbl>, TDL_FS_12_12S <dbl>,
## #   TDL_FS_14_12S <dbl>, TDL_FS_15_12S <dbl>, TDL_FS_16_12S <dbl>,
## #   TDL_FS_17_12S <dbl>, TDL_FS_18_12S <dbl>, TDL_FS_19_12S <dbl>,
## #   TDL_FS_1_12S <dbl>, TDL_FS_20_12S <dbl>, TDL_FS_21_12S <dbl>,
## #   TDL_FS_22_12S <dbl>, TDL_FS_23_12S <dbl>, TDL_FS_24_12S <dbl>,
## #   TDL_FS_25_12S <dbl>, TDL_FS_26_12S <dbl>, TDL_FS_27_12S <dbl>, …
### Columns
ASV_table_taxID_collapsed %>%
  select(where(~ is.numeric(.x) && sum(.x, na.rm = TRUE) == 0))
## # A tibble: 107 × 0

Create reports

Species and ASV breakdown reports

### the total number of ASVs is from unfiltered ASVs 
ASV_table_taxID_filtered %>% dplyr::select(ASV_ID:ASV_rank) %>%
  write_xlsx(ASV_breakdown_sheet)

## unique taxonomic assignments, not including the unassigned category (for the report)
length(unique(ASV_table_taxID_collapsed$Species_name)) - 1
## [1] 106

Results tables

# create named vector: names = old, values = new
# new ID, old ID 
map <- setNames(meta$`Project Sample ID`, meta$`GMGI Sample ID`)

# replace only matching headers
names(ASV_table_taxID_collapsed)[names(ASV_table_taxID_collapsed) %in% names(map)] <- map[names(ASV_table_taxID_collapsed)[names(ASV_table_taxID_collapsed) %in% names(map)]]

# exporting that matrix
ASV_table_taxID_collapsed %>% write_xlsx(results_matrix_df)

### Calculating relative abundance
df_relab <- ASV_table_taxID_collapsed %>%
  mutate(across(4:last_col(), ~ .x / sum(.x), .names = "{.col}")) 

df_relab %>% write_xlsx(results_relative_df)