Datatable preparation base script for eDNA metabarcoding
.Rmd script
This script takes your Blast output from the GMGI database, Mitofish database, and NCBI database to create one datatable with read counts and taxonomic assignment.
Workflow summary:
1. Load libraries
2. Load metadata
3. Load BLAST output from GMGI, Mitofish, and NCBI
4. Load DADA2 ASV Table
5. Taxonomic Assignment
- 5a. Identify ASVs with multiple hits from GMGI’s database
- 5b. Identify entries that mismatch between GMGI, Mitofish, and NCBI
databases
- 5c. Assign taxonomy based on hierarchical approach
- 5d. Edit taxonomy annotations based on mismatch table choices
- 5e. Adjusting common name and category for those entries that don’t
have one (from Mito or NCBI)
6. Filtering: Filter ASV by less than 0.1% reads and then collapse by
group
7. Collapsing read counts by species name
8. Creating results output
Load libraries
library(ggplot2) ## for plotting
library(dplyr) ## for data table manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) ## for data table manipulation
library(readr) ## for reading in tsv files
library(readxl) ## for reading in excel files
library(stringr) ## for data transformation
library(strex) ## for data transformation
library(writexl) ## for excel output
library(purrr) ## for data transformation
library(funrar) ## for make_relative()
library(tidyverse) ## for data table manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(Biostrings) ## for reading in fasta file
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Loading required package: S4Vectors
## Loading required package: stats4
##
## Attaching package: 'S4Vectors'
##
## The following objects are masked from 'package:lubridate':
##
## second, second<-
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:utils':
##
## findMatches
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:lubridate':
##
## %within%
##
## The following object is masked from 'package:purrr':
##
## reduce
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:grDevices':
##
## windows
##
## Loading required package: XVector
##
## Attaching package: 'XVector'
##
## The following object is masked from 'package:purrr':
##
## compact
##
## Loading required package: GenomeInfoDb
##
## Attaching package: 'Biostrings'
##
## The following object is masked from 'package:base':
##
## strsplit
library(data.table) ## for data table manipulation
##
## Attaching package: 'data.table'
##
## The following object is masked from 'package:IRanges':
##
## shift
##
## The following objects are masked from 'package:S4Vectors':
##
## first, second
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(microDecon) ## for microDecon
Metadata input
Identify paths for metadata and project data
Each user needs to write in their specific directory outputs prior to the file name. The default working directory is this document so the folder where this script is saved for the user. To change the working directory to the Rproject directory, select ‘Knit’ and ‘Knit Directory’ > ‘Project Directory’.
### User edits:
### 1. change paths of input and output as desired
## GMGI Fish database
path_GMGIdb = "C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/Metabarcoding Lab Resources/Reference Databases/GMGI_Vert_Ref.xlsx"
## BLAST results
path_blast_gmgi = "docs/eDNA 12S metab/example_input/blast/BLASTResults_GMGI.txt"
path_blast_mito = "docs/eDNA 12S metab/example_input/blast/BLASTResults_Mito.txt"
path_blast_ncbi_taxassigned = "docs/eDNA 12S metab/example_input/blast/NCBI_taxassigned.txt"
path_blast_ncbi = "docs/eDNA 12S metab/example_input/blast/BLASTResults_NCBI.txt"
## ASV table results
## confirm that the ASV_table.len.tsv name is correct for user's project
path_asv_table = "docs/eDNA 12S metab/example_input/ASV_table.len.tsv"
path_seq_file = "docs/eDNA 12S metab/example_input/ASV_seqs.len.fasta"
path_output_summary = "docs/eDNA 12S metab/example_input/overall_summary.tsv"
Create output files
### TAXONOMIC CHOICES
# output paths
path_choice_required = "docs/eDNA 12S metab/example_output/taxonomic_assignments/Choice_required_GMGI_multiplehits.xlsx"
path_choice_required_edited="docs/eDNA 12S metab/example_output/taxonomic_assignments/Choice_required_GMGI_multiplehits_edited.xlsx"
path_disagree_list = "docs/eDNA 12S metab/example_output/taxonomic_assignments/Three_DB_comparison_taxonomic_ID.xlsx"
path_disagree_list_edited="docs/eDNA 12S metab/example_output/taxonomic_assignments/Three_DB_comparison_taxonomic_ID_edited.xlsx"
path_commonnames_add="docs/eDNA 12S metab/example_output/taxonomic_assignments/CommonNames_required.xlsx"
path_commonnames_add_edited="docs/eDNA 12S metab/example_output/taxonomic_assignments/CommonNames_required_edited.xlsx"
reads_filtered_percent_out="docs/eDNA 12S metab/example_output/Filtered_Percent_removed_reads.xlsx"
reads_filtered10_out="docs/eDNA 12S metab/example_output/Filtered_10_removed_reads.xlsx"
reads_decontaminated_out="docs/eDNA 12S metab/example_output/Filtered_Decontaminated_removed_reads.RData"
Species_breakdown_sheet="docs/eDNA 12S metab/example_output/Summary_Species_level.xlsx"
ASV_breakdown_sheet="docs/eDNA 12S metab/example_output/Summary_ASV_level.xlsx"
results_filtered_df="docs/eDNA 12S metab/example_output/ASV_table_filtered.rds"
results_matrix_df="docs/eDNA 12S metab/example_output/Results_matrix.xlsx"
results_relative_df="docs/eDNA 12S metab/example_output/Results_matrix_relative.xlsx"
Load project metadata
Metadata specific to each project. This contains information about each sample (e.g., month, site, time, sample type, etc.). Confirm that sample IDs match those used in the ASV_table.len.tsv file.
### User edits:
### 1. change path of metadata file
## EXCEL
meta <- read_excel("docs/eDNA 12S metab/example_input/metadata_tidal_cycle.xlsx") %>%
## ampliseq needs _ but the MiSeq needs -
mutate(`GMGI Sample ID` = gsub("-", "_", `GMGI Sample ID`))
Load database metadata
No user edits in this section because paths have already been set above.
# Load GMGI database information (common name, species name, etc.)
gmgi_db <- read_xlsx(path_GMGIdb, sheet = 1) %>% dplyr::rename(sseqid = Ref) %>%
## removing > from beginning of entires within Ref column
mutate(sseqid = gsub(">", "", sseqid))
BLAST data input
No user edits unless user changed blastn parameters from fisheries team default.
## Setting column header names and classes
blast_col_headers = c("ASV_ID", "sseqid", "pident", "length", "mismatch", "gapopen",
"qstart", "qend", "sstart", "send", "evalue", "bitscore")
blast_col_classes = c(rep("character", 2), rep("numeric", 10))
GMGI database
No user edits.
Blast_GMGI <- read.table(path_blast_gmgi, header=F, col.names = blast_col_headers, colClasses = blast_col_classes) %>%
## blast changes spaces to hyphons so we need to change that back to match our metadata
mutate(sseqid = gsub("-", " ", sseqid)) %>%
## join with GMGI database information
left_join(., gmgi_db, by = "sseqid") %>%
## only keep the first human hit if there are multiple (this is already ordered by pident)
group_by(ASV_ID) %>%
filter(!(str_detect(sseqid, "Human") & row_number() > 1)) %>%
ungroup()
## Check how many ASVs were identified with the GMGI Database
length(unique(Blast_GMGI$ASV_ID))
## [1] 391
Mitofish database
No user edits.
Blast_Mito <- read.table(path_blast_mito, header=F, col.names = blast_col_headers, colClasses = blast_col_classes) %>%
# renaming sseqid to species name
dplyr::rename(Species_name = sseqid) %>%
# replacing _ with spaces
mutate(Species_name = gsub("_", " ", Species_name),
## removing gb || sequence from species name
### uncomment if we need this line again
Species_name = str_after_nth(Species_name, "\\|", 2)
)
NCBI database
No user edits.
NCBI_taxassigned <- read.delim2(path_blast_ncbi_taxassigned, header=F, col.names = c("staxid", "Phylo")) %>%
## creating taxonomic assignment columns
separate(Phylo, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species_name"), sep = ";") %>%
## creating species column based on Species_name
mutate(., species = str_after_nth(Species_name, " ", 1))
Blast_NCBI <- read.table(path_blast_ncbi, header=F,
col.names = c("ASV_ID", "sseqid", "sscinames", "staxid", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"),
colClasses = c(rep("character", 3), "integer", rep("numeric", 9))) %>%
left_join(., NCBI_taxassigned, by = "staxid")
Load ASV sequences
No user edits.
### Read in fasta file
fasta <- readDNAStringSet(path_seq_file)
## Creating db from the fasta information
fasta_db <- data.table(
ASV_ID = names(fasta),
sequence = as.character(fasta)
)
Load DADA2 ASV Table
The column headers will be the Sample IDs and the first column is the ASV ID. ASVs are given a “rank” based on sum of reads from that ASV (pre-filtering). ‘Random’ indicates that if ASVs are tied, then the code will randomly assign a rank for those tied. Because we don’t need an exact rank here, ‘random’ will do for a tie-breaker.
No user edits.
ASV_table <- read_tsv(path_asv_table, show_col_types = FALSE) %>%
## calculate the sum of all reads for each ASV
mutate(., ASV_sum = rowSums(across(where(is.numeric)))) %>%
## calculate a ranking based on those sum calculated above
mutate(ASV_rank = rank(-ASV_sum, ties.method='random')) %>%
## move the sum and rank columns to after ASV_ID and arrange by rank
relocate(c(ASV_sum,ASV_rank), .after = ASV_ID) %>% arrange((ASV_rank)) %>%
## editing the rank value to be out of the max #
mutate(ASV_rank = paste0(ASV_rank, "/", max(ASV_rank, na.rm = TRUE)))
## creating list of rankings
ASV_rank_list <- ASV_table %>% dplyr::select(ASV_ID, ASV_sum, ASV_rank)
Taxonomic Assignment
Identifying where NCBI, Mito, and GMGI disagree on tax assignment. With the hierarchical approach, ASVs that match to GMGI and several other databases will only result in GMGI assignment. By reviewing this df, we can be sure we aren’t missing an assignment in our GMGI curated database.
Sub-workflow:
1. Identify any ASVs that contain multiple hits within the GMGI
database.
2. Identify entries that mismatch between GMGI, Mitofish, and NCBI
databases.
3. Assign taxonomy based on hierarchical approach.
4. Edit taxonomy annotations based on mismatch table.
5. Adjusting common name for those entries that don’t have one (from
Mito or GMGI).
Identify any ASVs that contain multiple hits within the GMGI database
At this point, a fisheries team member needs to make choices about which taxonomic assignment to accept.
Create list of those ASVs with multiple hits
No user edits.
multiple_hit_choice <- Blast_GMGI %>% dplyr::group_by(ASV_ID) %>%
## take top percent identity hit, count the number of top hits, and filter to those with more than 1 top hit
slice_max(pident, n=1) %>% dplyr::count() %>% filter(n>1) %>%
dplyr::rename(No_tied_hits=n) %>%
## adding BLAST_GMGI information with these ASVs and ASV rank and sum
left_join(., Blast_GMGI, by = "ASV_ID") %>%
left_join(., ASV_rank_list, by = "ASV_ID") %>%
## adding choice column for next steps
mutate(Choice = NA) %>%
## adding sequencing information
left_join(., fasta_db, by = "ASV_ID") %>%
## moving database percent ID to be next to Blast percent ID
relocate(c(db_percent_ID, ASV_sum, ASV_rank), .after = pident)
## export this data frame as excel sheet
multiple_hit_choice %>% write_xlsx(path_choice_required)
Based on the output above, user needs to make some choices. In the excel spreadsheet, user needs to mark ‘x’ on the choices desired while leaving the other entries blank.
Choosing one of several hits.
Load choice edited dataset. No user edits.
multiple_hit_choice_edited <- read_xlsx(path_choice_required_edited) %>%
## selecting the choices made
filter(!is.na(Choice)) %>%
## selecting only columns needed
dplyr::select(ASV_ID, sseqid, Choice)
A for loop will filter Blast_GMGI df based on these choices. No user edits.
# Create a new edited df
Blast_GMGI_edited <- Blast_GMGI
# Loop through each row of the dataframe
for (i in multiple_hit_choice_edited$ASV_ID) {
# Extract the current row (will do this for each ASV_ID in the choice df)
current_row <- multiple_hit_choice_edited %>% subset(ASV_ID==i)
# Apply filter based on the current row's condition
Blast_GMGI_edited <- Blast_GMGI_edited %>%
filter(case_when(ASV_ID == current_row$ASV_ID ~ sseqid == current_row$sseqid,
TRUE ~ TRUE))
}
Confirming that all entries have been dealt with
No user edits.
### Check the below output to confirm the filtering steps above worked (if it worked, it won't be in output)
Blast_GMGI_edited %>% dplyr::group_by(ASV_ID) %>% slice_max(pident, n=1) %>% dplyr::count() %>% filter(n>1)
## # A tibble: 0 × 2
## # Groups: ASV_ID [0]
## # ℹ 2 variables: ASV_ID <chr>, n <int>
Identify entries that mismatch between GMGI, Mitofish, and NCBI databases
Creating a df called “Disagree”. Review the output before moving onto the next section.
No user edits.
## Create GMGI species input
disagree_input_GMGI <- Blast_GMGI_edited %>%
dplyr::select(ASV_ID, GMGI_pident = pident, GMGI_db_ID = db_percent_ID, GMGI_Species = Species_name) %>%
group_by(ASV_ID) %>% slice_max(GMGI_pident, n = 1, with_ties = FALSE) %>% ungroup()
## Create Mitofish input
disagree_input_Mito <- Blast_Mito %>%
dplyr::select(ASV_ID, Mito_Species = Species_name) %>% distinct() %>%
dplyr::group_by(ASV_ID) %>% mutate(Mito_Species = paste0(Mito_Species, collapse = ";")) %>% distinct() %>% ungroup()
## Create NCBI input
disagree_input_NCBI <- Blast_NCBI %>%
dplyr::select(ASV_ID, NCBI_Species = Species_name) %>% distinct() %>%
group_by(ASV_ID) %>% mutate(NCBI_Species = paste0(NCBI_Species, collapse = ";")) %>% distinct() %>% ungroup()
## Combine all three dfs into disagree_df with ASV rank information
disagree_df <- disagree_input_GMGI %>%
full_join(disagree_input_Mito, by = "ASV_ID") %>%
full_join(disagree_input_NCBI, by = "ASV_ID") %>%
full_join(., ASV_rank_list, by = "ASV_ID") %>%
mutate(Choice = NA) %>%
left_join(., fasta_db, by = "ASV_ID")
## Filtering the disagree_df to only output the entries that disagree with multiple entries
filtered_disagree_df <- disagree_df %>%
## grouping by ASV (row)
rowwise() %>%
## Filtering out rows that are unassigned across all 3 db
filter(! (is.na(GMGI_Species) && is.na(Mito_Species) && is.na(NCBI_Species) )) %>%
## Filter out rows that are GMGI assigned and empty from Mito and NCBI
filter(! (!is.na(GMGI_Species) && is.na(Mito_Species) && is.na(NCBI_Species) )) %>%
## Filtering out rows that have the same information across databases (ONLY in rows with a GMGI entry)
filter(ifelse(!is.na(GMGI_Species),
n_distinct(c_across(GMGI_Species:NCBI_Species), na.rm = TRUE) > 1, TRUE))
## export this data frame as excel sheet
filtered_disagree_df %>% write_xlsx(path_disagree_list)
Assign taxonomy based on hierarchical approach
Taxonomic identification is taken from GMGI 100%, then GMGI \<100%, then Mitofish 100%, and finally NCBI 100%.
No user edits.
ASV_table_taxID <- ASV_table %>%
## 1. Top hit from GMGI's database
left_join(Blast_GMGI_edited %>% group_by(ASV_ID) %>%
slice_max(pident, n = 1) %>%
dplyr::select(ASV_ID, Species_name),
by = join_by(ASV_ID)) %>%
## 2. Mitofish database
### join df, select ASV_ID and Species_name columns, rename Species_name to Mito, call only distinct rows
left_join(., Blast_Mito %>% dplyr::select(ASV_ID, Mito = Species_name) %>% distinct() %>%
### group by ASV_ID, and collapse all species names separated by ;, then take only distinct rows
group_by(ASV_ID) %>% mutate(Mito = paste0(Mito, collapse = ";")) %>% distinct(), by = "ASV_ID") %>%
### if GMGI annotation is NA, then replace with Mitofish
mutate(., Species_name = ifelse(is.na(Species_name), Mito, Species_name)) %>%
## 3. NCBI database; same functions as above
left_join(., Blast_NCBI %>% dplyr::select(ASV_ID, NCBI = Species_name) %>% distinct() %>%
group_by(ASV_ID) %>% mutate(NCBI = paste0(NCBI, collapse = ";")) %>% distinct(), by = "ASV_ID") %>%
mutate(., Species_name = ifelse(is.na(Species_name), NCBI, Species_name)) %>%
## 4. if Species name is STILL not filled, call it "Unassigned"
mutate(., Species_name = ifelse(is.na(Species_name), "Unassigned", Species_name)) %>%
## removing Mito spp and NCBI spp
dplyr::select(-Mito, -NCBI) %>%
## move species name to be after ASV_ID
relocate(., c(Species_name), .after = ASV_ID)
Edit taxonomy annotations based on mismatch table
Override any annotations with edited taxonomic identification table.
No user edits.
## read in edited df
taxonomic_choice <- read_xlsx(path_disagree_list_edited) %>%
## selecting only columns needed
dplyr::select(ASV_ID, Choice)
# Create a new edited df
ASV_table_taxID_edited <- ASV_table_taxID
# Loop through each row of the dataframe
for (i in taxonomic_choice$ASV_ID) {
# Extract the current row (will do this for each ASV_ID in the choice df)
current_row <- taxonomic_choice %>% subset(ASV_ID==i)
# Apply filter based on the current row's condition
ASV_table_taxID_edited <- ASV_table_taxID_edited %>%
mutate(Species_name = case_when(
ASV_ID == current_row$ASV_ID ~ current_row$Choice,
TRUE ~ Species_name))
}
Confirm all entries are dealt with
No user edits.
## Output will be blank
ASV_table_taxID_edited %>% dplyr::select(Species_name) %>% distinct() %>%
filter(., grepl(";", Species_name)) %>% arrange(Species_name)
## # A tibble: 0 × 1
## # ℹ 1 variable: Species_name <chr>
Adjusting common name for those entries that don’t have one (from Mito or NCBI)
No user edits.
### add common name column to df
ASV_table_taxID_edited <- ASV_table_taxID_edited %>%
left_join(., gmgi_db %>% dplyr::select(Species_name, Common_name, Category) %>% distinct(), by = "Species_name") %>%
relocate(., c(Common_name, Category), .after = Species_name)
### print entries with no common name
ASV_table_taxID_edited %>% dplyr::select(Species_name, Common_name) %>%
filter(is.na(Common_name)) %>% distinct() %>%
mutate(Category = NA, Kingdom = NA, Phylum = NA, Class = NA,
Order = NA, Family = NA, Genus = NA, species = NA, PhylogenySorting = NA) %>%
mutate(
across(everything(), ~case_when(
Species_name == "Unassigned" ~ "Unassigned",
TRUE ~ .x
))) %>%
write_xlsx(path_commonnames_add)
Editing common names and category when needed.
## read in edited df
commonNames_annotated <- read_xlsx(path_commonnames_add_edited)
# Create a new edited df
ASV_table_taxID_annotated <- ASV_table_taxID_edited
# Loop through each row of the dataframe
for (i in commonNames_annotated$Species_name) {
# Extract the current row (will do this for each ASV_ID in the choice df)
current_row <- commonNames_annotated %>% subset(Species_name==i)
# Apply filter based on the current row's condition
ASV_table_taxID_annotated <- ASV_table_taxID_annotated %>%
mutate(Common_name = case_when(
Species_name == current_row$Species_name ~ current_row$Common_name,
TRUE ~ Common_name)) %>%
mutate(Category = case_when(
Species_name == current_row$Species_name ~ current_row$Category,
TRUE ~ Category))
}
## printing list of species name without common names
## after additions to mutate function above, this output should be zero
ASV_table_taxID_annotated %>% dplyr::select(Species_name, Common_name) %>% filter(is.na(Common_name)) %>% distinct()
## # A tibble: 0 × 2
## # ℹ 2 variables: Species_name <chr>, Common_name <chr>
Filtering
Filter out reads that are less than 10 in each sample per ASV
ASV_table_taxID_filtered_step1 <- ASV_table_taxID_annotated %>%
## telling the df we are doing the following function by rows (ASVs)
rowwise() %>%
## Filter out reads that are less than 10 in each sample per ASV
mutate(across(.cols = (7:ncol(.)),
.fns = ~ ifelse(.x<10, 0, .x))) %>% ungroup()
nrow(ASV_table_taxID_filtered_step1)
## [1] 422
## Remove rows with zero sums across the table
ASV_table_taxID_filtered_step2 <- ASV_table_taxID_filtered_step1 %>%
# Keep only rows where the sum across columns 7:ncol(.) is greater than 0
filter(rowSums(across(7:ncol(.))) > 0)
nrow(ASV_table_taxID_filtered_step2)
## [1] 400
Create output of reads that were filtered out
ASV_table_taxID_annotated %>%
## telling the df we are doing the following function by rows (ASVs)
rowwise() %>%
## Filter out reads that are less than 10 in each sample per ASV
mutate(across(.cols = (7:ncol(.)),
.fns = ~ ifelse(.x>10, 0, .x))) %>% ungroup() %>%
write_xlsx(reads_filtered10_out)
Filter out reads that are less than a certain % of ASV (row) total per sample
Values are based on the MiSeq used because the flow cell elicits
different adapter hopping tendencies:
- Legacy MiSeq = 0.001
- New MiSeq = 0.0025
ASV_table_taxID_filtered <- ASV_table_taxID_filtered_step2 %>%
## telling the df we are doing the following function by rows (ASVs)
rowwise() %>%
mutate(across(.cols = 7:ncol(.),
.fns = ~ ifelse((.x / sum(.x, na.rm = TRUE)) < 0.001, NA, .x)))
Create output of reads that were filtered out
ASV_table_taxID_filtered_step1 %>%
## telling the df we are doing the following function by rows (ASVs)
rowwise() %>%
mutate(across(.cols = 7:ncol(.),
.fns = ~ ifelse((.x / sum(.x, na.rm = TRUE)) > 0.001, NA, .x))) %>%
write_xlsx(reads_filtered_percent_out)
Filter out samples with less than X number of reads
Remove samples that don’t meet minimum read count thresholds. GMGI Fisheries team minimum is 1,000 reads.
### View final number of reads for each sample
colsums <- as.data.frame(colSums(ASV_table_taxID_filtered[, 7:ncol(ASV_table_taxID_filtered)], na.rm = TRUE)) %>%
dplyr::rename(sum=1) %>% arrange(sum) %>% rownames_to_column(var = "sampleID")
range(colsums$sum)
## [1] 0 90067
# ### Identify samples with <1000 reads
# low_read_samples <- colsums %>% filter(sum < 1000) %>% pull(sampleID)
#
# ### Remove those samples from dataset
# ASV_table_taxID_filtered <- ASV_table_taxID_filtered %>%
# dplyr::select(-all_of(low_read_samples))
#
# ### Verify
# range(colSums(ASV_table_taxID_filtered[, 7:ncol(ASV_table_taxID_filtered)], na.rm = TRUE))
Creating results Rdata output
ASV_table_taxID_filtered
## # A tibble: 400 × 55
## # Rowwise:
## ASV_ID Species_name Common_name Category ASV_sum ASV_rank TDL_EC_49_12S
## <chr> <chr> <chr> <chr> <dbl> <chr> <dbl>
## 1 e9d13e82268… Menidia men… Atlantic s… Teleost… 867666 1/422 1417
## 2 80b85e59cac… Fundulus he… Mummichog Teleost… 478263 2/422 5299
## 3 0c345c9a2aa… Brevoortia … Atlantic m… Teleost… 215802 3/422 406
## 4 8961e2d14d8… Clupeidae sp Atlantic m… Teleost… 188456 4/422 75
## 5 6a9c2d5770b… Ammodytes d… Northern s… Teleost… 56436 5/422 2766
## 6 1186c621e04… Pseudopleur… Winter or … Teleost… 55158 6/422 6011
## 7 ef311a4fd2c… Fundulus ma… Striped ki… Teleost… 49796 7/422 98
## 8 ea364c18219… Homo sapiens Human Human 49413 8/422 3537
## 9 9a193f7029a… Gallus gall… Chicken Livesto… 36080 9/422 12
## 10 d38ddf42991… Apeltes qua… Fourspine … Teleost… 24082 10/422 2895
## # ℹ 390 more rows
## # ℹ 48 more variables: TDL_FS_10_12S <dbl>, TDL_FS_11_12S <dbl>,
## # TDL_FS_12_12S <dbl>, TDL_FS_13_12S <lgl>, TDL_FS_14_12S <dbl>,
## # TDL_FS_15_12S <dbl>, TDL_FS_16_12S <dbl>, TDL_FS_17_12S <dbl>,
## # TDL_FS_18_12S <dbl>, TDL_FS_19_12S <dbl>, TDL_FS_1_12S <dbl>,
## # TDL_FS_20_12S <dbl>, TDL_FS_21_12S <dbl>, TDL_FS_22_12S <dbl>,
## # TDL_FS_23_12S <dbl>, TDL_FS_24_12S <dbl>, TDL_FS_25_12S <dbl>, …
saveRDS(ASV_table_taxID_filtered, results_filtered_df)
Decontaminating based on blanks
Optional if blanks are included in dataset. User edits:
1. Change the samples to be removed with 0 zeros. If none, then comment
out that line.
2. Edit the number of blanks and number of samples.
## Add annotation as needed for the name of the NTC or blank
df_for_microDecon <- ASV_table_taxID_filtered %>%
dplyr::select(-(Species_name:Category), -ASV_sum, -ASV_rank) %>%
## remove any samples with 0 reads total
dplyr::select(-TDL_FS_13_12S) %>%
## replace NAs with zeros
dplyr::mutate(across(where(is.numeric), ~ tidyr::replace_na(.x, 0))) %>%
relocate(matches("BK|EC"), .after = ASV_ID)
## re-formatting
df_for_microDecon <- as.data.frame(df_for_microDecon)
## print the number of field samples in dataframe
sum(grepl("_FS", colnames(df_for_microDecon)))
## [1] 47
sum(grepl("BK|EC", colnames(df_for_microDecon)))
## [1] 1
## numb.blanks = the number of blank columns present in the dataset
## numb.ind=c(141) = the number of field samples
decontaminated <- decon(data = df_for_microDecon,
## number of blank columns [integer]
numb.blanks=1,
## number of individuals per group [integer]
numb.ind=c(47),
## Indicate if taxa columns are present (T/F)
## no species information here
taxa=F)
Exporting
### Reads removed
removed <- decontaminated$reads.removed
save(removed, file = reads_decontaminated_out)
decontamin_df <- decontaminated$decon.table
## if multiple blanks, a new column called Mean.blank is created, if not the column name remains the same
ASV_table_taxID_decontaminated <- decontamin_df %>% full_join(
ASV_table_taxID_filtered %>% dplyr::select(ASV_ID, Species_name, Common_name, Category, ASV_sum, ASV_rank), .,
by = "ASV_ID") %>%
dplyr::select(-matches("_EC")) %>%
## removing ASVs that were removed by decontamin
## these are characterized by NA read counts across every sample so remove NAs
na.omit()
At this point:
- ASV_table_taxID_filtered = ASV-level pre-decontaminated matrix
- ASV_table_taxID_decontaminated = ASV-level post-decontaminated
matrix
Generate species list
species_list <- ASV_table_taxID_decontaminated %>% dplyr::select(Species_name, Category, ASV_sum) %>%
## calculate total filtered reads
dplyr::group_by(Species_name, Category) %>%
reframe(Filtered_reads = sum(ASV_sum)) %>%
## arrange by stats
arrange(desc(Filtered_reads)) %>%
mutate(
Percent = round(Filtered_reads / sum(Filtered_reads) * 100, 4)
)
species_list %>% write_xlsx(Species_breakdown_sheet)
nrow(species_list) == length(unique(species_list$Species_name)) ## should be TRUE
## [1] TRUE
length(unique(species_list$Species_name))
## [1] 107
Collapsing read counts by species name
No user edits.
ASV_table_taxID_collapsed <- ASV_table_taxID_decontaminated %>%
# removing original ASV_ID to collapse
dplyr::select(-ASV_ID, -ASV_sum, -ASV_rank) %>%
## group by Species_name and sample
dplyr::group_by(Species_name, Common_name, Category) %>%
## sum down column by species name and sample to collapse
dplyr::summarise(across(.cols = (1:last_col()),
.fns = ~ sum(., na.rm = TRUE)),
.groups = 'drop')
# Print number of samples = 39
ncol(ASV_table_taxID_filtered %>% dplyr::select(-Species_name, -Common_name, -Category, -ASV_ID, -ASV_sum, -ASV_rank))
## [1] 49
ncol(ASV_table_taxID_collapsed %>% dplyr::select(-Species_name, -Common_name, -Category))
## [1] 47
length(unique(ASV_table_taxID_collapsed$Species_name))
## [1] 107
Check if any rows or columns == 0
### Rows
ASV_table_taxID_collapsed %>%
filter(rowSums(across(where(is.numeric))) == 0)
## # A tibble: 0 × 50
## # ℹ 50 variables: Species_name <chr>, Common_name <chr>, Category <chr>,
## # TDL_FS_10_12S <dbl>, TDL_FS_11_12S <dbl>, TDL_FS_12_12S <dbl>,
## # TDL_FS_14_12S <dbl>, TDL_FS_15_12S <dbl>, TDL_FS_16_12S <dbl>,
## # TDL_FS_17_12S <dbl>, TDL_FS_18_12S <dbl>, TDL_FS_19_12S <dbl>,
## # TDL_FS_1_12S <dbl>, TDL_FS_20_12S <dbl>, TDL_FS_21_12S <dbl>,
## # TDL_FS_22_12S <dbl>, TDL_FS_23_12S <dbl>, TDL_FS_24_12S <dbl>,
## # TDL_FS_25_12S <dbl>, TDL_FS_26_12S <dbl>, TDL_FS_27_12S <dbl>, …
### Columns
ASV_table_taxID_collapsed %>%
select(where(~ is.numeric(.x) && sum(.x, na.rm = TRUE) == 0))
## # A tibble: 107 × 0
Create reports
Species and ASV breakdown reports
### the total number of ASVs is from unfiltered ASVs
ASV_table_taxID_filtered %>% dplyr::select(ASV_ID:ASV_rank) %>%
write_xlsx(ASV_breakdown_sheet)
## unique taxonomic assignments, not including the unassigned category (for the report)
length(unique(ASV_table_taxID_collapsed$Species_name)) - 1
## [1] 106
Results tables
# create named vector: names = old, values = new
# new ID, old ID
map <- setNames(meta$`Project Sample ID`, meta$`GMGI Sample ID`)
# replace only matching headers
names(ASV_table_taxID_collapsed)[names(ASV_table_taxID_collapsed) %in% names(map)] <- map[names(ASV_table_taxID_collapsed)[names(ASV_table_taxID_collapsed) %in% names(map)]]
# exporting that matrix
ASV_table_taxID_collapsed %>% write_xlsx(results_matrix_df)
### Calculating relative abundance
df_relab <- ASV_table_taxID_collapsed %>%
mutate(across(4:last_col(), ~ .x / sum(.x), .names = "{.col}"))
df_relab %>% write_xlsx(results_relative_df)