Datatable preparation base script for eDNA metabarcoding
.Rmd script
This script takes your Blast output from the GMGI database, Mitofish database, and NCBI database to create one datatable with read counts and taxonomic assignment.
Workflow summary:
1. Load libraries
2. Load metadata
3. Load BLAST output from GMGI, Mitofish, and NCBI
4. Load DADA2 ASV Table
5. Taxonomic Assignment
- 5a. Identify ASVs with multiple hits from GMGI’s database
- 5b. Identify entries that mismatch between GMGI, Mitofish, and NCBI
databases
- 5c. Assign taxonomy based on hierarchical approach
- 5d. Edit taxonomy annotations based on mismatch table choices
- 5e. Adjusting common name and category for those entries that don’t
have one (from Mito or NCBI)
6. Filtering: Filter ASV by less than 0.1% reads and then collapse by
group
7. Collapsing read counts by species name
8. Creating results output
Load libraries
library(ggplot2) ## for plotting
library(dplyr) ## for data table manipulation
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr) ## for data table manipulation
library(readr) ## for reading in tsv files
library(readxl) ## for reading in excel files
library(stringr) ## for data transformation
library(strex) ## for data transformation
library(writexl) ## for excel output
library(purrr) ## for data transformation
library(funrar) ## for make_relative()
library(tidyverse) ## for data table manipulation
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Metadata input
Identify paths for metadata and project data
Each user needs to write in their specific directory outputs prior to the file name. The default working directory is this document so the folder where this script is saved for the user. To change the workign directory to the Rproject directory, select ‘Knit’ and ‘Knit Directory’ > ‘Project Directory’.
### User edits:
### 1. change paths of input and output as desired
## GMGI Fish database
path_GMGIdb = "C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/Metabarcoding Lab Resources/Reference Databases/GMGI_Vert_Ref.xlsx"
path_fishbase_tax = "C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/Metabarcoding Lab Resources/Reference Databases/taxonomic_classification_fishbase.csv"
path_mitofish_tax = "C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/Metabarcoding Lab Resources/Reference Databases/taxonomic_classification_mitofish.csv"
## BLAST results
path_blast_gmgi = "example_input/BLASTResults_GMGI.txt"
path_blast_mito = "example_input/BLASTResults_Mito.txt"
path_blast_ncbi_taxassigned = "example_input/NCBI_taxassigned.txt"
path_blast_ncbi = "example_input/BLASTResults_NCBI.txt"
## ASV table results
## confirm that the ASV_table.len.tsv name is correct for user's project
path_asv_table = "example_input/ASV_table.len.tsv"
path_output_summary = "example_input/overall_summary.tsv"
# output paths
path_choice_required = "example_output/Taxonomic_assignments/Choice_required_GMGI_multiplehits.xlsx"
path_choice_required_edited="example_output/Taxonomic_assignments/Choice_required_GMGI_multiplehits_edited.xlsx"
path_disagree_list = "example_output/Taxonomic_assignments/SampleReport_taxonomic_ID.xlsx"
path_disagree_list_edited="example_output/Taxonomic_assignments/SampleReport_taxonomic_ID_edited.xlsx"
path_commonnames_add="example_output/Taxonomic_assignments/CommonNames_required.xlsx"
path_commonnames_add_edited="example_output/Taxonomic_assignments/CommonNames_required_edited.xlsx"
results_rawreads_matrix = "example_output/Results_1_rawreads_matrix.xlsx"
results_rawreads_long = "example_output/Results_rawreads_long_format.xlsx"
results_relab_matrix = "example_output/Results_2_relative_abundance_matrix.xlsx"
results_relab_long = "example_output/Results_relative_abundance_long_format.xlsx"
reads_filtered_out="example_output/ASV_reads_filtered_out.xlsx"
ASV_breakdown_sheet="example_output/ASV_breakdown.xlsx"
Load project metadata
Metadata specific to each project. This contains information about each sample (e.g., month, site, time, sample type, etc.). Confirm that sample IDs match those used in the ASV_table.len.tsv file.
### User edits:
### 1. change path of metadata file
## EXCEL
meta <- read_excel("example_input/metadata.xlsx")
## CSV
# meta <- read.csv("example_input/metadata.csv", header = TRUE)
Load database metadata
No user edits in this section because paths have already been set above.
# Load GMGI database information (common name, species name, etc.)
gmgi_db <- read_xlsx(path_GMGIdb, sheet = 1) %>% dplyr::rename(sseqid = Ref) %>%
## removing > from beginning of entires within Ref column
mutate(sseqid = gsub(">", "", sseqid))
BLAST data input
No user edits unless user changed blastn parameters from fisheries team default.
## Setting column header names and classes
blast_col_headers = c("ASV_ID", "sseqid", "pident", "length", "mismatch", "gapopen",
"qstart", "qend", "sstart", "send", "evalue", "bitscore")
blast_col_classes = c(rep("character", 2), rep("numeric", 10))
GMGI database
No user edits.
Blast_GMGI <- read.table(path_blast_gmgi, header=F, col.names = blast_col_headers, colClasses = blast_col_classes) %>%
## blast changes spaces to hyphons so we need to change that back to match our metadata
mutate(sseqid = gsub("-", " ", sseqid)) %>%
## join with GMGI database information
left_join(., gmgi_db, by = "sseqid")
## Check how many ASVs were identified with the GMGI Database
length(unique(Blast_GMGI$ASV_ID))
## [1] 986
Mitofish database
No user edits.
Blast_Mito <- read.table(path_blast_mito, header=F, col.names = blast_col_headers, colClasses = blast_col_classes) %>%
# renaming sseqid to species name
dplyr::rename(Species_name = sseqid) %>%
# replacing _ with spaces
mutate(Species_name = gsub("_", " ", Species_name),
## removing gb || sequence from species name
Species_name = str_after_nth(Species_name, "\\|", 2))
NCBI database
No user edits.
NCBI_taxassigned <- read.delim2(path_blast_ncbi_taxassigned, header=F, col.names = c("staxid", "Phylo")) %>%
## creating taxonomic assignment columns
separate(Phylo, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species_name"), sep = ";") %>%
## creating species column based on Species_name
mutate(., species = str_after_nth(Species_name, " ", 1))
Blast_NCBI <- read.table(path_blast_ncbi, header=F,
col.names = c("ASV_ID", "sseqid", "sscinames", "staxid", "pident", "length", "mismatch",
"gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"),
colClasses = c(rep("character", 3), "integer", rep("numeric", 9))) %>%
left_join(., NCBI_taxassigned, by = "staxid")
Load DADA2 ASV Table
The column headers will be the Sample IDs and the first column is the ASV ID. ASVs are given a “rank” based on sum of reads from that ASV (pre-filtering). ‘Random’ indicates that if ASVs are tied, then the code will randomly assign a rank for those tied. Because we don’t need an exact rank here, ‘random’ will do for a tie-breaker.
No user edits.
ASV_table <- read_tsv(path_asv_table, show_col_types = FALSE) %>%
## calculate the sum of all reads for each ASV
mutate(., ASV_sum = rowSums(across(where(is.numeric)))) %>%
## calculate a ranking based on those sum calculated above
mutate(ASV_rank = rank(-ASV_sum, ties.method='random')) %>%
## move the sum and rank columns to after ASV_ID and arrange by rank
relocate(c(ASV_sum,ASV_rank), .after = ASV_ID) %>% arrange((ASV_rank))
## creating list of rankings
ASV_rank_list <- ASV_table %>% dplyr::select(ASV_ID, ASV_sum, ASV_rank)
Taxonomic Assignment
Identifying where NCBI, Mito, and GMGI disagree on tax assignment. With the hierarchial approach, ASVs that match to GMGI and several other databases will only result in GMGI assignment. By reviewing this df, we can be sure we aren’t missing an assignment in our GMGI curated database.
Sub-workflow:
1. Identify any ASVs that contain multiple hits within the GMGI
database.
2. Identify entries that mismatch between GMGI, Mitofish, and NCBI
databases.
3. Assign taxonomy based on hierarchical approach.
4. Edit taxonomy annotations based on mismatch table.
5. Adjusting common name for those entries that don’t have one (from
Mito or GMGI).
Identify any ASVs that contain multiple hits within the GMGI database
At this point, a fisheries team member needs to make choices about which taxonomic assignment to accept.
Create list of those ASVs with multiple hits
No user edits.
multiple_hit_choice <- Blast_GMGI %>% group_by(ASV_ID) %>%
## take top percent identity hit, count the number of top hits, and filter to those with more than 1 top hit
slice_max(pident, n=1) %>% count() %>% filter(n>1) %>%
## adding BLAST_GMGI information with these ASVs and ASV rank and sum
left_join(., Blast_GMGI, by = "ASV_ID") %>%
left_join(., ASV_rank_list, by = "ASV_ID") %>%
## moving database percent ID to be next to Blast percent ID
relocate(c(db_percent_ID, ASV_sum, ASV_rank), .after = pident) %>%
## adding choice column for next steps
mutate(Choice = NA)
## export this data frame as excel sheet
multiple_hit_choice %>% write_xlsx(path_choice_required)
Based on the output above, user needs to make some choices. In the excel spreadsheet, user needs to mark ‘x’ on the choices desired while leaving the other entries blank.
Choosing one of several hits.
Load choice edited dataset. No user edits.
multiple_hit_choice_edited <- read_xlsx(path_choice_required_edited) %>%
## selecting the choices made
filter(!is.na(Choice)) %>%
## selecting only columns needed
dplyr::select(ASV_ID, sseqid, Choice)
A for loop will filter Blast_GMGI df based on these choices. No user edits.
# Create a new edited df
Blast_GMGI_edited <- Blast_GMGI
# Loop through each row of the dataframe
for (i in multiple_hit_choice_edited$ASV_ID) {
# Extract the current row (will do this for each ASV_ID in the choice df)
current_row <- multiple_hit_choice_edited %>% subset(ASV_ID==i)
# Apply filter based on the current row's condition
Blast_GMGI_edited <- Blast_GMGI_edited %>%
filter(case_when(ASV_ID == current_row$ASV_ID ~ sseqid == current_row$sseqid,
TRUE ~ TRUE))
}
Confirming that all entries have been dealth with
No user edits.
### Check the below output to confirm the filtering steps above worked (if it worked, it won't be in output)
Blast_GMGI_edited %>% group_by(ASV_ID) %>% slice_max(pident, n=1) %>% count() %>% filter(n>1)
## # A tibble: 0 × 2
## # Groups: ASV_ID [0]
## # ℹ 2 variables: ASV_ID <chr>, n <int>
Identify entries that mismatch between GMGI, Mitofish, and NCBI databases
Creating a df called “Disagree”. Review the output before moving onto the next section.
No user edits.
## Create GMGI species input
disagree_input_GMGI <- Blast_GMGI_edited %>%
dplyr::select(ASV_ID, GMGI_pident = pident, GMGI_db_ID = db_percent_ID, GMGI_Species = Species_name) %>%
group_by(ASV_ID) %>% slice_max(GMGI_pident, n = 1, with_ties = FALSE) %>% ungroup()
## Create Mitofish input
disagree_input_Mito <- Blast_Mito %>%
dplyr::select(ASV_ID, Mito_Species = Species_name) %>% distinct() %>%
group_by(ASV_ID) %>% mutate(Mito_Species = paste0(Mito_Species, collapse = ";")) %>% distinct() %>% ungroup()
## Create NCBI input
disagree_input_NCBI <- Blast_NCBI %>%
dplyr::select(ASV_ID, NCBI_Species = Species_name) %>% distinct() %>%
group_by(ASV_ID) %>% mutate(NCBI_Species = paste0(NCBI_Species, collapse = ";")) %>% distinct() %>% ungroup()
## Combine all three dfs into disagree_df with ASV rank information
disagree_df <- disagree_input_GMGI %>%
full_join(disagree_input_Mito, by = "ASV_ID") %>%
full_join(disagree_input_NCBI, by = "ASV_ID") %>%
full_join(., ASV_rank_list, by = "ASV_ID") %>%
mutate(Choice = NA)
## Filtering the disagree_df to only output the entries that disagree with multiple entries
filtered_disagree_df <- disagree_df %>%
## grouping by ASV (row)
rowwise() %>%
## Filtering out rows that are unassigned across all 3 db
filter(! (is.na(GMGI_Species) && is.na(Mito_Species) && is.na(NCBI_Species) )) %>%
## Filter out rows that are GMGI assigned and empty from Mito and NCBI
filter(! (!is.na(GMGI_Species) && is.na(Mito_Species) && is.na(NCBI_Species) )) %>%
## Filtering out rows that have the same information across databases (ONLY in rows with a GMGI entry)
filter(ifelse(!is.na(GMGI_Species),
n_distinct(c_across(GMGI_Species:NCBI_Species), na.rm = TRUE) > 1, TRUE))
## export this data frame as excel sheet
filtered_disagree_df %>% write_xlsx(path_disagree_list)
Assign taxonomy based on hierarchical approach
Taxonomic identification is taken from GMGI 100%, then GMGI \<100%, then Mitofish 100%, and finally NCBI 100%.
No user edits.
ASV_table_taxID <- ASV_table %>%
## 1. Top hit from GMGI's database
left_join(Blast_GMGI_edited %>% group_by(ASV_ID) %>%
slice_max(pident, n = 1) %>%
dplyr::select(ASV_ID, Species_name),
by = join_by(ASV_ID)) %>%
## 2. Mitofish database
### join df, select ASV_ID and Species_name columns, rename Species_name to Mito, call only distinct rows
left_join(., Blast_Mito %>% dplyr::select(ASV_ID, Mito = Species_name) %>% distinct() %>%
### group by ASV_ID, and collapse all species names separated by ;, then take only distinct rows
group_by(ASV_ID) %>% mutate(Mito = paste0(Mito, collapse = ";")) %>% distinct(), by = "ASV_ID") %>%
### if GMGI annotation is NA, then replace with Mitofish
mutate(., Species_name = ifelse(is.na(Species_name), Mito, Species_name)) %>%
## 3. NCBI database; same functions as above
left_join(., Blast_NCBI %>% dplyr::select(ASV_ID, NCBI = Species_name) %>% distinct() %>%
group_by(ASV_ID) %>% mutate(NCBI = paste0(NCBI, collapse = ";")) %>% distinct(), by = "ASV_ID") %>%
mutate(., Species_name = ifelse(is.na(Species_name), NCBI, Species_name)) %>%
## 4. if Species name is STILL not filled, call it "Unassigned"
mutate(., Species_name = ifelse(is.na(Species_name), "Unassigned", Species_name)) %>%
## removing Mito spp and NCBI spp
dplyr::select(-Mito, -NCBI) %>%
## move species name to be after ASV_ID
relocate(., c(Species_name), .after = ASV_ID)
Edit taxonomy annotations based on mismatch table
Override any annotations with edited taxonomic identification table.
No user edits.
## read in edited df
taxonomic_choice <- read_xlsx(path_disagree_list_edited) %>%
## selecting only columns needed
dplyr::select(ASV_ID, Choice)
# Create a new edited df
ASV_table_taxID_edited <- ASV_table_taxID
# Loop through each row of the dataframe
for (i in taxonomic_choice$ASV_ID) {
# Extract the current row (will do this for each ASV_ID in the choice df)
current_row <- taxonomic_choice %>% subset(ASV_ID==i)
# Apply filter based on the current row's condition
ASV_table_taxID_edited <- ASV_table_taxID_edited %>%
mutate(Species_name = case_when(
ASV_ID == current_row$ASV_ID ~ current_row$Choice,
TRUE ~ Species_name))
}
Confirm all entries are dealt with
No user edits.
## Output will be blank
ASV_table_taxID_edited %>% dplyr::select(Species_name) %>% distinct() %>%
filter(., grepl(";", Species_name)) %>% arrange(Species_name)
## # A tibble: 0 × 1
## # ℹ 1 variable: Species_name <chr>
Adjusting common name for those entries that don’t have one (from Mito or NCBI)
No user edits.
### add common name column to df
ASV_table_taxID_edited <- ASV_table_taxID_edited %>%
left_join(., gmgi_db %>% dplyr::select(Species_name, Common_name, Category) %>% distinct(), by = "Species_name") %>%
relocate(., c(Common_name, Category), .after = Species_name)
### print entries with no common name
ASV_table_taxID_edited %>% dplyr::select(Species_name, Common_name) %>%
filter(is.na(Common_name)) %>% distinct() %>%
mutate(Category = NA, Kingdom = NA, Phylum = NA, Class = NA, Order = NA, Family = NA, Genus = NA, species = NA) %>%
mutate(
across(everything(), ~case_when(
Species_name == "Unassigned" ~ "Unassigned",
TRUE ~ .x
))) %>%
write_xlsx(path_commonnames_add)
Editing common names and category when needed.
## read in edited df
commonNames_annotated <- read_xlsx(path_commonnames_add_edited)
# Create a new edited df
ASV_table_taxID_annotated <- ASV_table_taxID_edited
# Loop through each row of the dataframe
for (i in commonNames_annotated$Species_name) {
# Extract the current row (will do this for each ASV_ID in the choice df)
current_row <- commonNames_annotated %>% subset(Species_name==i)
# Apply filter based on the current row's condition
ASV_table_taxID_annotated <- ASV_table_taxID_annotated %>%
mutate(Common_name = case_when(
Species_name == current_row$Species_name ~ current_row$Common_name,
TRUE ~ Common_name)) %>%
mutate(Category = case_when(
Species_name == current_row$Species_name ~ current_row$Category,
TRUE ~ Category))
}
## printing list of species name without common names
## after additions to mutate function above, this output should be zero
ASV_table_taxID_annotated %>% dplyr::select(Species_name, Common_name) %>% filter(is.na(Common_name)) %>% distinct()
## # A tibble: 0 × 2
## # ℹ 2 variables: Species_name <chr>, Common_name <chr>
Filtering: Filter ASV by less than 0.1% reads and then collapse by group
Filter out reads that are less than 0.1% of ASV (row) total per sample.
Create an output of what you’re losing with filtering.
No user edits.
ASV_table_taxID_filtered <- ASV_table_taxID_annotated %>%
## telling the df we are doing the following function by rows (ASVs)
rowwise() %>%
## filtering out any values that are less than 0.001 of the total ASV read # in each sample
mutate(across(.cols = (7:ncol(.)),
.fns = ~ ifelse((.x/ASV_sum)<0.001, NA, .x))) %>% ungroup()
## output of what we're losing
ASV_table_taxID_edited %>% rowwise() %>%
mutate(across(.cols = (7:ncol(.)),
.fns = ~ ifelse((.x/ASV_sum)>0.001, NA, .x))) %>% ungroup() %>% write_xlsx(reads_filtered_out)
## Export ASV break-down for 03-data_quality.Rmds
ASV_table_taxID_filtered %>% dplyr::select(ASV_ID, Species_name, Common_name, Category, ASV_sum, ASV_rank) %>%
write_xlsx(ASV_breakdown_sheet)
## Confirm number of samples is as expected (outputs should be the same)
ncol(ASV_table_taxID_annotated %>% dplyr::select(-ASV_ID, -Species_name, -Common_name, -Category, -ASV_sum, -ASV_rank))
## [1] 339
ncol(ASV_table_taxID_filtered %>% dplyr::select(-ASV_ID, -Species_name, -Common_name, -Category, -ASV_sum, -ASV_rank))
## [1] 339
Collapsing read counts by species name
No user edits.
ASV_table_taxID_collapsed <- ASV_table_taxID_filtered %>%
# removing original ASV_ID to collapse
dplyr::select(-ASV_ID, -ASV_sum, -ASV_rank) %>%
## group by Species_name and sample
dplyr::group_by(Species_name, Common_name, Category) %>%
## sum down column by species name and sample to collapse
dplyr::summarise(across(.cols = (1:last_col()),
.fns = ~ sum(., na.rm = TRUE)),
.groups = 'drop')
# Print number of samples
ncol(ASV_table_taxID_collapsed %>% dplyr::select(-Species_name, -Common_name, -Category))
## [1] 339
Creating results output
Raw reads results output.
No user edits.
### Number of total assignments -1 for unassigned
length(unique(ASV_table_taxID_collapsed$Species_name)) - 1
## [1] 106
## Raw reads matrix (wide format)
ASV_table_taxID_collapsed %>% write_xlsx(results_rawreads_matrix)
## Raw reads long format and filtering out entries with zero reads
ASV_table_taxID_collapsed %>%
gather("sampleID", "reads", c(4:last_col())) %>%
filter(reads > 0) %>%
left_join(., meta, by = "sampleID") %>%
write_xlsx(results_rawreads_long)
## exporting species summary
ASV_table_taxID_collapsed %>%
gather("sampleID", "reads", c(4:last_col())) %>%
group_by(Species_name, Common_name, Category) %>%
dplyr::select(-sampleID) %>%
summarise(sum_reads = sum(reads)) %>%
write_xlsx("example_output/Species_breakdown.xlsx")
## `summarise()` has grouped output by 'Species_name', 'Common_name'. You can
## override using the `.groups` argument.
Relative Abundance No user edits.
### Calculating relative abundance
df_relab <- ASV_table_taxID_collapsed %>%
gather("sampleID", "reads", 4:last_col()) %>%
group_by(sampleID) %>%
### total
mutate(sample_total = sum(reads)) %>%
group_by(sampleID, Species_name) %>%
## relab
mutate(relab = reads/sample_total) %>% ungroup() %>%
select(-reads, -sample_total)
df_relab %>%
left_join(., meta, by = "sampleID") %>%
write_xlsx(results_relab_long)
df_relab %>% spread(sampleID, relab) %>% write_xlsx(results_relab_matrix)