Datatable preparation base script for eDNA metabarcoding

.Rmd script

This script takes your Blast output from the GMGI database, Mitofish database, and NCBI database to create one datatable with read counts and taxonomic assignment.

Workflow summary:
1. Load libraries
2. Load metadata
3. Load BLAST output from GMGI, Mitofish, and NCBI
4. Load DADA2 ASV Table
5. Taxonomic Assignment
- 5a. Identify ASVs with multiple hits from GMGI’s database
- 5b. Identify entries that mismatch between GMGI, Mitofish, and NCBI databases
- 5c. Assign taxonomy based on hierarchical approach
- 5d. Edit taxonomy annotations based on mismatch table choices
- 5e. Adjusting common name and category for those entries that don’t have one (from Mito or NCBI)
6. Filtering: Filter ASV by less than 0.1% reads and then collapse by group
7. Collapsing read counts by species name
8. Creating results output

Load libraries

library(ggplot2) ## for plotting
library(dplyr) ## for data table manipulation

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr) ## for data table manipulation
library(readr) ## for reading in tsv files
library(readxl) ## for reading in excel files
library(stringr) ## for data transformation
library(strex) ## for data transformation
library(writexl) ## for excel output
library(purrr) ## for data transformation
library(funrar) ## for make_relative()
library(tidyverse) ## for data table manipulation

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Metadata input

Identify paths for metadata and project data

Each user needs to write in their specific directory outputs prior to the file name. The default working directory is this document so the folder where this script is saved for the user. To change the workign directory to the Rproject directory, select ‘Knit’ and ‘Knit Directory’ > ‘Project Directory’.

### User edits:
### 1. change paths of input and output as desired 

## GMGI Fish database
path_GMGIdb = "C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/Metabarcoding Lab Resources/Reference Databases/GMGI_Vert_Ref.xlsx"
path_fishbase_tax = "C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/Metabarcoding Lab Resources/Reference Databases/taxonomic_classification_fishbase.csv"
path_mitofish_tax = "C:/BoxDrive/Box/Science/Fisheries/Projects/eDNA/Metabarcoding Lab Resources/Reference Databases/taxonomic_classification_mitofish.csv"

## BLAST results
path_blast_gmgi = "example_input/BLASTResults_GMGI.txt"
path_blast_mito = "example_input/BLASTResults_Mito.txt"
path_blast_ncbi_taxassigned = "example_input/NCBI_taxassigned.txt"
path_blast_ncbi = "example_input/BLASTResults_NCBI.txt"

## ASV table results 
## confirm that the ASV_table.len.tsv name is correct for user's project
path_asv_table = "example_input/ASV_table.len.tsv"
path_output_summary = "example_input/overall_summary.tsv"

# output paths 
path_choice_required = "example_output/Taxonomic_assignments/Choice_required_GMGI_multiplehits.xlsx"
path_choice_required_edited="example_output/Taxonomic_assignments/Choice_required_GMGI_multiplehits_edited.xlsx"

path_disagree_list = "example_output/Taxonomic_assignments/SampleReport_taxonomic_ID.xlsx"
path_disagree_list_edited="example_output/Taxonomic_assignments/SampleReport_taxonomic_ID_edited.xlsx"

path_commonnames_add="example_output/Taxonomic_assignments/CommonNames_required.xlsx"
path_commonnames_add_edited="example_output/Taxonomic_assignments/CommonNames_required_edited.xlsx"

results_rawreads_matrix = "example_output/Results_1_rawreads_matrix.xlsx"
results_rawreads_long = "example_output/Results_rawreads_long_format.xlsx"
results_relab_matrix = "example_output/Results_2_relative_abundance_matrix.xlsx"
results_relab_long = "example_output/Results_relative_abundance_long_format.xlsx"

reads_filtered_out="example_output/ASV_reads_filtered_out.xlsx"
ASV_breakdown_sheet="example_output/ASV_breakdown.xlsx"

Load project metadata

Metadata specific to each project. This contains information about each sample (e.g., month, site, time, sample type, etc.). Confirm that sample IDs match those used in the ASV_table.len.tsv file.

### User edits:
### 1. change path of metadata file

## EXCEL
meta <- read_excel("example_input/metadata.xlsx")
## CSV 
# meta <- read.csv("example_input/metadata.csv", header = TRUE)

Load database metadata

No user edits in this section because paths have already been set above.

# Load GMGI database information (common name, species name, etc.)
gmgi_db <- read_xlsx(path_GMGIdb, sheet = 1) %>% dplyr::rename(sseqid = Ref) %>%
  ## removing > from beginning of entires within Ref column
  mutate(sseqid = gsub(">", "", sseqid))

BLAST data input

No user edits unless user changed blastn parameters from fisheries team default.

## Setting column header names and classes
blast_col_headers = c("ASV_ID", "sseqid", "pident", "length", "mismatch", "gapopen",
                                        "qstart", "qend", "sstart", "send", "evalue", "bitscore")
blast_col_classes = c(rep("character", 2), rep("numeric", 10))

GMGI database

No user edits.

Blast_GMGI <- read.table(path_blast_gmgi, header=F, col.names = blast_col_headers, colClasses = blast_col_classes) %>%
  ## blast changes spaces to hyphons so we need to change that back to match our metadata
  mutate(sseqid = gsub("-", " ", sseqid)) %>%
  ## join with GMGI database information
  left_join(., gmgi_db, by = "sseqid")

## Check how many ASVs were identified with the GMGI Database
length(unique(Blast_GMGI$ASV_ID))

## [1] 986

Mitofish database

No user edits.

Blast_Mito <- read.table(path_blast_mito, header=F, col.names = blast_col_headers, colClasses = blast_col_classes) %>%
  # renaming sseqid to species name
  dplyr::rename(Species_name = sseqid) %>%

  # replacing _ with spaces
  mutate(Species_name = gsub("_", " ", Species_name),

         ## removing gb || sequence from species name 
         Species_name = str_after_nth(Species_name, "\\|", 2))

NCBI database

No user edits.

NCBI_taxassigned <- read.delim2(path_blast_ncbi_taxassigned, header=F, col.names = c("staxid", "Phylo")) %>%
  ## creating taxonomic assignment columns
  separate(Phylo, c("Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species_name"), sep = ";") %>%
  ## creating species column based on Species_name
  mutate(., species = str_after_nth(Species_name, " ", 1))

Blast_NCBI <- read.table(path_blast_ncbi, header=F,
                           col.names = c("ASV_ID", "sseqid", "sscinames", "staxid", "pident", "length", "mismatch",
                                         "gapopen", "qstart", "qend", "sstart", "send", "evalue", "bitscore"),
                           colClasses = c(rep("character", 3), "integer", rep("numeric", 9))) %>%
  left_join(., NCBI_taxassigned, by = "staxid")

Load DADA2 ASV Table

The column headers will be the Sample IDs and the first column is the ASV ID. ASVs are given a “rank” based on sum of reads from that ASV (pre-filtering). ‘Random’ indicates that if ASVs are tied, then the code will randomly assign a rank for those tied. Because we don’t need an exact rank here, ‘random’ will do for a tie-breaker.

No user edits.

ASV_table <- read_tsv(path_asv_table, show_col_types = FALSE) %>%
  ## calculate the sum of all reads for each ASV
  mutate(., ASV_sum = rowSums(across(where(is.numeric)))) %>% 

  ## calculate a ranking based on those sum calculated above
  mutate(ASV_rank = rank(-ASV_sum, ties.method='random')) %>%

  ## move the sum and rank columns to after ASV_ID and arrange by rank
  relocate(c(ASV_sum,ASV_rank), .after = ASV_ID) %>% arrange((ASV_rank))

## creating list of rankings
ASV_rank_list <- ASV_table %>% dplyr::select(ASV_ID, ASV_sum, ASV_rank)

Taxonomic Assignment

Identifying where NCBI, Mito, and GMGI disagree on tax assignment. With the hierarchial approach, ASVs that match to GMGI and several other databases will only result in GMGI assignment. By reviewing this df, we can be sure we aren’t missing an assignment in our GMGI curated database.

Sub-workflow:
1. Identify any ASVs that contain multiple hits within the GMGI database.
2. Identify entries that mismatch between GMGI, Mitofish, and NCBI databases.
3. Assign taxonomy based on hierarchical approach.
4. Edit taxonomy annotations based on mismatch table.
5. Adjusting common name for those entries that don’t have one (from Mito or GMGI).

Identify any ASVs that contain multiple hits within the GMGI database

At this point, a fisheries team member needs to make choices about which taxonomic assignment to accept.

Create list of those ASVs with multiple hits

No user edits.

multiple_hit_choice <- Blast_GMGI %>% group_by(ASV_ID) %>%
  ## take top percent identity hit, count the number of top hits, and filter to those with more than 1 top hit 
  slice_max(pident, n=1) %>% count() %>% filter(n>1) %>%

  ## adding BLAST_GMGI information with these ASVs and ASV rank and sum
  left_join(., Blast_GMGI, by = "ASV_ID") %>%
  left_join(., ASV_rank_list, by = "ASV_ID") %>%

  ## moving database percent ID to be next to Blast percent ID
  relocate(c(db_percent_ID, ASV_sum, ASV_rank), .after = pident) %>%

  ## adding choice column for next steps 
  mutate(Choice = NA)

## export this data frame as excel sheet 
multiple_hit_choice %>% write_xlsx(path_choice_required)

Based on the output above, user needs to make some choices. In the excel spreadsheet, user needs to mark ‘x’ on the choices desired while leaving the other entries blank.

Choosing one of several hits.

Load choice edited dataset. No user edits.

multiple_hit_choice_edited <- read_xlsx(path_choice_required_edited) %>%
  ## selecting the choices made
  filter(!is.na(Choice)) %>%
  ## selecting only columns needed 
  dplyr::select(ASV_ID, sseqid, Choice)

A for loop will filter Blast_GMGI df based on these choices. No user edits.

# Create a new edited df
Blast_GMGI_edited <- Blast_GMGI 

# Loop through each row of the dataframe
for (i in multiple_hit_choice_edited$ASV_ID) {
  # Extract the current row (will do this for each ASV_ID in the choice df)
  current_row <- multiple_hit_choice_edited %>% subset(ASV_ID==i)

  # Apply filter based on the current row's condition
  Blast_GMGI_edited <- Blast_GMGI_edited %>%
    filter(case_when(ASV_ID == current_row$ASV_ID ~ sseqid == current_row$sseqid,
           TRUE ~ TRUE))
}

Confirming that all entries have been dealth with

No user edits.

### Check the below output to confirm the filtering steps above worked (if it worked, it won't be in output)
Blast_GMGI_edited %>% group_by(ASV_ID) %>% slice_max(pident, n=1) %>% count() %>% filter(n>1)

## # A tibble: 0 × 2
## # Groups:   ASV_ID [0]
## # ℹ 2 variables: ASV_ID <chr>, n <int>

Identify entries that mismatch between GMGI, Mitofish, and NCBI databases

Creating a df called “Disagree”. Review the output before moving onto the next section.

No user edits.

## Create GMGI species input
disagree_input_GMGI <- Blast_GMGI_edited %>% 
  dplyr::select(ASV_ID, GMGI_pident = pident, GMGI_db_ID = db_percent_ID, GMGI_Species = Species_name) %>%
  group_by(ASV_ID) %>% slice_max(GMGI_pident, n = 1, with_ties = FALSE) %>% ungroup()

## Create Mitofish input
disagree_input_Mito <- Blast_Mito %>%
  dplyr::select(ASV_ID, Mito_Species = Species_name) %>% distinct() %>%
  group_by(ASV_ID) %>% mutate(Mito_Species = paste0(Mito_Species, collapse = ";")) %>% distinct() %>% ungroup()

## Create NCBI input
disagree_input_NCBI <- Blast_NCBI %>% 
  dplyr::select(ASV_ID, NCBI_Species = Species_name) %>% distinct() %>% 
  group_by(ASV_ID) %>% mutate(NCBI_Species = paste0(NCBI_Species, collapse = ";")) %>% distinct() %>% ungroup()

## Combine all three dfs into disagree_df with ASV rank information
disagree_df <- disagree_input_GMGI %>%
  full_join(disagree_input_Mito, by = "ASV_ID") %>%
  full_join(disagree_input_NCBI, by = "ASV_ID") %>%
  full_join(., ASV_rank_list, by = "ASV_ID") %>%
  mutate(Choice = NA)

## Filtering the disagree_df to only output the entries that disagree with multiple entries
filtered_disagree_df <- disagree_df %>%

  ## grouping by ASV (row)
  rowwise() %>%

  ## Filtering out rows that are unassigned across all 3 db 
  filter(! (is.na(GMGI_Species) && is.na(Mito_Species) && is.na(NCBI_Species) )) %>%

  ## Filter out rows that are GMGI assigned and empty from Mito and NCBI
  filter(! (!is.na(GMGI_Species) && is.na(Mito_Species) && is.na(NCBI_Species) )) %>%

  ## Filtering out rows that have the same information across databases (ONLY in rows with a GMGI entry)
  filter(ifelse(!is.na(GMGI_Species), 
          n_distinct(c_across(GMGI_Species:NCBI_Species), na.rm = TRUE) > 1, TRUE))

## export this data frame as excel sheet 
filtered_disagree_df %>% write_xlsx(path_disagree_list)

Assign taxonomy based on hierarchical approach

Taxonomic identification is taken from GMGI 100%, then GMGI \<100%, then Mitofish 100%, and finally NCBI 100%.

No user edits.

ASV_table_taxID <- ASV_table %>% 

  ## 1. Top hit from GMGI's database
  left_join(Blast_GMGI_edited %>%  group_by(ASV_ID) %>%
              slice_max(pident, n = 1) %>%
                            dplyr::select(ASV_ID, Species_name),
            by = join_by(ASV_ID)) %>%

  ## 2. Mitofish database
  ### join df, select ASV_ID and Species_name columns, rename Species_name to Mito, call only distinct rows
  left_join(., Blast_Mito %>% dplyr::select(ASV_ID, Mito = Species_name) %>% distinct() %>%

              ### group by ASV_ID, and collapse all species names separated by ;, then take only distinct rows
              group_by(ASV_ID) %>% mutate(Mito = paste0(Mito, collapse = ";")) %>% distinct(), by = "ASV_ID") %>%

  ### if GMGI annotation is NA, then replace with Mitofish 
  mutate(., Species_name = ifelse(is.na(Species_name), Mito, Species_name)) %>%

  ## 3. NCBI database; same functions as above
  left_join(., Blast_NCBI %>% dplyr::select(ASV_ID, NCBI = Species_name) %>% distinct() %>%
              group_by(ASV_ID) %>% mutate(NCBI = paste0(NCBI, collapse = ";")) %>% distinct(), by = "ASV_ID") %>%
  mutate(., Species_name = ifelse(is.na(Species_name), NCBI, Species_name)) %>%

  ## 4. if Species name is STILL not filled, call it "Unassigned"
  mutate(., Species_name = ifelse(is.na(Species_name), "Unassigned", Species_name)) %>%  

  ## removing Mito spp and NCBI spp
  dplyr::select(-Mito, -NCBI) %>%

  ## move species name to be after ASV_ID
  relocate(., c(Species_name), .after = ASV_ID)

Edit taxonomy annotations based on mismatch table

Override any annotations with edited taxonomic identification table.
No user edits.

## read in edited df 
taxonomic_choice <- read_xlsx(path_disagree_list_edited) %>%
    ## selecting only columns needed 
  dplyr::select(ASV_ID, Choice)  

# Create a new edited df
ASV_table_taxID_edited <- ASV_table_taxID 

# Loop through each row of the dataframe
for (i in taxonomic_choice$ASV_ID) {
  # Extract the current row (will do this for each ASV_ID in the choice df)
  current_row <- taxonomic_choice %>% subset(ASV_ID==i)

  # Apply filter based on the current row's condition
  ASV_table_taxID_edited <- ASV_table_taxID_edited %>%
    mutate(Species_name = case_when(
          ASV_ID == current_row$ASV_ID ~ current_row$Choice,
           TRUE ~ Species_name))
}

Confirm all entries are dealt with

No user edits.

## Output will be blank
ASV_table_taxID_edited %>% dplyr::select(Species_name) %>% distinct() %>% 
  filter(., grepl(";", Species_name)) %>% arrange(Species_name)

## # A tibble: 0 × 1
## # ℹ 1 variable: Species_name <chr>

Adjusting common name for those entries that don’t have one (from Mito or NCBI)

No user edits.

### add common name column to df
ASV_table_taxID_edited <- ASV_table_taxID_edited %>%
  left_join(., gmgi_db %>% dplyr::select(Species_name, Common_name, Category) %>% distinct(), by = "Species_name") %>%
  relocate(., c(Common_name, Category), .after = Species_name)

### print entries with no common name
ASV_table_taxID_edited %>% dplyr::select(Species_name, Common_name) %>% 
  filter(is.na(Common_name)) %>% distinct() %>%
  mutate(Category = NA, Kingdom = NA, Phylum = NA, Class = NA, Order = NA, Family = NA, Genus = NA, species = NA) %>%
  mutate(
    across(everything(), ~case_when(
      Species_name == "Unassigned" ~ "Unassigned",
      TRUE ~ .x
    ))) %>% 
  write_xlsx(path_commonnames_add)

Editing common names and category when needed.

## read in edited df 
commonNames_annotated <- read_xlsx(path_commonnames_add_edited)

# Create a new edited df
ASV_table_taxID_annotated <- ASV_table_taxID_edited 

# Loop through each row of the dataframe
for (i in commonNames_annotated$Species_name) {
  # Extract the current row (will do this for each ASV_ID in the choice df)
  current_row <- commonNames_annotated %>% subset(Species_name==i)

  # Apply filter based on the current row's condition
  ASV_table_taxID_annotated <- ASV_table_taxID_annotated %>%
    mutate(Common_name = case_when(
          Species_name == current_row$Species_name ~ current_row$Common_name,
           TRUE ~ Common_name)) %>%
    mutate(Category = case_when(
          Species_name == current_row$Species_name ~ current_row$Category,
           TRUE ~ Category))  
}

## printing list of species name without common names 
## after additions to mutate function above, this output should be zero 
ASV_table_taxID_annotated %>% dplyr::select(Species_name, Common_name) %>% filter(is.na(Common_name)) %>% distinct()

## # A tibble: 0 × 2
## # ℹ 2 variables: Species_name <chr>, Common_name <chr>

Filtering: Filter ASV by less than 0.1% reads and then collapse by group

Filter out reads that are less than 0.1% of ASV (row) total per sample.

Create an output of what you’re losing with filtering.

No user edits.

ASV_table_taxID_filtered <- ASV_table_taxID_annotated %>%
  ## telling the df we are doing the following function by rows (ASVs)
  rowwise() %>%

  ## filtering out any values that are less than 0.001 of the total ASV read # in each sample
  mutate(across(.cols = (7:ncol(.)),            
                .fns = ~ ifelse((.x/ASV_sum)<0.001, NA, .x))) %>% ungroup()

## output of what we're losing
ASV_table_taxID_edited %>% rowwise() %>%
  mutate(across(.cols = (7:ncol(.)),            
                .fns = ~ ifelse((.x/ASV_sum)>0.001, NA, .x))) %>% ungroup() %>% write_xlsx(reads_filtered_out)

## Export ASV break-down for 03-data_quality.Rmds
ASV_table_taxID_filtered %>% dplyr::select(ASV_ID, Species_name, Common_name, Category, ASV_sum, ASV_rank) %>%
  write_xlsx(ASV_breakdown_sheet)

## Confirm number of samples is as expected (outputs should be the same)
ncol(ASV_table_taxID_annotated %>% dplyr::select(-ASV_ID, -Species_name, -Common_name, -Category, -ASV_sum, -ASV_rank))

## [1] 339

ncol(ASV_table_taxID_filtered %>% dplyr::select(-ASV_ID, -Species_name, -Common_name, -Category, -ASV_sum, -ASV_rank))

## [1] 339

Collapsing read counts by species name

No user edits.

ASV_table_taxID_collapsed <- ASV_table_taxID_filtered %>% 
  # removing original ASV_ID to collapse
  dplyr::select(-ASV_ID, -ASV_sum, -ASV_rank) %>%  

  ## group by Species_name and sample
  dplyr::group_by(Species_name, Common_name, Category) %>%

  ## sum down column by species name and sample to collapse
  dplyr::summarise(across(.cols = (1:last_col()),            
                          .fns = ~ sum(., na.rm = TRUE)), 
                     .groups = 'drop')

# Print number of samples
ncol(ASV_table_taxID_collapsed %>% dplyr::select(-Species_name, -Common_name, -Category))

## [1] 339

Creating results output

Raw reads results output.
No user edits.

### Number of total assignments -1 for unassigned
length(unique(ASV_table_taxID_collapsed$Species_name)) - 1

## [1] 106

## Raw reads matrix (wide format)
ASV_table_taxID_collapsed %>% write_xlsx(results_rawreads_matrix)

## Raw reads long format and filtering out entries with zero reads
ASV_table_taxID_collapsed %>% 
  gather("sampleID", "reads", c(4:last_col())) %>%
  filter(reads > 0) %>% 
  left_join(., meta, by = "sampleID") %>%
  write_xlsx(results_rawreads_long)

## exporting species summary
ASV_table_taxID_collapsed %>% 
  gather("sampleID", "reads", c(4:last_col())) %>% 
  group_by(Species_name, Common_name, Category) %>%
  dplyr::select(-sampleID) %>%
  summarise(sum_reads = sum(reads)) %>%
  write_xlsx("example_output/Species_breakdown.xlsx")

## `summarise()` has grouped output by 'Species_name', 'Common_name'. You can
## override using the `.groups` argument.

Relative Abundance No user edits.

### Calculating relative abundance
df_relab <- ASV_table_taxID_collapsed %>% 
  gather("sampleID", "reads", 4:last_col()) %>%
  group_by(sampleID) %>%

  ### total
  mutate(sample_total = sum(reads)) %>%
  group_by(sampleID, Species_name) %>%

  ## relab
  mutate(relab = reads/sample_total) %>% ungroup() %>%
  select(-reads, -sample_total) 

df_relab %>%
  left_join(., meta, by = "sampleID") %>%
  write_xlsx(results_relab_long)

df_relab %>% spread(sampleID, relab) %>% write_xlsx(results_relab_matrix)