We first setup our R working environment by loading a few useful packages…

library(dada2)
library(ggplot2)
library(tidyverse)
library(dada2)
library(DT)
library(ShortRead)
library(Biostrings)
library(biomformat)
library(purrr)

… and modify PATH to include bioinformatics tools.

Sys.setenv(PATH = paste("/usr/local/genome/Anaconda3/envs/seqkit-2.0.0/bin", "/usr/local/genome/Anaconda3/envs/frogs-4.0.1/bin", Sys.getenv("PATH"), sep = ":"))

Quality control

Seqkit

Fisrt, we use seqkit to obtain the number of reads by sample and the reads lengths.

if(!file.exists(file.path(params$output_directory,"seqkit_raw.csv"))){
    system2(command="seqkit", args = c("stat",file.path(params$input_directory,paste0("*",params$fileext)), "--out-file", file.path(params$output_directory,"seqkit_raw.csv")), stdout = TRUE)
}

Here are the informations about the raw FASTQ files:

Quality profiles

Let look at the average quality profiles of R1:

fnFs <- sort(list.files(params$input_directory, pattern = "_R1.fastq.gz", full.names = TRUE)) # Extensions for R1 files
plotQualityProfile(fnFs, aggregate=TRUE)

and R2:

fnRs <- sort(list.files(params$input_directory, pattern = "_R2.fastq.gz", full.names = TRUE)) # Extensions for R2 files
plotQualityProfile(fnRs, aggregate=TRUE)

Cleaning

Remove reads with ambiguous bases (Ns)

Reads containing ambigous bases are removed because dada2 can not deal with them.

fnFs.filtN <- file.path(filtN_output_dir, basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
fnRs.filtN <- file.path(filtN_output_dir, basename(fnRs))

if(!file.exists(file.path(filtN_output_dir,"samples.rds"))){
  filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = params$threads)
  df_samples_filtN <- get_reads_number(filtN_output_dir)
  saveRDS(df_samples_filtN,file.path(filtN_output_dir,"samples.rds"))
}else{
  df_samples_filtN <- readRDS(file.path(filtN_output_dir,"samples.rds"))
}

Remove primers

We now remove primers (and check their orientation) from reads.

FWD <- params$forward_primer
REV <- params$reverse_primer

allOrients <- function(primer) {
    # Create all orientations of the input sequence
    require(Biostrings)
    dna <- DNAString(primer)  # The Biostrings works w/ DNAString objects rather than character vectors
    orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna), 
        RevComp = reverseComplement(dna))
    return(sapply(orients, toString))  # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients
##          Forward       Complement          Reverse          RevComp 
## "ACGGRAGGCWGCAG" "TGCCYTCCGWCGTC" "GACGWCGGARGGCA" "CTGCWGCCTYCCGT"
REV.orients
##               Forward            Complement               Reverse 
## "TACCAGGGTATCTAATCCT" "ATGGTCCCATAGATTAGGA" "TCCTAATCTATGGGACCAT" 
##               RevComp 
## "AGGATTAGATACCCTGGTA"

We first check the orientation of primers in one sample.

primerHits <- function(primer, fn) {
    # Counts number of reads in which the primer is found
    nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
    return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]), 
    FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]), 
    REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]), 
    REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))
##                  Forward Complement Reverse RevComp
## FWD.ForwardReads   31410          0       0       1
## FWD.ReverseReads       1          0       0    1591
## REV.ForwardReads       0          0       0    1614
## REV.ReverseReads   30013          0       0       0

Then, cutadapt is used to remove primers

if(!dir.exists(cutadapt_output_dir)) dir.create(cutadapt_output_dir)
fnFs.cut <- file.path(cutadapt_output_dir, basename(fnFs))
fnRs.cut <- file.path(cutadapt_output_dir, basename(fnRs))

sample.names <- unname(sapply(fnFs, get.sample.name))

if(!file.exists(file.path(cutadapt_output_dir,"samples.rds")) && !file.exists(file.path(cutadapt_output_dir, "fnRs.cut.rds")) && !file.exists(file.path(cutadapt_output_dir, "fnRs.cut.rds"))){
  FWD.RC <- dada2:::rc(FWD)
  REV.RC <- dada2:::rc(REV)
  # Trim FWD and the reverse-complement of REV off of R1 (forward reads)
  R1.flags <- paste("-g", FWD, "-a", REV.RC) 
  # Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
  R2.flags <- paste("-G", REV, "-A", FWD.RC) 
  # Run Cutadapt
  for(i in seq_along(fnFs)) {
    logfile <- system2("cutadapt", args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
                               "-o", fnFs.cut[i], "-p", fnRs.cut[i], "--discard-untrimmed", # output files
                               fnFs.filtN[i], fnRs.filtN[i]), stdout=TRUE, stderr=TRUE) # input files
    write(logfile, file=file.path(cutadapt_output_dir,paste0(sample.names[i],".cutadapt.log")))
  }
  df_samples_cutadapt <- get_reads_number(cutadapt_output_dir)
  saveRDS(df_samples_cutadapt,file.path(cutadapt_output_dir,"samples.rds"))
  saveRDS(fnFs.cut, file.path(cutadapt_output_dir, "fnFs.cut.rds"))
  saveRDS(fnRs.cut, file.path(cutadapt_output_dir, "fnRs.cut.rds"))
}else{
  df_samples_cutadapt <- readRDS(file.path(cutadapt_output_dir,"samples.rds"))
  fnFs.cut <- readRDS(file.path(cutadapt_output_dir, "fnFs.cut.rds"))
  fnRs.cut <- readRDS(file.path(cutadapt_output_dir, "fnRs.cut.rds"))
}

Finally, as a sanity check, we will count the presence of primers in the first cutadapt-ed sample:

rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]), 
    FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]), 
    REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]), 
    REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
##                  Forward Complement Reverse RevComp
## FWD.ForwardReads       0          0       0       0
## FWD.ReverseReads       0          0       0       0
## REV.ForwardReads       0          0       0       0
## REV.ReverseReads       0          0       0       0

Here is the summary of the cutadapt process:

Filter on length and quality

fnFs.cut <- fnFs.cut[file.exists(fnFs.cut)]
fnRs.cut <- fnRs.cut[file.exists(fnRs.cut)]

filtFs <- file.path(filters_output_dir, basename(fnFs.cut))
filtRs <- file.path(filters_output_dir, basename(fnRs.cut))


if(!file.exists(file.path(filters_output_dir,"filtFs.rds")) || !file.exists(file.path(filters_output_dir, "filtRs.rds")) || !file.exists(file.path(filters_output_dir, "out.rds")) || !file.exists(file.path(filters_output_dir, "samples.rds"))){
  out <- filterAndTrim(fnFs.cut, filtFs, fnRs.cut, filtRs, maxN = 0, #maxEE = c(2, 2), 
                     truncQ = 2, minLen = 100, rm.phix = TRUE, compress = TRUE, multithread = params$threads)
  df_samples_filters <- get_reads_number(filters_output_dir)
  saveRDS(out, file.path(filters_output_dir, "out.rds"))
  saveRDS(filtFs, file.path(filters_output_dir, "filtFs.rds"))
  saveRDS(filtRs, file.path(filters_output_dir, "filtRs.rds"))
  saveRDS(df_samples_filters,file.path(filters_output_dir,"samples.rds"))
}else{
  df_samples_filters <- readRDS(file.path(filters_output_dir,"samples.rds"))
  out <- readRDS(file.path(filters_output_dir, "out.rds"))
  filtFs <- readRDS(file.path(filters_output_dir, "filtFs.rds"))
  filtRs <- readRDS(file.path(filters_output_dir, "filtRs.rds"))
}

Here is the summary of the filters:

Dada2

Learn error rates

Error rates are learned by alternating between sample inference and error rate estimation until convergence.

dada2_output_dir <- file.path(params$output_dir, "dada2")
if(!dir.exists(dada2_output_dir)) dir.create(dada2_output_dir)
filtFs <- filtFs[file.exists(filtFs)]
filtRs <- filtRs[file.exists(filtRs)]
if(!file.exists(file.path(dada2_output_dir,"errF.rds")) && !file.exists(file.path(dada2_output_dir,"errR.rds"))){
  errF <- learnErrors(filtFs, multithread=params$threads)
  errR <- learnErrors(filtRs, multithread=params$threads)
  saveRDS(errF, file.path(dada2_output_dir, "errF.rds"))
  saveRDS(errR, file.path(dada2_output_dir, "errR.rds"))
}else{
  errF <- readRDS(file.path(dada2_output_dir,"errF.rds"))
  errR <- readRDS(file.path(dada2_output_dir,"errR.rds"))
}
plotErrors(errF, nominalQ=TRUE)

plotErrors(errR, nominalQ=TRUE)

Sample inference

The dada function removes all sequencing errors to reveal the members of the sequenced community.

if(!file.exists(file.path(dada2_output_dir,"dadaFs.rds")) && !file.exists(file.path(dada2_output_dir,"dadaRs.rds"))){
  dadaFs <- dada(filtFs, err=errF, multithread=params$threads)
  dadaRs <- dada(filtRs, err=errR, multithread=params$threads)
  saveRDS(dadaFs,file.path(dada2_output_dir,"dadaFs.rds"))
  saveRDS(dadaRs,file.path(dada2_output_dir,"dadaRs.rds"))
}else{
  dadaFs <- readRDS(file.path(dada2_output_dir,"dadaFs.rds"))
  dadaRs <- readRDS(file.path(dada2_output_dir,"dadaRs.rds"))
}

Merge pairs

We now attempt to merge each denoised pair of forward and reverse reads.

if (!file.exists(file.path(dada2_output_dir,"mergers.rds")) | !file.exists(file.path(dada2_output_dir,"seqtab.rds"))){
  if(!params$its == TRUE){
    mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE, minOverlap = params$merge_min_overlap, maxMismatch = params$merge_max_mismatch)
    seqtab <- makeSequenceTable(mergers)
  }else{
    mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, returnRejects = TRUE, trimOverhang = TRUE, verbose = TRUE, maxMismatch = params$merge_max_mismatch)
    concats <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, justConcatenate=TRUE,verbose=TRUE)
    repair_merger <- function(merger, concat) {
     merger[!merger$accept, ] <- concat[!merger$accept, ]
     merger$sequence <- unlist(merger$sequence)
     return(merger)
    }
    repaired_mergers <- purrr::map2(mergers, concats, repair_merger)
    mergers <- repaired_mergers
    seqtab <- makeSequenceTable(mergers)
  }
  saveRDS(mergers,file.path(dada2_output_dir,"mergers.rds"))
  rownames(seqtab) <- sample.names <- unname(sapply(filtFs, get.sample.name))
  saveRDS(seqtab,file.path(dada2_output_dir,"seqtab.rds")) 
}else{
  mergers <- readRDS(file.path(dada2_output_dir,"mergers.rds"))
  seqtab <- readRDS(file.path(dada2_output_dir,"seqtab.rds"))
}

Finally, samples without remaining reads are removed: 0 sample(s) removed.

if(length(rownames(seqtab[rowSums(seqtab)==0,]))>0){
    rownames(seqtab[rowSums(seqtab)==0,])
}
seqtab_final <- seqtab[rowSums(seqtab)>0,]
saveRDS(seqtab_final,file.path(dada2_output_dir,"seqtab.rds"))

We track reads through the pipeline:

frogs_output_dir <- file.path(params$output_directory, "FROGS")
if(!dir.exists(file.path(params$output_directory,"FROGS"))) dir.create(file.path(params$output_directory,"FROGS"))

seqtab <- readRDS(file.path(dada2_output_dir,"seqtab.rds"))

colnames(seqtab) <- gsub("NNNNNNNNNN", "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", colnames(seqtab))
    asv_seqs <- colnames(seqtab)
    asv_headers <- paste(">",colnames(seqtab),sep="")
    asv_headers <- if_else(grepl("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", asv_headers), paste0(">",asv_seqs, "_FROGS_combined"), asv_headers)
    asv_fasta <- c(rbind(asv_headers, asv_seqs))
    write(asv_fasta, file.path(dada2_output_dir,"dada2.fasta"))
    asv_tab <- t(seqtab)
    colnames(seqtab) <- if_else(grepl("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", colnames(seqtab)), paste0(colnames(seqtab), "_FROGS_combined"), colnames(seqtab))
    row.names(asv_tab) <- sub(">", "", asv_headers)
    write.table(asv_tab, file.path(dada2_output_dir,"dada2.tsv"), sep="\t", quote=F, col.names=NA)
    st.biom <- make_biom(t(seqtab))
    write_biom(st.biom, file.path(dada2_output_dir,"dada2.biom"))

FROGS

Now we use FROGS.

FROGS remove chimera

Chimera are removed.

if(!file.exists(file.path(frogs_output_dir,"remove_chimera.biom")) | !file.exists(file.path(frogs_output_dir,"remove_chimera.fasta"))){
    system2(command="remove_chimera.py", args=c("--input-biom", file.path(dada2_output_dir,"dada2.biom"), "--input-fasta", file.path(dada2_output_dir,"dada2.fasta"), "--out-abundance", file.path(frogs_output_dir,"remove_chimera.biom"), "--summary", file.path(frogs_output_dir,"remove_chimera.html"), "--log-file", file.path(frogs_output_dir,"remove_chimera.log"), "--non-chimera", file.path(frogs_output_dir,"remove_chimera.fasta"), "--nb-cpus", params$threads))  
}

FROGS filters

OTUs are filtered on abundance and phiX contamination:

if(!file.exists(file.path(frogs_output_dir,"filters.biom")) | !file.exists(file.path(frogs_output_dir,"filters.fasta"))){
    system2(command="otu_filters.py", args=c("--input-biom", file.path(frogs_output_dir,"remove_chimera.biom"), "--input-fasta", file.path(frogs_output_dir,"remove_chimera.fasta"), "--output-biom", file.path(frogs_output_dir,"filters.biom"), "--summary", file.path(frogs_output_dir,"filters.html"), "--log-file", file.path(frogs_output_dir,"filters.log"), "--output-fasta", file.path(frogs_output_dir,"filters.fasta"), "--contaminant", params$phix, "--min-abundance", params$min_abundance, "--excluded", file.path(frogs_output_dir,"excluded.tsv"), "--nb-cpus", params$threads))   
}
if(params$its == TRUE){
  cat("## FROGS ITSx\n")
  if(!file.exists(file.path(frogs_output_dir,"itsx.biom"))){
  system2(command="itsx.py", args = c("--nb-cpus", params$threads, "--region", params$its_region, "--input-biom", file.path(frogs_output_dir,"filters.biom"), "--out-removed" , file.path(frogs_output_dir,"itsx_removed.fasta"),  "--input-fasta", file.path(frogs_output_dir,"filters.fasta"), "--check-its-only" , "--out-abundance", file.path(frogs_output_dir,"itsx.biom"), "--out-fasta", file.path(frogs_output_dir,"itsx.fasta"), "--summary", file.path(frogs_output_dir, "itsx.html"), "--log-file", file.path(frogs_output_dir,"itsx.log")), stdout=TRUE) # input files
  }
  cat("\n")
  cat(" - [FROGS ITSx report](FROGS/itsx.html)")
}

FROGS affiliation

Finally, OTUs are affiliated against a dedicated databank:

if(!file.exists(file.path(frogs_output_dir,"affiliations.biom"))){
    if(params$its == TRUE){
        input_biom <- file.path(frogs_output_dir,"itsx.biom")
        input_fasta <- file.path(frogs_output_dir,"itsx.fasta")
    }else{
        input_biom <- file.path(frogs_output_dir,"filters.biom")
        input_fasta <- file.path(frogs_output_dir,"filters.fasta")
    }
    system2(command="affiliation_OTU.py", args=c("--input-biom", input_biom, "--input-fasta", input_fasta, "--output-biom", file.path(frogs_output_dir,"affiliations.biom"), "--summary", file.path(frogs_output_dir,"affiliation.html"), "--log-file", file.path(frogs_output_dir,"affiliation.log"), "--reference", params$reference, "--nb-cpus", params$threads))   
}

FROGS BIOM to TSV

if(!file.exists(file.path(frogs_output_dir,"affiliations.tsv"))){
    system2(command="biom_to_tsv.py", args=c("--input-biom", file.path(frogs_output_dir,"affiliations.biom"), "--input-fasta", file.path(frogs_output_dir,"filters.fasta"), "--output-tsv", file.path(frogs_output_dir,"affiliations.tsv"), "--output-multi-affi", file.path(frogs_output_dir,"multi_affiliations.tsv"), "--log-file", file.path(frogs_output_dir,"biomtotsv.log")))    
}

The ASV table is available here:

Reproductibility tokens

system2(command="remove_chimera.py", args=c("--version"), stdout = TRUE)
## [1] "4.0.1"
system2(command="cutadapt", args=c("--version"), stdout = TRUE)
## [1] "2.10"
system2(command="seqkit", args=c("version"), stdout = TRUE)
## [1] "seqkit v2.0.0"
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
## 
## Matrix products: default
## BLAS:   /usr/local/public/R-4.2.1/lib/R/lib/libRblas.so
## LAPACK: /usr/local/public/R-4.2.1/lib/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=fr_FR.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=fr_FR.UTF-8        LC_COLLATE=fr_FR.UTF-8    
##  [5] LC_MONETARY=fr_FR.UTF-8    LC_MESSAGES=fr_FR.UTF-8   
##  [7] LC_PAPER=fr_FR.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] biomformat_1.26.0           ShortRead_1.56.1           
##  [3] GenomicAlignments_1.34.0    SummarizedExperiment_1.28.0
##  [5] Biobase_2.58.0              MatrixGenerics_1.10.0      
##  [7] matrixStats_0.63.0          Rsamtools_2.14.0           
##  [9] GenomicRanges_1.50.2        Biostrings_2.66.0          
## [11] GenomeInfoDb_1.34.9         XVector_0.38.0             
## [13] IRanges_2.32.0              S4Vectors_0.36.1           
## [15] BiocParallel_1.32.5         BiocGenerics_0.44.0        
## [17] DT_0.27                     lubridate_1.9.2            
## [19] forcats_1.0.0               stringr_1.5.0              
## [21] dplyr_1.1.0                 purrr_1.0.1                
## [23] readr_2.1.4                 tidyr_1.3.0                
## [25] tibble_3.1.8                tidyverse_2.0.0            
## [27] ggplot2_3.4.1               dada2_1.26.0               
## [29] Rcpp_1.0.10                
## 
## loaded via a namespace (and not attached):
##  [1] bitops_1.0-7           RColorBrewer_1.1-3     tools_4.2.1           
##  [4] bslib_0.4.2            utf8_1.2.3             R6_2.5.1              
##  [7] DBI_1.1.3              colorspace_2.1-0       rhdf5filters_1.10.0   
## [10] withr_2.5.0            tidyselect_1.2.0       compiler_4.2.1        
## [13] cli_3.6.0              DelayedArray_0.24.0    labeling_0.4.2        
## [16] sass_0.4.5             scales_1.2.1           digest_0.6.31         
## [19] rmarkdown_2.20         jpeg_0.1-10            pkgconfig_2.0.3       
## [22] htmltools_0.5.4        highr_0.10             fastmap_1.1.0         
## [25] htmlwidgets_1.6.1      rlang_1.0.6            farver_2.1.1          
## [28] jquerylib_0.1.4        generics_0.1.3         hwriter_1.3.2.1       
## [31] jsonlite_1.8.4         crosstalk_1.2.0        RCurl_1.98-1.10       
## [34] magrittr_2.0.3         GenomeInfoDbData_1.2.9 interp_1.1-3          
## [37] Matrix_1.5-3           munsell_0.5.0          Rhdf5lib_1.20.0       
## [40] fansi_1.0.4            lifecycle_1.0.3        stringi_1.7.12        
## [43] yaml_2.3.7             zlibbioc_1.44.0        rhdf5_2.42.0          
## [46] plyr_1.8.8             grid_4.2.1             parallel_4.2.1        
## [49] crayon_1.5.2           deldir_1.0-6           lattice_0.20-45       
## [52] hms_1.1.2              knitr_1.42             pillar_1.8.1          
## [55] reshape2_1.4.4         codetools_0.2-19       glue_1.6.2            
## [58] evaluate_0.20          latticeExtra_0.6-30    RcppParallel_5.1.6    
## [61] png_0.1-8              vctrs_0.5.2            tzdb_0.3.0            
## [64] gtable_0.3.1           cachem_1.0.6           xfun_0.37             
## [67] timechange_0.2.0       ellipsis_0.3.2