We first setup our R working environment by loading a few useful packages…
library(dada2)
library(ggplot2)
library(tidyverse)
library(dada2)
library(DT)
library(ShortRead)
library(Biostrings)
library(biomformat)
library(purrr)
… and modify PATH to include bioinformatics tools.
Sys.setenv(PATH = paste("/usr/local/genome/Anaconda3/envs/seqkit-2.0.0/bin", "/usr/local/genome/Anaconda3/envs/frogs-4.0.1/bin", Sys.getenv("PATH"), sep = ":"))
Fisrt, we use seqkit to obtain the number of reads by sample and the reads lengths.
if(!file.exists(file.path(params$output_directory,"seqkit_raw.csv"))){
system2(command="seqkit", args = c("stat",file.path(params$input_directory,paste0("*",params$fileext)), "--out-file", file.path(params$output_directory,"seqkit_raw.csv")), stdout = TRUE)
}
Here are the informations about the raw FASTQ files:
Let look at the average quality profiles of R1:
fnFs <- sort(list.files(params$input_directory, pattern = "_R1.fastq.gz", full.names = TRUE)) # Extensions for R1 files
plotQualityProfile(fnFs, aggregate=TRUE)
and R2:
fnRs <- sort(list.files(params$input_directory, pattern = "_R2.fastq.gz", full.names = TRUE)) # Extensions for R2 files
plotQualityProfile(fnRs, aggregate=TRUE)
Reads containing ambigous bases are removed because dada2 can not deal with them.
fnFs.filtN <- file.path(filtN_output_dir, basename(fnFs)) # Put N-filterd files in filtN/ subdirectory
fnRs.filtN <- file.path(filtN_output_dir, basename(fnRs))
if(!file.exists(file.path(filtN_output_dir,"samples.rds"))){
filterAndTrim(fnFs, fnFs.filtN, fnRs, fnRs.filtN, maxN = 0, multithread = params$threads)
df_samples_filtN <- get_reads_number(filtN_output_dir)
saveRDS(df_samples_filtN,file.path(filtN_output_dir,"samples.rds"))
}else{
df_samples_filtN <- readRDS(file.path(filtN_output_dir,"samples.rds"))
}
We now remove primers (and check their orientation) from reads.
FWD <- params$forward_primer
REV <- params$reverse_primer
allOrients <- function(primer) {
# Create all orientations of the input sequence
require(Biostrings)
dna <- DNAString(primer) # The Biostrings works w/ DNAString objects rather than character vectors
orients <- c(Forward = dna, Complement = complement(dna), Reverse = reverse(dna),
RevComp = reverseComplement(dna))
return(sapply(orients, toString)) # Convert back to character vector
}
FWD.orients <- allOrients(FWD)
REV.orients <- allOrients(REV)
FWD.orients
## Forward Complement Reverse RevComp
## "ACGGRAGGCWGCAG" "TGCCYTCCGWCGTC" "GACGWCGGARGGCA" "CTGCWGCCTYCCGT"
REV.orients
## Forward Complement Reverse
## "TACCAGGGTATCTAATCCT" "ATGGTCCCATAGATTAGGA" "TCCTAATCTATGGGACCAT"
## RevComp
## "AGGATTAGATACCCTGGTA"
We first check the orientation of primers in one sample.
primerHits <- function(primer, fn) {
# Counts number of reads in which the primer is found
nhits <- vcountPattern(primer, sread(readFastq(fn)), fixed = FALSE)
return(sum(nhits > 0))
}
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.filtN[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.filtN[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.filtN[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.filtN[[1]]))
## Forward Complement Reverse RevComp
## FWD.ForwardReads 37854 0 0 0
## FWD.ReverseReads 0 0 0 2690
## REV.ForwardReads 0 0 0 2806
## REV.ReverseReads 37137 0 0 0
Then, cutadapt is used to remove primers
if(!dir.exists(cutadapt_output_dir)) dir.create(cutadapt_output_dir)
fnFs.cut <- file.path(cutadapt_output_dir, basename(fnFs))
fnRs.cut <- file.path(cutadapt_output_dir, basename(fnRs))
sample.names <- unname(sapply(fnFs, get.sample.name))
if(!file.exists(file.path(cutadapt_output_dir,"samples.rds")) && !file.exists(file.path(cutadapt_output_dir, "fnRs.cut.rds")) && !file.exists(file.path(cutadapt_output_dir, "fnRs.cut.rds"))){
FWD.RC <- dada2:::rc(FWD)
REV.RC <- dada2:::rc(REV)
# Trim FWD and the reverse-complement of REV off of R1 (forward reads)
R1.flags <- paste("-g", FWD, "-a", REV.RC)
# Trim REV and the reverse-complement of FWD off of R2 (reverse reads)
R2.flags <- paste("-G", REV, "-A", FWD.RC)
# Run Cutadapt
for(i in seq_along(fnFs)) {
logfile <- system2("cutadapt", args = c(R1.flags, R2.flags, "-n", 2, # -n 2 required to remove FWD and REV from reads
"-o", fnFs.cut[i], "-p", fnRs.cut[i], "--discard-untrimmed", # output files
fnFs.filtN[i], fnRs.filtN[i]), stdout=TRUE, stderr=TRUE) # input files
write(logfile, file=file.path(cutadapt_output_dir,paste0(sample.names[i],".cutadapt.log")))
}
df_samples_cutadapt <- get_reads_number(cutadapt_output_dir)
saveRDS(df_samples_cutadapt,file.path(cutadapt_output_dir,"samples.rds"))
saveRDS(fnFs.cut, file.path(cutadapt_output_dir, "fnFs.cut.rds"))
saveRDS(fnRs.cut, file.path(cutadapt_output_dir, "fnRs.cut.rds"))
}else{
df_samples_cutadapt <- readRDS(file.path(cutadapt_output_dir,"samples.rds"))
fnFs.cut <- readRDS(file.path(cutadapt_output_dir, "fnFs.cut.rds"))
fnRs.cut <- readRDS(file.path(cutadapt_output_dir, "fnRs.cut.rds"))
}
Finally, as a sanity check, we will count the presence of primers in the first cutadapt-ed sample:
rbind(FWD.ForwardReads = sapply(FWD.orients, primerHits, fn = fnFs.cut[[1]]),
FWD.ReverseReads = sapply(FWD.orients, primerHits, fn = fnRs.cut[[1]]),
REV.ForwardReads = sapply(REV.orients, primerHits, fn = fnFs.cut[[1]]),
REV.ReverseReads = sapply(REV.orients, primerHits, fn = fnRs.cut[[1]]))
## Forward Complement Reverse RevComp
## FWD.ForwardReads 0 0 0 0
## FWD.ReverseReads 0 0 0 0
## REV.ForwardReads 0 0 0 0
## REV.ReverseReads 0 0 0 0
Here is the summary of the cutadapt process:
fnFs.cut <- fnFs.cut[file.exists(fnFs.cut)]
fnRs.cut <- fnRs.cut[file.exists(fnRs.cut)]
filtFs <- file.path(filters_output_dir, basename(fnFs.cut))
filtRs <- file.path(filters_output_dir, basename(fnRs.cut))
if(!file.exists(file.path(filters_output_dir,"filtFs.rds")) || !file.exists(file.path(filters_output_dir, "filtRs.rds")) || !file.exists(file.path(filters_output_dir, "out.rds")) || !file.exists(file.path(filters_output_dir, "samples.rds"))){
out <- filterAndTrim(fnFs.cut, filtFs, fnRs.cut, filtRs, maxN = 0, #maxEE = c(2, 2),
truncQ = 2, minLen = 100, rm.phix = TRUE, compress = TRUE, multithread = params$threads)
df_samples_filters <- get_reads_number(filters_output_dir)
saveRDS(out, file.path(filters_output_dir, "out.rds"))
saveRDS(filtFs, file.path(filters_output_dir, "filtFs.rds"))
saveRDS(filtRs, file.path(filters_output_dir, "filtRs.rds"))
saveRDS(df_samples_filters,file.path(filters_output_dir,"samples.rds"))
}else{
df_samples_filters <- readRDS(file.path(filters_output_dir,"samples.rds"))
out <- readRDS(file.path(filters_output_dir, "out.rds"))
filtFs <- readRDS(file.path(filters_output_dir, "filtFs.rds"))
filtRs <- readRDS(file.path(filters_output_dir, "filtRs.rds"))
}
Here is the summary of the filters:
Error rates are learned by alternating between sample inference and error rate estimation until convergence.
dada2_output_dir <- file.path(params$output_dir, "dada2")
if(!dir.exists(dada2_output_dir)) dir.create(dada2_output_dir)
filtFs <- filtFs[file.exists(filtFs)]
filtRs <- filtRs[file.exists(filtRs)]
if(!file.exists(file.path(dada2_output_dir,"errF.rds")) && !file.exists(file.path(dada2_output_dir,"errR.rds"))){
errF <- learnErrors(filtFs, multithread=params$threads)
errR <- learnErrors(filtRs, multithread=params$threads)
saveRDS(errF, file.path(dada2_output_dir, "errF.rds"))
saveRDS(errR, file.path(dada2_output_dir, "errR.rds"))
}else{
errF <- readRDS(file.path(dada2_output_dir,"errF.rds"))
errR <- readRDS(file.path(dada2_output_dir,"errR.rds"))
}
plotErrors(errF, nominalQ=TRUE)
plotErrors(errR, nominalQ=TRUE)
The dada function removes all sequencing errors to reveal the members of the sequenced community.
if(!file.exists(file.path(dada2_output_dir,"dadaFs.rds")) && !file.exists(file.path(dada2_output_dir,"dadaRs.rds"))){
dadaFs <- dada(filtFs, err=errF, multithread=params$threads)
dadaRs <- dada(filtRs, err=errR, multithread=params$threads)
saveRDS(dadaFs,file.path(dada2_output_dir,"dadaFs.rds"))
saveRDS(dadaRs,file.path(dada2_output_dir,"dadaRs.rds"))
}else{
dadaFs <- readRDS(file.path(dada2_output_dir,"dadaFs.rds"))
dadaRs <- readRDS(file.path(dada2_output_dir,"dadaRs.rds"))
}
We now attempt to merge each denoised pair of forward and reverse reads.
if (!file.exists(file.path(dada2_output_dir,"mergers.rds")) | !file.exists(file.path(dada2_output_dir,"seqtab.rds"))){
if(!params$its == TRUE){
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE, minOverlap = params$merge_min_overlap, maxMismatch = params$merge_max_mismatch)
seqtab <- makeSequenceTable(mergers)
}else{
mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, returnRejects = TRUE, trimOverhang = TRUE, verbose = TRUE, maxMismatch = params$merge_max_mismatch)
concats <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, justConcatenate=TRUE,verbose=TRUE)
repair_merger <- function(merger, concat) {
merger[!merger$accept, ] <- concat[!merger$accept, ]
merger$sequence <- unlist(merger$sequence)
return(merger)
}
repaired_mergers <- purrr::map2(mergers, concats, repair_merger)
mergers <- repaired_mergers
seqtab <- makeSequenceTable(mergers)
}
saveRDS(mergers,file.path(dada2_output_dir,"mergers.rds"))
rownames(seqtab) <- sample.names <- unname(sapply(filtFs, get.sample.name))
saveRDS(seqtab,file.path(dada2_output_dir,"seqtab.rds"))
}else{
mergers <- readRDS(file.path(dada2_output_dir,"mergers.rds"))
seqtab <- readRDS(file.path(dada2_output_dir,"seqtab.rds"))
}
Finally, samples without remaining reads are removed: 0 sample(s) removed.
if(length(rownames(seqtab[rowSums(seqtab)==0,]))>0){
rownames(seqtab[rowSums(seqtab)==0,])
}
seqtab_final <- seqtab[rowSums(seqtab)>0,]
saveRDS(seqtab_final,file.path(dada2_output_dir,"seqtab.rds"))
We track reads through the pipeline:
frogs_output_dir <- file.path(params$output_directory, "FROGS")
if(!dir.exists(file.path(params$output_directory,"FROGS"))) dir.create(file.path(params$output_directory,"FROGS"))
seqtab <- readRDS(file.path(dada2_output_dir,"seqtab.rds"))
colnames(seqtab) <- gsub("NNNNNNNNNN", "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", colnames(seqtab))
asv_seqs <- colnames(seqtab)
asv_headers <- paste(">",colnames(seqtab),sep="")
asv_headers <- if_else(grepl("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", asv_headers), paste0(">",asv_seqs, "_FROGS_combined"), asv_headers)
asv_fasta <- c(rbind(asv_headers, asv_seqs))
write(asv_fasta, file.path(dada2_output_dir,"dada2.fasta"))
asv_tab <- t(seqtab)
colnames(seqtab) <- if_else(grepl("NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN", colnames(seqtab)), paste0(colnames(seqtab), "_FROGS_combined"), colnames(seqtab))
row.names(asv_tab) <- sub(">", "", asv_headers)
write.table(asv_tab, file.path(dada2_output_dir,"dada2.tsv"), sep="\t", quote=F, col.names=NA)
st.biom <- make_biom(t(seqtab))
write_biom(st.biom, file.path(dada2_output_dir,"dada2.biom"))
Now we use FROGS.
Chimera are removed.
if(!file.exists(file.path(frogs_output_dir,"remove_chimera.biom")) | !file.exists(file.path(frogs_output_dir,"remove_chimera.fasta"))){
system2(command="remove_chimera.py", args=c("--input-biom", file.path(dada2_output_dir,"dada2.biom"), "--input-fasta", file.path(dada2_output_dir,"dada2.fasta"), "--out-abundance", file.path(frogs_output_dir,"remove_chimera.biom"), "--summary", file.path(frogs_output_dir,"remove_chimera.html"), "--log-file", file.path(frogs_output_dir,"remove_chimera.log"), "--non-chimera", file.path(frogs_output_dir,"remove_chimera.fasta"), "--nb-cpus", params$threads))
}
OTUs are filtered on abundance and phiX contamination:
if(!file.exists(file.path(frogs_output_dir,"filters.biom")) | !file.exists(file.path(frogs_output_dir,"filters.fasta"))){
system2(command="otu_filters.py", args=c("--input-biom", file.path(frogs_output_dir,"remove_chimera.biom"), "--input-fasta", file.path(frogs_output_dir,"remove_chimera.fasta"), "--output-biom", file.path(frogs_output_dir,"filters.biom"), "--summary", file.path(frogs_output_dir,"filters.html"), "--log-file", file.path(frogs_output_dir,"filters.log"), "--output-fasta", file.path(frogs_output_dir,"filters.fasta"), "--contaminant", params$phix, "--min-abundance", params$min_abundance, "--excluded", file.path(frogs_output_dir,"excluded.tsv"), "--nb-cpus", params$threads))
}
if(params$its == TRUE){
cat("## FROGS ITSx\n")
if(!file.exists(file.path(frogs_output_dir,"itsx.biom"))){
system2(command="itsx.py", args = c("--nb-cpus", params$threads, "--region", params$its_region, "--input-biom", file.path(frogs_output_dir,"filters.biom"), "--out-removed" , file.path(frogs_output_dir,"itsx_removed.fasta"), "--input-fasta", file.path(frogs_output_dir,"filters.fasta"), "--check-its-only" , "--out-abundance", file.path(frogs_output_dir,"itsx.biom"), "--out-fasta", file.path(frogs_output_dir,"itsx.fasta"), "--summary", file.path(frogs_output_dir, "itsx.html"), "--log-file", file.path(frogs_output_dir,"itsx.log")), stdout=TRUE) # input files
}
cat("\n")
cat(" - [FROGS ITSx report](FROGS/itsx.html)")
}
Finally, OTUs are affiliated against a dedicated databank:
if(!file.exists(file.path(frogs_output_dir,"affiliations.biom"))){
if(params$its == TRUE){
input_biom <- file.path(frogs_output_dir,"itsx.biom")
input_fasta <- file.path(frogs_output_dir,"itsx.fasta")
}else{
input_biom <- file.path(frogs_output_dir,"filters.biom")
input_fasta <- file.path(frogs_output_dir,"filters.fasta")
}
system2(command="affiliation_OTU.py", args=c("--input-biom", input_biom, "--input-fasta", input_fasta, "--output-biom", file.path(frogs_output_dir,"affiliations.biom"), "--summary", file.path(frogs_output_dir,"affiliation.html"), "--log-file", file.path(frogs_output_dir,"affiliation.log"), "--reference", params$reference, "--nb-cpus", params$threads))
}
if(!file.exists(file.path(frogs_output_dir,"affiliations.tsv"))){
system2(command="biom_to_tsv.py", args=c("--input-biom", file.path(frogs_output_dir,"affiliations.biom"), "--input-fasta", file.path(frogs_output_dir,"filters.fasta"), "--output-tsv", file.path(frogs_output_dir,"affiliations.tsv"), "--output-multi-affi", file.path(frogs_output_dir,"multi_affiliations.tsv"), "--log-file", file.path(frogs_output_dir,"biomtotsv.log")))
}
The ASV table is available here:
system2(command="remove_chimera.py", args=c("--version"), stdout = TRUE)
## [1] "4.0.1"
system2(command="cutadapt", args=c("--version"), stdout = TRUE)
## [1] "2.10"
system2(command="seqkit", args=c("version"), stdout = TRUE)
## [1] "seqkit v2.0.0"
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/local/public/R-4.2.1/lib/R/lib/libRblas.so
## LAPACK: /usr/local/public/R-4.2.1/lib/R/lib/libRlapack.so
##
## locale:
## [1] LC_CTYPE=fr_FR.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=fr_FR.UTF-8 LC_COLLATE=fr_FR.UTF-8
## [5] LC_MONETARY=fr_FR.UTF-8 LC_MESSAGES=fr_FR.UTF-8
## [7] LC_PAPER=fr_FR.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=fr_FR.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] biomformat_1.26.0 ShortRead_1.56.1
## [3] GenomicAlignments_1.34.0 SummarizedExperiment_1.28.0
## [5] Biobase_2.58.0 MatrixGenerics_1.10.0
## [7] matrixStats_0.63.0 Rsamtools_2.14.0
## [9] GenomicRanges_1.50.2 Biostrings_2.66.0
## [11] GenomeInfoDb_1.34.9 XVector_0.38.0
## [13] IRanges_2.32.0 S4Vectors_0.36.1
## [15] BiocParallel_1.32.5 BiocGenerics_0.44.0
## [17] DT_0.27 lubridate_1.9.2
## [19] forcats_1.0.0 stringr_1.5.0
## [21] dplyr_1.1.0 purrr_1.0.1
## [23] readr_2.1.4 tidyr_1.3.0
## [25] tibble_3.1.8 tidyverse_2.0.0
## [27] ggplot2_3.4.1 dada2_1.26.0
## [29] Rcpp_1.0.10
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-7 RColorBrewer_1.1-3 tools_4.2.1
## [4] bslib_0.4.2 utf8_1.2.3 R6_2.5.1
## [7] DBI_1.1.3 colorspace_2.1-0 rhdf5filters_1.10.0
## [10] withr_2.5.0 tidyselect_1.2.0 compiler_4.2.1
## [13] cli_3.6.0 DelayedArray_0.24.0 labeling_0.4.2
## [16] sass_0.4.5 scales_1.2.1 digest_0.6.31
## [19] rmarkdown_2.20 jpeg_0.1-10 pkgconfig_2.0.3
## [22] htmltools_0.5.4 highr_0.10 fastmap_1.1.0
## [25] htmlwidgets_1.6.1 rlang_1.0.6 farver_2.1.1
## [28] jquerylib_0.1.4 generics_0.1.3 hwriter_1.3.2.1
## [31] jsonlite_1.8.4 crosstalk_1.2.0 RCurl_1.98-1.10
## [34] magrittr_2.0.3 GenomeInfoDbData_1.2.9 interp_1.1-3
## [37] Matrix_1.5-3 munsell_0.5.0 Rhdf5lib_1.20.0
## [40] fansi_1.0.4 lifecycle_1.0.3 stringi_1.7.12
## [43] yaml_2.3.7 zlibbioc_1.44.0 rhdf5_2.42.0
## [46] plyr_1.8.8 grid_4.2.1 parallel_4.2.1
## [49] crayon_1.5.2 deldir_1.0-6 lattice_0.20-45
## [52] hms_1.1.2 knitr_1.42 pillar_1.8.1
## [55] reshape2_1.4.4 codetools_0.2-19 glue_1.6.2
## [58] evaluate_0.20 latticeExtra_0.6-30 RcppParallel_5.1.6
## [61] png_0.1-8 vctrs_0.5.2 tzdb_0.3.0
## [64] gtable_0.3.1 cachem_1.0.6 xfun_0.37
## [67] timechange_0.2.0 ellipsis_0.3.2