Commit 6786a19f authored by Michael Nakai's avatar Michael Nakai
Browse files

Initial commit

parents
^OneStopShop\.Rproj$
^\.Rproj\.user$
^LICENSE\.md$
.Rproj.user
.Rhistory
.RData
Package: OneStopShop
Title: A Wrapper For A More Straightforward Workflow
Version: 0.99.0
Authors@R:
person(given = "Michael E.",
family = "Nakai",
role = c("aut", "cre"),
email = "mnakai.zj9@gmail.com",
comment = c(ORCID = "0000-0002-6400-8007"))
Description: A wrapper for multiple ecology/microbiome R packages that supports a more straightforward workflow.
License: GPL-3
Encoding: UTF-8
LazyData: true
Roxygen: list(markdown = TRUE)
RoxygenNote: 7.1.1
biocViews:
Requires:
Biostrings,
dada2,
data.table,
DECIPHER,
microbiome,
phangorn,
phyloseq,
tidyverse,
vegan
This diff is collapsed.
# Generated by roxygen2: do not edit by hand
export(DADA2_main)
export(create_alpha_table)
export(create_rarefaction_curve)
export(create_tree)
export(format_q2_metadata)
export(handoff_to_phyloseq)
export(label_taxonomy)
export(plot_alpha_all)
export(plot_alpha_group)
export(plot_alpha_specific)
export(rarefy_phyloseq)
Version: 1.0
RestoreWorkspace: No
SaveWorkspace: No
AlwaysSaveHistory: Default
EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 4
Encoding: UTF-8
RnwWeave: Sweave
LaTeX: pdfLaTeX
AutoAppendNewline: Yes
StripTrailingWhitespace: Yes
LineEndingConversion: Posix
BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source
PackageRoxygenize: rd,collate,namespace
#' Helper function to replace slashes and backslashes depending on the OS type.
#'
#' @keywords internal
#'
#' @param pathstring A string of a filepath containing slashes or backslashes.
#'
#' @return A OS-dependent filepath string.
fileplatform <- function(pathstring) {
interm <- pathstring
ostype <- Sys.info()['sysname']
if (ostype == 'Windows') {
interm <- stringr::str_replace_all(interm, "/", "\\\\")
} else {
interm <- stringr::str_replace_all(interm, "\\\\", "/")
}
return(interm)
}
#' A helper function to quickly load phyloseq objects. Currently unused.
#'
#' @param output_dir A filepath to the output directory, without the final slash.
#' @param rarefy_samples A boolean showing whether a rarefied phyloseq object should be loaded or not.
#'
#' @return The loaded phyloseq object.
#'
#' @keywords internal
loadphyloseq <- function(output_dir, rarefy_samples=FALSE) {
if (rarefy_samples) {
phyloseqobj <- readRDS(paste0(output_dir, fileplatform("/Rarefaction/RDS_objects/rarefied_ps.RDS")))
} else {
phyloseqobj <- readRDS(paste0(output_dir, fileplatform("/dada2/RDS_objects/phyloseq_output.RDS")))
}
return(phyloseqobj)
}
#' Contains all DADA2 preprocessing, denoising, merging, and chimera removal processes.
#'
#' @param fastq_folderpath The filepath to the directory containing the FASTQ files to process.
#' @param outputpath The filepath to the directory where all filtered sequences and summary data should be output.
#' @param trim_left The number of base pairs to trim from the start of all sequences (min 0).
#' @param truncF The base number that forward reads should truncate at.
#' @param truncR The base number that reverse reads should truncate at.
#' @param forwardReadPattern A filename identifier that identifies FASTQ files with forward reads (ex: "_F").
#' @param reverseReadPattern A filename identifier that identifies FASTQ files with reverse reads (ex: "_R").
#' @param max_EE_allowed A vector containing the maximum error rates allowed (default c(2,2)).
#' @param max_N_allowed The maximum amount of unknown bases labelled N to allow before discarding a read (default 0).
#' @param min_length_allowed The minimum length a read can be. If the read length < min_length_allowed, the read is discarded (default 0).
#' @param min_quality_allowed The minimum quality score that a base can have before the entire read is discarded (default 0).
#' @param remove_phiX A boolean signifying whether reads assigned to phiX should be removed (default TRUE).
#' @param verbose_output A boolean signifying whether the main DADA2 step should be verbose or silent (default TRUE).
#' @param pooling A boolean signifying whether DADA2 should pool samples together prior to sample inference (default FALSE).
#'
#' @return Returns a table of sequences, all of which are non-chimeric and have been denoised and merged.
#'
#' @export
DADA2_main <- function(fastq_folderpath, outputpath, trim_left, truncF, truncR, forwardReadPattern, reverseReadPattern,
max_EE_allowed=c(2,2), max_N_allowed=0, max_length_allowed=10000, min_length_allowed=0,
min_quality_allowed=0, remove_phiX=TRUE, verbose_output=TRUE, pooling=FALSE) {
message("Starting file sort...")
# Sort into forward and reverse reads
fnFs <- sort(list.files(fastq_folderpath,
pattern=forwardReadPattern,
full.names = TRUE))
fnRs <- sort(list.files(fastq_folderpath,
pattern=reverseReadPattern,
full.names = TRUE))
# Extract sample names, assuming the sample names are before the first underscore
sample.names <- sapply(strsplit(basename(fnFs), "_"), `[`, 1)
# Make the output paths per filtered file, then label them with the corresponding sample names
dir.create(paste0(outputpath, fileplatform("/dada2/")), showWarnings = FALSE)
filtFs <- file.path(outputpath, fileplatform("dada2/filtered/"), paste0(sample.names, "_F_filt.fastq.gz"))
filtRs <- file.path(outputpath, fileplatform("dada2/filtered/"), paste0(sample.names, "_R_filt.fastq.gz"))
names(filtFs) <- sample.names
names(filtRs) <- sample.names
# Do the filtering and save the output
message("Starting DADA2 filtering and trimming...")
trunclens <- c(truncF, truncR)
out <- dada2::filterAndTrim(fnFs, filtFs, fnRs, filtRs,
truncLen = trunclens,
maxN = max_N_allowed,
maxEE = max_EE_allowed,
maxLen = max_length_allowed,
minLen = min_length_allowed,
minQ = min_quality_allowed,
truncQ = 2,
rm.phix = remove_phiX,
compress = TRUE, multithread = TRUE, verbose = verbose_output)
dada2_save_path <- paste0(outputpath, fileplatform("/dada2/summary/filterAndTrimOutput.csv"))
dir.create(paste0(outputpath, fileplatform("/dada2/summary/")), showWarnings = FALSE)
write.csv(out, dada2_save_path)
# Check the estimated error rates for possible base --> base errors
# This takes a while, depending on how many reads we have, so fire and forget it
# If the red/black lines trend in the same direction and fit relatively well together, then everything's OK
message("Checking estimated base --> base error rates...")
errF <- learnErrors(filtFs, multithread=TRUE)
errR <- learnErrors(filtRs, multithread=TRUE)
dada2::plotErrors(errF, nominalQ=TRUE)
# Apply the core sample inference algorithm (again, can take a while)
message("Applying sample inference algorithm...")
dadaFs <- dada2::dada(filtFs, err=errF, multithread=TRUE, pool = pooling)
dadaRs <- dada2::dada(filtRs, err=errR, multithread=TRUE, pool = pooling)
# Merge the paired-end reads
message("Merging forward and reverse reads...")
mergers <- dada2::mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=verbose_output)
# Make a sequence table
message("Making sequence table...")
seqtab <- dada2::makeSequenceTable(mergers)
# Check that many of the sequences are all around the same length
table_save_path <- paste0(outputpath, fileplatform("/dada2/summary/readLength.csv"))
lengthTable <- table(nchar(getSequences(seqtab)))
write.csv(lengthTable, table_save_path)
# Remove chimeras
message("Removing chimeras...")
seqtab.nochim <- dada2::removeBimeraDenovo(seqtab, method="consensus", multithread=TRUE, verbose=verbose_output)
# Output summary of all steps, then save
message("Creating DADA2 summary...")
getN <- function(x) sum(getUniques(x))
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
dada2_final_save_path <- paste0(outputpath, fileplatform("/dada2/summary/Summary.csv"))
write.csv(track, dada2_final_save_path)
# Save the final seqtab
dir.create(paste0(outputpath, fileplatform("/dada2/RDS_objects/")), showWarnings = FALSE)
saveRDS(seqtab.nochim, file = paste0(outputpath, fileplatform("/dada2/RDS_objects/seqtab_nochim.RDS")))
return(seqtab.nochim)
message("Finished DADA2 denoising\n")
}
#' Creates a phylogenetic tree using phangorn and DECEIPHER.
#'
#' @param seqtab A table of sequences. Can be generated via the [DADA2_main()] function.
#'
#' @return A phylogenetic tree object, used to make a full phyloseq object.
#'
#' @seealso [DADA2_main()] for sequence table creation.
#' @export
create_tree <- function(seqtab) {
message("Creating phylogenetic tree...")
seqs <- dada2::getSequences(seqtab)
names(seqs) <- seqs # This propagates to the tip labels of the tree
alignment <- DECIPHER::AlignSeqs(DNAStringSet(seqs), anchor=NA)
phang.align <- phangorn::phyDat(as(alignment, "matrix"), type="DNA")
dm <- phangorn::dist.ml(phang.align)
treeNJ <- phangorn::NJ(dm) # Note, tip order != sequence order
fit = phangorn::pml(treeNJ, data=phang.align)
fitGTR <- update(fit, k=4, inv=0.2)
fitGTR <- phangorn::optim.pml(fitGTR, model="GTR", optInv=TRUE, optGamma=TRUE,
rearrangement = "stochastic", control = pml.control(trace = 0)) # This takes a while
return(fitGTR)
message("Finished phylogenetic tree creation\n")
}
#' Formats metadata from a QIIME2-compatible .tsv file to a data frame, and prepares it for downstream use.
#'
#' @param metadata_path A filepath to the metadata file.
#'
#' @return A dataframe of the imported metadata.
#' @export
format_q2_metadata <- function(metadata_path) {
# Format the metadata for the phyloseq sample_data
message("Formatting metadata for phyloseq handoff...")
temp_sample_data <- as.data.frame(fread(metadata_path))
if (temp_sample_data[1,1] == "#q2:types") { # If the second row is #q2:types then delete it
temp_sample_data <- temp_sample_data[-c(1),]
}
rowname_sample_data <- temp_sample_data[,-1] # Reassign the first column as the rownames
rownames(rowname_sample_data) <- temp_sample_data[,1]
return(rowname_sample_data)
message("Finished formatting metadata\n")
}
#' Creates a phyloseq object from the supplied data. If outputpath is specified, save the phyloseq object.
#'
#' @param sequence_table A sequence table to include. Can be generated from [DADA2_main()].
#' @param metadata A metadata table to include. Can be generated from [format_q2_metadata()].
#' @param taxa_table A taxonomy table to include. Can be generated from [label_taxonomy()].
#' @param tree (optional) A phylogenetic tree to include. Can be generated from [create_tree()].
#' @param outputpath (optional) A filepath to a folder to output the saved phyloseq object as an RDS object.
#' @param substitute_ASV_names (optional) A boolean specifying whether ASV names should be substituted with ASV1, ASV2... The names will be stored in "dna".
#'
#' @return A phyloseq object consisting of the supplied sequence table, metadata, taxonomy table, and (optional) phylogenetic tree.
#'
#' @seealso [DADA2_main()] for sequence table creation, [format_q2_metadata()] for metadata importing, [label_taxonomy()] for taxonomy table creation, and [create_tree()] for phylogenetic tree creation.
#' @export
handoff_to_phyloseq <- function(sequence_table, metadata, taxa_table, tree=FALSE, outputpath=FALSE, substitute_ASV_names=FALSE){
message("Starting phyloseq handoff...")
if (tree == FALSE) {
ps_object <- phyloseq::phyloseq(phyloseq::otu_table(seqtab, taxa_are_rows=FALSE),
phyloseq::sample_data(metadata),
phyloseq::tax_table(taxa_table))
} else {
ps_object <- phyloseq::phyloseq(phyloseq::otu_table(seqtab, taxa_are_rows=FALSE),
phyloseq::sample_data(metadata),
phyloseq::phy_tree(tree),
phyloseq::tax_table(taxa_table))
}
if (substitute_ASV_names) {
dna <- Biostrings::DNAStringSet(taxa_names(ps_object))
names(dna) <- phyloseq::taxa_names(ps_object)
ps_object <- phyloseq::merge_phyloseq(ps_object, dna)
phyloseq::taxa_names(ps_object) <- paste0("ASV", seq(phyloseq::ntaxa(ps_object)))
}
if (outputpath) {
dir.create(paste0(outputpath, fileplatform("/dada2/")), showWarnings = FALSE)
dir.create(paste0(outputpath, fileplatform("/dada2/RDS_objects/")), showWarnings = FALSE)
saveRDS(ps_object, file = paste0(outputpath, fileplatform("/dada2/RDS_objects/phyloseq_output.RDS")))
}
return(ps_object)
message("Finished phyloseq handoff\n")
}
#' Assigns taxonomy classifications to a provided table of sequences.
#'
#' @param seqtab The provided table of sequences. Can be generated from DADA2_main().
#' @param pathToDatabase A filepath to a genus-level classification database compatible with DADA2.
#' @param species A boolean specifying whether specie-level classification should be attempted.
#' @param pathToSpecies A filepath to a species-level classification database compatible with DADA2.
#'
#' @return Returns a table of classified sequences.
#' @export
label_taxonomy <- function(seqtab, pathToDatabase, species=FALSE, pathToSpecies=FALSE) {
message("Assigning taxonomy...")
taxa <- dada2::assignTaxonomy(seqtab.nochim, pathToDatabase, multithread=TRUE)
if (species) {
message("Assigning species...")
taxa <- dada2::addSpecies(taxa, pathToSpecies)
}
return(taxa)
message("Finished taxonomy annotation")
}
#' Creates a rarefaction curve using vegan's rarecurve().
#'
#' @param phyloseq_object The phyloseq object to be loaded.
#' @param step_size The intervals the rarefaction curve should be crafted on.
#'
#' @return A rarefaction curve.
#'
#' @seealso <https://www.rdocumentation.org/packages/vegan/versions/2.4-2/topics/rarefy>
#' @export
create_rarefaction_curve <- function(phyloseq_object, step_size=20) {
message("Starting rarefaction curve generation...")
rarefaction_curve <- vegan::rarecurve(otu_table(ps_object, FALSE), step=step_size, cex=0.5)
return(rarefaction_curve)
message("Finished rarefaction curve generation\n")
}
#' Rarefies samples in the provided phyloseq object to the specified rarefaction depth. Optionally saves the resulting phyloseq object.
#'
#' @param phyloseq_object The phyloseq object to rarefy.
#' @param rarefaction_depth A number specifying the number of reads to rarefy to per sample.
#' @param outputpath (optional) A filepath specifying the folder to save the rarefied object to.
#'
#' @return A rarefied phyloseq object.
#' @export
rarefy_phyloseq <- function(phyloseq_object, rarefaction_depth, outputpath=FALSE) {
message("Starting rarefaction...")
# Rarefy the phyloseq object and save it
ps.rarefied = rarefy_even_depth(phyloseq_object, sample.size=rarefaction_depth, replace=FALSE)
if (outputpath) {
# Create the output folder
dir.create(paste0(outputpath, fileplatform("/Rarefaction")), showWarnings = FALSE)
dir.create(paste0(outputpath, fileplatform("/Rarefaction/RDS_objects")), showWarnings = FALSE)
saveRDS(ps.rarefied, file = paste0(outputpath, fileplatform("/Rarefaction/RDS_objects/rarefied_ps.RDS")))
}
return(ps.rarefied)
message("Finished rarefying samples")
}
#' Creates a table of alpha diversity metrics per sample. Optionally saves the table as a .tsv file.
#'
#' @param phyloseq_object The phyloseq object containing the target dataset.
#' @param outputpath (optional) A filepath to the folder that the table should be saved into.
#'
#' @return A table of alpha diversity metrics per sample.
#' @export
create_alpha_table <- function(phyloseq_object, outputpath=FALSE) {
message("Starting microbiome diversity table generation...")
tab <- microbiome::alpha(phyloseq_object, index = "all")
if (outputpath) {
dir.create(paste0(outputpath, fileplatform("/alpha_diversity_table")), showWarnings = FALSE)
write.table(tab,
file=paste0(outputpath, fileplatform("/alpha_diversity_table/alpha_diversity.tsv")),
row.names=TRUE,
col.names=TRUE,
na = "",
sep = "\t",
quote = FALSE)
}
return(tab)
message("Finished table generation\n")
}
#' Plots all general alpha diversity metrics on one plot.
#'
#' @param phyloseq_object The phyloseq object containing the data to be visualized.
#'
#' @return A plot showing all general metrics of alpha diversity.
#' @export
plot_alpha_all <- function(phyloseq_object) {
general_richness <- phyloseq::plot_richness(phyloseq_object)
return(general_richness)
}
#' Creates a list of alpha diversity plots with samples separated into groups based on the provided grouping variable.
#'
#' @param phyloseq_object The phyloseq object containing the dataset to be plotted.
#' @param grouping_variable The column name in the metadata that the samples should be grouped by. Multiple columns can be looped over if provided in a vector.
#' @param alpha_types The types of alpha diversity metrics to plot.
#'
#' @return Returns a list of alpha diversity plots (if grouping variable isn't a vector) or a list of lists (if grouping variable is a vector). Raises an error if alpha_types isn't a vector
#' @export
plot_alpha_group <- function(phyloseq_object, grouping_variable, alpha_types=c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")) {
# Check that alpha_types is a vector
if (!(is.vector(alpha_types))) {
stop("alpha_types is not a vector!")
}
if (!(is.vector(grouping_variable))) {
outputList <- vector("list", length = 0)
for (divType in alpha_types) {
outputList[[divType]] <- phyloseq::plot_richness(ps_object, x = grouping_variable, measures = divType)
}
}
if (is.vector(grouping_variable)) {
outputList <- vector("list", length = 0)
for (variab in grouping_variable) {
intermList <- vector("list", length = 0)
for (divType in alpha_types) {
intermList[[divType]] <- phyloseq::plot_richness(ps_object, x = variab, measures = divType)
}
outputList[[variab]] <- intermList
}
}
return(outputList)
}
#' Creates a list of alpha diversity plots based on the metrics specified.
#'
#' @param phyloseq_object The phyloseq object containing the relevant dataset.
#' @param alpha_types A vector containing the types of alpha diversity metrics to show.
#'
#' @return Returns a list of alpha diversity plots, or raises an error if alpha_types isn't a vector
#'
#' @seealso <https://www.rdocumentation.org/packages/phyloseq/versions/1.16.2/topics/plot_richness> for the primarily used function.
#' @export
plot_alpha_specific <- function(phyloseq_object, alpha_types=c("Observed", "Chao1", "ACE", "Shannon", "Simpson", "InvSimpson", "Fisher")) {
# Check that alpha_types is a vector
if (!(is.vector(alpha_types))) {
stop("alpha_types is not a vector!")
}
outputList <- vector("list", length = length(alpha_types))
i <- 1
for (divType in alpha_types) {
outputList[[i]] <- phyloseq::plot_richness(ps_object, measures = divType)
i <- i + 1
}
return(outputList)
}
% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/1-DADA2_main.R
\name{DADA2_main}
\alias{DADA2_main}
\title{Contains all DADA2 preprocessing, denoising, merging, and chimera removal processes.}
\usage{
DADA2_main(
fastq_folderpath,
outputpath,
trim_left,
truncF,
truncR,
forwardReadPattern,
reverseReadPattern,
max_EE_allowed = c(2, 2),
max_N_allowed = 0,
max_length_allowed = 10000,
min_length_allowed = 0,
min_quality_allowed = 0,
remove_phiX = TRUE,
verbose_output = TRUE,
pooling = FALSE
)
}
\arguments{
\item{fastq_folderpath}{The filepath to the directory containing the FASTQ files to process.}
\item{outputpath}{The filepath to the directory where all filtered sequences and summary data should be output.}
\item{trim_left}{The number of base pairs to trim from the start of all sequences (min 0).}
\item{truncF}{The base number that forward reads should truncate at.}
\item{truncR}{The base number that reverse reads should truncate at.}
\item{forwardReadPattern}{A filename identifier that identifies FASTQ files with forward reads (ex: "_F").}
\item{reverseReadPattern}{A filename identifier that identifies FASTQ files with reverse reads (ex: "_R").}
\item{max_EE_allowed}{A vector containing the maximum error rates allowed (default c(2,2)).}
\item{max_N_allowed}{The maximum amount of unknown bases labelled N to allow before discarding a read (default 0).}
\item{min_length_allowed}{The minimum length a read can be. If the read length < min_length_allowed, the read is discarded (default 0).}
\item{min_quality_allowed}{The minimum quality score that a base can have before the entire read is discarded (default 0).}
\item{remove_phiX}{A boolean signifying whether reads assigned to phiX should be removed (default TRUE).}
\item{verbose_output}{A boolean signifying whether the main DADA2 step should be verbose or silent (default TRUE).}
\item{pooling}{A boolean signifying whether DADA2 should pool samples together prior to sample inference (default FALSE).}
}
\value{
Returns a table of sequences, all of which are non-chimeric and have been denoised and merged.
}
\description{
Contains all DADA2 preprocessing, denoising, merging, and chimera removal processes.
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment