|
| 1 | +#' Differential abundance with DESeq2 |
| 2 | +#' |
| 3 | +#' EXPERIMENTAL: This function is still being tested and developed; use with caution. Uses the |
| 4 | +#' \code{\link{DESeq2}} package to conduct differential abundance analysis of count data. Counts can |
| 5 | +#' be of OTUs/ASVs or taxa. The plotting function \code{\link{heat_tree_matrix}} is useful for |
| 6 | +#' visualizing these results. See details section below for considerations on preparing data for |
| 7 | +#' this analysis. |
| 8 | +#' |
| 9 | +#' Data should be raw read counts, not rarefied, converted to proportions, or modified with any |
| 10 | +#' other technique designed to correct for sample size since \code{\link{DESeq2}} is designed to be |
| 11 | +#' used with count data and takes into accoutn unequal sample size when determining differential |
| 12 | +#' abundance. Warnings will be given if the data is not integers or all sample sizes are equal. |
| 13 | +#' |
| 14 | +#' @param obj A \code{\link[taxa]{taxmap}} object |
| 15 | +#' @param data The name of a table in \code{obj} that contains data for each sample in columns. |
| 16 | +#' @param cols The names/indexes of columns in \code{data} to use. By default, all numeric columns |
| 17 | +#' are used. Takes one of the following inputs: \describe{ \item{TRUE/FALSE:}{All/No columns will |
| 18 | +#' used.} \item{Character vector:}{The names of columns to use} \item{Numeric vector:}{The indexes |
| 19 | +#' of columns to use} \item{Vector of TRUE/FALSE of length equal to the number of columns:}{Use |
| 20 | +#' the columns corresponding to \code{TRUE} values.} } |
| 21 | +#' @param groups A vector defining how samples are grouped into "treatments". Must be the same order |
| 22 | +#' and length as \code{cols}. |
| 23 | +#' @param other_cols If \code{TRUE}, preserve all columns not in \code{cols} in the output. If |
| 24 | +#' \code{FALSE}, dont keep other columns. If a column names or indexes are supplied, only preserve |
| 25 | +#' those columns. |
| 26 | +#' @param lfc_shrinkage What technique to use to adjust the log fold change results for low counts. |
| 27 | +#' Useful for ranking and visualizing log fold changes. Must be one of the following: |
| 28 | +#' \describe{ |
| 29 | +#' \item{'none'}{No log fold change adjustments.} |
| 30 | +#' \item{'normal'}{The original DESeq2 shrinkage estimator} |
| 31 | +#' \item{'ashr'}{Adaptive shrinkage estimator from the \code{\link{ashr}} package, using a fitted mixture of normals prior.} |
| 32 | +#' } |
| 33 | +#' @param ... Passed to \code{\link[DESeq2]{results}} if the \code{lfc_shrinkage} option is "none" |
| 34 | +#' and to \code{\link[DESeq2]{lfcShrink}} otherwise. |
| 35 | +#' |
| 36 | +#' @return A tibble with at least the taxon ID of the thing tested, the groups compared, and the |
| 37 | +#' DESeq2 results. The \code{log2FoldChange} values will be positive if \code{treatment_1} is more |
| 38 | +#' abundant and \code{treatment_2}. |
| 39 | +#' |
| 40 | +#' @family calculations |
| 41 | +#' |
| 42 | +#' @examples |
| 43 | +#' \dontrun{ |
| 44 | +#' # Parse data for plotting |
| 45 | +#' x = parse_tax_data(hmp_otus, class_cols = "lineage", class_sep = ";", |
| 46 | +#' class_key = c(tax_rank = "taxon_rank", tax_name = "taxon_name"), |
| 47 | +#' class_regex = "^(.+)__(.+)$") |
| 48 | +#' |
| 49 | +#' # Get per-taxon counts |
| 50 | +#' x$data$tax_table <- calc_taxon_abund(x, data = "tax_data", cols = hmp_samples$sample_id) |
| 51 | +#' |
| 52 | +#' # Calculate difference between groups |
| 53 | +#' x$data$diff_table <- calc_diff_abund_deseq2(x, data = "tax_table", |
| 54 | +#' cols = hmp_samples$sample_id, |
| 55 | +#' groups = hmp_samples$body_site) |
| 56 | +#' |
| 57 | +#' # Plot results (might take a few minutes) |
| 58 | +#' heat_tree_matrix(x, |
| 59 | +#' data = "diff_table", |
| 60 | +#' node_size = n_obs, |
| 61 | +#' node_label = taxon_names, |
| 62 | +#' node_color = ifelse(is.na(padj) | padj > 0.05, 0, log2FoldChange), |
| 63 | +#' node_color_range = diverging_palette(), |
| 64 | +#' node_color_trans = "linear", |
| 65 | +#' node_color_interval = c(-3, 3), |
| 66 | +#' edge_color_interval = c(-3, 3), |
| 67 | +#' node_size_axis_label = "Number of OTUs", |
| 68 | +#' node_color_axis_label = "Log2 fold change") |
| 69 | +#' |
| 70 | +#' } |
| 71 | +#' |
| 72 | +#' @export |
| 73 | +calc_diff_abund_deseq2 <- function(obj, data, cols, groups, other_cols = FALSE, |
| 74 | + lfc_shrinkage = c('none', 'normal', 'ashr'), ...) { |
| 75 | + |
| 76 | + # Check that DESeq2 is intalled |
| 77 | + if (! requireNamespace("DESeq2", quietly = TRUE)) { |
| 78 | + stop('The "DESeq2" package needs to be installed for this function to work.', |
| 79 | + call. = FALSE) |
| 80 | + } |
| 81 | + |
| 82 | + # Parse options |
| 83 | + lfc_shrinkage <- match.arg(lfc_shrinkage) |
| 84 | + |
| 85 | + # Get abundance by sample data |
| 86 | + abund_data <- get_taxmap_table(obj, data) |
| 87 | + |
| 88 | + # Parse columns to use |
| 89 | + cols <- get_numeric_cols(obj, data, cols) |
| 90 | + |
| 91 | + # Check that columns contain integers only |
| 92 | + col_is_int <- vapply(cols, FUN.VALUE = logical(1), function(c) { |
| 93 | + all(abund_data[[c]] %% 1 == 0) |
| 94 | + }) |
| 95 | + non_int_cols <- cols[! col_is_int] |
| 96 | + if (length(non_int_cols) > 0) { |
| 97 | + stop(call. = FALSE, |
| 98 | + 'All data given to DESeq2 should be untransformed counts. The following columns contain non-integers:\n', |
| 99 | + limited_print(type = 'silent', non_int_cols, prefix = ' ')) |
| 100 | + } |
| 101 | + |
| 102 | + # Check for equal sample sizes |
| 103 | + if (length(unique(colSums(abund_data[cols]))) == 1) { |
| 104 | + warning(call. = FALSE, |
| 105 | + 'All columns have equal counts, suggesting counts were normalized (e.g. rarefied) to correct for sample size variation.', |
| 106 | + ' DESeq2 is designed to be used with unnormalized data.', |
| 107 | + ' Use untransformed counts if available.') |
| 108 | + } |
| 109 | + |
| 110 | + # Check groups option |
| 111 | + groups <- check_option_groups(groups, cols) |
| 112 | + |
| 113 | + # Find other columns |
| 114 | + # These might be added back to the output later |
| 115 | + other_cols <- get_taxmap_other_cols(obj, data, cols, other_cols) |
| 116 | + |
| 117 | + # Get every combination of groups to compare |
| 118 | + combinations <- t(utils::combn(unique(groups), 2)) |
| 119 | + combinations <- lapply(seq_len(nrow(combinations)), function(i) combinations[i, ]) |
| 120 | + |
| 121 | + # Format data for DESeq2 |
| 122 | + metadata <- data.frame(group = groups) |
| 123 | + deseq_data <- DESeq2::DESeqDataSetFromMatrix(countData = abund_data[cols], |
| 124 | + colData = metadata, |
| 125 | + design = ~ group) |
| 126 | + |
| 127 | + # Run DESeq2 |
| 128 | + deseq_result <- DESeq2::DESeq(deseq_data) |
| 129 | + |
| 130 | + # Make function to compare one pair of groups |
| 131 | + one_comparison <- function(treat_1, treat_2) { |
| 132 | + |
| 133 | + # Extract results for a single pair |
| 134 | + if (lfc_shrinkage == 'none') { |
| 135 | + pair_result <- DESeq2::results(deseq_result, contrast = c('group', treat_1, treat_2), ...) |
| 136 | + } else { |
| 137 | + pair_result <- DESeq2::lfcShrink(deseq_result, contrast = c('group', treat_1, treat_2), |
| 138 | + type = lfc_shrinkage, ...) |
| 139 | + } |
| 140 | + |
| 141 | + # Add treatments compared |
| 142 | + output <- cbind(data.frame(stringsAsFactors = FALSE, |
| 143 | + treatment_1 = treat_1, |
| 144 | + treatment_2 = treat_2), |
| 145 | + as.data.frame(pair_result)) |
| 146 | + |
| 147 | + # Add in other columns if specified |
| 148 | + output <- cbind(abund_data[, other_cols], output) |
| 149 | + |
| 150 | + return(output) |
| 151 | + } |
| 152 | + |
| 153 | + # Compare all pairs of treatments and combine |
| 154 | + output <- lapply(seq_len(length(combinations)), function(i) { |
| 155 | + one_comparison(combinations[[i]][1], combinations[[i]][2]) |
| 156 | + }) |
| 157 | + output <- do.call(rbind, output) |
| 158 | + |
| 159 | + # Convert to tibble and return |
| 160 | + dplyr::as_tibble(output) |
| 161 | +} |
0 commit comments