Skip to content

Commit 898475a

Browse files
use factor order in compare_groups
addresses #323 When an ordered factor is supplied to the `groups` argument of `compare_groups`, the order of levels is used to arrange the results. This can be used to change the order of groups when the output is used with `heat_tree_matrix`
1 parent 8146849 commit 898475a

File tree

5 files changed

+223
-3
lines changed

5 files changed

+223
-3
lines changed

DESCRIPTION

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Package: metacoder
22
Title: Tools for Parsing, Manipulating, and Graphing Taxonomic Abundance Data
3-
Version: 0.3.5
3+
Version: 0.3.5.001
44
Authors@R: c(person("Zachary", "Foster", email =
55
"zacharyfoster1989@gmail.com", role = c("aut", "cre")),
66
person("Niklaus", "Grunwald", email =
@@ -52,7 +52,7 @@ Suggests:
5252
traits,
5353
biomformat
5454
VignetteBuilder: knitr
55-
RoxygenNote: 7.1.1
55+
RoxygenNote: 7.1.2
5656
Date: 2021-06-23
5757
Encoding: UTF-8
5858
biocViews:

NEWS.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# News
22

3+
## Current
4+
5+
* When an ordered factor is supplied to the `groups` argument of `compare_groups`, the order of levels is used to arrange the results. This can be used to change the order of groups when the output is used with `heat_tree_matrix` (issue [#323](https://github.com/grunwaldlab/metacoder/issues/323)).
6+
37
## metacoder 0.3.5
48

59
* Replaced depreciated `as.tbl` function from `dplyr`'

R/calculations.R

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -640,7 +640,12 @@ compare_groups <- function(obj, data, cols, groups,
640640

641641
# Get every combination of groups to compare
642642
if (is.null(combinations)) {
643-
combinations <- t(utils::combn(unique(groups), 2))
643+
if (is.ordered(groups)) {
644+
group_order <- factor(levels(groups), levels = levels(groups), order = TRUE)
645+
} else {
646+
group_order <- unique(groups)
647+
}
648+
combinations <- t(utils::combn(group_order, 2))
644649
combinations <- lapply(seq_len(nrow(combinations)), function(i) combinations[i, ])
645650
}
646651

R/heat_tree_matrix.R

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,8 @@ heat_tree_matrix <- function(obj, data, label_small_trees = FALSE,
7979
treat_1 <- as.character(diff_table$treatment_1)
8080
treat_2 <- as.character(diff_table$treatment_2)
8181
treatments <- unique(c(treat_1, treat_2))
82+
treat_counts <- sort(vapply(treatments, FUN.VALUE = numeric(1), function(x) sum(treat_1 == x)), decreasing = TRUE)
83+
treatments <- names(treat_counts)
8284
combinations <- t(utils::combn(seq_along(treatments), 2))
8385
layout_matrix <- matrix(rep(NA, (length(treatments))^2), nrow = length(treatments))
8486
for (index in 1:nrow(combinations)) {

scratch/search_ncbi.R

Lines changed: 209 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,209 @@
1+
2+
#' @keywords internal
3+
parse_ft <- function(text) {
4+
text <- gsub(text, pattern = "\n\t\t\t", replacement = "\t", fixed = TRUE)
5+
parts <- strsplit(text, "\n\n", fixed = TRUE)[[1]]
6+
part_data <- lapply(parts, function(x) {
7+
first_line <- sub(x, pattern = "\\n.+$", replacement = "")
8+
acc <- stringr::str_match(first_line, pattern = "\\|(.+)\\|")[,2]
9+
the_rest <- sub(x, pattern = "^>.+?\\n", replacement = "")
10+
# replace extra \t with , when there are multiple features
11+
lines <- strsplit(the_rest, "\n")[[1]]
12+
lines <- purrr::map2(stringr::str_locate_all(lines, "\t"), lines, function(matches, line) {
13+
if (nrow(matches) > 4) {
14+
for (i in matches[3:(nrow(matches) - 2), 1]) {
15+
substr(line, i, i) <- ","
16+
}
17+
}
18+
return(line)
19+
})
20+
the_rest <- paste0(lines, collapse = "\n")
21+
output <- readr::read_tsv(paste0(the_rest, "\n"), col_names = c("start", "end", "feature", "type", "name"), col_types = "ccccc")
22+
output <- tibble::as_tibble(cbind(list(acc = acc), output, stringsAsFactors = FALSE))
23+
})
24+
output <- dplyr::bind_rows(part_data)
25+
output$complete <- ifelse(startsWith(as.character(output$start), "<") | startsWith(as.character(output$end), ">"), FALSE, TRUE)
26+
output$start <- as.integer(gsub(output$start, pattern = "<", replacement = ""))
27+
output$end <- as.integer(gsub(output$end, pattern = ">", replacement = ""))
28+
return(output)
29+
}
30+
31+
#' @keywords internal
32+
parse_seqs <- function(text) {
33+
xml <- xml2::read_xml(text)
34+
tibble::tibble(acc = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_accver")),
35+
seq = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_sequence")),
36+
header = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_defline")),
37+
length = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_length")))
38+
}
39+
40+
#' Lookup gene sequences from NCBI
41+
#'
42+
#' Look for sequences of a particular gene for a list of species/isolates/genes from the Genbank
43+
#' nucleotide database.
44+
#'
45+
#' @param species The names of species to look up.
46+
#' @param genes The names of the genes to look up.
47+
#' @param isolates The names of isolates to look up. Must be the same length as \code{species} if
48+
#' used.
49+
#' @param extract_features If TRUE, return the sequence for each feature in the sequence annotation
50+
#' instead of the whole sequence.
51+
#' @param gene_name_in_feature If TRUE, only return features that have one of the gene names
52+
#' somewhere in their description. Only has an effect if extract_features is TRUE.
53+
#' @param flanking A vector of length 2. The number of base pairs before and after the target gene
54+
#' to include in the sequence returned. If the flanking sequence is not available, the sequence
55+
#' will be considered incomplete.
56+
#' @param db The name of the NCBI database to query. Only tested with "nucleotide", but a few others
57+
#' might work.
58+
#' @param pause The number of seconds to pause between each query. This avoids annoying NCBI and
59+
#' having them block you IP address. Should be at least 0.35 seconds if you dont have an NCBI API
60+
#' key and at least 0.1 seconds if you do.
61+
#' @param ... Additional terms to add to the search request for each species/isolate, see NCBI
62+
#' documentation for a complete list:
63+
#' http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_
64+
#'
65+
#' @return
66+
#'
67+
#' A tibble (a type of data.frame) with the following columns, depending on settings:
68+
#'
69+
#' * species: The species search term associated with the result
70+
#' * isolate: The isolate search term associated with the result
71+
#' * query: The query used to search for sequences on NCBI
72+
#' * acc: The Genbank accession number
73+
#' * start: The index of the first base pair of the target gene in the original sequence
74+
#' * end: The index of the last base pair of the target gene in the original sequence
75+
#' * feature: The type of locus
76+
#' * type: The type of annotation for the sequence
77+
#' * name: The name of the locus
78+
#' * complete: If the locus was complete in the original sequence, according to the annotation
79+
#' * seq: The whole sequence the gene was found in
80+
#' * header: The name of the overall sequence the gene was found in
81+
#' * length: The length of the whole sequence
82+
#' * flank_start: The index of the first base pair of the target gene plus the flanking region in the original sequence
83+
#' * flank_end: The index of the last base pair of the target gene plus the flanking region in the original sequence
84+
#' * flank_complete: If the locus plus flanking region was complete in the original sequence
85+
#' * gene_seq: The target gene sequence plus the flanking region
86+
#' * gene_length: The length of the target gene plus the flanking region
87+
#'
88+
#' @examples \dontrun{
89+
#'
90+
#' # Search for the whole seqeunces for with P. infestans Cox I
91+
#' get_isolate_seqs(species = c("Phytophthora infestans"),
92+
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"),
93+
#' retmax = 100)
94+
#'
95+
#' # Search for the just the gene sequence for P. infestans Cox I
96+
#' get_isolate_seqs(species = c("Phytophthora infestans"),
97+
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"),
98+
#' retmax = 100,
99+
#' extract_features = TRUE)
100+
#'
101+
#' # Search for all the gene sequences in whole sequences that contain P. infestans Cox I
102+
#' get_isolate_seqs(species = c("Phytophthora infestans"),
103+
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"),
104+
#' retmax = 100,
105+
#' extract_features = TRUE,
106+
#' gene_name_in_feature = FALSE)
107+
#'
108+
#' # Search for whole sequences for P. infestans Cox I for just some isolates
109+
#' get_isolate_seqs(species = c("Phytophthora infestans", "Phytophthora infestans", "Phytophthora infestans"),
110+
#' isolates = c("44", "580", "180"),
111+
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"))
112+
#'
113+
#' # Search for just the gene sequences for P. infestans Cox I for just some isolates
114+
#' get_isolate_seqs(species = c("Phytophthora infestans", "Phytophthora infestans", "Phytophthora infestans"),
115+
#' isolates = c("44", "580", "180"),
116+
#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"),
117+
#' extract_features = TRUE)
118+
#'
119+
#' # Search for P infestans ITS with flanking regions and subset for complete results
120+
#' result <- get_isolate_seqs(species = c("Phytophthora"),
121+
#' genes = c("internal transcribed spacer"),
122+
#' retmax = 300,
123+
#' extract_features = TRUE,
124+
#' flanking = c(300, 300))
125+
#' result[result$complete, ]
126+
#' result[result$flank_complete, ]
127+
#'
128+
#' }
129+
#'
130+
#' @export
131+
get_isolate_seqs <- function(species, genes, isolates = NULL, extract_features = FALSE,
132+
gene_name_in_feature = TRUE, flanking = c(0, 0),
133+
db = "nucleotide", pause = 0.5, ...) {
134+
135+
if (! is.numeric(flanking) || length(flanking) != 2) {
136+
stop('The "flanking" option must be a numeric vector of length 2')
137+
}
138+
139+
get_one <- function(name, isolate = NULL) {
140+
141+
# Wait a bit so NCBI doesnt get unhappy
142+
Sys.sleep(pause)
143+
144+
# Search for sequences
145+
if (is.null(isolate)) {
146+
query <- paste0('"', name, '"[Organism] AND (', paste0('"', genes, '"[All Fields]', collapse = " OR "), ')')
147+
} else {
148+
query <- paste0('"', name, '"[Organism] AND ("', isolate, '"[Isolate] OR "', isolate, '"[Strain]) AND (', paste0('"', genes, '"[All Fields]', collapse = " OR "), ')')
149+
}
150+
search <- rentrez::entrez_search(db, term = query, ...)
151+
if (length(search$ids) == 0) {
152+
return(NULL)
153+
}
154+
155+
if (extract_features) {
156+
# Parse features
157+
features <- parse_ft(rentrez::entrez_fetch(db, id = search$ids, rettype = "ft", retmode = "text"))
158+
if (gene_name_in_feature) {
159+
gene_in_feature <- purrr:::map_lgl(features$name, function(text) {
160+
purrr:::reduce(lapply(genes, function(gene) grepl(tolower(text), pattern = tolower(gene), fixed = TRUE)), `|`)
161+
})
162+
features <- features[gene_in_feature, ]
163+
}
164+
if (nrow(features) == 0) {
165+
return(NULL)
166+
}
167+
168+
# Parse sequences
169+
sequences <- parse_seqs(rentrez::entrez_fetch(db, id = search$ids, rettype = "fasta", retmode = "xml"))
170+
171+
# Join feature and sequence data
172+
output <- dplyr::left_join(features, sequences, by = "acc")
173+
174+
# Get positions to return
175+
output$flank_start <- purrr::map2_dbl(output$start, output$end, function(x, y) {
176+
min(c(x, y)) - flanking[1]
177+
})
178+
output$flank_end <- purrr::map2_dbl(output$start, output$end, function(x, y) {
179+
max(c(x, y)) + flanking[2]
180+
})
181+
output$flank_complete <- output$complete & output$flank_start >= 1 & output$flank_end <= nchar(output$seq)
182+
output$flank_start[output$flank_start < 1] <- 1
183+
output$flank_end[output$flank_end > nchar(output$seq)] <- nchar(output$seq)[output$flank_end > nchar(output$seq)]
184+
185+
# Subset sequences to fetures
186+
output$gene_seq <- substr(output$seq, output$flank_start, output$flank_end)
187+
output$gene_length <- nchar(output$gene_seq)
188+
189+
} else {
190+
output <- parse_seqs(rentrez::entrez_fetch(db, id = search$ids, rettype = "fasta", retmode = "xml"))
191+
}
192+
193+
# Add query info
194+
if (is.null(isolate)) {
195+
output <- tibble::as_tibble(cbind(list(species = name, query = query), output, stringsAsFactors = FALSE))
196+
} else {
197+
output <- tibble::as_tibble(cbind(list(species = name, isolate = isolate, query = query), output, stringsAsFactors = FALSE))
198+
}
199+
200+
return(output)
201+
}
202+
203+
if (is.null(isolates)) {
204+
return(dplyr::bind_rows(purrr::pmap(list(species), get_one)))
205+
} else {
206+
return(dplyr::bind_rows(purrr::pmap(list(species, isolates), get_one)))
207+
}
208+
209+
}

0 commit comments

Comments
 (0)