|
| 1 | + |
| 2 | +#' @keywords internal |
| 3 | +parse_ft <- function(text) { |
| 4 | + text <- gsub(text, pattern = "\n\t\t\t", replacement = "\t", fixed = TRUE) |
| 5 | + parts <- strsplit(text, "\n\n", fixed = TRUE)[[1]] |
| 6 | + part_data <- lapply(parts, function(x) { |
| 7 | + first_line <- sub(x, pattern = "\\n.+$", replacement = "") |
| 8 | + acc <- stringr::str_match(first_line, pattern = "\\|(.+)\\|")[,2] |
| 9 | + the_rest <- sub(x, pattern = "^>.+?\\n", replacement = "") |
| 10 | + # replace extra \t with , when there are multiple features |
| 11 | + lines <- strsplit(the_rest, "\n")[[1]] |
| 12 | + lines <- purrr::map2(stringr::str_locate_all(lines, "\t"), lines, function(matches, line) { |
| 13 | + if (nrow(matches) > 4) { |
| 14 | + for (i in matches[3:(nrow(matches) - 2), 1]) { |
| 15 | + substr(line, i, i) <- "," |
| 16 | + } |
| 17 | + } |
| 18 | + return(line) |
| 19 | + }) |
| 20 | + the_rest <- paste0(lines, collapse = "\n") |
| 21 | + output <- readr::read_tsv(paste0(the_rest, "\n"), col_names = c("start", "end", "feature", "type", "name"), col_types = "ccccc") |
| 22 | + output <- tibble::as_tibble(cbind(list(acc = acc), output, stringsAsFactors = FALSE)) |
| 23 | + }) |
| 24 | + output <- dplyr::bind_rows(part_data) |
| 25 | + output$complete <- ifelse(startsWith(as.character(output$start), "<") | startsWith(as.character(output$end), ">"), FALSE, TRUE) |
| 26 | + output$start <- as.integer(gsub(output$start, pattern = "<", replacement = "")) |
| 27 | + output$end <- as.integer(gsub(output$end, pattern = ">", replacement = "")) |
| 28 | + return(output) |
| 29 | +} |
| 30 | + |
| 31 | +#' @keywords internal |
| 32 | +parse_seqs <- function(text) { |
| 33 | + xml <- xml2::read_xml(text) |
| 34 | + tibble::tibble(acc = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_accver")), |
| 35 | + seq = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_sequence")), |
| 36 | + header = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_defline")), |
| 37 | + length = xml2::xml_text(xml2::xml_find_all(xml, "//TSeq_length"))) |
| 38 | +} |
| 39 | + |
| 40 | +#' Lookup gene sequences from NCBI |
| 41 | +#' |
| 42 | +#' Look for sequences of a particular gene for a list of species/isolates/genes from the Genbank |
| 43 | +#' nucleotide database. |
| 44 | +#' |
| 45 | +#' @param species The names of species to look up. |
| 46 | +#' @param genes The names of the genes to look up. |
| 47 | +#' @param isolates The names of isolates to look up. Must be the same length as \code{species} if |
| 48 | +#' used. |
| 49 | +#' @param extract_features If TRUE, return the sequence for each feature in the sequence annotation |
| 50 | +#' instead of the whole sequence. |
| 51 | +#' @param gene_name_in_feature If TRUE, only return features that have one of the gene names |
| 52 | +#' somewhere in their description. Only has an effect if extract_features is TRUE. |
| 53 | +#' @param flanking A vector of length 2. The number of base pairs before and after the target gene |
| 54 | +#' to include in the sequence returned. If the flanking sequence is not available, the sequence |
| 55 | +#' will be considered incomplete. |
| 56 | +#' @param db The name of the NCBI database to query. Only tested with "nucleotide", but a few others |
| 57 | +#' might work. |
| 58 | +#' @param pause The number of seconds to pause between each query. This avoids annoying NCBI and |
| 59 | +#' having them block you IP address. Should be at least 0.35 seconds if you dont have an NCBI API |
| 60 | +#' key and at least 0.1 seconds if you do. |
| 61 | +#' @param ... Additional terms to add to the search request for each species/isolate, see NCBI |
| 62 | +#' documentation for a complete list: |
| 63 | +#' http://www.ncbi.nlm.nih.gov/books/NBK25499/#_chapter4_ESearch_ |
| 64 | +#' |
| 65 | +#' @return |
| 66 | +#' |
| 67 | +#' A tibble (a type of data.frame) with the following columns, depending on settings: |
| 68 | +#' |
| 69 | +#' * species: The species search term associated with the result |
| 70 | +#' * isolate: The isolate search term associated with the result |
| 71 | +#' * query: The query used to search for sequences on NCBI |
| 72 | +#' * acc: The Genbank accession number |
| 73 | +#' * start: The index of the first base pair of the target gene in the original sequence |
| 74 | +#' * end: The index of the last base pair of the target gene in the original sequence |
| 75 | +#' * feature: The type of locus |
| 76 | +#' * type: The type of annotation for the sequence |
| 77 | +#' * name: The name of the locus |
| 78 | +#' * complete: If the locus was complete in the original sequence, according to the annotation |
| 79 | +#' * seq: The whole sequence the gene was found in |
| 80 | +#' * header: The name of the overall sequence the gene was found in |
| 81 | +#' * length: The length of the whole sequence |
| 82 | +#' * flank_start: The index of the first base pair of the target gene plus the flanking region in the original sequence |
| 83 | +#' * flank_end: The index of the last base pair of the target gene plus the flanking region in the original sequence |
| 84 | +#' * flank_complete: If the locus plus flanking region was complete in the original sequence |
| 85 | +#' * gene_seq: The target gene sequence plus the flanking region |
| 86 | +#' * gene_length: The length of the target gene plus the flanking region |
| 87 | +#' |
| 88 | +#' @examples \dontrun{ |
| 89 | +#' |
| 90 | +#' # Search for the whole seqeunces for with P. infestans Cox I |
| 91 | +#' get_isolate_seqs(species = c("Phytophthora infestans"), |
| 92 | +#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"), |
| 93 | +#' retmax = 100) |
| 94 | +#' |
| 95 | +#' # Search for the just the gene sequence for P. infestans Cox I |
| 96 | +#' get_isolate_seqs(species = c("Phytophthora infestans"), |
| 97 | +#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"), |
| 98 | +#' retmax = 100, |
| 99 | +#' extract_features = TRUE) |
| 100 | +#' |
| 101 | +#' # Search for all the gene sequences in whole sequences that contain P. infestans Cox I |
| 102 | +#' get_isolate_seqs(species = c("Phytophthora infestans"), |
| 103 | +#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"), |
| 104 | +#' retmax = 100, |
| 105 | +#' extract_features = TRUE, |
| 106 | +#' gene_name_in_feature = FALSE) |
| 107 | +#' |
| 108 | +#' # Search for whole sequences for P. infestans Cox I for just some isolates |
| 109 | +#' get_isolate_seqs(species = c("Phytophthora infestans", "Phytophthora infestans", "Phytophthora infestans"), |
| 110 | +#' isolates = c("44", "580", "180"), |
| 111 | +#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1")) |
| 112 | +#' |
| 113 | +#' # Search for just the gene sequences for P. infestans Cox I for just some isolates |
| 114 | +#' get_isolate_seqs(species = c("Phytophthora infestans", "Phytophthora infestans", "Phytophthora infestans"), |
| 115 | +#' isolates = c("44", "580", "180"), |
| 116 | +#' genes = c("cox I", "cox 1", "cox1", "coxI", "cytochrome oxidase I", "cytochrome oxidase 1"), |
| 117 | +#' extract_features = TRUE) |
| 118 | +#' |
| 119 | +#' # Search for P infestans ITS with flanking regions and subset for complete results |
| 120 | +#' result <- get_isolate_seqs(species = c("Phytophthora"), |
| 121 | +#' genes = c("internal transcribed spacer"), |
| 122 | +#' retmax = 300, |
| 123 | +#' extract_features = TRUE, |
| 124 | +#' flanking = c(300, 300)) |
| 125 | +#' result[result$complete, ] |
| 126 | +#' result[result$flank_complete, ] |
| 127 | +#' |
| 128 | +#' } |
| 129 | +#' |
| 130 | +#' @export |
| 131 | +get_isolate_seqs <- function(species, genes, isolates = NULL, extract_features = FALSE, |
| 132 | + gene_name_in_feature = TRUE, flanking = c(0, 0), |
| 133 | + db = "nucleotide", pause = 0.5, ...) { |
| 134 | + |
| 135 | + if (! is.numeric(flanking) || length(flanking) != 2) { |
| 136 | + stop('The "flanking" option must be a numeric vector of length 2') |
| 137 | + } |
| 138 | + |
| 139 | + get_one <- function(name, isolate = NULL) { |
| 140 | + |
| 141 | + # Wait a bit so NCBI doesnt get unhappy |
| 142 | + Sys.sleep(pause) |
| 143 | + |
| 144 | + # Search for sequences |
| 145 | + if (is.null(isolate)) { |
| 146 | + query <- paste0('"', name, '"[Organism] AND (', paste0('"', genes, '"[All Fields]', collapse = " OR "), ')') |
| 147 | + } else { |
| 148 | + query <- paste0('"', name, '"[Organism] AND ("', isolate, '"[Isolate] OR "', isolate, '"[Strain]) AND (', paste0('"', genes, '"[All Fields]', collapse = " OR "), ')') |
| 149 | + } |
| 150 | + search <- rentrez::entrez_search(db, term = query, ...) |
| 151 | + if (length(search$ids) == 0) { |
| 152 | + return(NULL) |
| 153 | + } |
| 154 | + |
| 155 | + if (extract_features) { |
| 156 | + # Parse features |
| 157 | + features <- parse_ft(rentrez::entrez_fetch(db, id = search$ids, rettype = "ft", retmode = "text")) |
| 158 | + if (gene_name_in_feature) { |
| 159 | + gene_in_feature <- purrr:::map_lgl(features$name, function(text) { |
| 160 | + purrr:::reduce(lapply(genes, function(gene) grepl(tolower(text), pattern = tolower(gene), fixed = TRUE)), `|`) |
| 161 | + }) |
| 162 | + features <- features[gene_in_feature, ] |
| 163 | + } |
| 164 | + if (nrow(features) == 0) { |
| 165 | + return(NULL) |
| 166 | + } |
| 167 | + |
| 168 | + # Parse sequences |
| 169 | + sequences <- parse_seqs(rentrez::entrez_fetch(db, id = search$ids, rettype = "fasta", retmode = "xml")) |
| 170 | + |
| 171 | + # Join feature and sequence data |
| 172 | + output <- dplyr::left_join(features, sequences, by = "acc") |
| 173 | + |
| 174 | + # Get positions to return |
| 175 | + output$flank_start <- purrr::map2_dbl(output$start, output$end, function(x, y) { |
| 176 | + min(c(x, y)) - flanking[1] |
| 177 | + }) |
| 178 | + output$flank_end <- purrr::map2_dbl(output$start, output$end, function(x, y) { |
| 179 | + max(c(x, y)) + flanking[2] |
| 180 | + }) |
| 181 | + output$flank_complete <- output$complete & output$flank_start >= 1 & output$flank_end <= nchar(output$seq) |
| 182 | + output$flank_start[output$flank_start < 1] <- 1 |
| 183 | + output$flank_end[output$flank_end > nchar(output$seq)] <- nchar(output$seq)[output$flank_end > nchar(output$seq)] |
| 184 | + |
| 185 | + # Subset sequences to fetures |
| 186 | + output$gene_seq <- substr(output$seq, output$flank_start, output$flank_end) |
| 187 | + output$gene_length <- nchar(output$gene_seq) |
| 188 | + |
| 189 | + } else { |
| 190 | + output <- parse_seqs(rentrez::entrez_fetch(db, id = search$ids, rettype = "fasta", retmode = "xml")) |
| 191 | + } |
| 192 | + |
| 193 | + # Add query info |
| 194 | + if (is.null(isolate)) { |
| 195 | + output <- tibble::as_tibble(cbind(list(species = name, query = query), output, stringsAsFactors = FALSE)) |
| 196 | + } else { |
| 197 | + output <- tibble::as_tibble(cbind(list(species = name, isolate = isolate, query = query), output, stringsAsFactors = FALSE)) |
| 198 | + } |
| 199 | + |
| 200 | + return(output) |
| 201 | + } |
| 202 | + |
| 203 | + if (is.null(isolates)) { |
| 204 | + return(dplyr::bind_rows(purrr::pmap(list(species), get_one))) |
| 205 | + } else { |
| 206 | + return(dplyr::bind_rows(purrr::pmap(list(species, isolates), get_one))) |
| 207 | + } |
| 208 | + |
| 209 | +} |
0 commit comments