From 778aefaeedcf121f2243cdbecb1124f5d50af639 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 3 Oct 2025 15:11:40 -0400 Subject: [PATCH 1/2] indels support for madc2vcf_targets --- NAMESPACE | 1 + R/check_madc_sanity.R | 105 ++++++++ R/madc2vcf_targets.R | 345 +++++++++++++++--------- man/check_madc_sanity.Rd | 43 +++ man/madc2vcf_targets.Rd | 118 ++++++-- tests/testthat/test-check_madc_sanity.R | 10 + 6 files changed, 474 insertions(+), 148 deletions(-) create mode 100644 R/check_madc_sanity.R create mode 100644 man/check_madc_sanity.Rd create mode 100644 tests/testthat/test-check_madc_sanity.R diff --git a/NAMESPACE b/NAMESPACE index d47d95d..443314d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,7 @@ export(allele_freq_poly) export(calculate_Het) export(calculate_MAF) export(check_homozygous_trios) +export(check_madc_sanity) export(check_ped) export(check_replicates) export(dosage2vcf) diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R new file mode 100644 index 0000000..19d3ff6 --- /dev/null +++ b/R/check_madc_sanity.R @@ -0,0 +1,105 @@ +#' Run basic sanity checks on a MADC-style allele report +#' +#' @description +#' Performs five quick validations on an allele report: +#' 1) **Columns** – required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`); +#' 2) **FixAlleleIDs** – first column’s first up-to-6 rows are not all blank or "*" +#' *and* both `_0001` and `_0002` appear in `AlleleID`; +#' 3) **IUPACcodes** – presence of non-ATCG characters in `AlleleSequence`; +#' 4) **LowerCase** – presence of lowercase a/t/c/g in `AlleleSequence`; +#' 5) **Indels** – reference/alternate allele lengths differ for the same `CloneID`. +#' +#' @param report A `data.frame` with at least the columns +#' `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also +#' used in the “FixAlleleIDs” check to inspect its first up to six entries. +#' +#' @details +#' - **IUPAC check:** Flags any character outside `ATCG` (case-insensitive), +#' which will include ambiguity codes (`N`, `R`, `Y`, etc.) and symbols like `-`. +#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs `"Alt_0002"`, +#' merged by `CloneID`, and the lengths of `AlleleSequence` are compared. +#' - If required columns are missing, only **Columns** is evaluated (`FALSE`) and +#' `indel_clone_ids` is returned as `NULL`. +#' +#' @return A list with: +#' \describe{ +#' \item{checks}{Named logical vector with entries +#' `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`.} +#' \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt lengths differ. +#' Returns `character(0)` if none, or `NULL` when required columns are missing.} +#' } +#' +#' +#' @export +check_madc_sanity <- function(report) { + + # Initialize + checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) + messages <- list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA) + + # Validate required columns + required <- c("CloneID", "AlleleID", "AlleleSequence") + missing_cols <- setdiff(required, names(report)) + checks["Columns"] <- length(missing_cols) == 0 + + if(checks[["Columns"]]){ + # ---- FixAlleleIDs ---- + # Check if first up-to-6 entries in the *first column* are all "" or "*" + n <- nrow(report) + idx <- seq_len(min(6L, n)) + first_col_vals <- report[[1]][idx] + all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE) + # Also require that both _0001 and _0002 appear in AlleleID + has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE) + has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE) + checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002 + + # ---- IUPACcodes ---- + iu <- grepl("[^ATCG]", report$AlleleSequence, ignore.case = TRUE) + checks["IUPACcodes"] <- any(iu, na.rm = TRUE) + + # ---- LowerCase ---- + lc <- grepl("[atcg]", report$AlleleSequence) + checks["LowerCase"] <- any(lc, na.rm = TRUE) + + # ---- Indels ---- + refs <- subset(report, grepl("Ref_0001", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + alts <- subset(report, grepl("Alt_0002", AlleleID, fixed = TRUE), + select = c(CloneID, AlleleID, AlleleSequence)) + + merged <- merge(refs, alts, by = "CloneID", suffixes = c("_ref", "_alt"), all = FALSE) + + if (nrow(merged) > 0) { + ref_len <- nchar(merged$AlleleSequence_ref, keepNA = TRUE) + alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE) + cmp_ok <- !is.na(ref_len) & !is.na(alt_len) + indel_mask <- cmp_ok & (ref_len != alt_len) + checks["Indels"] <- any(indel_mask) + indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0) + } else { + checks["Indels"] <- FALSE + indels <- character(0) + } + + # ---- Chrom Pos ---- + pos <- strsplit(report[,2], "_") + checks["ChromPos"] <- all(sapply(pos, length) == 2) + + } else indels <- NULL + + messages[["Columns"]] <- c("Required columns are present\n", + "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence\n") + messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good\n", + "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets\n") + messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported\n", + "No IUPAC (non-ATCG) codes found in AlleleSequence\n") + messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence\n", + "No lowercase bases found in AlleleSequence\n") + messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")), + "No indels found (ref/alt lengths match) for all CloneIDs\n") + messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good\n", + "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information\n") + + list(checks = checks, messages = messages, indel_clone_ids = indels) +} diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 1b02c31..fc022f9 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -1,45 +1,121 @@ #' Format MADC Target Loci Read Counts Into VCF #' -#' This function will extract the read count information from a MADC file target markers and convert to VCF file format. +#' Convert DArTag MADC target read counts to a VCF #' -#' The DArTag MADC file format is not commonly supported through existing tools. This function -#' will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -#' genotyping panel target markers only +#' @description +#' Parses a DArTag **MADC** report and writes a **VCF v4.3** containing per-target +#' read counts for the panel’s target loci. This is useful because MADC is not +#' widely supported by general-purpose tools, while VCF is. #' -#' @param madc_file Path to MADC file -#' @param output.file output file name and path -#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand. -#' @param get_REF_ALT if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded. +#' @details +#' **What this function does** +#' - Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, +#' IUPAC/ambiguous bases, lowercase bases, indels). +#' - Extracts reference and total read counts per sample and target. +#' - Derives `AD` (ref,alt) by subtraction (alt = total − ref). +#' - If `get_REF_ALT = TRUE`, attempts to recover true REF/ALT bases by comparing +#' the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. +#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, `ALT` +#' (and `Type`, `Indel_pos` when indels are present), bypassing sequence-based +#' inference. +#' +#' **Output VCF layout** +#' - `INFO` fields: +#' * `DP` — total depth across all samples for the locus +#' * `ADS` — total counts across samples in the order `ref,alt` +#' - `FORMAT` fields (per sample): +#' * `DP` — total reads (ref + alt) +#' * `RA` — reads supporting the reference allele +#' * `AD` — `"ref,alt"` counts +#' +#' **Strand handling** +#' If a target ID appears in `botloci_file`, its probe sequences are reverse- +#' complemented prior to base comparison so that REF/ALT are reported in the +#' top-strand genomic orientation. +#' +#' **Sanity check behavior** +#' - If required columns or fixed IDs are missing, the function `stop()`s. +#' - If IUPAC/lowercase/indels are detected and `markers_info` is **not** +#' provided, the function `stop()`s with a diagnostic message explaining what to fix. +#' +#' @param madc_file character. Path to the input MADC CSV file. +#' @param output.file character. Path to the output VCF file to write. +#' @param botloci_file character. Path to a plain-text file listing target IDs +#' designed on the **bottom** strand (one ID per line). Required when +#' `get_REF_ALT = TRUE` and `markers_info` is not provided. +#' @param markers_info character or `NULL`. Optional path to a CSV providing target +#' metadata. Required columns: `BI_markerID, Chr, Pos, Ref, Alt`. If indels are +#' present, also require `Type, Indel_pos`. When supplied, these values populate +#' `#CHROM, POS, REF, ALT` in the VCF directly. +#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT +#' bases from the Ref/Alt probe sequences in the MADC file (with strand correction +#' using `botloci_file`). Targets with more than one difference between Ref/Alt +#' sequences are removed. +#' +#' @return (Invisibly) returns the path to `output.file`. The side effect is a +#' **VCF v4.3** written to disk containing one row per target and columns for all +#' samples in the MADC file. +#' +#' @section Dependencies: +#' Uses **dplyr**, **tidyr**, **tibble**, **reshape2**, **Biostrings** and base +#' **utils**. Helper functions expected in this package: `check_madc_sanity()`, +#' `get_countsMADC()`, `get_counts()`, and `check_botloci()`. +#' +#' @examples +#' # Example files shipped with the package +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", +#' package = "BIGr") +#' out_vcf <- tempfile(fileext = ".vcf") +#' +#' # Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +#' \dontrun{ +#' madc2vcf_targets( +#' madc_file = madc_file, +#' output.file = out_vcf, +#' botloci_file = bot_file, +#' get_REF_ALT = TRUE +#' ) +#' } +#' +#' # Clean up (example) +#' unlink(out_vcf) +#' +#' @seealso +#' `check_madc_sanity()`, `get_countsMADC()`, `check_botloci()` #' -#' @return A VCF file v4.3 with the target marker read count information #' @import dplyr #' @import tidyr #' @import tibble -#' @importFrom Rdpack reprompt #' @importFrom reshape2 melt dcast #' @importFrom utils write.table #' @importFrom Biostrings DNAString reverseComplement -#' @return A VCF file v4.3 with the target marker read count information -#' -#' @examples -#' # Load example files -#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#' -#' #Temp location (only for example) -#' output_file <- tempfile() -#' -#' # Convert MADC to VCF -#' madc2vcf_targets(madc_file = madc_file, -#' output.file = output_file, -#' get_REF_ALT = TRUE, -#' botloci_file = bot_file) -#' -#' rm(output_file) #' #' @export -madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) { - #Making the VCF (This is highly dependent on snps being in a format where the SNP IDs are the CHR_POS) +madc2vcf_targets <- function(madc_file, + output.file, + botloci_file, + markers_info = NULL, + get_REF_ALT = FALSE) { + + # MADC checks + report <- read.csv(madc_file) + checks <- check_madc_sanity(report) + + messages_results <- mapply(function(check, message) { + if (check) message[1] else message[2] + }, checks$checks, checks$messages) + + if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){ + idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")])) + stop(paste("The MADC file does not pass the sanity checks:\n", + paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n"))) + } + + if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){ + idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")])) + if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n")) + } matrices <- get_countsMADC(madc_file) ref_df <- data.frame(matrices[[1]], check.names = FALSE) @@ -56,18 +132,124 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = row.names(ad_df) <- row.names(ref_df) #Obtaining Chr and Pos information from the row_names - new_df <- size_df %>% - rownames_to_column(var = "row_name") %>% - separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% - select(CHROM, POS) + if(is.null(markers_info)){ + new_df <- size_df %>% + rownames_to_column(var = "row_name") %>% + separate(row_name, into = c("CHROM", "POS"), sep = "_") %>% + select(CHROM, POS) + + # Remove leading zeros from the POS column + new_df$POS <- sub("^0+", "", new_df$POS) + + #Get read count sums + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + # Get REF and ALT + if(get_REF_ALT){ + if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") + csv <- get_counts(madc_file) + # Keep only the ones that have alt and ref + csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] + + # Get reverse complement the tag is present in botloci + botloci <- read.table(botloci_file, header = FALSE) + + # Check if the botloci file marker IDs match with the MADC file + checked_botloci <- check_botloci(botloci, csv) + botloci <- checked_botloci[[1]] + csv <- checked_botloci[[2]] + + # FIXED: Store original sequences before any transformation + csv$OriginalAlleleSequence <- csv$AlleleSequence + + # Apply reverse complement to sequences for bottom strand markers + idx <- which(csv$CloneID %in% botloci[,1]) + csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) + + ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] + alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] + alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + + # FIXED: Get original sequences for SNP calling + orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - # Remove leading zeros from the POS column - new_df$POS <- sub("^0+", "", new_df$POS) + if(all(sort(ref_ord) == sort(alt_ord))){ + # Order sequences consistently + ref_seq <- ref_seq[order(ref_ord)] + alt_seq <- alt_seq[order(alt_ord)] + orig_ref_seq <- orig_ref_seq[order(ref_ord)] + orig_alt_seq <- orig_alt_seq[order(alt_ord)] + ordered_clone_ids <- sort(ref_ord) - #Get read count sums - new_df$TotalRef <- rowSums(ref_df) - new_df$TotalAlt <- rowSums(alt_df) - new_df$TotalSize <- rowSums(size_df) + ref_base <- alt_base <- vector() + for(i in 1:length(orig_ref_seq)){ + # FIXED: Use original sequences for SNP calling + temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") + idx_diff <- which(temp_list[[1]] != temp_list[[2]]) + + if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences + ref_base[i] <- NA + alt_base[i] <- NA + } else if(length(idx_diff) == 1) { + orig_ref_base <- temp_list[[1]][idx_diff] + orig_alt_base <- temp_list[[2]][idx_diff] + + # FIXED: Apply reverse complement to bases only if marker is in botloci + if(ordered_clone_ids[i] %in% botloci[,1]) { + ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) + alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) + } else { + ref_base[i] <- orig_ref_base + alt_base[i] <- orig_alt_base + } + } else { + # No differences found + ref_base[i] <- NA + alt_base[i] <- NA + } + } + } else { + warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") + ref_base <- "." + alt_base <- "." + } + + } else { + ref_base <- "." + alt_base <- "." + } + } else { + # Verify markers_info file + df <- read.csv(markers_info) + if(checks$checks["Indels"]){ + if(!all(c("BI_markerID","Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df))) + stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT, Type, Indel_pos") + } + if(!all(c("BI_markerID","Chr","Pos","Ref","Alt") %in% colnames(df))) + stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT") + + if(!all(rownames(ad_df)%in% df$BI_markerID)) + warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.") + + matched <- df[match(rownames(ad_df), df$BI_markerID),] + + new_df <- data.frame( + CHROM = matched$Chr, + POS = matched$Pos + ) + + #Get read count sums + new_df$TotalRef <- rowSums(ref_df) + new_df$TotalAlt <- rowSums(alt_df) + new_df$TotalSize <- rowSums(size_df) + + ref_base <- matched$Ref + alt_base <- matched$Alt + } #Make a header separate from the dataframe vcf_header <- c( @@ -82,83 +264,6 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = '##FORMAT=' ) - # Get REF and ALT - if(get_REF_ALT){ - if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.") - csv <- get_counts(madc_file) - # Keep only the ones that have alt and ref - csv <- csv[which(csv$CloneID %in% rownames(ad_df)),] - - # Get reverse complement the tag is present in botloci - botloci <- read.table(botloci_file, header = FALSE) - - # Check if the botloci file marker IDs match with the MADC file - checked_botloci <- check_botloci(botloci, csv) - botloci <- checked_botloci[[1]] - csv <- checked_botloci[[2]] - - # FIXED: Store original sequences before any transformation - csv$OriginalAlleleSequence <- csv$AlleleSequence - - # Apply reverse complement to sequences for bottom strand markers - idx <- which(csv$CloneID %in% botloci[,1]) - csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) - - ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] - alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] - - # FIXED: Get original sequences for SNP calling - orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] - orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] - - if(all(sort(ref_ord) == sort(alt_ord))){ - # Order sequences consistently - ref_seq <- ref_seq[order(ref_ord)] - alt_seq <- alt_seq[order(alt_ord)] - orig_ref_seq <- orig_ref_seq[order(ref_ord)] - orig_alt_seq <- orig_alt_seq[order(alt_ord)] - ordered_clone_ids <- sort(ref_ord) - - ref_base <- alt_base <- vector() - for(i in 1:length(orig_ref_seq)){ - # FIXED: Use original sequences for SNP calling - temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") - idx_diff <- which(temp_list[[1]] != temp_list[[2]]) - - if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences - ref_base[i] <- NA - alt_base[i] <- NA - } else if(length(idx_diff) == 1) { - orig_ref_base <- temp_list[[1]][idx_diff] - orig_alt_base <- temp_list[[2]][idx_diff] - - # FIXED: Apply reverse complement to bases only if marker is in botloci - if(ordered_clone_ids[i] %in% botloci[,1]) { - ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) - alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) - } else { - ref_base[i] <- orig_ref_base - alt_base[i] <- orig_alt_base - } - } else { - # No differences found - ref_base[i] <- NA - alt_base[i] <- NA - } - } - } else { - warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.") - ref_base <- "." - alt_base <- "." - } - - } else { - ref_base <- "." - alt_base <- "." - } - #Make the header#Make the VCF df vcf_df <- data.frame( CHROM = new_df$CHROM, @@ -233,12 +338,4 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = suppressWarnings( write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE) ) - #Unload all items from memory - rm(matrices) - rm(ref_df) - rm(alt_df) - rm(size_df) - rm(ad_df) - rm(vcf_df) - rm(geno_df) } diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd new file mode 100644 index 0000000..494145e --- /dev/null +++ b/man/check_madc_sanity.Rd @@ -0,0 +1,43 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_madc_sanity.R +\name{check_madc_sanity} +\alias{check_madc_sanity} +\title{Run basic sanity checks on a MADC-style allele report} +\usage{ +check_madc_sanity(report) +} +\arguments{ +\item{report}{A \code{data.frame} with at least the columns +\code{CloneID}, \code{AlleleID}, and \code{AlleleSequence}. The first column is also +used in the “FixAlleleIDs” check to inspect its first up to six entries.} +} +\value{ +A list with: +\describe{ +\item{checks}{Named logical vector with entries +\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}.} +\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt lengths differ. +Returns \code{character(0)} if none, or \code{NULL} when required columns are missing.} +} +} +\description{ +Performs five quick validations on an allele report: +\enumerate{ +\item \strong{Columns} – required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence}); +\item \strong{FixAlleleIDs} – first column’s first up-to-6 rows are not all blank or "*" +\emph{and} both \verb{_0001} and \verb{_0002} appear in \code{AlleleID}; +\item \strong{IUPACcodes} – presence of non-ATCG characters in \code{AlleleSequence}; +\item \strong{LowerCase} – presence of lowercase a/t/c/g in \code{AlleleSequence}; +\item \strong{Indels} – reference/alternate allele lengths differ for the same \code{CloneID}. +} +} +\details{ +\itemize{ +\item \strong{IUPAC check:} Flags any character outside \code{ATCG} (case-insensitive), +which will include ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.) and symbols like \code{-}. +\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs \code{"Alt_0002"}, +merged by \code{CloneID}, and the lengths of \code{AlleleSequence} are compared. +\item If required columns are missing, only \strong{Columns} is evaluated (\code{FALSE}) and +\code{indel_clone_ids} is returned as \code{NULL}. +} +} diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd index a790460..fad847c 100644 --- a/man/madc2vcf_targets.Rd +++ b/man/madc2vcf_targets.Rd @@ -4,44 +4,114 @@ \alias{madc2vcf_targets} \title{Format MADC Target Loci Read Counts Into VCF} \usage{ -madc2vcf_targets(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) +madc2vcf_targets( + madc_file, + output.file, + botloci_file, + markers_info = NULL, + get_REF_ALT = FALSE +) } \arguments{ -\item{madc_file}{Path to MADC file} +\item{madc_file}{character. Path to the input MADC CSV file.} -\item{output.file}{output file name and path} +\item{output.file}{character. Path to the output VCF file to write.} -\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.} +\item{botloci_file}{character. Path to a plain-text file listing target IDs +designed on the \strong{bottom} strand (one ID per line). Required when +\code{get_REF_ALT = TRUE} and \code{markers_info} is not provided.} -\item{get_REF_ALT}{if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded.} +\item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target +metadata. Required columns: \verb{BI_markerID, Chr, Pos, Ref, Alt}. If indels are +present, also require \verb{Type, Indel_pos}. When supplied, these values populate +\verb{#CHROM, POS, REF, ALT} in the VCF directly.} + +\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT +bases from the Ref/Alt probe sequences in the MADC file (with strand correction +using \code{botloci_file}). Targets with more than one difference between Ref/Alt +sequences are removed.} } \value{ -A VCF file v4.3 with the target marker read count information - -A VCF file v4.3 with the target marker read count information +(Invisibly) returns the path to \code{output.file}. The side effect is a +\strong{VCF v4.3} written to disk containing one row per target and columns for all +samples in the MADC file. } \description{ -This function will extract the read count information from a MADC file target markers and convert to VCF file format. +Parses a DArTag \strong{MADC} report and writes a \strong{VCF v4.3} containing per-target +read counts for the panel’s target loci. This is useful because MADC is not +widely supported by general-purpose tools, while VCF is. } \details{ -The DArTag MADC file format is not commonly supported through existing tools. This function -will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the -genotyping panel target markers only +Convert DArTag MADC target read counts to a VCF + +\strong{What this function does} +\itemize{ +\item Runs basic sanity checks on the MADC file (column presence, fixed allele IDs, +IUPAC/ambiguous bases, lowercase bases, indels). +\item Extracts reference and total read counts per sample and target. +\item Derives \code{AD} (ref,alt) by subtraction (alt = total − ref). +\item If \code{get_REF_ALT = TRUE}, attempts to recover true REF/ALT bases by comparing +the Ref/Alt probe sequences; targets with >1 polymorphism are discarded. +\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, \code{ALT} +(and \code{Type}, \code{Indel_pos} when indels are present), bypassing sequence-based +inference. } -\examples{ -# Load example files -madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") -#Temp location (only for example) -output_file <- tempfile() +\strong{Output VCF layout} +\itemize{ +\item \code{INFO} fields: +\itemize{ +\item \code{DP} — total depth across all samples for the locus +\item \code{ADS} — total counts across samples in the order \verb{ref,alt} +} +\item \code{FORMAT} fields (per sample): +\itemize{ +\item \code{DP} — total reads (ref + alt) +\item \code{RA} — reads supporting the reference allele +\item \code{AD} — \code{"ref,alt"} counts +} +} -# Convert MADC to VCF -madc2vcf_targets(madc_file = madc_file, - output.file = output_file, - get_REF_ALT = TRUE, - botloci_file = bot_file) +\strong{Strand handling} +If a target ID appears in \code{botloci_file}, its probe sequences are reverse- +complemented prior to base comparison so that REF/ALT are reported in the +top-strand genomic orientation. -rm(output_file) +\strong{Sanity check behavior} +\itemize{ +\item If required columns or fixed IDs are missing, the function \code{stop()}s. +\item If IUPAC/lowercase/indels are detected and \code{markers_info} is \strong{not} +provided, the function \code{stop()}s with a diagnostic message explaining what to fix. +} +} +\section{Dependencies}{ +Uses \strong{dplyr}, \strong{tidyr}, \strong{tibble}, \strong{reshape2}, \strong{Biostrings} and base +\strong{utils}. Helper functions expected in this package: \code{check_madc_sanity()}, +\code{get_countsMADC()}, \code{get_counts()}, and \code{check_botloci()}. +} + +\examples{ +# Example files shipped with the package +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr") +bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", + package = "BIGr") +out_vcf <- tempfile(fileext = ".vcf") + +# Convert MADC to VCF (attempting to recover REF/ALT from probe sequences) +\dontrun{ +madc2vcf_targets( + madc_file = madc_file, + output.file = out_vcf, + botloci_file = bot_file, + get_REF_ALT = TRUE +) +} + +# Clean up (example) +unlink(out_vcf) + +} +\seealso{ +\code{check_madc_sanity()}, \code{get_countsMADC()}, \code{check_botloci()} } diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R new file mode 100644 index 0000000..5053e55 --- /dev/null +++ b/tests/testthat/test-check_madc_sanity.R @@ -0,0 +1,10 @@ +test_that("check madc",{ + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + report <- read.csv(madc_file, check.names = FALSE) + + check_madc_sanity(report) + + +}) + + From 0b97b46434afcd7f441cb2b9705df2a9323232bf Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 14 Nov 2025 12:59:33 -0500 Subject: [PATCH 2/2] bugfix - if hapDB padding is not matching report --- R/madc2vcf_all.R | 16 ++++++++++++++-- R/utils.R | 1 + 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 329ac5a..125d540 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -135,7 +135,7 @@ madc2vcf_all <- function(madc = NULL, loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, verbose = TRUE){ if(!is.null(hap_seq)){ - hap_seq <- get_ref_alt_hap_seq(hap_seq) + hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci) } nsamples <- ncol(report) - 3 @@ -376,7 +376,8 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ #' @param hap_seq haplotype db #' #' @noRd -get_ref_alt_hap_seq <- function(hap_seq){ +get_ref_alt_hap_seq <- function(hap_seq, botloci){ + headers <- hap_seq$V1[grep(">",hap_seq$V1)] headers <- gsub(">", "", headers) @@ -394,6 +395,17 @@ get_ref_alt_hap_seq <- function(hap_seq){ seqs <- sapply(seqs, function(x) paste0(x, collapse = "")) hap_seq <- data.frame(AlleleID = headers, AlleleSequence = seqs) + + # Check padding + hap_cloneID <- sapply(strsplit(hap_seq$AlleleID, "[|]"), function(x) x[1]) + botloci_cloneID <- botloci$V1 + + pad_hap <- unique(nchar(sub(".*_", "", hap_cloneID))) + pad_botloci <- unique(nchar(sub(".*_", "", botloci_cloneID))) + + if(length(pad_hap) > 1) stop("Check marker IDs in haplotype DB file. They should have the same padding.") + if(pad_hap != pad_botloci) stop("Check marker IDs padding in haplotype DB file. They should match the botloci file.") + return(hap_seq) } diff --git a/R/utils.R b/R/utils.R index c280ad2..2399560 100644 --- a/R/utils.R +++ b/R/utils.R @@ -56,6 +56,7 @@ check_botloci <- function(botloci, report, verbose=TRUE){ report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_", sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID))) ) + report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2)) } else { botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_", sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1)))