From 778aefaeedcf121f2243cdbecb1124f5d50af639 Mon Sep 17 00:00:00 2001
From: Cristianetaniguti <chaytaniguti@gmail.com>
Date: Fri, 3 Oct 2025 15:11:40 -0400
Subject: [PATCH 1/2] indels support for madc2vcf_targets

---
 NAMESPACE                               |   1 +
 R/check_madc_sanity.R                   | 105 ++++++++
 R/madc2vcf_targets.R                    | 345 +++++++++++++++---------
 man/check_madc_sanity.Rd                |  43 +++
 man/madc2vcf_targets.Rd                 | 118 ++++++--
 tests/testthat/test-check_madc_sanity.R |  10 +
 6 files changed, 474 insertions(+), 148 deletions(-)
 create mode 100644 R/check_madc_sanity.R
 create mode 100644 man/check_madc_sanity.Rd
 create mode 100644 tests/testthat/test-check_madc_sanity.R

diff --git a/NAMESPACE b/NAMESPACE
index d47d95d..443314d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,7 @@ export(allele_freq_poly)
 export(calculate_Het)
 export(calculate_MAF)
 export(check_homozygous_trios)
+export(check_madc_sanity)
 export(check_ped)
 export(check_replicates)
 export(dosage2vcf)
diff --git a/R/check_madc_sanity.R b/R/check_madc_sanity.R
new file mode 100644
index 0000000..19d3ff6
--- /dev/null
+++ b/R/check_madc_sanity.R
@@ -0,0 +1,105 @@
+#' Run basic sanity checks on a MADC-style allele report
+#'
+#' @description
+#' Performs five quick validations on an allele report:
+#' 1) **Columns** – required columns are present (`CloneID`, `AlleleID`, `AlleleSequence`);
+#' 2) **FixAlleleIDs** – first column’s first up-to-6 rows are not all blank or "*"
+#'    *and* both `_0001` and `_0002` appear in `AlleleID`;
+#' 3) **IUPACcodes** – presence of non-ATCG characters in `AlleleSequence`;
+#' 4) **LowerCase** – presence of lowercase a/t/c/g in `AlleleSequence`;
+#' 5) **Indels** – reference/alternate allele lengths differ for the same `CloneID`.
+#'
+#' @param report A `data.frame` with at least the columns
+#'   `CloneID`, `AlleleID`, and `AlleleSequence`. The first column is also
+#'   used in the “FixAlleleIDs” check to inspect its first up to six entries.
+#'
+#' @details
+#' - **IUPAC check:** Flags any character outside `ATCG` (case-insensitive),
+#'   which will include ambiguity codes (`N`, `R`, `Y`, etc.) and symbols like `-`.
+#' - **Indels:** Rows are split by `AlleleID` containing `"Ref_0001"` vs `"Alt_0002"`,
+#'   merged by `CloneID`, and the lengths of `AlleleSequence` are compared.
+#' - If required columns are missing, only **Columns** is evaluated (`FALSE`) and
+#'   `indel_clone_ids` is returned as `NULL`.
+#'
+#' @return A list with:
+#' \describe{
+#'   \item{checks}{Named logical vector with entries
+#'     `Columns`, `FixAlleleIDs`, `IUPACcodes`, `LowerCase`, `Indels`.}
+#'   \item{indel_clone_ids}{Character vector of `CloneID`s where ref/alt lengths differ.
+#'     Returns `character(0)` if none, or `NULL` when required columns are missing.}
+#' }
+#'
+#'
+#' @export
+check_madc_sanity <- function(report) {
+
+  # Initialize
+  checks <- c(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA)
+  messages <-  list(Columns = NA, FixAlleleIDs = NA, IUPACcodes = NA, LowerCase = NA, Indels = NA, ChromPos = NA)
+
+  # Validate required columns
+  required <- c("CloneID", "AlleleID", "AlleleSequence")
+  missing_cols <- setdiff(required, names(report))
+  checks["Columns"] <- length(missing_cols) == 0
+
+  if(checks[["Columns"]]){
+    # ---- FixAlleleIDs ----
+    # Check if first up-to-6 entries in the *first column* are all "" or "*"
+    n <- nrow(report)
+    idx <- seq_len(min(6L, n))
+    first_col_vals <- report[[1]][idx]
+    all_blank_or_star <- all(first_col_vals %in% c("", "*"), na.rm = TRUE)
+    # Also require that both _0001 and _0002 appear in AlleleID
+    has_0001 <- any(grepl("_0001", report$AlleleID, fixed = TRUE), na.rm = TRUE)
+    has_0002 <- any(grepl("_0002", report$AlleleID, fixed = TRUE), na.rm = TRUE)
+    checks["FixAlleleIDs"] <- (!all_blank_or_star) & has_0001 & has_0002
+
+    # ---- IUPACcodes ----
+    iu <- grepl("[^ATCG]", report$AlleleSequence, ignore.case = TRUE)
+    checks["IUPACcodes"] <- any(iu, na.rm = TRUE)
+
+    # ---- LowerCase ----
+    lc <- grepl("[atcg]", report$AlleleSequence)
+    checks["LowerCase"] <- any(lc, na.rm = TRUE)
+
+    # ---- Indels ----
+    refs <- subset(report, grepl("Ref_0001", AlleleID, fixed = TRUE),
+                   select = c(CloneID, AlleleID, AlleleSequence))
+    alts <- subset(report, grepl("Alt_0002", AlleleID, fixed = TRUE),
+                   select = c(CloneID, AlleleID, AlleleSequence))
+
+    merged <- merge(refs, alts, by = "CloneID", suffixes = c("_ref", "_alt"), all = FALSE)
+
+    if (nrow(merged) > 0) {
+      ref_len <- nchar(merged$AlleleSequence_ref, keepNA = TRUE)
+      alt_len <- nchar(merged$AlleleSequence_alt, keepNA = TRUE)
+      cmp_ok <- !is.na(ref_len) & !is.na(alt_len)
+      indel_mask <- cmp_ok & (ref_len != alt_len)
+      checks["Indels"] <- any(indel_mask)
+      indels <- if (any(indel_mask)) merged$CloneID[indel_mask] else character(0)
+    } else {
+      checks["Indels"] <- FALSE
+      indels <- character(0)
+    }
+
+    # ---- Chrom Pos ----
+    pos <- strsplit(report[,2], "_")
+    checks["ChromPos"] <- all(sapply(pos, length) == 2)
+
+  } else indels <- NULL
+
+  messages[["Columns"]] <- c("Required columns are present\n",
+                           "One or more required columns missing. Verify if your file has columns: CloneID, AlleleID, AlleleSequence\n")
+  messages[["FixAlleleIDs"]] <- c("Fixed Allele IDs look good\n",
+                               "MADC not processed by BI. Please contact us to assign allele IDs to your MADC according to the specie haplotype dabatase. This guarantee reproducibility between diferent datasets\n")
+  messages[["IUPACcodes"]] <- c("IUPAC (non-ATCG) codes found in AlleleSequence. This codes are not currently supported\n",
+                             "No IUPAC (non-ATCG) codes found in AlleleSequence\n")
+  messages[["LowerCase"]] <- c("Lowercase bases found in AlleleSequence\n",
+                            "No lowercase bases found in AlleleSequence\n")
+  messages[["Indels"]] <- c(paste("Indels found (ref/alt lengths differ) for the CloneIDs:",paste(indels, collapse = " ")),
+                         "No indels found (ref/alt lengths match) for all CloneIDs\n")
+  messages[["ChromPos"]] <- c("Chromosome and Position format in CloneID look good\n",
+                             "CloneID does not have the expected Chromosome_Position format. Please check your CloneIDs or provide a file with this information\n")
+
+  list(checks = checks, messages = messages, indel_clone_ids = indels)
+}
diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R
index 1b02c31..fc022f9 100644
--- a/R/madc2vcf_targets.R
+++ b/R/madc2vcf_targets.R
@@ -1,45 +1,121 @@
 #' Format MADC Target Loci Read Counts Into VCF
 #'
-#' This function will extract the read count information from a MADC file target markers and convert to VCF file format.
+#' Convert DArTag MADC target read counts to a VCF
 #'
-#' The DArTag MADC file format is not commonly supported through existing tools. This function
-#' will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the
-#' genotyping panel target markers only
+#' @description
+#' Parses a DArTag **MADC** report and writes a **VCF v4.3** containing per-target
+#' read counts for the panel’s target loci. This is useful because MADC is not
+#' widely supported by general-purpose tools, while VCF is.
 #'
-#' @param madc_file Path to MADC file
-#' @param output.file output file name and path
-#' @param botloci_file A string specifying the path to the file containing the target IDs designed in the bottom strand.
-#' @param get_REF_ALT if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded.
+#' @details
+#' **What this function does**
+#' - Runs basic sanity checks on the MADC file (column presence, fixed allele IDs,
+#'   IUPAC/ambiguous bases, lowercase bases, indels).
+#' - Extracts reference and total read counts per sample and target.
+#' - Derives `AD` (ref,alt) by subtraction (alt = total − ref).
+#' - If `get_REF_ALT = TRUE`, attempts to recover true REF/ALT bases by comparing
+#'   the Ref/Alt probe sequences; targets with >1 polymorphism are discarded.
+#' - Optionally accepts a `markers_info` CSV to supply `CHROM`, `POS`, `REF`, `ALT`
+#'   (and `Type`, `Indel_pos` when indels are present), bypassing sequence-based
+#'   inference.
+#'
+#' **Output VCF layout**
+#' - `INFO` fields:
+#'   * `DP`   — total depth across all samples for the locus
+#'   * `ADS`  — total counts across samples in the order `ref,alt`
+#' - `FORMAT` fields (per sample):
+#'   * `DP`   — total reads (ref + alt)
+#'   * `RA`   — reads supporting the reference allele
+#'   * `AD`   — `"ref,alt"` counts
+#'
+#' **Strand handling**
+#' If a target ID appears in `botloci_file`, its probe sequences are reverse-
+#' complemented prior to base comparison so that REF/ALT are reported in the
+#' top-strand genomic orientation.
+#'
+#' **Sanity check behavior**
+#' - If required columns or fixed IDs are missing, the function `stop()`s.
+#' - If IUPAC/lowercase/indels are detected and `markers_info` is **not**
+#'   provided, the function `stop()`s with a diagnostic message explaining what to fix.
+#'
+#' @param madc_file character. Path to the input MADC CSV file.
+#' @param output.file character. Path to the output VCF file to write.
+#' @param botloci_file character. Path to a plain-text file listing target IDs
+#'   designed on the **bottom** strand (one ID per line). Required when
+#'   `get_REF_ALT = TRUE` and `markers_info` is not provided.
+#' @param markers_info character or `NULL`. Optional path to a CSV providing target
+#'   metadata. Required columns: `BI_markerID, Chr, Pos, Ref, Alt`. If indels are
+#'   present, also require `Type, Indel_pos`. When supplied, these values populate
+#'   `#CHROM, POS, REF, ALT` in the VCF directly.
+#' @param get_REF_ALT logical (default `FALSE`). If `TRUE`, attempts to infer REF/ALT
+#'   bases from the Ref/Alt probe sequences in the MADC file (with strand correction
+#'   using `botloci_file`). Targets with more than one difference between Ref/Alt
+#'   sequences are removed.
+#'
+#' @return (Invisibly) returns the path to `output.file`. The side effect is a
+#'   **VCF v4.3** written to disk containing one row per target and columns for all
+#'   samples in the MADC file.
+#'
+#' @section Dependencies:
+#' Uses **dplyr**, **tidyr**, **tibble**, **reshape2**, **Biostrings** and base
+#' **utils**. Helper functions expected in this package: `check_madc_sanity()`,
+#' `get_countsMADC()`, `get_counts()`, and `check_botloci()`.
+#'
+#' @examples
+#' # Example files shipped with the package
+#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr")
+#' bot_file  <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci",
+#'                          package = "BIGr")
+#' out_vcf <- tempfile(fileext = ".vcf")
+#'
+#' # Convert MADC to VCF (attempting to recover REF/ALT from probe sequences)
+#' \dontrun{
+#' madc2vcf_targets(
+#'   madc_file    = madc_file,
+#'   output.file  = out_vcf,
+#'   botloci_file = bot_file,
+#'   get_REF_ALT  = TRUE
+#' )
+#' }
+#'
+#' # Clean up (example)
+#' unlink(out_vcf)
+#'
+#' @seealso
+#' `check_madc_sanity()`, `get_countsMADC()`, `check_botloci()`
 #'
-#' @return A VCF file v4.3 with the target marker read count information
 #' @import dplyr
 #' @import tidyr
 #' @import tibble
-#' @importFrom Rdpack reprompt
 #' @importFrom reshape2 melt dcast
 #' @importFrom utils write.table
 #' @importFrom Biostrings DNAString reverseComplement
-#' @return A VCF file v4.3 with the target marker read count information
-#'
-#' @examples
-#' # Load example files
-#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
-#' bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr")
-#'
-#' #Temp location (only for example)
-#' output_file <- tempfile()
-#'
-#' # Convert MADC to VCF
-#' madc2vcf_targets(madc_file = madc_file,
-#'                  output.file = output_file,
-#'                  get_REF_ALT = TRUE,
-#'                  botloci_file = bot_file)
-#'
-#' rm(output_file)
 #'
 #' @export
-madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = FALSE) {
-  #Making the VCF (This is highly dependent on snps being in a format where the SNP IDs are the CHR_POS)
+madc2vcf_targets <- function(madc_file,
+                             output.file,
+                             botloci_file,
+                             markers_info = NULL,
+                             get_REF_ALT = FALSE) {
+
+  # MADC checks
+  report <- read.csv(madc_file)
+  checks <- check_madc_sanity(report)
+
+  messages_results <- mapply(function(check, message) {
+    if (check)  message[1] else message[2]
+  }, checks$checks, checks$messages)
+
+  if(any(!(checks$checks[c("Columns", "FixAlleleIDs")]))){
+    idx <- which(!(checks$checks[c("Columns", "FixAlleleIDs")]))
+    stop(paste("The MADC file does not pass the sanity checks:\n",
+               paste(messages_results[c("Columns", "FixAlleleIDs")[idx]], collapse = "\n")))
+  }
+
+  if(any(checks$checks[c("IUPACcodes", "LowerCase", "Indels")])){
+    idx <- which((checks$checks[c("IUPACcodes", "LowerCase", "Indels")]))
+    if(is.null(markers_info)) stop(paste(messages_results[c("IUPACcodes", "LowerCase", "Indels")[idx]], collapse = "\n"))
+  }
 
   matrices <- get_countsMADC(madc_file)
   ref_df <- data.frame(matrices[[1]], check.names = FALSE)
@@ -56,18 +132,124 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT =
   row.names(ad_df) <- row.names(ref_df)
 
   #Obtaining Chr and Pos information from the row_names
-  new_df <- size_df %>%
-    rownames_to_column(var = "row_name") %>%
-    separate(row_name, into = c("CHROM", "POS"), sep = "_") %>%
-    select(CHROM, POS)
+  if(is.null(markers_info)){
+    new_df <- size_df %>%
+      rownames_to_column(var = "row_name") %>%
+      separate(row_name, into = c("CHROM", "POS"), sep = "_") %>%
+      select(CHROM, POS)
+
+    # Remove leading zeros from the POS column
+    new_df$POS <- sub("^0+", "", new_df$POS)
+
+    #Get read count sums
+    new_df$TotalRef <- rowSums(ref_df)
+    new_df$TotalAlt <- rowSums(alt_df)
+    new_df$TotalSize <- rowSums(size_df)
+
+    # Get REF and ALT
+    if(get_REF_ALT){
+      if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.")
+      csv <- get_counts(madc_file)
+      # Keep only the ones that have alt and ref
+      csv <- csv[which(csv$CloneID %in% rownames(ad_df)),]
+
+      # Get reverse complement the tag is present in botloci
+      botloci <- read.table(botloci_file, header = FALSE)
+
+      # Check if the botloci file marker IDs match with the MADC file
+      checked_botloci <- check_botloci(botloci, csv)
+      botloci <- checked_botloci[[1]]
+      csv <- checked_botloci[[2]]
+
+      # FIXED: Store original sequences before any transformation
+      csv$OriginalAlleleSequence <- csv$AlleleSequence
+
+      # Apply reverse complement to sequences for bottom strand markers
+      idx <- which(csv$CloneID %in% botloci[,1])
+      csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence))))
+
+      ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)]
+      ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)]
+      alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)]
+      alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)]
+
+      # FIXED: Get original sequences for SNP calling
+      orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)]
+      orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)]
 
-  # Remove leading zeros from the POS column
-  new_df$POS <- sub("^0+", "", new_df$POS)
+      if(all(sort(ref_ord) == sort(alt_ord))){
+        # Order sequences consistently
+        ref_seq <- ref_seq[order(ref_ord)]
+        alt_seq <- alt_seq[order(alt_ord)]
+        orig_ref_seq <- orig_ref_seq[order(ref_ord)]
+        orig_alt_seq <- orig_alt_seq[order(alt_ord)]
+        ordered_clone_ids <- sort(ref_ord)
 
-  #Get read count sums
-  new_df$TotalRef <- rowSums(ref_df)
-  new_df$TotalAlt <- rowSums(alt_df)
-  new_df$TotalSize <- rowSums(size_df)
+        ref_base <- alt_base <- vector()
+        for(i in 1:length(orig_ref_seq)){
+          # FIXED: Use original sequences for SNP calling
+          temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "")
+          idx_diff <- which(temp_list[[1]] != temp_list[[2]])
+
+          if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences
+            ref_base[i] <- NA
+            alt_base[i] <- NA
+          } else if(length(idx_diff) == 1) {
+            orig_ref_base <- temp_list[[1]][idx_diff]
+            orig_alt_base <- temp_list[[2]][idx_diff]
+
+            # FIXED: Apply reverse complement to bases only if marker is in botloci
+            if(ordered_clone_ids[i] %in% botloci[,1]) {
+              ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base)))
+              alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base)))
+            } else {
+              ref_base[i] <- orig_ref_base
+              alt_base[i] <- orig_alt_base
+            }
+          } else {
+            # No differences found
+            ref_base[i] <- NA
+            alt_base[i] <- NA
+          }
+        }
+      } else {
+        warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.")
+        ref_base <- "."
+        alt_base <- "."
+      }
+
+    } else {
+      ref_base <- "."
+      alt_base <- "."
+    }
+  } else {
+    # Verify markers_info file
+    df <- read.csv(markers_info)
+    if(checks$checks["Indels"]){
+      if(!all(c("BI_markerID","Chr","Pos","Ref","Alt","Type", "Indel_pos") %in% colnames(df)))
+        stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT, Type, Indel_pos")
+    }
+    if(!all(c("BI_markerID","Chr","Pos","Ref","Alt") %in% colnames(df)))
+      stop("The markers_info dataframe must contain the following columns: BI_markerID, CHROM, POS, REF, ALT")
+
+    if(!all(rownames(ad_df)%in% df$BI_markerID))
+      warning("Not all MADC CloneID was found in the markers_info file. These markers will be removed.")
+
+    matched <- df[match(rownames(ad_df), df$BI_markerID),]
+
+    new_df <- data.frame(
+      CHROM = matched$Chr,
+      POS = matched$Pos
+    )
+
+    #Get read count sums
+    new_df$TotalRef <- rowSums(ref_df)
+    new_df$TotalAlt <- rowSums(alt_df)
+    new_df$TotalSize <- rowSums(size_df)
+
+    ref_base <- matched$Ref
+    alt_base <- matched$Alt
+  }
 
   #Make a header separate from the dataframe
   vcf_header <- c(
@@ -82,83 +264,6 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT =
     '##FORMAT=<ID=AD,Number=R,Type=Integer,Description="Allelic depths for the ref and alt alleles in the order listed">'
   )
 
-  # Get REF and ALT
-  if(get_REF_ALT){
-    if(is.null(botloci_file)) stop("Please provide the botloci file to recover the reference and alternative bases.")
-    csv <- get_counts(madc_file)
-    # Keep only the ones that have alt and ref
-    csv <- csv[which(csv$CloneID %in% rownames(ad_df)),]
-
-    # Get reverse complement the tag is present in botloci
-    botloci <- read.table(botloci_file, header = FALSE)
-
-    # Check if the botloci file marker IDs match with the MADC file
-    checked_botloci <- check_botloci(botloci, csv)
-    botloci <- checked_botloci[[1]]
-    csv <- checked_botloci[[2]]
-
-    # FIXED: Store original sequences before any transformation
-    csv$OriginalAlleleSequence <- csv$AlleleSequence
-    
-    # Apply reverse complement to sequences for bottom strand markers
-    idx <- which(csv$CloneID %in% botloci[,1])
-    csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence))))
-
-    ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)]
-    ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)]
-    alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)]
-    alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)]
-    
-    # FIXED: Get original sequences for SNP calling
-    orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)]
-    orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)]
-
-    if(all(sort(ref_ord) == sort(alt_ord))){
-      # Order sequences consistently
-      ref_seq <- ref_seq[order(ref_ord)]
-      alt_seq <- alt_seq[order(alt_ord)]
-      orig_ref_seq <- orig_ref_seq[order(ref_ord)]
-      orig_alt_seq <- orig_alt_seq[order(alt_ord)]
-      ordered_clone_ids <- sort(ref_ord)
-
-      ref_base <- alt_base <- vector()
-      for(i in 1:length(orig_ref_seq)){
-        # FIXED: Use original sequences for SNP calling
-        temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "")
-        idx_diff <- which(temp_list[[1]] != temp_list[[2]])
-        
-        if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences
-          ref_base[i] <- NA
-          alt_base[i] <- NA
-        } else if(length(idx_diff) == 1) {
-          orig_ref_base <- temp_list[[1]][idx_diff]
-          orig_alt_base <- temp_list[[2]][idx_diff]
-          
-          # FIXED: Apply reverse complement to bases only if marker is in botloci
-          if(ordered_clone_ids[i] %in% botloci[,1]) {
-            ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base)))
-            alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base)))
-          } else {
-            ref_base[i] <- orig_ref_base
-            alt_base[i] <- orig_alt_base
-          }
-        } else {
-          # No differences found
-          ref_base[i] <- NA
-          alt_base[i] <- NA
-        }
-      }
-    } else {
-      warning("There are missing reference or alternative sequence, the SNP bases could not be recovery.")
-      ref_base <- "."
-      alt_base <- "."
-    }
-
-  } else {
-    ref_base <- "."
-    alt_base <- "."
-  }
-
   #Make the header#Make the VCF df
   vcf_df <- data.frame(
     CHROM = new_df$CHROM,
@@ -233,12 +338,4 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT =
   suppressWarnings(
     write.table(vcf_df, file = output.file, sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE, append = TRUE)
   )
-  #Unload all items from memory
-  rm(matrices)
-  rm(ref_df)
-  rm(alt_df)
-  rm(size_df)
-  rm(ad_df)
-  rm(vcf_df)
-  rm(geno_df)
 }
diff --git a/man/check_madc_sanity.Rd b/man/check_madc_sanity.Rd
new file mode 100644
index 0000000..494145e
--- /dev/null
+++ b/man/check_madc_sanity.Rd
@@ -0,0 +1,43 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/check_madc_sanity.R
+\name{check_madc_sanity}
+\alias{check_madc_sanity}
+\title{Run basic sanity checks on a MADC-style allele report}
+\usage{
+check_madc_sanity(report)
+}
+\arguments{
+\item{report}{A \code{data.frame} with at least the columns
+\code{CloneID}, \code{AlleleID}, and \code{AlleleSequence}. The first column is also
+used in the “FixAlleleIDs” check to inspect its first up to six entries.}
+}
+\value{
+A list with:
+\describe{
+\item{checks}{Named logical vector with entries
+\code{Columns}, \code{FixAlleleIDs}, \code{IUPACcodes}, \code{LowerCase}, \code{Indels}.}
+\item{indel_clone_ids}{Character vector of \code{CloneID}s where ref/alt lengths differ.
+Returns \code{character(0)} if none, or \code{NULL} when required columns are missing.}
+}
+}
+\description{
+Performs five quick validations on an allele report:
+\enumerate{
+\item \strong{Columns} – required columns are present (\code{CloneID}, \code{AlleleID}, \code{AlleleSequence});
+\item \strong{FixAlleleIDs} – first column’s first up-to-6 rows are not all blank or "*"
+\emph{and} both \verb{_0001} and \verb{_0002} appear in \code{AlleleID};
+\item \strong{IUPACcodes} – presence of non-ATCG characters in \code{AlleleSequence};
+\item \strong{LowerCase} – presence of lowercase a/t/c/g in \code{AlleleSequence};
+\item \strong{Indels} – reference/alternate allele lengths differ for the same \code{CloneID}.
+}
+}
+\details{
+\itemize{
+\item \strong{IUPAC check:} Flags any character outside \code{ATCG} (case-insensitive),
+which will include ambiguity codes (\code{N}, \code{R}, \code{Y}, etc.) and symbols like \code{-}.
+\item \strong{Indels:} Rows are split by \code{AlleleID} containing \code{"Ref_0001"} vs \code{"Alt_0002"},
+merged by \code{CloneID}, and the lengths of \code{AlleleSequence} are compared.
+\item If required columns are missing, only \strong{Columns} is evaluated (\code{FALSE}) and
+\code{indel_clone_ids} is returned as \code{NULL}.
+}
+}
diff --git a/man/madc2vcf_targets.Rd b/man/madc2vcf_targets.Rd
index a790460..fad847c 100644
--- a/man/madc2vcf_targets.Rd
+++ b/man/madc2vcf_targets.Rd
@@ -4,44 +4,114 @@
 \alias{madc2vcf_targets}
 \title{Format MADC Target Loci Read Counts Into VCF}
 \usage{
-madc2vcf_targets(madc_file, output.file, botloci_file, get_REF_ALT = FALSE)
+madc2vcf_targets(
+  madc_file,
+  output.file,
+  botloci_file,
+  markers_info = NULL,
+  get_REF_ALT = FALSE
+)
 }
 \arguments{
-\item{madc_file}{Path to MADC file}
+\item{madc_file}{character. Path to the input MADC CSV file.}
 
-\item{output.file}{output file name and path}
+\item{output.file}{character. Path to the output VCF file to write.}
 
-\item{botloci_file}{A string specifying the path to the file containing the target IDs designed in the bottom strand.}
+\item{botloci_file}{character. Path to a plain-text file listing target IDs
+designed on the \strong{bottom} strand (one ID per line). Required when
+\code{get_REF_ALT = TRUE} and \code{markers_info} is not provided.}
 
-\item{get_REF_ALT}{if TRUE recovers the reference and alternative bases by comparing the sequences. If more than one polymorphism are found for a tag, it is discarded.}
+\item{markers_info}{character or \code{NULL}. Optional path to a CSV providing target
+metadata. Required columns: \verb{BI_markerID, Chr, Pos, Ref, Alt}. If indels are
+present, also require \verb{Type, Indel_pos}. When supplied, these values populate
+\verb{#CHROM, POS, REF, ALT} in the VCF directly.}
+
+\item{get_REF_ALT}{logical (default \code{FALSE}). If \code{TRUE}, attempts to infer REF/ALT
+bases from the Ref/Alt probe sequences in the MADC file (with strand correction
+using \code{botloci_file}). Targets with more than one difference between Ref/Alt
+sequences are removed.}
 }
 \value{
-A VCF file v4.3 with the target marker read count information
-
-A VCF file v4.3 with the target marker read count information
+(Invisibly) returns the path to \code{output.file}. The side effect is a
+\strong{VCF v4.3} written to disk containing one row per target and columns for all
+samples in the MADC file.
 }
 \description{
-This function will extract the read count information from a MADC file target markers and convert to VCF file format.
+Parses a DArTag \strong{MADC} report and writes a \strong{VCF v4.3} containing per-target
+read counts for the panel’s target loci. This is useful because MADC is not
+widely supported by general-purpose tools, while VCF is.
 }
 \details{
-The DArTag MADC file format is not commonly supported through existing tools. This function
-will extract the read count information from a MADC file for the target markers and convert it to a VCF file format for the
-genotyping panel target markers only
+Convert DArTag MADC target read counts to a VCF
+
+\strong{What this function does}
+\itemize{
+\item Runs basic sanity checks on the MADC file (column presence, fixed allele IDs,
+IUPAC/ambiguous bases, lowercase bases, indels).
+\item Extracts reference and total read counts per sample and target.
+\item Derives \code{AD} (ref,alt) by subtraction (alt = total − ref).
+\item If \code{get_REF_ALT = TRUE}, attempts to recover true REF/ALT bases by comparing
+the Ref/Alt probe sequences; targets with >1 polymorphism are discarded.
+\item Optionally accepts a \code{markers_info} CSV to supply \code{CHROM}, \code{POS}, \code{REF}, \code{ALT}
+(and \code{Type}, \code{Indel_pos} when indels are present), bypassing sequence-based
+inference.
 }
-\examples{
-# Load example files
-madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
-bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr")
 
-#Temp location (only for example)
-output_file <- tempfile()
+\strong{Output VCF layout}
+\itemize{
+\item \code{INFO} fields:
+\itemize{
+\item \code{DP}   — total depth across all samples for the locus
+\item \code{ADS}  — total counts across samples in the order \verb{ref,alt}
+}
+\item \code{FORMAT} fields (per sample):
+\itemize{
+\item \code{DP}   — total reads (ref + alt)
+\item \code{RA}   — reads supporting the reference allele
+\item \code{AD}   — \code{"ref,alt"} counts
+}
+}
 
-# Convert MADC to VCF
-madc2vcf_targets(madc_file = madc_file,
-                 output.file = output_file,
-                 get_REF_ALT = TRUE,
-                 botloci_file = bot_file)
+\strong{Strand handling}
+If a target ID appears in \code{botloci_file}, its probe sequences are reverse-
+complemented prior to base comparison so that REF/ALT are reported in the
+top-strand genomic orientation.
 
-rm(output_file)
+\strong{Sanity check behavior}
+\itemize{
+\item If required columns or fixed IDs are missing, the function \code{stop()}s.
+\item If IUPAC/lowercase/indels are detected and \code{markers_info} is \strong{not}
+provided, the function \code{stop()}s with a diagnostic message explaining what to fix.
+}
+}
+\section{Dependencies}{
 
+Uses \strong{dplyr}, \strong{tidyr}, \strong{tibble}, \strong{reshape2}, \strong{Biostrings} and base
+\strong{utils}. Helper functions expected in this package: \code{check_madc_sanity()},
+\code{get_countsMADC()}, \code{get_counts()}, and \code{check_botloci()}.
+}
+
+\examples{
+# Example files shipped with the package
+madc_file <- system.file("example_MADC_FixedAlleleID.csv", package = "BIGr")
+bot_file  <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci",
+                         package = "BIGr")
+out_vcf <- tempfile(fileext = ".vcf")
+
+# Convert MADC to VCF (attempting to recover REF/ALT from probe sequences)
+\dontrun{
+madc2vcf_targets(
+  madc_file    = madc_file,
+  output.file  = out_vcf,
+  botloci_file = bot_file,
+  get_REF_ALT  = TRUE
+)
+}
+
+# Clean up (example)
+unlink(out_vcf)
+
+}
+\seealso{
+\code{check_madc_sanity()}, \code{get_countsMADC()}, \code{check_botloci()}
 }
diff --git a/tests/testthat/test-check_madc_sanity.R b/tests/testthat/test-check_madc_sanity.R
new file mode 100644
index 0000000..5053e55
--- /dev/null
+++ b/tests/testthat/test-check_madc_sanity.R
@@ -0,0 +1,10 @@
+test_that("check madc",{
+  madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
+  report <- read.csv(madc_file, check.names = FALSE)
+
+  check_madc_sanity(report)
+
+
+})
+
+

From 0b97b46434afcd7f441cb2b9705df2a9323232bf Mon Sep 17 00:00:00 2001
From: Cristianetaniguti <chaytaniguti@gmail.com>
Date: Fri, 14 Nov 2025 12:59:33 -0500
Subject: [PATCH 2/2] bugfix - if hapDB padding is not matching report

---
 R/madc2vcf_all.R | 16 ++++++++++++++--
 R/utils.R        |  1 +
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R
index 329ac5a..125d540 100644
--- a/R/madc2vcf_all.R
+++ b/R/madc2vcf_all.R
@@ -135,7 +135,7 @@ madc2vcf_all <- function(madc = NULL,
 loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, alignment_score_thr=40, verbose = TRUE){
 
   if(!is.null(hap_seq)){
-    hap_seq <- get_ref_alt_hap_seq(hap_seq)
+    hap_seq <- get_ref_alt_hap_seq(hap_seq, botloci)
   }
 
   nsamples <- ncol(report) - 3
@@ -376,7 +376,8 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){
 #' @param hap_seq haplotype db
 #'
 #' @noRd
-get_ref_alt_hap_seq <- function(hap_seq){
+get_ref_alt_hap_seq <- function(hap_seq, botloci){
+
   headers <- hap_seq$V1[grep(">",hap_seq$V1)]
   headers <- gsub(">", "", headers)
 
@@ -394,6 +395,17 @@ get_ref_alt_hap_seq <- function(hap_seq){
   seqs <- sapply(seqs, function(x) paste0(x, collapse = ""))
 
   hap_seq <- data.frame(AlleleID = headers, AlleleSequence = seqs)
+
+  # Check padding
+  hap_cloneID <- sapply(strsplit(hap_seq$AlleleID, "[|]"), function(x) x[1])
+  botloci_cloneID <- botloci$V1
+
+  pad_hap <- unique(nchar(sub(".*_", "", hap_cloneID)))
+  pad_botloci <- unique(nchar(sub(".*_", "", botloci_cloneID)))
+
+  if(length(pad_hap) > 1) stop("Check marker IDs in haplotype DB file. They should have the same padding.")
+  if(pad_hap != pad_botloci) stop("Check marker IDs padding in haplotype DB file. They should match the botloci file.")
+
   return(hap_seq)
 }
 
diff --git a/R/utils.R b/R/utils.R
index c280ad2..2399560 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -56,6 +56,7 @@ check_botloci <- function(botloci, report, verbose=TRUE){
         report$CloneID <- paste0(sub("_(.*)", "", report$CloneID), "_",
                                  sprintf(paste0("%0", pad_botloci, "d"), as.integer(sub(".*_", "", report$CloneID)))
         )
+        report$AlleleID <- paste0(report$CloneID, "|", sapply(strsplit(report$AlleleID, "[|]"), "[[",2))
       } else {
         botloci$V1 <- paste0(sub("_(.*)", "", botloci$V1), "_",
                              sprintf(paste0("%0", pad_madc, "d"), as.integer(sub(".*_", "", botloci$V1)))