Breeding-Insight · alex-sandercock · Sep 12, 2025 · Jun 4, 2025 · Jun 4, 2025 · Jun 11, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: BIGr
 Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species
-Version: 0.6.0
+Version: 0.6.1
 Authors@R: c(person(given='Alexander M.',
                     family='Sandercock',
                     email='ams866@cornell.edu',
@@ -53,15 +53,16 @@ Imports:
     Rdpack (>= 0.7),
     readr (>= 2.1.5),
     reshape2 (>= 1.4.4),
-    rrBLUP,
+    rlang,
     tidyr (>= 1.3.1),
     vcfR (>= 1.15.0),
     Rsamtools,
     Biostrings,
     pwalign,
     janitor,
     quadprog,
-    tibble
+    tibble,
+    stringr
 Suggests: 
     covr,
     spelling,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2: do not edit by hand
 
+export(allele_freq_poly)
 export(calculate_Het)
 export(calculate_MAF)
 export(check_homozygous_trios)
@@ -16,11 +17,15 @@ export(madc2gmat)
 export(madc2vcf_all)
 export(madc2vcf_targets)
 export(merge_MADCs)
+export(solve_composition_poly)
+export(thinSNP)
 export(updog2vcf)
 import(dplyr)
 import(janitor)
 import(parallel)
 import(quadprog)
+import(rlang)
+import(stringr)
 import(tibble)
 import(tidyr)
 import(vcfR)
@@ -33,7 +38,6 @@ importFrom(pwalign,pairwiseAlignment)
 importFrom(readr,read_csv)
 importFrom(reshape2,dcast)
 importFrom(reshape2,melt)
-importFrom(rrBLUP,A.mat)
 importFrom(stats,cor)
 importFrom(stats,setNames)
 importFrom(utils,packageVersion)

diff --git a/R/breedtools_functions.R b/R/breedtools_functions.R
@@ -32,7 +32,7 @@
 #' allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4)
 #' print(allele_freqs)
 #'
-#' @noRd
+#' @export
 allele_freq_poly <- function(geno, populations, ploidy = 2) {
 
   # Initialize returned df
@@ -168,7 +168,7 @@ QPsolve <- function(Y, X) {
 #'                                       ploidy = 4)
 #' print(composition)
 #'
-#' @noRd
+#' @export
 solve_composition_poly <- function(Y,
                                    X,
                                    ped = NULL,

diff --git a/R/madc2gmat.R b/R/madc2gmat.R
@@ -3,16 +3,17 @@
 #' Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix.
 #'
 #'@details
-#' This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and
-#' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package.
+#' This function reads a MADC file, processes it to remove unnecessary columns, and then
+#' converts it into an additive genomic relationship matrix using the first method proposed by VanRaden (2008).
 #' The resulting matrix can be used for genomic selection or other genetic analyses.
 #'
-#'@import dplyr
 #'@importFrom utils read.csv write.csv
-#'@importFrom rrBLUP A.mat
+#'@import tibble stringr dplyr tidyr
 #'
 #'@param madc_file Path to the MADC file to be filtered
 #'@param seed Optional seed for random number generation (default is NULL)
+#'@param method Method to use for processing the MADC data. Options are "unique" or "collapsed". Default is "collapsed".
+#'@param ploidy Numeric. Ploidy level of the samples (e.g., 2 for diploid, 4 for tetraploid)
 #'@param output.file Path to save the filtered data (if NULL, data will not be saved)
 #'
 #'@return data.frame or saved csv file
@@ -27,14 +28,17 @@
 #' # Converting to additive relationship matrix
 #' gmat <- madc2gmat(madc_file,
 #'                  seed = 123,
+#'                  ploidy=2,
 #'                  output.file = NULL)
 #'
-#'@references
-#'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3).
+#'@references VanRaden, P. M. (2008). Efficient methods to compute genomic predictions. Journal of Dairy Science, 91(11), 4414-4423
+#'@author Alexander M. Sandercock
 #'
 #'@export
 madc2gmat <- function(madc_file,
                       seed = NULL,
+                      method = "collapsed",
+                      ploidy,
-                      ploidy,
+                      ploidy = 2,
-                      ploidy,
+                      ploidy = 2,
-                      ploidy,
+                      ploidy = 2,
-                      ploidy,
+                      ploidy = 2,
                       output.file = NULL) {
   #set seed if not null
   if (!is.null(seed)) {
@@ -62,39 +66,163 @@ madc2gmat <- function(madc_file,
     filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
   }
 
-  #Remove extra text after Ref and Alt (_001 or _002)
-  filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID)
-  filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID)
+  #Convert to data.table
+  #filtered_df <- as.data.table(filtered_df)
+
+  if (method == "unique") {
+    #Get allele ratios
+    # --- Step 1: Calculate frequency of each specific allele using total locus count ---
+    # The denominator MUST be the sum of all reads at the locus.
+    allele_freq_df <- filtered_df %>%
+      group_by(CloneID) %>%
+      mutate(across(
+        .cols = where(is.numeric),
+        # The calculation is count / total_locus_count
+        .fns = ~ . / sum(.),
+        .names = "{.col}_freq"
+      )) %>%
+      ungroup()
+
+    #Rm object
+    rm(filtered_df)
+
+    # --- Step 2: Prepare data for reshaping ---
+    # Filter out the reference rows and create a unique marker name for each alternative allele
+    markers_to_pivot <- allele_freq_df %>%
+      filter(!str_ends(AlleleID, "\\|Ref_0001")) %>%
+      # Create a new, unique column for each marker (e.g., "chr1.1_000194324_RefMatch_0001")
+      # This makes each alternative allele its own column in the final matrix.
+      mutate(MarkerID = str_replace(AlleleID, "\\|", "_")) %>%
+      select(MarkerID, ends_with("_freq")) %>%
+      tibble::column_to_rownames(var = "MarkerID")
+    names(markers_to_pivot) <- str_remove(names(markers_to_pivot), "_freq$")
+    markers_to_pivot <- t(markers_to_pivot)
+    #Replace the NaN to NA
+    markers_to_pivot[is.nan(markers_to_pivot)] <- NA
+
+    #rm unneeded objects
+    rm(allele_freq_df)
+
+    #Step 3 is a robust transformation step, but probably not needed.
+    # --- Step 3: Reshape data into the Marker Matrix format ---
+    #marker_matrix_for_Amat <- markers_to_pivot %>%
+    #  pivot_longer(
+    #    cols = -MarkerID,
+    #    names_to = "SampleID",
+    #    values_to = "AlleleFreq"
+    #  ) %>%
+    #  mutate(SampleID = str_remove(SampleID, "_freq$")) %>%
+    #  pivot_wider(
+    #    names_from = MarkerID,
+    #    values_from = AlleleFreq,
+    # If a sample has zero reads for an allele, it won't appear after the filter.
+    # values_fill = 0 ensures these are explicitly set to zero frequency.
+    #    values_fill = 0
+    #  ) %>%
+    #  tibble::column_to_rownames("SampleID") %>%
+    #  as.matrix()
+  } else if (method == "collapsed"){
+
+    # This single pipeline calculates the final matrix.
+    # The output is the marker dosage matrix (X) to be used as input for A.mat()
+    markers_to_pivot <- filtered_df %>%
+
+      # --- Part A: Calculate the frequency of each allele at each locus ---
+      group_by(CloneID) %>%
+      mutate(across(
+        .cols = where(is.numeric),
+        # Use if_else to prevent 0/0, which results in NaN
+        .fns = ~ . / sum(.),
+        .names = "{.col}_freq"
+      )) %>%
+      ungroup() %>%
+
+      # --- Part B: Isolate and sum all alternative allele frequencies ---
+      #Note, the RefMatch counts are being counted as Reference allele counts, and not
+      #Alternative allele counts
+      # Filter to keep only the alternative alleles using base R's endsWith()
+      filter(!grepl("\\|Ref", AlleleID)) %>%
+      # Group by the main locus ID
+      group_by(CloneID) %>%
+      # For each locus, sum the frequencies of all its alternative alleles
+      summarise(across(ends_with("_freq"), sum), .groups = 'drop') %>%
+
+      # --- Part C: Reshape the data into the final matrix format ---
+      # Convert from a "long" format to a "wide" format
+      pivot_longer(
+        cols = -CloneID,
+        names_to = "SampleID",
+        values_to = "Dosage"
+      ) %>%
+      # Clean up the sample names using base R's sub()
+      mutate(SampleID = sub("_freq$", "", SampleID)) %>%
+      # Pivot to the final shape: samples in rows, markers in columns
+      pivot_wider(
+        names_from = CloneID,
+        values_from = Dosage,
+        # This ensures that if a sample/locus combination is missing,
+        # it gets a value of 0 instead of NA.
+        values_fill = 0
+      ) %>%
+      # Convert the 'SampleID' column into the actual row names of the dataframe
+      column_to_rownames("SampleID") %>%
+      # Convert the final dataframe into a true matrix object
+      as.matrix()
+    #Replace the NaN to NA
+    markers_to_pivot[is.nan(markers_to_pivot)] <- NA
+    #rm unneeded objects
+    rm(filtered_df)
 
-  #Removing extra columns
-  row.names(filtered_df) <- filtered_df$AlleleID
-  filtered_df <- filtered_df %>%
-    select(-c(AlleleID, CloneID, AlleleSequence))
-
-
-  #Scale and normalized data
-  message("Scaling and normalizing data to be -1,1")
-  # Function to scale a matrix to be between -1 and 1 for rrBLUP
-  scale_matrix <- function(mat) {
-    min_val <- min(mat)
-    max_val <- max(mat)
-
-    # Normalize to [0, 1]
-    normalized <- (mat - min_val) / (max_val - min_val)
-
-    # Scale to [-1, 1]
-    scaled <- 2 * normalized - 1
-
-    return(scaled)
+  } else {
+    stop("Invalid method specified. Use 'unique' or 'collapsed'.")
   }
 
-  # Apply the scaling function
-  filtered_df <- scale_matrix(filtered_df)
-
-  #Making additive relationship matrix
-  MADC.mat <- A.mat(t(filtered_df))
-
-  rm(filtered_df)
+  #VanRaden method
+  #This method is used to calculate the additive relationship matrix
+  compute_relationship_matrix <- function(mat,
+                                          ploidy,
+                                          method = "VanRaden",
+                                          impute = TRUE) {
+    ### Prepare matrix
+    #Remove any SNPs that are all NAs
+    mat <- mat[, colSums(is.na(mat)) < nrow(mat)]
+    #impute the missing values with the mean of the column
+    if (impute) {
+      mat <- data.frame(mat, check.names = FALSE, check.rows = FALSE) %>%
+        mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)) %>%
+        as.matrix()
+    }
+
+    if (method == "VanRaden") {
+      # Calculate the additive relationship matrix using VanRaden's method
+
+      #Calculate allele frequencies
+      p <- apply(mat, 2, mean, na.rm = TRUE)
+      q <- 1 - p
+      #Calculate the denominator
+      #Note: need to use 1/ploidy for ratios, where ploidy should be used for dosages.
+      #This is because the ratios are from 0-1, which is what you get when dosage/ploidy, whereas dosages are from 0 to ploidy
+      denominator <- (1/as.numeric(ploidy))*sum(p * q, na.rm = TRUE)
+      #Get the numerator
+      Z <- scale(mat, center = TRUE, scale = FALSE)
+      ZZ <- (Z %*% t(Z))
+      #Calculate the additive relationship matrix
+      A.mat <- ZZ / denominator
+
+      return(A.mat)
+
+    } else {
+      stop("Unsupported method. Only 'VanRaden' is currently implemented.")
+    }
+  }
+  #Calculate the additive relationship matrix
+  MADC.mat <- compute_relationship_matrix(markers_to_pivot,
+                                          ploidy = ploidy,
+                                          method = "VanRaden",
+                                          impute = TRUE)
+
+  #Remove the markers_to_pivot object to free up memory
+  rm(markers_to_pivot)
 
   #Save the output to disk if file name provided
   if (!is.null(output.file)) {

diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R
@@ -97,28 +97,55 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT =
     botloci <- checked_botloci[[1]]
     csv <- checked_botloci[[2]]
 
+    # FIXED: Store original sequences before any transformation
+    csv$OriginalAlleleSequence <- csv$AlleleSequence
+
+    # Apply reverse complement to sequences for bottom strand markers
     idx <- which(csv$CloneID %in% botloci[,1])
     csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence))))
 
     ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)]
     ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)]
     alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)]
     alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)]
+
+    # FIXED: Get original sequences for SNP calling
+    orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)]
+    orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)]
 
     if(all(sort(ref_ord) == sort(alt_ord))){
+      # Order sequences consistently
       ref_seq <- ref_seq[order(ref_ord)]
       alt_seq <- alt_seq[order(alt_ord)]
+      orig_ref_seq <- orig_ref_seq[order(ref_ord)]
+      orig_alt_seq <- orig_alt_seq[order(alt_ord)]
+      ordered_clone_ids <- sort(ref_ord)
 
       ref_base <- alt_base <- vector()
-      for(i in 1:length(ref_seq)){
-        temp_list <- strsplit(c(ref_seq[i], alt_seq[i]), "")
-        idx <- which(temp_list[[1]] != temp_list[[2]])
-        if(length(idx) >1) { # If finds more than one polymorphism between Ref and Alt sequences
+      for(i in 1:length(orig_ref_seq)){
+        # FIXED: Use original sequences for SNP calling
+        temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "")
+        idx_diff <- which(temp_list[[1]] != temp_list[[2]])
+
+        if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences
           ref_base[i] <- NA
           alt_base[i] <- NA
+        } else if(length(idx_diff) == 1) {
+          orig_ref_base <- temp_list[[1]][idx_diff]
+          orig_alt_base <- temp_list[[2]][idx_diff]
+
+          # FIXED: Apply reverse complement to bases only if marker is in botloci
+          if(ordered_clone_ids[i] %in% botloci[,1]) {
+            ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base)))
+            alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base)))
+          } else {
+            ref_base[i] <- orig_ref_base
+            alt_base[i] <- orig_alt_base
+          }
         } else {
-          ref_base[i] <- temp_list[[1]][idx]
-          alt_base[i] <- temp_list[[2]][idx]
+          # No differences found
+          ref_base[i] <- NA
+          alt_base[i] <- NA
         }
       }
     } else {