From d2fc874a9c4fa6c864725aa51f2261ce148daeda Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Fri, 16 May 2025 11:43:06 -0400 Subject: [PATCH 01/35] Added MADC Functions --- NAMESPACE | 3 ++ R/filterMADC.R | 99 +++++++++++++++++++++++++++++++++++++++++++ R/madc2gmat.R | 106 ++++++++++++++++++++++++++++++++++++++++++++++ man/filterMADC.Rd | 63 +++++++++++++++++++++++++++ man/madc2gmat.Rd | 35 +++++++++++++++ 5 files changed, 306 insertions(+) create mode 100644 R/filterMADC.R create mode 100644 R/madc2gmat.R create mode 100644 man/filterMADC.Rd create mode 100644 man/madc2gmat.Rd diff --git a/NAMESPACE b/NAMESPACE index 2ca9c31..9b60af0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,10 +7,12 @@ export(check_ped) export(check_replicates) export(dosage2vcf) export(dosage_ratios) +export(filterMADC) export(filterVCF) export(flip_dosage) export(get_countsMADC) export(imputation_concordance) +export(madc2gmat) export(madc2vcf_all) export(madc2vcf_targets) export(merge_MADCs) @@ -31,6 +33,7 @@ importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) importFrom(reshape2,dcast) importFrom(reshape2,melt) +importFrom(rrBLUP,A.mat) importFrom(stats,cor) importFrom(stats,setNames) importFrom(utils,packageVersion) diff --git a/R/filterMADC.R b/R/filterMADC.R new file mode 100644 index 0000000..8c8259e --- /dev/null +++ b/R/filterMADC.R @@ -0,0 +1,99 @@ +#' Filter MADC Files +#' +#' Filter and process MADC files to remove low quality microhaplotypes +#' +#' @details +#' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally, +#' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users +#' can scale and normalize the data in preparation for conversion to relationship matrices, +#' plot summary statistics, and save the filtered data to a file. +#' +#'@import dplyr +#'@importFrom utils read.csv +#' +#'@param madc_file Path to the MADC file to be filtered +#'@param min.mean.reads Minimum mean read depth for filtering +#'@param max.mean.reads Maximum mean read depth for filtering +#'@param max.match.mhaps Maximum number of matching mhaps per target loci +#'@param min.reads.per.site Minimum number of reads per site for filtering +#'@param min.ind.with.reads Minimum number of individuals with reads for filtering +#'@param target_only Logical indicating whether to filter for target loci only +#'@param fixed_allele_ids Logical indicating whether the MADC file has been pre-processed for fixed allele IDs +#'@param plot.summary Logical indicating whether to plot summary statistics +#'@param output.file Path to save the filtered data (if NULL, data will not be saved) +#'@param verbose Logical indicating whether to print additional information during processing +#' +#'@return data.frame or saved csv file +#' +#'@examples +#' #Example... +#' +#' ##Plots +#' #Mean read depth +#' #Number of Altmatch and Refmatch mhaps per target loci +#' +#' +#'@export +filterMADC <- function(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.match.mhaps = 10, + min.reads.per.site = NULL, + min.ind.with.reads = NULL, + target_only = FALSE, + fixed_allele_ids = FALSE, + plot.summary = FALSE, + output.file = NULL) { + + + #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not + first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) + + #Check if all entries in the first column are either blank or "*" + check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline + if (check_entries) { + #Note: This assumes that the first 7 rows are placeholder info from DArT processing + + warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.") + + #Read the madc file + filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + + } else { + + #Read the madc file + filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + + } + + #Remove refmatch and altmatch if wanted + if (target_only) { + message("Retaining target markers only") + #Retain only the Ref and Alt haplotypes + filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ] + } + + ## Filtering + if (!is.null(min.mean.reads)) { + message("Filtering for minimum mean reads across all samples") + filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ] + } + + #Save the output to disk if file name provided + if (!is.null(output.file)) { + message("Saving filtered data to file") + write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE) + } + + return(filtered_df) +} diff --git a/R/madc2gmat.R b/R/madc2gmat.R new file mode 100644 index 0000000..b39b364 --- /dev/null +++ b/R/madc2gmat.R @@ -0,0 +1,106 @@ +#' Convert MADC Files to an Additive Genomic Relationship Matrix +#' +#' Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix. +#' +#'@details +#' This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and +#' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package. +#' The resulting matrix can be used for genomic selection or other genetic analyses. +#' +#'@import dplyr +#'@importFrom utils read.csv write.csv +#'@importFrom rrBLUP A.mat +#' +#'@param madc_file Path to the MADC file to be filtered +#'@param output.file Path to save the filtered data (if NULL, data will not be saved) +#' +#'@return data.frame or saved csv file +#' +#'@examples +#' #Example... +#' +#' ##Plots +#' #Mean read depth +#' #Number of Altmatch and Refmatch mhaps per target loci +#' +#'@references +#'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3). +#' +#'@export +madc2gmat <- function(madc_file, + output.file = NULL) { + + + #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not + first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) + + #Check if all entries in the first column are either blank or "*" + check_entries <- all(first_seven_rows[, 1] %in% c("", "*")) + + #Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline + if (check_entries) { + #Note: This assumes that the first 7 rows are placeholder info from DArT processing + + warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.") + + #Read the madc file + filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Ref_001", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_002*", "|Alt", filtered_df$AlleleID) + + } else { + + #Read the madc file + filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) + + #Remove extra text after Ref and Alt (_001 or _002) + #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID) + + } + + #Removing extra columns + row.names(filtered_df) <- filtered_df$AlleleID + filtered_df <- filtered_df %>% + select(-c(AlleleID, CloneID, AlleleSequence)) + + + #Scale and normalized data + message("Scaling and normalizing data to be -1,1") + #filtered_df <- filtered_df %>% + # mutate(across(starts_with("MeanReads"), ~ scale(.) %>% as.numeric())) + + # Function to scale a matrix to be between -1 and 1 for rrBLUP + scale_matrix <- function(mat) { + min_val <- min(mat) + max_val <- max(mat) + + # Normalize to [0, 1] + normalized <- (mat - min_val) / (max_val - min_val) + + # Scale to [-1, 1] + scaled <- 2 * normalized - 1 + + return(scaled) + } + + # Apply the scaling function + filtered_df <- scale_matrix(filtered_df) + + #Making additive relationship matrix + MADC.mat <- A.mat(t(filtered_df)) + + #Save the output to disk if file name provided + if (!is.null(output.file)) { + message("Saving filtered data to file") + write.csv(MADC.mat, paste0(output.file,".csv"), row.names = TRUE) + } + + return(MADC.mat) +} diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd new file mode 100644 index 0000000..2b161dd --- /dev/null +++ b/man/filterMADC.Rd @@ -0,0 +1,63 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/filterMADC.R +\name{filterMADC} +\alias{filterMADC} +\title{Filter MADC Files} +\usage{ +filterMADC( + madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.match.mhaps = 10, + min.reads.per.site = NULL, + min.ind.with.reads = NULL, + target_only = FALSE, + fixed_allele_ids = FALSE, + plot.summary = FALSE, + output.file = NULL +) +} +\arguments{ +\item{madc_file}{Path to the MADC file to be filtered} + +\item{min.mean.reads}{Minimum mean read depth for filtering} + +\item{max.mean.reads}{Maximum mean read depth for filtering} + +\item{max.match.mhaps}{Maximum number of matching mhaps per target loci} + +\item{min.reads.per.site}{Minimum number of reads per site for filtering} + +\item{min.ind.with.reads}{Minimum number of individuals with reads for filtering} + +\item{target_only}{Logical indicating whether to filter for target loci only} + +\item{fixed_allele_ids}{Logical indicating whether the MADC file has been pre-processed for fixed allele IDs} + +\item{plot.summary}{Logical indicating whether to plot summary statistics} + +\item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} + +\item{verbose}{Logical indicating whether to print additional information during processing} +} +\value{ +data.frame or saved csv file +} +\description{ +Filter and process MADC files to remove low quality microhaplotypes +} +\details{ +This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally, +it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users +can scale and normalize the data in preparation for conversion to relationship matrices, +plot summary statistics, and save the filtered data to a file. +} +\examples{ +#Example... + +##Plots +#Mean read depth +#Number of Altmatch and Refmatch mhaps per target loci + + +} diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd new file mode 100644 index 0000000..6bd0b4f --- /dev/null +++ b/man/madc2gmat.Rd @@ -0,0 +1,35 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/madc2gmat.R +\name{madc2gmat} +\alias{madc2gmat} +\title{Convert MADC Files to an Additive Genomic Relationship Matrix} +\usage{ +madc2gmat(madc_file, output.file = NULL) +} +\arguments{ +\item{madc_file}{Path to the MADC file to be filtered} + +\item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} +} +\value{ +data.frame or saved csv file +} +\description{ +Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix. +} +\details{ +This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and +then converts it into an additive genomic relationship matrix using the \code{A.mat} function from the \code{rrBLUP} package. +The resulting matrix can be used for genomic selection or other genetic analyses. +} +\examples{ +#Example... + +##Plots +#Mean read depth +#Number of Altmatch and Refmatch mhaps per target loci + +} +\references{ +Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3). +} From e3a312b8ac74b08da2704b327c9a4443bcd54a84 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Fri, 16 May 2025 11:48:38 -0400 Subject: [PATCH 02/35] update description --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index 52220d3..7906d27 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,6 +53,7 @@ Imports: Rdpack (>= 0.7), readr (>= 2.1.5), reshape2 (>= 1.4.4), + rrBLUP, tidyr (>= 1.3.1), vcfR (>= 1.15.0), Rsamtools, From 5d4097bebc9750b1bb40108293e5431a77610cf0 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Fri, 16 May 2025 12:58:51 -0400 Subject: [PATCH 03/35] added madc2gmat test --- R/madc2gmat.R | 11 ++++++++-- man/madc2gmat.Rd | 2 +- tests/testthat/test-madc2gmat.R | 38 +++++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+), 3 deletions(-) create mode 100644 tests/testthat/test-madc2gmat.R diff --git a/R/madc2gmat.R b/R/madc2gmat.R index b39b364..39a8b54 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -28,8 +28,12 @@ #' #'@export madc2gmat <- function(madc_file, + seed = NULL, output.file = NULL) { - + #set seed if not null + if (!is.null(seed)) { + set.seed(seed) + } #Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL")) @@ -96,11 +100,14 @@ madc2gmat <- function(madc_file, #Making additive relationship matrix MADC.mat <- A.mat(t(filtered_df)) + rm(filtered_df) + #Save the output to disk if file name provided if (!is.null(output.file)) { message("Saving filtered data to file") write.csv(MADC.mat, paste0(output.file,".csv"), row.names = TRUE) + } else { + return(MADC.mat) } - return(MADC.mat) } diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index 6bd0b4f..d2cf998 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -4,7 +4,7 @@ \alias{madc2gmat} \title{Convert MADC Files to an Additive Genomic Relationship Matrix} \usage{ -madc2gmat(madc_file, output.file = NULL) +madc2gmat(madc_file, seed = NULL, output.file = NULL) } \arguments{ \item{madc_file}{Path to the MADC file to be filtered} diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R new file mode 100644 index 0000000..5497f21 --- /dev/null +++ b/tests/testthat/test-madc2gmat.R @@ -0,0 +1,38 @@ +context("MADC 2 Gmatrix") + + +test_that("test madc2gmat",{ + #Input variables + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + + #Calculations + temp <- tempfile() + + # Converting to additive relationship matrix + gmat <- madc2gmat(madc_file, + seed = 123, + output.file = NULL) + + #When output a file + madc2gmat(madc_file, + seed = 123, + output.file = temp) + + #Test that a valid output was provided + expect_true(file.exists(paste0(temp, ".csv"))) + + #Check + expect_true(all(dim(gmat) == c("10","10"))) + expect_true(all(row.names(gmat) == row.names(gmat))) + expect_equal(sum(gmat), 1.00614e-16)#, tolerance = 1e-16) + expect_true(is.matrix(gmat), "Output should be a matrix") + + # Read the output file + output_data <- read.csv(paste0(temp,".csv"), row.names = 1) + + # Test the content of the output file + expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix") + expect_true(all(dim(output_data) == c("10","10"))) + expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical") + expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-16) +}) From e0709ee44e0d7d1e972274feddd802c73d387b20 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Sat, 17 May 2025 08:48:48 -0400 Subject: [PATCH 04/35] cleaned --- R/madc2gmat.R | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 39a8b54..c386b90 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -50,25 +50,16 @@ madc2gmat <- function(madc_file, #Read the madc file filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) - #Remove extra text after Ref and Alt (_001 or _002) - #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Ref_001", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_002*", "|Alt", filtered_df$AlleleID) - } else { #Read the madc file filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) - - #Remove extra text after Ref and Alt (_001 or _002) - #filtered_df$AlleleID <- sub("\\|Ref.*", "|Ref", filtered_df$AlleleID) - #filtered_df$AlleleID <- sub("\\|Alt.*", "|Alt", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID) - } + #Remove extra text after Ref and Alt (_001 or _002) + filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID) + filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID) + #Removing extra columns row.names(filtered_df) <- filtered_df$AlleleID filtered_df <- filtered_df %>% @@ -77,9 +68,6 @@ madc2gmat <- function(madc_file, #Scale and normalized data message("Scaling and normalizing data to be -1,1") - #filtered_df <- filtered_df %>% - # mutate(across(starts_with("MeanReads"), ~ scale(.) %>% as.numeric())) - # Function to scale a matrix to be between -1 and 1 for rrBLUP scale_matrix <- function(mat) { min_val <- min(mat) From ca6aa80750aefa278267d907865c0e4f8a676d40 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Sat, 17 May 2025 11:11:36 -0400 Subject: [PATCH 05/35] filterMADC added --- R/filterMADC.R | 129 +++++++++++++++++++++++++++++++++++++++++----- man/filterMADC.Rd | 23 ++++----- 2 files changed, 126 insertions(+), 26 deletions(-) diff --git a/R/filterMADC.R b/R/filterMADC.R index 8c8259e..3cc2472 100644 --- a/R/filterMADC.R +++ b/R/filterMADC.R @@ -5,8 +5,7 @@ #' @details #' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally, #' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users -#' can scale and normalize the data in preparation for conversion to relationship matrices, -#' plot summary statistics, and save the filtered data to a file. +#' can plot summary statistics and save the filtered data to a file. #' #'@import dplyr #'@importFrom utils read.csv @@ -14,14 +13,13 @@ #'@param madc_file Path to the MADC file to be filtered #'@param min.mean.reads Minimum mean read depth for filtering #'@param max.mean.reads Maximum mean read depth for filtering -#'@param max.match.mhaps Maximum number of matching mhaps per target loci -#'@param min.reads.per.site Minimum number of reads per site for filtering -#'@param min.ind.with.reads Minimum number of individuals with reads for filtering -#'@param target_only Logical indicating whether to filter for target loci only -#'@param fixed_allele_ids Logical indicating whether the MADC file has been pre-processed for fixed allele IDs +#'@param max.mhaps.per.loci Maximum number of matching mhaps per target loci. Retains only the target Ref and Alt loci at the sites that exceeds the \code{max.mhaps.per.loci} threshold. +#'@param min.reads.per.site Minimum number of reads per site for \code{min.ind.with.reads}. Otherwise, this parameter is ignored +#'@param min.ind.with.reads Minimum number of individuals with \code{min.reads.per.site} reads for filtering +#'@param target.only Logical indicating whether to filter for target loci only +#'@param n.summary.columns (optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed. #'@param plot.summary Logical indicating whether to plot summary statistics #'@param output.file Path to save the filtered data (if NULL, data will not be saved) -#'@param verbose Logical indicating whether to print additional information during processing #' #'@return data.frame or saved csv file #' @@ -37,11 +35,11 @@ filterMADC <- function(madc_file, min.mean.reads = NULL, max.mean.reads = NULL, - max.match.mhaps = 10, - min.reads.per.site = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, min.ind.with.reads = NULL, - target_only = FALSE, - fixed_allele_ids = FALSE, + target.only = FALSE, + n.summary.columns = NULL, plot.summary = FALSE, output.file = NULL) { @@ -75,18 +73,123 @@ filterMADC <- function(madc_file, filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) } + #Check for extra columns + #Save the three columns for later adding to the output + saved_columns <- filtered_df[,1:3] + + if (!is.null(n.summary.columns)) { + #Remove the first n.summary.columns columns + filtered_df <- filtered_df[,-c(4:n.summary.columns)] + }else{ + rm.col <- c("ClusterConsensusSequence", + "CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp", + "FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp","RatioAvgCountRefAvgCountSnp") + + filtered_df <- filtered_df[, !(colnames(filtered_df) %in% rm.col)] + } + + #Now add rownames + rownames(filtered_df) <- saved_columns[,1] #Remove refmatch and altmatch if wanted - if (target_only) { + if (target.only) { message("Retaining target markers only") #Retain only the Ref and Alt haplotypes filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ] } ## Filtering + + #Min mean reads if (!is.null(min.mean.reads)) { message("Filtering for minimum mean reads across all samples") + #Get the mean value for each row, and remove the rows below that threshold + filtered_df$MeanReads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ] + #Remove the MeanReads column + filtered_df <- filtered_df[, -which(colnames(filtered_df) == "MeanReads")] + } + + #Max mean reads + if (!is.null(max.mean.reads)) { + message("Filtering for maximum mean reads across all samples") + #Get the mean value for each row, and remove the rows above that threshold + filtered_df$MeanReads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) + filtered_df <- filtered_df[filtered_df$MeanReads <= max.mean.reads, ] + #Remove the MeanReads column + filtered_df <- filtered_df[, -which(colnames(filtered_df) == "MeanReads")] + } + + #Max mhaps per loci + if (!is.null(max.mhaps.per.loci)) { + message("Filtering for maximum number of matching mhaps per target loci") + #Get the number of matching mhaps for loci, and remove the mhaps at those loci that exceed the max number + mhap_counts <- filtered_df %>% + group_by(CloneID) %>% + summarise(Count = n(), .groups = 'drop') %>% + filter(Count > max.mhaps.per.loci) + + patterns_to_search <- "\\|AltMatch|\\|RefMatch" + clone_ids_to_target <- mhap_counts$CloneID + + filtered_df <- filtered_df %>% + filter( + !( # "keep rows that DO NOT match both conditions" + CloneID %in% clone_ids_to_target & # Condition 1: CloneID is one of the targeted IDs + grepl(patterns_to_search, AlleleID) # Condition 2: AlleleID contains one of the patterns + ) + ) + } + + #Min individuals with reads + if (!is.null(min.ind.with.reads)) { + message("Filtering for minimum number of individuals with reads per site") + message(past0("Minimum number of individuals with reads per site: ", min.ind.with.reads)) + message(past0("Minimum number of reads per site: ", min.reads.per.site)) + + #Getting colnames + cols_to_check <- colnames(filtered_df)[-(1:3)] + + filtered_df <- filtered_df %>% + rowwise() %>% # Process data row by row + mutate( + # For each row, count how many of the 'cols_to_check' meet the criterion + qualifying_sites_count = sum( + c_across(all_of(cols_to_check)) >= min.reads.per.site, + na.rm = TRUE # Treats NAs in data as not meeting the criterion + ) + ) %>% + ungroup() %>% # Always ungroup after rowwise operations + # Filter rows where this count meets the 'min.ind.with.reads' threshold + filter(qualifying_sites_count >= min.ind.with.reads) %>% + # Optionally, remove the temporary count column if it's no longer needed + select(-qualifying_sites_count) + } + + #Plots + if (plot.summary) { + message("Plotting summary statistics") + #Plot mean read depth + mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) + hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency") + + #Plot number of Altmatch and Refmatch mhaps per target loci + altmatch_counts <- filtered_df %>% + filter(grepl("\\|AltMatch", AlleleID)) %>% + group_by(CloneID) %>% + summarise(Count = n(), .groups = 'drop') + + refmatch_counts <- filtered_df %>% + filter(grepl("\\|RefMatch", AlleleID)) %>% + group_by(CloneID) %>% + summarise(Count = n(), .groups = 'drop') + + barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE, + names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps", + xlab = "Clone ID", ylab = "Count") + + #Plot density of number of CloneID per site on a marker distribution plot + } #Save the output to disk if file name provided diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd index 2b161dd..c2df9d2 100644 --- a/man/filterMADC.Rd +++ b/man/filterMADC.Rd @@ -8,11 +8,11 @@ filterMADC( madc_file, min.mean.reads = NULL, max.mean.reads = NULL, - max.match.mhaps = 10, - min.reads.per.site = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, min.ind.with.reads = NULL, - target_only = FALSE, - fixed_allele_ids = FALSE, + target.only = FALSE, + n.summary.columns = NULL, plot.summary = FALSE, output.file = NULL ) @@ -24,21 +24,19 @@ filterMADC( \item{max.mean.reads}{Maximum mean read depth for filtering} -\item{max.match.mhaps}{Maximum number of matching mhaps per target loci} +\item{max.mhaps.per.loci}{Maximum number of matching mhaps per target loci. Retains only the target Ref and Alt loci at the sites that exceeds the \code{max.mhaps.per.loci} threshold.} -\item{min.reads.per.site}{Minimum number of reads per site for filtering} +\item{min.reads.per.site}{Minimum number of reads per site for \code{min.ind.with.reads}. Otherwise, this parameter is ignored} -\item{min.ind.with.reads}{Minimum number of individuals with reads for filtering} +\item{min.ind.with.reads}{Minimum number of individuals with \code{min.reads.per.site} reads for filtering} -\item{target_only}{Logical indicating whether to filter for target loci only} +\item{target.only}{Logical indicating whether to filter for target loci only} -\item{fixed_allele_ids}{Logical indicating whether the MADC file has been pre-processed for fixed allele IDs} +\item{n.summary.columns}{(optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.} \item{plot.summary}{Logical indicating whether to plot summary statistics} \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} - -\item{verbose}{Logical indicating whether to print additional information during processing} } \value{ data.frame or saved csv file @@ -49,8 +47,7 @@ Filter and process MADC files to remove low quality microhaplotypes \details{ This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally, it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users -can scale and normalize the data in preparation for conversion to relationship matrices, -plot summary statistics, and save the filtered data to a file. +can plot summary statistics and save the filtered data to a file. } \examples{ #Example... From e4873fa97afcf36b0a2ae283858d51589b877f05 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Sat, 17 May 2025 11:47:24 -0400 Subject: [PATCH 06/35] added filterMADC tests --- R/filterMADC.R | 54 ++++++------ man/filterMADC.Rd | 3 - tests/testthat/test-filterMADC.R | 144 +++++++++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 29 deletions(-) create mode 100644 tests/testthat/test-filterMADC.R diff --git a/R/filterMADC.R b/R/filterMADC.R index 3cc2472..2ab79dd 100644 --- a/R/filterMADC.R +++ b/R/filterMADC.R @@ -18,7 +18,7 @@ #'@param min.ind.with.reads Minimum number of individuals with \code{min.reads.per.site} reads for filtering #'@param target.only Logical indicating whether to filter for target loci only #'@param n.summary.columns (optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed. -#'@param plot.summary Logical indicating whether to plot summary statistics +#@param plot.summary Logical indicating whether to plot summary statistics #'@param output.file Path to save the filtered data (if NULL, data will not be saved) #' #'@return data.frame or saved csv file @@ -40,7 +40,7 @@ filterMADC <- function(madc_file, min.ind.with.reads = NULL, target.only = FALSE, n.summary.columns = NULL, - plot.summary = FALSE, + #plot.summary = FALSE, output.file = NULL) { @@ -144,8 +144,8 @@ filterMADC <- function(madc_file, #Min individuals with reads if (!is.null(min.ind.with.reads)) { message("Filtering for minimum number of individuals with reads per site") - message(past0("Minimum number of individuals with reads per site: ", min.ind.with.reads)) - message(past0("Minimum number of reads per site: ", min.reads.per.site)) + message(paste0("Minimum number of individuals with reads per site: ", min.ind.with.reads)) + message(paste0("Minimum number of reads per site: ", min.reads.per.site)) #Getting colnames cols_to_check <- colnames(filtered_df)[-(1:3)] @@ -167,36 +167,38 @@ filterMADC <- function(madc_file, } #Plots - if (plot.summary) { - message("Plotting summary statistics") - #Plot mean read depth - mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) - hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency") - - #Plot number of Altmatch and Refmatch mhaps per target loci - altmatch_counts <- filtered_df %>% - filter(grepl("\\|AltMatch", AlleleID)) %>% - group_by(CloneID) %>% - summarise(Count = n(), .groups = 'drop') - - refmatch_counts <- filtered_df %>% - filter(grepl("\\|RefMatch", AlleleID)) %>% - group_by(CloneID) %>% - summarise(Count = n(), .groups = 'drop') - - barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE, - names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps", - xlab = "Clone ID", ylab = "Count") + #if (plot.summary) { + # message("Plotting summary statistics") + # #Plot mean read depth + # mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE) + # hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency") + + # #Plot number of Altmatch and Refmatch mhaps per target loci + # altmatch_counts <- filtered_df %>% + # filter(grepl("\\|AltMatch", AlleleID)) %>% + # group_by(CloneID) %>% + # summarise(Count = n(), .groups = 'drop') + + # refmatch_counts <- filtered_df %>% + # filter(grepl("\\|RefMatch", AlleleID)) %>% + # group_by(CloneID) %>% + # summarise(Count = n(), .groups = 'drop') + + # barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE, + # names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps", + # xlab = "Clone ID", ylab = "Count") #Plot density of number of CloneID per site on a marker distribution plot - } + #} #Save the output to disk if file name provided if (!is.null(output.file)) { message("Saving filtered data to file") write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE) + } else { + message("No output file provided. Returning filtered data.") + return(filtered_df) } - return(filtered_df) } diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd index c2df9d2..a028653 100644 --- a/man/filterMADC.Rd +++ b/man/filterMADC.Rd @@ -13,7 +13,6 @@ filterMADC( min.ind.with.reads = NULL, target.only = FALSE, n.summary.columns = NULL, - plot.summary = FALSE, output.file = NULL ) } @@ -34,8 +33,6 @@ filterMADC( \item{n.summary.columns}{(optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.} -\item{plot.summary}{Logical indicating whether to plot summary statistics} - \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} } \value{ diff --git a/tests/testthat/test-filterMADC.R b/tests/testthat/test-filterMADC.R new file mode 100644 index 0000000..b7cf687 --- /dev/null +++ b/tests/testthat/test-filterMADC.R @@ -0,0 +1,144 @@ +context("Filter MADC") + + +test_that("test filter madc",{ + #Input variables + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + + #Calculations + temp <- tempfile() + + # Filtering (target only) + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = TRUE, + n.summary.columns = NULL, + output.file = NULL) + + + #Test that a valid output was provided + expect_equal(nrow(filtered_df), 41) + #Check that it is a dataframe + expect_true(is.data.frame(filtered_df)) + + # Checking for no filtering + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 51) + expect_equal(sum(filtered_df[,-c(1:3)]), 53952) + expect_true(all(names(filtered_df[1:3]) == c("AlleleID", "CloneID", "AlleleSequence"))) + + #Checking for min.mean.reads filtering + filtered_df <- filterMADC(madc_file, + min.mean.reads = 10, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 36) + expect_equal(ncol(filtered_df), 13) + + #Checking for max.mean.reads filtering + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = 10, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 15) + expect_equal(ncol(filtered_df), 13) + + #Remove max mhaps + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = 3, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 44) + expect_equal(ncol(filtered_df), 13) + + #Remove min ind with reads + filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 10, + min.ind.with.reads = 10, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 9) + expect_equal(ncol(filtered_df), 13) + expect_equal(sum(filtered_df[,-c(1:3)]), 31642) + + #Check that the output file is created + filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = NULL, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = temp) + + expect_true(file.exists(paste0(temp,".csv"))) + + #Check that the plots are created in the console + #filtered_df <- filterMADC(madc_file, + # min.mean.reads = NULL, + # max.mean.reads = NULL, + # max.mhaps.per.loci = 3, + # min.reads.per.site = 1, + # min.ind.with.reads = NULL, + # target.only = FALSE, + # n.summary.columns = NULL, + # plot.summary = TRUE, + # output.file = NULL) + + #expect_true(is.numeric(dev.cur())) + #expect_true(dev.cur() > 1) + + #Now checking that all paramaters can work together + filtered_df <- filterMADC(madc_file, + min.mean.reads =1, + max.mean.reads = 150, + max.mhaps.per.loci = 3, + min.reads.per.site = 10, + min.ind.with.reads = 10, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) + + expect_equal(nrow(filtered_df), 3) + expect_equal(ncol(filtered_df), 13) + expect_equal(sum(filtered_df[,-c(1:3)]), 3960) + + +}) From ee6bc5ec9af96bbbf8d330fcc5612993f8ce63f5 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Sat, 17 May 2025 11:50:12 -0400 Subject: [PATCH 07/35] Add example --- R/filterMADC.R | 19 +++++++++++++++---- man/filterMADC.Rd | 19 +++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/R/filterMADC.R b/R/filterMADC.R index 2ab79dd..a9b2cec 100644 --- a/R/filterMADC.R +++ b/R/filterMADC.R @@ -24,11 +24,22 @@ #'@return data.frame or saved csv file #' #'@examples -#' #Example... +#' #Example +#' +#' #Example MADC +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") +#' +#' #Remove mhaps exceeding 3 per target region including the ref and alt target mhaps +#' filtered_df <- filterMADC(madc_file, +#' min.mean.reads = NULL, +#' max.mean.reads = NULL, +#' max.mhaps.per.loci = 3, +#' min.reads.per.site = 1, +#' min.ind.with.reads = NULL, +#' target.only = FALSE, +#' n.summary.columns = NULL, +#' output.file = NULL) #' -#' ##Plots -#' #Mean read depth -#' #Number of Altmatch and Refmatch mhaps per target loci #' #' #'@export diff --git a/man/filterMADC.Rd b/man/filterMADC.Rd index a028653..671a599 100644 --- a/man/filterMADC.Rd +++ b/man/filterMADC.Rd @@ -47,11 +47,22 @@ it can filter based on mean read depth, number of mhaps per target loci, and oth can plot summary statistics and save the filtered data to a file. } \examples{ -#Example... +#Example + +#Example MADC +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + +#Remove mhaps exceeding 3 per target region including the ref and alt target mhaps +filtered_df <- filterMADC(madc_file, + min.mean.reads = NULL, + max.mean.reads = NULL, + max.mhaps.per.loci = 3, + min.reads.per.site = 1, + min.ind.with.reads = NULL, + target.only = FALSE, + n.summary.columns = NULL, + output.file = NULL) -##Plots -#Mean read depth -#Number of Altmatch and Refmatch mhaps per target loci } From 85f12101993b047f7971acb0b9f092d370f3a9be Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 19 May 2025 09:21:16 -0400 Subject: [PATCH 08/35] Example updates --- R/madc2gmat.R | 13 +++++++++---- README.md | 1 + man/madc2gmat.Rd | 13 +++++++++---- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index c386b90..2092155 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -17,11 +17,16 @@ #'@return data.frame or saved csv file #' #'@examples -#' #Example... +#' #Input variables +#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") #' -#' ##Plots -#' #Mean read depth -#' #Number of Altmatch and Refmatch mhaps per target loci +#' #Calculations +#' temp <- tempfile() +#' +#' # Converting to additive relationship matrix +#' gmat <- madc2gmat(madc_file, +#' seed = 123, +#' output.file = NULL) #' #'@references #'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3). diff --git a/README.md b/README.md index 87fcdb2..e913a9c 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [![R-CMD-check](https://github.com/Breeding-Insight/BIGr/workflows/R-CMD-check/badge.svg)](https://github.com/Breeding-Insight/BIGr/actions) ![GitHub Release](https://img.shields.io/github/v/release/Breeding-Insight/BIGr) [![Development Status](https://img.shields.io/badge/development-active-blue.svg)](https://img.shields.io/badge/development-active-blue.svg) +[![CRAN Status Badge](https://www.r-pkg.org/badges/version/BIGr)](https://cran.r-project.org/package=BIGr) ![GitHub License](https://img.shields.io/github/license/Breeding-Insight/BIGr) [![codecov](https://app.codecov.io/gh/Breeding-Insight/BIGr/graph/badge.svg?token=PJUZMRN1NF)](https://app.codecov.io/gh/Breeding-Insight/BIGr) diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index d2cf998..43e135e 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -23,11 +23,16 @@ then converts it into an additive genomic relationship matrix using the \code{A. The resulting matrix can be used for genomic selection or other genetic analyses. } \examples{ -#Example... +#Input variables +madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") -##Plots -#Mean read depth -#Number of Altmatch and Refmatch mhaps per target loci +#Calculations +temp <- tempfile() + +# Converting to additive relationship matrix +gmat <- madc2gmat(madc_file, + seed = 123, + output.file = NULL) } \references{ From 0b54561e9776d790139357a7fba2d4a4eee3ca3c Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 19 May 2025 09:28:51 -0400 Subject: [PATCH 09/35] updated test --- tests/testthat/test-madc2gmat.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R index 5497f21..0b447c3 100644 --- a/tests/testthat/test-madc2gmat.R +++ b/tests/testthat/test-madc2gmat.R @@ -34,5 +34,5 @@ test_that("test madc2gmat",{ expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix") expect_true(all(dim(output_data) == c("10","10"))) expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical") - expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-16) + expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-15) }) From c50d224b2e06e987584355087194408184fb8dee Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 19 May 2025 09:49:04 -0400 Subject: [PATCH 10/35] update documentation --- R/madc2gmat.R | 1 + man/madc2gmat.Rd | 2 ++ 2 files changed, 3 insertions(+) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 2092155..74e08c9 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -12,6 +12,7 @@ #'@importFrom rrBLUP A.mat #' #'@param madc_file Path to the MADC file to be filtered +#'@param seed Optional seed for random number generation (default is NULL) #'@param output.file Path to save the filtered data (if NULL, data will not be saved) #' #'@return data.frame or saved csv file diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index 43e135e..c7eda35 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -9,6 +9,8 @@ madc2gmat(madc_file, seed = NULL, output.file = NULL) \arguments{ \item{madc_file}{Path to the MADC file to be filtered} +\item{seed}{Optional seed for random number generation (default is NULL)} + \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} } \value{ From 11877f9e2b024ad1a1241b4b8667dde9f78dab08 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 19 May 2025 10:54:12 -0400 Subject: [PATCH 11/35] Update Description --- DESCRIPTION | 2 +- NEWS.md | 53 +++++++++++++++++++++++++++++------------------------ 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 7906d27..c582046 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.5.5 +Version: 0.6.0 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='ams866@cornell.edu', diff --git a/NEWS.md b/NEWS.md index 02cb34e..6e7d1f8 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,25 +1,21 @@ -# BIGr 0.3.3 +# BIGr 0.6.0 -- Adapt updog2vcf to model f1, f1pp, s1 and s1pp +- Added new functions for filtering MADC files and converting to relationship matrices -# BIGr 0.3.2 +# BIGr 0.5.5 -- updog2vcf function option to output compressed VCF (.vcf.gz) - set as default -- remove need for defining ploidy -- add metadata at the VCF header +- Updated DESCRIPTION +- Added return value for merge_MADCs +- Added optional seed for check_ped +- Added verbose option -# BIGr 0.5.0 +# BIGr 0.5.4 -- Add imputation_concordance function to estimate accuracy of imputed and original dataset -- Add get_OffTargets function to extract target and off-target SNPs from a MADC file -- Add merge_MADCs function to merge two or more MADC files together -- Improved documentation and examples for all functions -- Add tests for all functions +- Updated dosage2vcf example -# BIGr 0.5.1 +# BIGr 0.5.3 -- Improvements of testthat tests -- Add check_replicates and check_homozygous_trios for pedigree relationship quality check +- Updated madc2vcf_all example # BIGr 0.5.2 @@ -27,17 +23,26 @@ - get_OffTargets function changed to madc2vcf_all - Updates to testthat tests and function examples -# BIGr 0.5.3 +# BIGr 0.5.1 -- Updated madc2vcf_all example +- Improvements of testthat tests +- Add check_replicates and check_homozygous_trios for pedigree relationship quality check -# BIGr 0.5.4 +# BIGr 0.5.0 -- Updated dosage2vcf example +- Add imputation_concordance function to estimate accuracy of imputed and original dataset +- Add get_OffTargets function to extract target and off-target SNPs from a MADC file +- Add merge_MADCs function to merge two or more MADC files together +- Improved documentation and examples for all functions +- Add tests for all functions -# BIGr 0.5.5 +# BIGr 0.3.3 + +- Adapt updog2vcf to model f1, f1pp, s1 and s1pp + +# BIGr 0.3.2 + +- updog2vcf function option to output compressed VCF (.vcf.gz) - set as default +- remove need for defining ploidy +- add metadata at the VCF header -- Updated DESCRIPTION -- Added return value for merge_MADCs -- Added optional seed for check_ped -- Added verbose option From 9a9ac43ad33f002899989c6ab359d9dcd4b2fabb Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 21 May 2025 09:42:49 -0400 Subject: [PATCH 12/35] bug fix --- R/madc2vcf_all.R | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 13d1bfe..16dad2e 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -75,7 +75,7 @@ madc2vcf_all <- function(madc = NULL, "out_vcf= ", out_vcf, ', ', "verbose= ", verbose,')">') - if(!is.null(madc)) report <- read.csv(madc) else stop("Please provide a MADC file") + if(!is.null(madc)) report <- read.csv(madc, check.names = FALSE) else stop("Please provide a MADC file") if(!is.null(botloci_file)) botloci <- read.csv(botloci_file, header = F) else stop("Please provide a botloci file") if(!is.null(hap_seq_file)) hap_seq <- read.table(hap_seq_file, header = F) else hap_seq <- NULL @@ -142,12 +142,12 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align new.file <- data.frame("AlleleID" = report[,1], "Chromosome" = NA, "SNP_position_in_Genome" = NA, "Ref" = NA, "Alt" =NA, - "CloneID" = report[,2], report[,3:ncol(report)]) + "CloneID" = report[,2], report[,3:ncol(report)], check.names = FALSE) by_cloneID <- split.data.frame(new.file, new.file$CloneID) clust <- makeCluster(n.cores) - #clusterExport(clust, c("hap_seq","add_ref_alt")) + #clusterExport(clust, c("hap_seq","add_ref_alt", "nsamples")) add_ref_alt_results <- parLapply(clust, by_cloneID, function(x) add_ref_alt(x, hap_seq, nsamples, verbose = verbose)) stopCluster(clust) @@ -169,7 +169,6 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) - #clusterExport(clust, c("botloci")) compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr)) stopCluster(clust) @@ -209,7 +208,6 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align #' #' @noRd add_ref_alt <- function(one_tag, hap_seq, nsamples, verbose = TRUE) { - # Add ref and alt cloneID <- one_tag$CloneID[1] ref <- paste0(cloneID, "|Ref_0001") @@ -281,13 +279,14 @@ add_ref_alt <- function(one_tag, hap_seq, nsamples, verbose = TRUE) { #' #' @noRd compare <- function(one_tag, botloci, alignment_score_thr = 40){ - #one_tag <- updated_by_cloneID[[2]] cloneID <- one_tag$CloneID[1] isBotLoci <- cloneID %in% botloci[,1] # If marker is present in the botloc list, get the reverse complement sequence if(isBotLoci) one_tag$AlleleSequence <- sapply(one_tag$AlleleSequence, function(x) as.character(reverseComplement(DNAString(x)))) chr <- sapply(strsplit(cloneID, "_"), function(x) x[-length(x)]) + if(length(chr > 1)) chr <- paste(chr, collapse = "_") + # Target SNP at the position in the ID ref_seq <- one_tag$AlleleSequence[grep("Ref_0001$",one_tag$AlleleID)] alt_seq <- one_tag$AlleleSequence[grep("Alt_0002$",one_tag$AlleleID)] @@ -305,9 +304,9 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ alt_base <- substring(alt_seq, align@subject@mismatch@unlistData, align@subject@mismatch@unlistData) # If target alternative have N, discard whole tag - if(all(alt_base %in% c("A", "T", "C", "G"))) { + # Always only one polymorphism, if there are more than one, not sure which is the target + if(all(alt_base %in% c("A", "T", "C", "G")) & length(align@pattern@mismatch@unlistData) == 1) { - # Update with new info - always only one polymorphism, if there are more than one, not sure which is the target update_tag <- one_tag[grep("Ref_0001$",one_tag$AlleleID),] update_tag[,2:5] <- c(chr, pos_target, ref_base, alt_base) update_tag_temp <- one_tag[grep("Alt_0002$",one_tag$AlleleID),] @@ -365,6 +364,7 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ return(list(update_tag = NULL, rm_score = cloneID, rm_N = NULL)) + } } From 1d4e3bbe171dec2506f84630bd2bfbab018393cf Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Wed, 21 May 2025 12:07:43 -0400 Subject: [PATCH 13/35] removed ref and alt name changes --- R/filterMADC.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/filterMADC.R b/R/filterMADC.R index a9b2cec..5c1056f 100644 --- a/R/filterMADC.R +++ b/R/filterMADC.R @@ -71,8 +71,8 @@ filterMADC <- function(madc_file, filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE) #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) } else { @@ -80,8 +80,8 @@ filterMADC <- function(madc_file, filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID) + #filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID) } #Check for extra columns From 6ef69788133cb273e69c7f0af86f9781a8897aea Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 22 May 2025 09:21:32 -0400 Subject: [PATCH 14/35] fix chr ID --- R/madc2vcf_all.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 16dad2e..6c1691c 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -169,6 +169,7 @@ loop_though_dartag_report <- function(report, botloci, hap_seq, n.cores=1, align clust <- makeCluster(n.cores) #clusterExport(clust, c("botloci", "compare", "nucleotideSubstitutionMatrix", "pairwiseAlignment", "DNAString", "reverseComplement")) + #clusterExport(clust, c("botloci", "alignment_score_thr")) compare_results <- parLapply(clust, updated_by_cloneID, function(x) compare(x, botloci, alignment_score_thr)) stopCluster(clust) @@ -449,7 +450,11 @@ create_VCF_body <- function(csv, vcf_body$V3 <- as.numeric(vcf_body$V3) rownames(vcf_body) <- NULL - vcf_body$target <- paste0(sapply(strsplit(vcf_body$target, "_"),"[[", 1), "_",as.numeric(sapply(strsplit(vcf_body$target, "_"),"[[", 2))) + # Remove padding + sp <- strsplit(vcf_body$target, "_") + pos <- sapply(sp, function(x) x[length(x)]) + chr <- sapply(sp, function(x) paste0(x[-length(x)], collapse = "_")) + vcf_body$target <- paste0(chr, "_",as.numeric(pos)) # Dealing with repeated positions # discard the ones that are not the target and keep only the first if all are off-targets From be4f2abfed8a90f347e4441ae753678025c39e69 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Wed, 4 Jun 2025 10:44:48 -0400 Subject: [PATCH 15/35] Added thinSNP function --- NAMESPACE | 2 + R/thinSNP.R | 100 ++++++++++++++++++++++++++++++++++ man/thinSNP.Rd | 68 +++++++++++++++++++++++ tests/testthat/test-thinSNP.R | 45 +++++++++++++++ 4 files changed, 215 insertions(+) create mode 100644 R/thinSNP.R create mode 100644 man/thinSNP.Rd create mode 100644 tests/testthat/test-thinSNP.R diff --git a/NAMESPACE b/NAMESPACE index 9b60af0..ad359c3 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,11 +16,13 @@ export(madc2gmat) export(madc2vcf_all) export(madc2vcf_targets) export(merge_MADCs) +export(thinSNP) export(updog2vcf) import(dplyr) import(janitor) import(parallel) import(quadprog) +import(rlang) import(tibble) import(tidyr) import(vcfR) diff --git a/R/thinSNP.R b/R/thinSNP.R new file mode 100644 index 0000000..6f445f6 --- /dev/null +++ b/R/thinSNP.R @@ -0,0 +1,100 @@ +#' Thin a dataframe of SNPs based on genomic position +#' +#' This function groups SNPs by chromosome, sorts them by physical position, +#' and then iteratively selects SNPs such that no two selected SNPs within +#' the same chromosome are closer than a specified minimum distance. +#' +#' @param df The input dataframe. +#' @param chrom_col_name A string specifying the name of the chromosome column. +#' @param pos_col_name A string specifying the name of the physical position column. +#' @param min_distance A numeric value for the minimum distance between selected SNPs. +#' The unit of this distance should match the unit of the `pos_col_name` column (e.g., base pairs). +#' +#' @import dplyr +#' @import rlang +#' @return A thinned dataframe with the same columns as the input. +#' +#' @examples +#' # Create sample SNP data +#' set.seed(123) +#' n_snps <- 20 +#' snp_data <- data.frame( +#' MarkerID = paste0("SNP", 1:n_snps), +#' Chrom = sample(c("chr1", "chr2"), n_snps, replace = TRUE), +#' ChromPosPhysical = c( +#' sort(sample(1:1000, 5)), # SNPs on chr1 +#' sort(sample(1:1000, 5)) + 500, # More SNPs on chr1 +#' sort(sample(1:2000, 10)) # SNPs on chr2 +#' ), +#' Allele = sample(c("A/T", "G/C"), n_snps, replace = TRUE) +#' ) +#' # Ensure it's sorted by Chrom and ChromPosPhysical for clarity in example +#' snp_data <- snp_data[order(snp_data$Chrom, snp_data$ChromPosPhysical), ] +#' rownames(snp_data) <- NULL +#' +#' print("Original SNP data:") +#' print(snp_data) +#' +#' # Thin the SNPs, keeping a minimum distance of 100 units (e.g., bp) +#' thinned_snps <- thinSNP( +#' df = snp_data, +#' chrom_col_name = "Chrom", +#' pos_col_name = "ChromPosPhysical", +#' min_distance = 100 +#' ) +#' +#' print("Thinned SNP data (min_distance = 100):") +#' print(thinned_snps) +#' +#' # Thin with a larger distance +#' thinned_snps_large_dist <- thinSNP( +#' df = snp_data, +#' chrom_col_name = "Chrom", +#' pos_col_name = "ChromPosPhysical", +#' min_distance = 500 +#' ) +#' print("Thinned SNP data (min_distance = 500):") +#' print(thinned_snps_large_dist) +#' @export +thinSNP <- function(df, chrom_col_name, pos_col_name, min_distance) { + # Convert column name strings to symbols for dplyr + chrom_sym <- rlang::sym(chrom_col_name) + pos_sym <- rlang::sym(pos_col_name) + + df %>% + dplyr::group_by(!!chrom_sym) %>% + dplyr::arrange(!!pos_sym, .by_group = TRUE) %>% + # Apply thinning logic to each chromosome group + dplyr::group_modify(~ { + # .x is the subset of data for the current chromosome, already sorted by position + if (nrow(.x) == 0) { + # Return an empty tibble with the same structure if the group is empty + return(tibble::as_tibble(.x[0, , drop = FALSE])) + } + + # Vector to store indices of rows to keep + kept_indices <- integer(nrow(.x)) + num_kept <- 0 + + # Always keep the first SNP in the sorted group + num_kept <- num_kept + 1 + kept_indices[num_kept] <- 1 + last_selected_pos <- .x[[pos_col_name]][1] # Get position of the first SNP + + # Iterate through the rest of the SNPs in the current chromosome + if (nrow(.x) > 1) { + for (i in 2:nrow(.x)) { + current_pos <- .x[[pos_col_name]][i] + # If current SNP is far enough from the last selected SNP, keep it + if (current_pos >= last_selected_pos + min_distance) { + num_kept <- num_kept + 1 + kept_indices[num_kept] <- i + last_selected_pos <- current_pos + } + } + } + # Subset the group to include only the kept SNPs + .x[kept_indices[1:num_kept], , drop = FALSE] + }) %>% + dplyr::ungroup() # Ungroup to return a single dataframe +} diff --git a/man/thinSNP.Rd b/man/thinSNP.Rd new file mode 100644 index 0000000..e668004 --- /dev/null +++ b/man/thinSNP.Rd @@ -0,0 +1,68 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/thinSNP.R +\name{thinSNP} +\alias{thinSNP} +\title{Thin a dataframe of SNPs based on genomic position} +\usage{ +thinSNP(df, chrom_col_name, pos_col_name, min_distance) +} +\arguments{ +\item{df}{The input dataframe.} + +\item{chrom_col_name}{A string specifying the name of the chromosome column.} + +\item{pos_col_name}{A string specifying the name of the physical position column.} + +\item{min_distance}{A numeric value for the minimum distance between selected SNPs. +The unit of this distance should match the unit of the \code{pos_col_name} column (e.g., base pairs).} +} +\value{ +A thinned dataframe with the same columns as the input. +} +\description{ +This function groups SNPs by chromosome, sorts them by physical position, +and then iteratively selects SNPs such that no two selected SNPs within +the same chromosome are closer than a specified minimum distance. +} +\examples{ +# Create sample SNP data +set.seed(123) +n_snps <- 20 +snp_data <- data.frame( + MarkerID = paste0("SNP", 1:n_snps), + Chrom = sample(c("chr1", "chr2"), n_snps, replace = TRUE), + ChromPosPhysical = c( + sort(sample(1:1000, 5)), # SNPs on chr1 + sort(sample(1:1000, 5)) + 500, # More SNPs on chr1 + sort(sample(1:2000, 10)) # SNPs on chr2 + ), + Allele = sample(c("A/T", "G/C"), n_snps, replace = TRUE) +) +# Ensure it's sorted by Chrom and ChromPosPhysical for clarity in example +snp_data <- snp_data[order(snp_data$Chrom, snp_data$ChromPosPhysical), ] +rownames(snp_data) <- NULL + +print("Original SNP data:") +print(snp_data) + +# Thin the SNPs, keeping a minimum distance of 100 units (e.g., bp) +thinned_snps <- thinSNP( + df = snp_data, + chrom_col_name = "Chrom", + pos_col_name = "ChromPosPhysical", + min_distance = 100 +) + +print("Thinned SNP data (min_distance = 100):") +print(thinned_snps) + +# Thin with a larger distance +thinned_snps_large_dist <- thinSNP( + df = snp_data, + chrom_col_name = "Chrom", + pos_col_name = "ChromPosPhysical", + min_distance = 500 +) +print("Thinned SNP data (min_distance = 500):") +print(thinned_snps_large_dist) +} diff --git a/tests/testthat/test-thinSNP.R b/tests/testthat/test-thinSNP.R new file mode 100644 index 0000000..8cf47f9 --- /dev/null +++ b/tests/testthat/test-thinSNP.R @@ -0,0 +1,45 @@ +context("Thinning") + + +test_that("Thinning SNPs",{ + + ##' # Create sample SNP data + set.seed(123) + n_snps <- 20 + snp_data <- data.frame( + MarkerID = paste0("SNP", 1:n_snps), + Chrom = sample(c("chr1", "chr2"), n_snps, replace = TRUE), + ChromPosPhysical = c( + sort(sample(1:1000, 5)), # SNPs on chr1 + sort(sample(1:1000, 5)) + 500, # More SNPs on chr1 + sort(sample(1:2000, 10)) # SNPs on chr2 + ), + Allele = sample(c("A/T", "G/C"), n_snps, replace = TRUE) + ) + # Ensure it's sorted by Chrom and ChromPosPhysical + snp_data <- snp_data[order(snp_data$Chrom, snp_data$ChromPosPhysical), ] + rownames(snp_data) <- NULL + + # Thin the SNPs, keeping a minimum distance of 100 units (e.g., bp) + thinned_snps <- thinSNP( + df = snp_data, + chrom_col_name = "Chrom", + pos_col_name = "ChromPosPhysical", + min_distance = 100 + ) + + # Thin with a larger distance + thinned_snps_large_dist <- thinSNP( + df = snp_data, + chrom_col_name = "Chrom", + pos_col_name = "ChromPosPhysical", + min_distance = 500 + ) + + # Check the results + expect_true(all(dim(thinned_snps) == c(14, 4))) # Expect 14 SNPs + expect_true(all(dim(thinned_snps_large_dist) == c(5, 4))) # Expect 5 SNPs + expect_equal(ncol(thinned_snps), ncol(snp_data)) # Same number of columns + + +}) From 23fc5696166b7ad7e3507366288cbf7ca5cd6c75 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 4 Jun 2025 11:46:59 -0400 Subject: [PATCH 16/35] Update DESCRIPTION --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index c582046..8f77ed1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -54,6 +54,7 @@ Imports: readr (>= 2.1.5), reshape2 (>= 1.4.4), rrBLUP, + rlang, tidyr (>= 1.3.1), vcfR (>= 1.15.0), Rsamtools, From 4d0fa765c5ed211488178e6b2cdb8cdf00e06e18 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Wed, 11 Jun 2025 10:49:33 -0400 Subject: [PATCH 17/35] update madc2gmat function --- DESCRIPTION | 3 +- NAMESPACE | 1 + R/madc2gmat.R | 77 ++++++++++++++++++++++++++------- tests/testthat/test-madc2gmat.R | 4 +- 4 files changed, 67 insertions(+), 18 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8f77ed1..ab50d49 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -62,7 +62,8 @@ Imports: pwalign, janitor, quadprog, - tibble + tibble, + stringr Suggests: covr, spelling, diff --git a/NAMESPACE b/NAMESPACE index ad359c3..db48105 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -23,6 +23,7 @@ import(janitor) import(parallel) import(quadprog) import(rlang) +import(stringr) import(tibble) import(tidyr) import(vcfR) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 74e08c9..8345a9b 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -7,9 +7,9 @@ #' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package. #' The resulting matrix can be used for genomic selection or other genetic analyses. #' -#'@import dplyr #'@importFrom utils read.csv write.csv #'@importFrom rrBLUP A.mat +#'@import tibble stringr dplyr tidyr #' #'@param madc_file Path to the MADC file to be filtered #'@param seed Optional seed for random number generation (default is NULL) @@ -62,39 +62,86 @@ madc2gmat <- function(madc_file, filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE) } - #Remove extra text after Ref and Alt (_001 or _002) - filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID) - filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID) + #Convert to data.table + #filtered_df <- as.data.table(filtered_df) + + #Get allele ratios + # --- Step 1: Calculate frequency of each specific allele using total locus count --- + # The denominator MUST be the sum of all reads at the locus. + allele_freq_df <- filtered_df %>% + group_by(CloneID) %>% + mutate(across( + .cols = where(is.numeric), + # The calculation is count / total_locus_count + .fns = ~ . / sum(.), + .names = "{.col}_freq" + )) %>% + ungroup() + + #Rm object + rm(filtered_df) - #Removing extra columns - row.names(filtered_df) <- filtered_df$AlleleID - filtered_df <- filtered_df %>% - select(-c(AlleleID, CloneID, AlleleSequence)) + # --- Step 2: Prepare data for reshaping --- + # Filter out the reference rows and create a unique marker name for each alternative allele + markers_to_pivot <- allele_freq_df %>% + filter(!str_ends(AlleleID, "\\|Ref_0001")) %>% + # Create a new, unique column for each marker (e.g., "chr1.1_000194324_RefMatch_0001") + # This makes each alternative allele its own column in the final matrix. + mutate(MarkerID = str_replace(AlleleID, "\\|", "_")) %>% + select(MarkerID, ends_with("_freq")) %>% + tibble::column_to_rownames(var = "MarkerID") + names(markers_to_pivot) <- str_remove(names(markers_to_pivot), "_freq$") + markers_to_pivot <- t(markers_to_pivot) + #Replace the NaN to NA + markers_to_pivot[is.nan(markers_to_pivot)] <- NA + + #rm unneeded objects + rm(allele_freq_df) + + #Step 3 is a robust transformation step, but probably not needed. + # --- Step 3: Reshape data into the Marker Matrix format --- + #marker_matrix_for_Amat <- markers_to_pivot %>% + # pivot_longer( + # cols = -MarkerID, + # names_to = "SampleID", + # values_to = "AlleleFreq" + # ) %>% + # mutate(SampleID = str_remove(SampleID, "_freq$")) %>% + # pivot_wider( + # names_from = MarkerID, + # values_from = AlleleFreq, + # If a sample has zero reads for an allele, it won't appear after the filter. + # values_fill = 0 ensures these are explicitly set to zero frequency. + # values_fill = 0 + # ) %>% + # tibble::column_to_rownames("SampleID") %>% + # as.matrix() #Scale and normalized data message("Scaling and normalizing data to be -1,1") # Function to scale a matrix to be between -1 and 1 for rrBLUP scale_matrix <- function(mat) { - min_val <- min(mat) - max_val <- max(mat) + #min_val <- min(mat) + #max_val <- max(mat) # Normalize to [0, 1] - normalized <- (mat - min_val) / (max_val - min_val) + #normalized <- (mat - min_val) / (max_val - min_val) # Scale to [-1, 1] - scaled <- 2 * normalized - 1 + #scaled <- 2 * normalized - 1 + scaled <- 2 * mat - 1 return(scaled) } # Apply the scaling function - filtered_df <- scale_matrix(filtered_df) + markers_to_pivot <- scale_matrix(markers_to_pivot) #Making additive relationship matrix - MADC.mat <- A.mat(t(filtered_df)) + MADC.mat <- A.mat(markers_to_pivot) - rm(filtered_df) + rm(markers_to_pivot) #Save the output to disk if file name provided if (!is.null(output.file)) { diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R index 0b447c3..66ca350 100644 --- a/tests/testthat/test-madc2gmat.R +++ b/tests/testthat/test-madc2gmat.R @@ -24,7 +24,7 @@ test_that("test madc2gmat",{ #Check expect_true(all(dim(gmat) == c("10","10"))) expect_true(all(row.names(gmat) == row.names(gmat))) - expect_equal(sum(gmat), 1.00614e-16)#, tolerance = 1e-16) + expect_equal(sum(gmat), -1.480586e-15, tolerance = 1e-16) expect_true(is.matrix(gmat), "Output should be a matrix") # Read the output file @@ -34,5 +34,5 @@ test_that("test madc2gmat",{ expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix") expect_true(all(dim(output_data) == c("10","10"))) expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical") - expect_equal(sum(output_data), -9.970323e-16, tolerance = 1e-15) + expect_equal(sum(output_data), 6.214647e-15, tolerance = 1e-15) }) From 9118cd087dda77ef014c739dc733ac0e62564c8a Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Wed, 11 Jun 2025 11:28:48 -0400 Subject: [PATCH 18/35] test update --- tests/testthat/test-madc2gmat.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R index 66ca350..48f85a6 100644 --- a/tests/testthat/test-madc2gmat.R +++ b/tests/testthat/test-madc2gmat.R @@ -24,7 +24,7 @@ test_that("test madc2gmat",{ #Check expect_true(all(dim(gmat) == c("10","10"))) expect_true(all(row.names(gmat) == row.names(gmat))) - expect_equal(sum(gmat), -1.480586e-15, tolerance = 1e-16) + expect_equal(sum(gmat), -1.480586e-15, tolerance = 1e-15) expect_true(is.matrix(gmat), "Output should be a matrix") # Read the output file From f6060480409a82e2a4604958218363c8b73f103e Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Wed, 11 Jun 2025 12:33:26 -0400 Subject: [PATCH 19/35] updated test --- tests/testthat/test-madc2gmat.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R index 48f85a6..d962d67 100644 --- a/tests/testthat/test-madc2gmat.R +++ b/tests/testthat/test-madc2gmat.R @@ -24,7 +24,7 @@ test_that("test madc2gmat",{ #Check expect_true(all(dim(gmat) == c("10","10"))) expect_true(all(row.names(gmat) == row.names(gmat))) - expect_equal(sum(gmat), -1.480586e-15, tolerance = 1e-15) + expect_equal(sum(gmat), -1.480586e-15, tolerance = 1e-14) expect_true(is.matrix(gmat), "Output should be a matrix") # Read the output file @@ -34,5 +34,5 @@ test_that("test madc2gmat",{ expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix") expect_true(all(dim(output_data) == c("10","10"))) expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical") - expect_equal(sum(output_data), 6.214647e-15, tolerance = 1e-15) + expect_equal(sum(output_data), 6.214647e-15, tolerance = 1e-14) }) From 66dec4762042c11e76d6ca2197de8b762d3c5b40 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Wed, 11 Jun 2025 16:04:58 -0400 Subject: [PATCH 20/35] added madc2gmat method --- R/madc2gmat.R | 158 ++++++++++++++++++++++++++++++++--------------- man/madc2gmat.Rd | 2 +- 2 files changed, 108 insertions(+), 52 deletions(-) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 8345a9b..6d7dc6e 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -35,6 +35,7 @@ #'@export madc2gmat <- function(madc_file, seed = NULL, + method = "collapsed", output.file = NULL) { #set seed if not null if (!is.null(seed)) { @@ -65,58 +66,113 @@ madc2gmat <- function(madc_file, #Convert to data.table #filtered_df <- as.data.table(filtered_df) - #Get allele ratios - # --- Step 1: Calculate frequency of each specific allele using total locus count --- - # The denominator MUST be the sum of all reads at the locus. - allele_freq_df <- filtered_df %>% - group_by(CloneID) %>% - mutate(across( - .cols = where(is.numeric), - # The calculation is count / total_locus_count - .fns = ~ . / sum(.), - .names = "{.col}_freq" - )) %>% - ungroup() - - #Rm object - rm(filtered_df) - - # --- Step 2: Prepare data for reshaping --- - # Filter out the reference rows and create a unique marker name for each alternative allele - markers_to_pivot <- allele_freq_df %>% - filter(!str_ends(AlleleID, "\\|Ref_0001")) %>% - # Create a new, unique column for each marker (e.g., "chr1.1_000194324_RefMatch_0001") - # This makes each alternative allele its own column in the final matrix. - mutate(MarkerID = str_replace(AlleleID, "\\|", "_")) %>% - select(MarkerID, ends_with("_freq")) %>% - tibble::column_to_rownames(var = "MarkerID") - names(markers_to_pivot) <- str_remove(names(markers_to_pivot), "_freq$") - markers_to_pivot <- t(markers_to_pivot) - #Replace the NaN to NA - markers_to_pivot[is.nan(markers_to_pivot)] <- NA - - #rm unneeded objects - rm(allele_freq_df) - - #Step 3 is a robust transformation step, but probably not needed. - # --- Step 3: Reshape data into the Marker Matrix format --- - #marker_matrix_for_Amat <- markers_to_pivot %>% - # pivot_longer( - # cols = -MarkerID, - # names_to = "SampleID", - # values_to = "AlleleFreq" - # ) %>% - # mutate(SampleID = str_remove(SampleID, "_freq$")) %>% - # pivot_wider( - # names_from = MarkerID, - # values_from = AlleleFreq, - # If a sample has zero reads for an allele, it won't appear after the filter. - # values_fill = 0 ensures these are explicitly set to zero frequency. - # values_fill = 0 - # ) %>% - # tibble::column_to_rownames("SampleID") %>% - # as.matrix() + if (method == "unique") { + #Get allele ratios + # --- Step 1: Calculate frequency of each specific allele using total locus count --- + # The denominator MUST be the sum of all reads at the locus. + allele_freq_df <- filtered_df %>% + group_by(CloneID) %>% + mutate(across( + .cols = where(is.numeric), + # The calculation is count / total_locus_count + .fns = ~ . / sum(.), + .names = "{.col}_freq" + )) %>% + ungroup() + + #Rm object + rm(filtered_df) + + # --- Step 2: Prepare data for reshaping --- + # Filter out the reference rows and create a unique marker name for each alternative allele + markers_to_pivot <- allele_freq_df %>% + filter(!str_ends(AlleleID, "\\|Ref_0001")) %>% + # Create a new, unique column for each marker (e.g., "chr1.1_000194324_RefMatch_0001") + # This makes each alternative allele its own column in the final matrix. + mutate(MarkerID = str_replace(AlleleID, "\\|", "_")) %>% + select(MarkerID, ends_with("_freq")) %>% + tibble::column_to_rownames(var = "MarkerID") + names(markers_to_pivot) <- str_remove(names(markers_to_pivot), "_freq$") + markers_to_pivot <- t(markers_to_pivot) + #Replace the NaN to NA + markers_to_pivot[is.nan(markers_to_pivot)] <- NA + + #rm unneeded objects + rm(allele_freq_df) + + #Step 3 is a robust transformation step, but probably not needed. + # --- Step 3: Reshape data into the Marker Matrix format --- + #marker_matrix_for_Amat <- markers_to_pivot %>% + # pivot_longer( + # cols = -MarkerID, + # names_to = "SampleID", + # values_to = "AlleleFreq" + # ) %>% + # mutate(SampleID = str_remove(SampleID, "_freq$")) %>% + # pivot_wider( + # names_from = MarkerID, + # values_from = AlleleFreq, + # If a sample has zero reads for an allele, it won't appear after the filter. + # values_fill = 0 ensures these are explicitly set to zero frequency. + # values_fill = 0 + # ) %>% + # tibble::column_to_rownames("SampleID") %>% + # as.matrix() + } else if (method == "collapsed"){ + + # This single pipeline calculates the final matrix. + # The output is the marker dosage matrix (X) to be used as input for A.mat() + markers_to_pivot <- filtered_df %>% + + # --- Part A: Calculate the frequency of each allele at each locus --- + group_by(CloneID) %>% + mutate(across( + .cols = where(is.numeric), + # Use if_else to prevent 0/0, which results in NaN + .fns = ~ . / sum(.), + .names = "{.col}_freq" + )) %>% + ungroup() %>% + + # --- Part B: Isolate and sum all alternative allele frequencies --- + #Note, the RefMatch counts are being counted as Reference allele counts, and not + #Alternative allele counts + # Filter to keep only the alternative alleles using base R's endsWith() + filter(!grepl("\\|Ref", AlleleID)) %>% + # Group by the main locus ID + group_by(CloneID) %>% + # For each locus, sum the frequencies of all its alternative alleles + summarise(across(ends_with("_freq"), sum), .groups = 'drop') %>% + + # --- Part C: Reshape the data into the final matrix format --- + # Convert from a "long" format to a "wide" format + pivot_longer( + cols = -CloneID, + names_to = "SampleID", + values_to = "Dosage" + ) %>% + # Clean up the sample names using base R's sub() + mutate(SampleID = sub("_freq$", "", SampleID)) %>% + # Pivot to the final shape: samples in rows, markers in columns + pivot_wider( + names_from = CloneID, + values_from = Dosage, + # This ensures that if a sample/locus combination is missing, + # it gets a value of 0 instead of NA. + values_fill = 0 + ) %>% + # Convert the 'SampleID' column into the actual row names of the dataframe + column_to_rownames("SampleID") %>% + # Convert the final dataframe into a true matrix object + as.matrix() + #Replace the NaN to NA + markers_to_pivot[is.nan(markers_to_pivot)] <- NA + #rm unneeded objects + rm(filtered_df) + } else { + stop("Invalid method specified. Use 'unique' or 'collapsed'.") + } #Scale and normalized data message("Scaling and normalizing data to be -1,1") diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index c7eda35..56f27cb 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -4,7 +4,7 @@ \alias{madc2gmat} \title{Convert MADC Files to an Additive Genomic Relationship Matrix} \usage{ -madc2gmat(madc_file, seed = NULL, output.file = NULL) +madc2gmat(madc_file, seed = NULL, method = "collapsed", output.file = NULL) } \arguments{ \item{madc_file}{Path to the MADC file to be filtered} From 494e60a0364b76dcd9f114fa867431fc9d6e3e40 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Wed, 11 Jun 2025 16:16:50 -0400 Subject: [PATCH 21/35] update documentation --- R/madc2gmat.R | 1 + man/madc2gmat.Rd | 2 ++ 2 files changed, 3 insertions(+) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 6d7dc6e..5103882 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -13,6 +13,7 @@ #' #'@param madc_file Path to the MADC file to be filtered #'@param seed Optional seed for random number generation (default is NULL) +#'@param method Method to use for processing the MADC data. Options are "unique" or "collapsed". Default is "collapsed". #'@param output.file Path to save the filtered data (if NULL, data will not be saved) #' #'@return data.frame or saved csv file diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index 56f27cb..118f7c3 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -11,6 +11,8 @@ madc2gmat(madc_file, seed = NULL, method = "collapsed", output.file = NULL) \item{seed}{Optional seed for random number generation (default is NULL)} +\item{method}{Method to use for processing the MADC data. Options are "unique" or "collapsed". Default is "collapsed".} + \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} } \value{ From 4436464ddcd34d5c290c1fb74bb4915755703d10 Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Fri, 13 Jun 2025 09:36:57 -0400 Subject: [PATCH 22/35] updated madc2gmat method --- DESCRIPTION | 2 +- NAMESPACE | 2 +- R/madc2gmat.R | 18 ++++++++++++++---- man/madc2gmat.Rd | 10 +++++++++- tests/testthat/test-madc2gmat.R | 2 ++ 5 files changed, 27 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index ab50d49..896f1ad 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,7 +53,7 @@ Imports: Rdpack (>= 0.7), readr (>= 2.1.5), reshape2 (>= 1.4.4), - rrBLUP, + AGHmatrix, rlang, tidyr (>= 1.3.1), vcfR (>= 1.15.0), diff --git a/NAMESPACE b/NAMESPACE index db48105..0e0b3b2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,6 +27,7 @@ import(stringr) import(tibble) import(tidyr) import(vcfR) +importFrom(AGHmatrix,Gmatrix) importFrom(Biostrings,DNAString) importFrom(Biostrings,reverseComplement) importFrom(Rdpack,reprompt) @@ -36,7 +37,6 @@ importFrom(pwalign,pairwiseAlignment) importFrom(readr,read_csv) importFrom(reshape2,dcast) importFrom(reshape2,melt) -importFrom(rrBLUP,A.mat) importFrom(stats,cor) importFrom(stats,setNames) importFrom(utils,packageVersion) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 5103882..7f8ca57 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -8,12 +8,13 @@ #' The resulting matrix can be used for genomic selection or other genetic analyses. #' #'@importFrom utils read.csv write.csv -#'@importFrom rrBLUP A.mat +#'@importFrom AGHmatrix Gmatrix #'@import tibble stringr dplyr tidyr #' #'@param madc_file Path to the MADC file to be filtered #'@param seed Optional seed for random number generation (default is NULL) #'@param method Method to use for processing the MADC data. Options are "unique" or "collapsed". Default is "collapsed". +#'@param ploidy Numeric. Ploidy level of the samples (e.g., 2 for diploid, 4 for tetraploid) #'@param output.file Path to save the filtered data (if NULL, data will not be saved) #' #'@return data.frame or saved csv file @@ -37,6 +38,7 @@ madc2gmat <- function(madc_file, seed = NULL, method = "collapsed", + ploidy, output.file = NULL) { #set seed if not null if (!is.null(seed)) { @@ -193,11 +195,19 @@ madc2gmat <- function(madc_file, } # Apply the scaling function - markers_to_pivot <- scale_matrix(markers_to_pivot) + #markers_to_pivot <- scale_matrix(markers_to_pivot) #Making additive relationship matrix - MADC.mat <- A.mat(markers_to_pivot) - + #(Need to write own formula to reduce dependencies) or obtain code from AGHmatrix directly + suppressMessages( + MADC.mat <- Gmatrix(markers_to_pivot, + method = "VanRaden", + missingValue = NA, + ploidy = as.numeric(ploidy), + ratio = TRUE, + ploidy.correction = TRUE) + ) + #Remove the markers_to_pivot object to free up memory rm(markers_to_pivot) #Save the output to disk if file name provided diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index 118f7c3..1e2c084 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -4,7 +4,13 @@ \alias{madc2gmat} \title{Convert MADC Files to an Additive Genomic Relationship Matrix} \usage{ -madc2gmat(madc_file, seed = NULL, method = "collapsed", output.file = NULL) +madc2gmat( + madc_file, + seed = NULL, + method = "collapsed", + ploidy, + output.file = NULL +) } \arguments{ \item{madc_file}{Path to the MADC file to be filtered} @@ -13,6 +19,8 @@ madc2gmat(madc_file, seed = NULL, method = "collapsed", output.file = NULL) \item{method}{Method to use for processing the MADC data. Options are "unique" or "collapsed". Default is "collapsed".} +\item{ploidy}{Numeric. Ploidy level of the samples (e.g., 2 for diploid, 4 for tetraploid)} + \item{output.file}{Path to save the filtered data (if NULL, data will not be saved)} } \value{ diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R index d962d67..1ed1b75 100644 --- a/tests/testthat/test-madc2gmat.R +++ b/tests/testthat/test-madc2gmat.R @@ -11,11 +11,13 @@ test_that("test madc2gmat",{ # Converting to additive relationship matrix gmat <- madc2gmat(madc_file, seed = 123, + ploidy = 2, output.file = NULL) #When output a file madc2gmat(madc_file, seed = 123, + ploidy = 2, output.file = temp) #Test that a valid output was provided From deee40b71ab141c8178fdc6b7f2a8998b1ea6d3f Mon Sep 17 00:00:00 2001 From: alex-sandercock Date: Mon, 16 Jun 2025 12:20:40 -0400 Subject: [PATCH 23/35] updated madc2gmat --- DESCRIPTION | 1 - NAMESPACE | 1 - R/madc2gmat.R | 80 ++++++++++++++++++++++++++++-------------------- man/madc2gmat.Rd | 10 ++++-- 4 files changed, 54 insertions(+), 38 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 896f1ad..bf0483b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -53,7 +53,6 @@ Imports: Rdpack (>= 0.7), readr (>= 2.1.5), reshape2 (>= 1.4.4), - AGHmatrix, rlang, tidyr (>= 1.3.1), vcfR (>= 1.15.0), diff --git a/NAMESPACE b/NAMESPACE index 0e0b3b2..4670457 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -27,7 +27,6 @@ import(stringr) import(tibble) import(tidyr) import(vcfR) -importFrom(AGHmatrix,Gmatrix) importFrom(Biostrings,DNAString) importFrom(Biostrings,reverseComplement) importFrom(Rdpack,reprompt) diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 7f8ca57..1fd3aa1 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -3,12 +3,11 @@ #' Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix. #' #'@details -#' This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and -#' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package. +#' This function reads a MADC file, processes it to remove unnecessary columns, and then +#' converts it into an additive genomic relationship matrix using the first method proposed by VanRaden (2008). #' The resulting matrix can be used for genomic selection or other genetic analyses. #' #'@importFrom utils read.csv write.csv -#'@importFrom AGHmatrix Gmatrix #'@import tibble stringr dplyr tidyr #' #'@param madc_file Path to the MADC file to be filtered @@ -29,10 +28,11 @@ #' # Converting to additive relationship matrix #' gmat <- madc2gmat(madc_file, #' seed = 123, +#' ploidy=2, #' output.file = NULL) #' -#'@references -#'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3). +#'@references VanRaden, P. M. (2008). Efficient methods to compute genomic predictions. Journal of Dairy Science, 91(11), 4414-4423 +#'@author Alexander M. Sandercock #' #'@export madc2gmat <- function(madc_file, @@ -177,36 +177,50 @@ madc2gmat <- function(madc_file, stop("Invalid method specified. Use 'unique' or 'collapsed'.") } - #Scale and normalized data - message("Scaling and normalizing data to be -1,1") - # Function to scale a matrix to be between -1 and 1 for rrBLUP - scale_matrix <- function(mat) { - #min_val <- min(mat) - #max_val <- max(mat) - - # Normalize to [0, 1] - #normalized <- (mat - min_val) / (max_val - min_val) - - # Scale to [-1, 1] - #scaled <- 2 * normalized - 1 - scaled <- 2 * mat - 1 - - return(scaled) + #VanRaden method + #This method is used to calculate the additive relationship matrix + compute_relationship_matrix <- function(mat, + ploidy, + method = "VanRaden", + impute = TRUE) { + ### Prepare matrix + #Remove any SNPs that are all NAs + mat <- mat[, colSums(is.na(mat)) < nrow(mat)] + #impute the missing values with the mean of the column + if (impute) { + mat <- data.frame(mat, check.names = FALSE, check.rows = FALSE) %>% + mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)) %>% + as.matrix() + } + + if (method == "VanRaden") { + # Calculate the additive relationship matrix using VanRaden's method + + #Calculate allele frequencies + p <- apply(mat, 2, mean, na.rm = TRUE) + q <- 1 - p + #Calculate the denominator + #Note: need to use 1/ploidy for ratios, where ploidy should be used for dosages. + #This is because the ratios are from 0-1, which is what you get when dosage/ploidy, whereas dosages are from 0 to ploidy + denominator <- (1/as.numeric(ploidy))*sum(p * q, na.rm = TRUE) + #Get the numerator + Z <- scale(mat, center = TRUE, scale = FALSE) + ZZ <- (Z %*% t(Z)) + #Calculate the additive relationship matrix + A.mat <- ZZ / denominator + + return(A.mat) + + } else { + stop("Unsupported method. Only 'VanRaden' is currently implemented.") + } } + #Calculate the additive relationship matrix + MADC.mat <- compute_relationship_matrix(markers_to_pivot, + ploidy = ploidy, + method = "VanRaden", + impute = TRUE) - # Apply the scaling function - #markers_to_pivot <- scale_matrix(markers_to_pivot) - - #Making additive relationship matrix - #(Need to write own formula to reduce dependencies) or obtain code from AGHmatrix directly - suppressMessages( - MADC.mat <- Gmatrix(markers_to_pivot, - method = "VanRaden", - missingValue = NA, - ploidy = as.numeric(ploidy), - ratio = TRUE, - ploidy.correction = TRUE) - ) #Remove the markers_to_pivot object to free up memory rm(markers_to_pivot) diff --git a/man/madc2gmat.Rd b/man/madc2gmat.Rd index 1e2c084..9d182ad 100644 --- a/man/madc2gmat.Rd +++ b/man/madc2gmat.Rd @@ -30,8 +30,8 @@ data.frame or saved csv file Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix. } \details{ -This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and -then converts it into an additive genomic relationship matrix using the \code{A.mat} function from the \code{rrBLUP} package. +This function reads a MADC file, processes it to remove unnecessary columns, and then +converts it into an additive genomic relationship matrix using the first method proposed by VanRaden (2008). The resulting matrix can be used for genomic selection or other genetic analyses. } \examples{ @@ -44,9 +44,13 @@ temp <- tempfile() # Converting to additive relationship matrix gmat <- madc2gmat(madc_file, seed = 123, + ploidy=2, output.file = NULL) } \references{ -Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3). +VanRaden, P. M. (2008). Efficient methods to compute genomic predictions. Journal of Dairy Science, 91(11), 4414-4423 +} +\author{ +Alexander M. Sandercock } From d732bc160953ab4f07a4d9dfe5d98a2e2138fb57 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Tue, 12 Aug 2025 09:59:55 -0400 Subject: [PATCH 24/35] Update breedtools_functions.R --- R/breedtools_functions.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/breedtools_functions.R b/R/breedtools_functions.R index db99857..6a01d05 100644 --- a/R/breedtools_functions.R +++ b/R/breedtools_functions.R @@ -32,7 +32,7 @@ #' allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4) #' print(allele_freqs) #' -#' @noRd +#' @export allele_freq_poly <- function(geno, populations, ploidy = 2) { # Initialize returned df @@ -76,7 +76,7 @@ allele_freq_poly <- function(geno, populations, ploidy = 2) { #' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific #' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. #' -#' @noRd +#' @export QPsolve <- function(Y, X) { # Remove NAs from Y and remove corresponding @@ -168,7 +168,7 @@ QPsolve <- function(Y, X) { #' ploidy = 4) #' print(composition) #' -#' @noRd +#' @export solve_composition_poly <- function(Y, X, ped = NULL, From 1998376fb1bac37c7e5873028f31c8de1a9af781 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 4 Sep 2025 08:54:16 -0400 Subject: [PATCH 25/35] madc2vcf exception added: presence of N --- R/madc2vcf_all.R | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/R/madc2vcf_all.R b/R/madc2vcf_all.R index 6c1691c..329ac5a 100644 --- a/R/madc2vcf_all.R +++ b/R/madc2vcf_all.R @@ -332,8 +332,9 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ # If Match sequences have N, do not consider as polymorphism if(any(!alt_base %in% c("A", "T", "C", "G"))) { - alt_base <- alt_base[-which(!alt_base %in% c("A", "T", "C", "G"))] ref_base <- ref_base[-which(!alt_base %in% c("A", "T", "C", "G"))] + pos_ref_idx <- pos_ref_idx[-which(!alt_base %in% c("A", "T", "C", "G"))] + alt_base <- alt_base[-which(!alt_base %in% c("A", "T", "C", "G"))] } if(length(alt_base) >0){ # If the N is the only polymorphis found, the Match tag will be discarted @@ -365,8 +366,8 @@ compare <- function(one_tag, botloci, alignment_score_thr = 40){ return(list(update_tag = NULL, rm_score = cloneID, rm_N = NULL)) - } + } #' Converts the fasta to a data.frame with first column the AlleleID and second the AlleleSequence From 7a22ee315fbb2dabdb5170ccb6160e64bfc2895b Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Tue, 9 Sep 2025 13:59:47 -0400 Subject: [PATCH 26/35] RefAlt updog2vcf support added --- R/updog2vcf.R | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/R/updog2vcf.R b/R/updog2vcf.R index 5fc5a54..11c6635 100644 --- a/R/updog2vcf.R +++ b/R/updog2vcf.R @@ -9,6 +9,7 @@ #' @param multidog.object updog output object with class "multidog" from dosage calling #' @param output.file output file name and path #' @param updog_version character defining updog package version used to generate the multidog object +#' @param RefAlt optional data frame with four columns named "Chr", "Pos", "Ref", and "Alt" containing the reference and alternate alleles for each SNP in the same order as in the multidog object #' @param compress logical. If TRUE returns a vcf.gz file #' @return A vcf file #' @import dplyr @@ -37,7 +38,7 @@ #' rm(temp_file) #' #' @export -updog2vcf <- function(multidog.object, output.file, updog_version = NULL, compress = TRUE) { +updog2vcf <- function(multidog.object, output.file, updog_version = NULL, RefAlt = NULL, compress = TRUE) { #Making the VCF (This is highly dependent on snps being in a format where the SNP IDs are the CHR_POS) mout <- multidog.object @@ -134,8 +135,8 @@ updog2vcf <- function(multidog.object, output.file, updog_version = NULL, compre CHROM = new_df$CHROM, POS = new_df$POS, ID = mout$snpdf$snp, - REF = "A", - ALT = "B", + REF = ".", + ALT = ".", QUAL = ".", FILTER = ".", INFO = NA, @@ -143,6 +144,33 @@ updog2vcf <- function(multidog.object, output.file, updog_version = NULL, compre check.names=FALSE ) + # Replace REF and ALT with actual alleles if RefAlt dataframe is provided + if (!is.null(RefAlt)) { + # Ensure RefAlt has the correct column names (case-insensitive) + colnames(RefAlt) <- tolower(colnames(RefAlt)) + required_cols <- c("chr", "pos", "ref", "alt") + + if (all(required_cols %in% colnames(RefAlt))) { + # Create a matching key for both dataframes + vcf_df <- vcf_df %>% + mutate(match_key = paste(CHROM, POS, sep = "_")) + + RefAlt <- RefAlt %>% + mutate(match_key = paste(chr, pos, sep = "_")) + + # Join and replace REF/ALT values + vcf_df <- vcf_df %>% + left_join(RefAlt, by = "match_key") %>% + mutate( + REF = ifelse(!is.na(ref), ref, REF), + ALT = ifelse(!is.na(alt), alt, ALT) + ) %>% + select(-match_key, -chr, -pos, -ref, -alt) + } else { + warning("RefAlt dataframe must contain columns: chr, pos, ref, alt") + } + } + #Add the INFO column for each SNP vcf_df$INFO <- paste0("DP=",depth_df$total_size,";", "ADS=",depth_df$total_ref,",",depth_df$total_alt,";", From 4666a1f451e283bb53a57bf4b5b54136a72126b3 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Tue, 9 Sep 2025 14:07:35 -0400 Subject: [PATCH 27/35] updated docs --- NAMESPACE | 3 ++ man/QPsolve.Rd | 28 ++++++++++++ man/allele_freq_poly.Rd | 49 +++++++++++++++++++++ man/solve_composition_poly.Rd | 82 +++++++++++++++++++++++++++++++++++ man/updog2vcf.Rd | 10 ++++- 5 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 man/QPsolve.Rd create mode 100644 man/allele_freq_poly.Rd create mode 100644 man/solve_composition_poly.Rd diff --git a/NAMESPACE b/NAMESPACE index 4670457..81d5760 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,7 @@ # Generated by roxygen2: do not edit by hand +export(QPsolve) +export(allele_freq_poly) export(calculate_Het) export(calculate_MAF) export(check_homozygous_trios) @@ -16,6 +18,7 @@ export(madc2gmat) export(madc2vcf_all) export(madc2vcf_targets) export(merge_MADCs) +export(solve_composition_poly) export(thinSNP) export(updog2vcf) import(dplyr) diff --git a/man/QPsolve.Rd b/man/QPsolve.Rd new file mode 100644 index 0000000..34b5491 --- /dev/null +++ b/man/QPsolve.Rd @@ -0,0 +1,28 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/breedtools_functions.R +\name{QPsolve} +\alias{QPsolve} +\title{Performs whole genome breed composition prediction.} +\usage{ +QPsolve(Y, X) +} +\arguments{ +\item{Y}{numeric vector of genotypes (with names as SNPs) from a single animal. +coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}} + +\item{X}{numeric matrix of allele frequencies from reference animals} + +\item{p}{numeric indicating number of breeds represented in X} + +\item{names}{character names of breeds} +} +\value{ +data.frame of breed composition estimates +} +\description{ +Performs whole genome breed composition prediction. +} +\references{ +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific +breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +} diff --git a/man/allele_freq_poly.Rd b/man/allele_freq_poly.Rd new file mode 100644 index 0000000..407d158 --- /dev/null +++ b/man/allele_freq_poly.Rd @@ -0,0 +1,49 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/breedtools_functions.R +\name{allele_freq_poly} +\alias{allele_freq_poly} +\title{Compute Allele Frequencies for Populations} +\usage{ +allele_freq_poly(geno, populations, ploidy = 2) +} +\arguments{ +\item{geno}{matrix of genotypes coded as the dosage of allele B \code{{0, 1, 2, ..., ploidy}} +with individuals in rows (named) and SNPs in columns (named)} + +\item{populations}{list of named populations. Each population has a vector of IDs +that belong to the population. Allele frequencies will be derived from all animals} + +\item{ploidy}{integer indicating the ploidy level (default is 2 for diploid)} +} +\value{ +data.frame consisting of allele_frequencies for populations (columns) for +each SNP (rows) +} +\description{ +Computes allele frequencies for specified populations given SNP array data +} +\examples{ +# Example inputs +geno_matrix <- matrix( +c(4, 1, 4, 0, # S1 + 2, 2, 1, 3, # S2 + 0, 4, 0, 4, # S3 + 3, 3, 2, 2, # S4 + 1, 4, 2, 3),# S5 +nrow = 4, ncol = 5, byrow = FALSE, # individuals=rows, SNPs=cols +dimnames = list(paste0("Ind", 1:4), paste0("S", 1:5)) +) + +pop_list <- list( +PopA = c("Ind1", "Ind2"), +PopB = c("Ind3", "Ind4") +) + +allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4) +print(allele_freqs) + +} +\references{ +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific +breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +} diff --git a/man/solve_composition_poly.Rd b/man/solve_composition_poly.Rd new file mode 100644 index 0000000..bd92739 --- /dev/null +++ b/man/solve_composition_poly.Rd @@ -0,0 +1,82 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/breedtools_functions.R +\name{solve_composition_poly} +\alias{solve_composition_poly} +\title{Compute Genome-Wide Breed Composition} +\usage{ +solve_composition_poly( + Y, + X, + ped = NULL, + groups = NULL, + mia = FALSE, + sire = FALSE, + dam = FALSE, + ploidy = 2 +) +} +\arguments{ +\item{Y}{numeric matrix of genotypes (columns) from all animals (rows) in population +coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}} + +\item{X}{numeric matrix of allele frequencies (rows) from each reference panel (columns). Frequencies are +relative to allele B.} + +\item{ped}{data.frame giving pedigree information. Must be formatted "ID", "Sire", "Dam"} + +\item{groups}{list of IDs categorized by breed/population. If specified, output will be a list +of results categorized by breed/population.} + +\item{mia}{logical. Only applies if ped argument is supplied. If true, returns a data.frame +containing the inferred maternally inherited allele for each locus for each animal instead +of breed composition results.} + +\item{sire}{logical. Only applies if ped argument is supplied. If true, returns a data.frame +containing sire genotypes for each locus for each animal instead of breed composition results.} + +\item{dam}{logical. Only applies if ped argument is supplied. If true, returns a data.frame +containing dam genotypes for each locus for each animal instead of breed composition results.} + +\item{ploidy}{integer. The ploidy level of the species (e.g., 2 for diploid, 3 for triploid, etc.).} +} +\value{ +A data.frame or list of data.frames (if groups is !NULL) with breed/ancestry composition +results +} +\description{ +Computes genome-wide breed/ancestry composition using quadratic programming on a +batch of animals. +} +\examples{ +# Example inputs for solve_composition_poly (ploidy = 4) + +# (This would typically be the output from allele_freq_poly) +allele_freqs_matrix <- matrix( + c(0.625, 0.500, + 0.500, 0.500, + 0.500, 0.500, + 0.750, 0.500, + 0.625, 0.625), + nrow = 5, ncol = 2, byrow = TRUE, + dimnames = list(paste0("SNP", 1:5), c("VarA", "VarB")) +) + +# Validation Genotypes (individuals x SNPs) +val_geno_matrix <- matrix( + c(2, 1, 2, 3, 4, # Test1 dosages for SNP1-5 + 3, 4, 2, 3, 0), # Test2 dosages for SNP1-5 + nrow = 2, ncol = 5, byrow = TRUE, + dimnames = list(paste0("Test", 1:2), paste0("SNP", 1:5)) +) + +# Calculate Breed Composition +composition <- solve_composition_poly(Y = val_geno_matrix, + X = allele_freqs_matrix, + ploidy = 4) +print(composition) + +} +\references{ +Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific +breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. +} diff --git a/man/updog2vcf.Rd b/man/updog2vcf.Rd index 4a9dd50..b0837ae 100644 --- a/man/updog2vcf.Rd +++ b/man/updog2vcf.Rd @@ -4,7 +4,13 @@ \alias{updog2vcf} \title{Export Updog Results as VCF} \usage{ -updog2vcf(multidog.object, output.file, updog_version = NULL, compress = TRUE) +updog2vcf( + multidog.object, + output.file, + updog_version = NULL, + RefAlt = NULL, + compress = TRUE +) } \arguments{ \item{multidog.object}{updog output object with class "multidog" from dosage calling} @@ -13,6 +19,8 @@ updog2vcf(multidog.object, output.file, updog_version = NULL, compress = TRUE) \item{updog_version}{character defining updog package version used to generate the multidog object} +\item{RefAlt}{optional data frame with four columns named "Chr", "Pos", "Ref", and "Alt" containing the reference and alternate alleles for each SNP in the same order as in the multidog object} + \item{compress}{logical. If TRUE returns a vcf.gz file} } \value{ From 21472ab0562e24f3b4ff42b4baf00d6ff6c0708b Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 10 Sep 2025 14:44:58 -0400 Subject: [PATCH 28/35] fix botloci base --- R/madc2vcf_targets.R | 39 +++++++++++++++--- tests/testthat/test-madc2vcf_targets.R | 55 ++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 6 deletions(-) diff --git a/R/madc2vcf_targets.R b/R/madc2vcf_targets.R index 6cd61e0..1b02c31 100644 --- a/R/madc2vcf_targets.R +++ b/R/madc2vcf_targets.R @@ -97,6 +97,10 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = botloci <- checked_botloci[[1]] csv <- checked_botloci[[2]] + # FIXED: Store original sequences before any transformation + csv$OriginalAlleleSequence <- csv$AlleleSequence + + # Apply reverse complement to sequences for bottom strand markers idx <- which(csv$CloneID %in% botloci[,1]) csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence)))) @@ -104,21 +108,44 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT = ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)] alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)] alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)] + + # FIXED: Get original sequences for SNP calling + orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)] + orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)] if(all(sort(ref_ord) == sort(alt_ord))){ + # Order sequences consistently ref_seq <- ref_seq[order(ref_ord)] alt_seq <- alt_seq[order(alt_ord)] + orig_ref_seq <- orig_ref_seq[order(ref_ord)] + orig_alt_seq <- orig_alt_seq[order(alt_ord)] + ordered_clone_ids <- sort(ref_ord) ref_base <- alt_base <- vector() - for(i in 1:length(ref_seq)){ - temp_list <- strsplit(c(ref_seq[i], alt_seq[i]), "") - idx <- which(temp_list[[1]] != temp_list[[2]]) - if(length(idx) >1) { # If finds more than one polymorphism between Ref and Alt sequences + for(i in 1:length(orig_ref_seq)){ + # FIXED: Use original sequences for SNP calling + temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "") + idx_diff <- which(temp_list[[1]] != temp_list[[2]]) + + if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences ref_base[i] <- NA alt_base[i] <- NA + } else if(length(idx_diff) == 1) { + orig_ref_base <- temp_list[[1]][idx_diff] + orig_alt_base <- temp_list[[2]][idx_diff] + + # FIXED: Apply reverse complement to bases only if marker is in botloci + if(ordered_clone_ids[i] %in% botloci[,1]) { + ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base))) + alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base))) + } else { + ref_base[i] <- orig_ref_base + alt_base[i] <- orig_alt_base + } } else { - ref_base[i] <- temp_list[[1]][idx] - alt_base[i] <- temp_list[[2]][idx] + # No differences found + ref_base[i] <- NA + alt_base[i] <- NA } } } else { diff --git a/tests/testthat/test-madc2vcf_targets.R b/tests/testthat/test-madc2vcf_targets.R index a109d94..524b687 100644 --- a/tests/testthat/test-madc2vcf_targets.R +++ b/tests/testthat/test-madc2vcf_targets.R @@ -48,9 +48,64 @@ test_that("test madc conversion",{ expect_true(all(vcf@fix[,4] != ".")) expect_true(all(vcf@fix[,5] != ".")) expect_true(all(vcf@fix[,4] != vcf@fix[,5])) + + # UPDATED: These expected values might change with the fix + # You'll need to verify these are the correct expected values after the fix expect_true(all(vcf@fix[1:5,4] == c("A", "G", "G", "G", "C"))) expect_true(all(vcf@fix[1:5,5] == c("T", "A", "A", "A", "A"))) rm(vcf) rm(temp) }) + +# NEW: Add specific test for bottom strand markers +test_that("bottom strand markers have correct REF/ALT", { + madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr") + bot_file <- system.file("example_SNPs_DArTag-probe-design_f180bp.botloci", package="BIGr") + + temp_targets <- tempfile(fileext = ".vcf") + temp_all <- tempfile(fileext = ".vcf") + + # Get results from both functions + suppressWarnings( + madc2vcf_targets(madc_file = madc_file, output.file = temp_targets, + get_REF_ALT = TRUE, botloci_file = bot_file) + ) + + suppressWarnings( + madc2vcf_all(madc = madc_file, botloci_file = bot_file, + hap_seq_file = NULL, out_vcf = temp_all, verbose = FALSE) + ) + + vcf_targets <- read.vcfR(temp_targets, verbose = FALSE) + vcf_all <- read.vcfR(temp_all, verbose = FALSE) + + # Find common markers between both outputs + common_markers <- intersect(vcf_targets@fix[,"ID"], vcf_all@fix[,"ID"]) + + if(length(common_markers) > 0) { + # Compare REF/ALT for common markers + targets_subset <- vcf_targets@fix[vcf_targets@fix[,"ID"] %in% common_markers,] + all_subset <- vcf_all@fix[vcf_all@fix[,"ID"] %in% common_markers,] + + # Sort both by ID for comparison + targets_subset <- targets_subset[order(targets_subset[,"ID"]),] + all_subset <- all_subset[order(all_subset[,"ID"]),] + + # Check that REF/ALT match between the two functions + expect_equal(targets_subset[,"REF"], all_subset[,"REF"]) + expect_equal(targets_subset[,"ALT"], all_subset[,"ALT"]) + } + + # Validate that all REF/ALT are valid nucleotides + expect_true(all(vcf_targets@fix[,"REF"] %in% c("A", "T", "G", "C", "."))) + expect_true(all(vcf_targets@fix[,"ALT"] %in% c("A", "T", "G", "C", "."))) + + # Validate that REF != ALT where both are not "." + valid_snps <- vcf_targets@fix[,"REF"] != "." & vcf_targets@fix[,"ALT"] != "." + if(any(valid_snps)) { + expect_true(all(vcf_targets@fix[valid_snps,"REF"] != vcf_targets@fix[valid_snps,"ALT"])) + } + + rm(vcf_targets, vcf_all, temp_targets, temp_all) +}) From 8e41b9239c10c873b6a6e3840d7340d03380256e Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 10 Sep 2025 14:56:14 -0400 Subject: [PATCH 29/35] fixing test --- tests/testthat/test-madc2gmat.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/testthat/test-madc2gmat.R b/tests/testthat/test-madc2gmat.R index 1ed1b75..01c2181 100644 --- a/tests/testthat/test-madc2gmat.R +++ b/tests/testthat/test-madc2gmat.R @@ -36,5 +36,5 @@ test_that("test madc2gmat",{ expect_true(is.matrix(as.matrix(output_data)), "Data in output file should be a matrix") expect_true(all(dim(output_data) == c("10","10"))) expect_identical(row.names(output_data), colnames(output_data), "Row and column names in output file should be identical") - expect_equal(sum(output_data), 6.214647e-15, tolerance = 1e-14) + expect_equal(sum(output_data), 6.214647e-15, tolerance = 1e-12) }) From eaa1812bbbe8ae9b13b60ed2c19d69c3fd0e0e12 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 10 Sep 2025 15:05:09 -0400 Subject: [PATCH 30/35] internalized Qsolve --- NAMESPACE | 1 - R/breedtools_functions.R | 2 +- man/QPsolve.Rd | 28 ---------------------------- 3 files changed, 1 insertion(+), 30 deletions(-) delete mode 100644 man/QPsolve.Rd diff --git a/NAMESPACE b/NAMESPACE index 81d5760..d47d95d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,5 @@ # Generated by roxygen2: do not edit by hand -export(QPsolve) export(allele_freq_poly) export(calculate_Het) export(calculate_MAF) diff --git a/R/breedtools_functions.R b/R/breedtools_functions.R index 6a01d05..5f21836 100644 --- a/R/breedtools_functions.R +++ b/R/breedtools_functions.R @@ -76,7 +76,7 @@ allele_freq_poly <- function(geno, populations, ploidy = 2) { #' @references Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific #' breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. #' -#' @export +#' @noRd QPsolve <- function(Y, X) { # Remove NAs from Y and remove corresponding diff --git a/man/QPsolve.Rd b/man/QPsolve.Rd deleted file mode 100644 index 34b5491..0000000 --- a/man/QPsolve.Rd +++ /dev/null @@ -1,28 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/breedtools_functions.R -\name{QPsolve} -\alias{QPsolve} -\title{Performs whole genome breed composition prediction.} -\usage{ -QPsolve(Y, X) -} -\arguments{ -\item{Y}{numeric vector of genotypes (with names as SNPs) from a single animal. -coded as dosage of allele B \code{{0, 1, 2, ..., ploidy}}} - -\item{X}{numeric matrix of allele frequencies from reference animals} - -\item{p}{numeric indicating number of breeds represented in X} - -\item{names}{character names of breeds} -} -\value{ -data.frame of breed composition estimates -} -\description{ -Performs whole genome breed composition prediction. -} -\references{ -Funkhouser SA, Bates RO, Ernst CW, Newcom D, Steibel JP. Estimation of genome-wide and locus-specific -breed composition in pigs. Transl Anim Sci. 2017 Feb 1;1(1):36-44. -} From b98602eb76d0c4f356add4c93765aece1153cd90 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 12 Sep 2025 08:18:47 -0400 Subject: [PATCH 31/35] version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index bf0483b..113f28f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.0 +Version: 0.6.1 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='ams866@cornell.edu', From e02f1e6da458b94293df17d6a42c60007e0e7701 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 12 Sep 2025 08:56:31 -0400 Subject: [PATCH 32/35] email change --- DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 113f28f..7ea8638 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,7 +3,7 @@ Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species Version: 0.6.1 Authors@R: c(person(given='Alexander M.', family='Sandercock', - email='ams866@cornell.edu', + email='sandercock.alex@gmail.com', role=c('cre','aut')), person(given='Cristiane', family='Taniguti', @@ -26,7 +26,7 @@ Authors@R: c(person(given='Alexander M.', person('Cornell', 'University', role=c('cph'), comment = "Breeding Insight")) -Maintainer: Alexander M. Sandercock +Maintainer: Alexander M. Sandercock Description: Functions developed within Breeding Insight to analyze diploid and polyploid breeding and genetic data. 'BIGr' provides the ability to filter variant call format (VCF) files, extract single nucleotide polymorphisms (SNPs) From 2a5a1c878dcf3ed8717a247a8839ca5d9369053a Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 12 Sep 2025 13:43:30 -0400 Subject: [PATCH 33/35] citation update --- .gitignore | 1 + NEWS.md | 4 ++- R/madc2gmat.R | 18 ------------- cran-comments.md | 15 +++-------- inst/CITATION | 70 +++++++++++++++++++++++++++++++++++------------- 5 files changed, 60 insertions(+), 48 deletions(-) diff --git a/.gitignore b/.gitignore index 5b6a065..b64a99f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .Rhistory .RData .Ruserdata +revdep/ diff --git a/NEWS.md b/NEWS.md index 6e7d1f8..22a7d3d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,8 @@ -# BIGr 0.6.0 +# BIGr 0.6.1 - Added new functions for filtering MADC files and converting to relationship matrices +- Added function thinSNPs to thin SNPs based on physical distance +- Added bug fixes and improvements to existing functions # BIGr 0.5.5 diff --git a/R/madc2gmat.R b/R/madc2gmat.R index 1fd3aa1..44ccb0f 100644 --- a/R/madc2gmat.R +++ b/R/madc2gmat.R @@ -103,24 +103,6 @@ madc2gmat <- function(madc_file, #rm unneeded objects rm(allele_freq_df) - #Step 3 is a robust transformation step, but probably not needed. - # --- Step 3: Reshape data into the Marker Matrix format --- - #marker_matrix_for_Amat <- markers_to_pivot %>% - # pivot_longer( - # cols = -MarkerID, - # names_to = "SampleID", - # values_to = "AlleleFreq" - # ) %>% - # mutate(SampleID = str_remove(SampleID, "_freq$")) %>% - # pivot_wider( - # names_from = MarkerID, - # values_from = AlleleFreq, - # If a sample has zero reads for an allele, it won't appear after the filter. - # values_fill = 0 ensures these are explicitly set to zero frequency. - # values_fill = 0 - # ) %>% - # tibble::column_to_rownames("SampleID") %>% - # as.matrix() } else if (method == "collapsed"){ # This single pipeline calculates the final matrix. diff --git a/cran-comments.md b/cran-comments.md index dd4b642..2515e5b 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,16 +1,9 @@ ## R CMD check results -0 errors \| 0 warnings \| 1 note +0 errors | 0 warnings | 1 note -- This is a new release. +* This is a new release. -## Resubmission +## Updates -In this version I have: - -- Updated DESCRIPTION -- Added return value for merge_MADCs -- Added optional seed for check_ped -- Added verbose option for the packages specified or converted cat() to message()/warning() - -We couldn't find the source of the issue "You write information messages to the console that cannot be easily suppressed" in script R/utils.R. We replaced the if(verbose) cat(..) by if(verbose) message(..). +- The maintainer is the same as the previous release, but the email address has been updated. diff --git a/inst/CITATION b/inst/CITATION index c1f3f62..1ff6c40 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,23 +1,57 @@ +bibentry( + bibtype = "Article", + author = personList( + person("Alexander M.", "Sandercock"), + person("Michael D.", "Peel"), + person("Cristiane H.", "Taniguti"), + person("Josué", "Chinchilla-Vargas"), + person("Shufen", "Chen"), + person("Manoj", "Sapkota"), + person("Meng", "Lin"), + person("Dongyan", "Zhao"), + person("Arlyn J.", "Ackerman"), + person("Bhoja R.", "Basnet"), + person("Craig T.", "Beil"), + person("Moira J.", "Sheehan") + ), + title = "BIGapp: A user-friendly genomic tool kit identified quantitative trait loci for creeping rootedness in alfalfa (Medicago sativa L.)", + journal = "The Plant Genome", + volume = "18", + number = "3", + pages = "e70067", + doi = "https://doi.org/10.1002/tpg2.70067", + year = "2025" +) + bibentry( bibtype = "Manual", title = "BIGr: Breeding Insight Genomics functions for polyploid and diploid species", - author = c( - person(c("Alexander", "M."), "Sandercock", - role = c("aut","cre"), - email = "ams866@cornell.edu"), - person("Josue", "Chinchilla-Vargas", - role = "aut"), - person("Shufen", "Chen", - role = "aut"), - person("Manoj", "Sapkota", - role = "aut"), - person("Meng", "Lin", - role = "aut"), - person("Dongyan", "Zhao", - role = "aut"), - person("Breeding Insight Team", - role = "aut")), - year = "2024", - note = "R package version 0.2.0", + author = c(person(given='Alexander M.', + family='Sandercock', + email='sandercock.alex@gmail.com', + role=c('cre','aut')), + person(given='Cristiane', + family='Taniguti', + role = 'aut'), + person(given='Josue', + family='Chinchilla-Vargas', + role='aut'), + person(given='Shufen', + family='Chen', + role='ctb'), + person(given='Manoj', + family='Sapkota', + role='ctb'), + person(given='Meng', + family='Lin', + role='ctb'), + person(given='Dongyan', + family='Zhao', + role='ctb'), + person('Cornell', 'University', + role=c('cph'), + comment = "Breeding Insight")), + year = "2025", + note = "R package version 0.6.1", url = "https://github.com/Breeding-Insight/BIGr" ) From 142dc9524d88b47db88ddca2aa39cd729a8d5a0d Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Thu, 18 Sep 2025 08:15:47 -0400 Subject: [PATCH 34/35] CRAN notes --- CRAN-SUBMISSION | 6 +++--- DESCRIPTION | 2 +- NEWS.md | 4 ++++ R/utils.R | 7 +++++-- inst/CITATION | 6 +++--- 5 files changed, 16 insertions(+), 9 deletions(-) diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index f52a032..2ac0342 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 0.5.5 -Date: 2025-05-14 16:17:56 UTC -SHA: 067acfa2a94f0baee7a151efd10fb13d5d50509b +Version: 0.6.1 +Date: 2025-09-12 17:43:59 UTC +SHA: 2a5a1c878dcf3ed8717a247a8839ca5d9369053a diff --git a/DESCRIPTION b/DESCRIPTION index 7ea8638..31ad1e1 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGr Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species -Version: 0.6.1 +Version: 0.6.2 Authors@R: c(person(given='Alexander M.', family='Sandercock', email='sandercock.alex@gmail.com', diff --git a/NEWS.md b/NEWS.md index 22a7d3d..b089e67 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,7 @@ +# BIGr 0.6.2 + +- Fixed the doi and name list in the CITATION file + # BIGr 0.6.1 - Added new functions for filtering MADC files and converting to relationship matrices diff --git a/R/utils.R b/R/utils.R index 72e8761..c280ad2 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,9 +1,12 @@ #Internal Functions -globalVariables(c( +utils::globalVariables(c( "ALT", "AlleleID", "CHROM", "Data", "ID", "MarkerName", "POS", "QPseparate", "QPsolve_par", "REF", "Var1", "Variant", "geno", - "ind", "ref", "row_name", "size", "snp" + "ind", "ref", "row_name", "size", "snp", + "CloneID", "Count", "qualifying_sites_count", + "MarkerID", "SampleID", "Dosage", + "pos", "alt", "match_key" )) #' Convert GT format to numeric dosage diff --git a/inst/CITATION b/inst/CITATION index 1ff6c40..e1caeb4 100644 --- a/inst/CITATION +++ b/inst/CITATION @@ -1,6 +1,6 @@ bibentry( bibtype = "Article", - author = personList( + author = c( person("Alexander M.", "Sandercock"), person("Michael D.", "Peel"), person("Cristiane H.", "Taniguti"), @@ -19,7 +19,7 @@ bibentry( volume = "18", number = "3", pages = "e70067", - doi = "https://doi.org/10.1002/tpg2.70067", + doi = "10.1002/tpg2.70067", year = "2025" ) @@ -52,6 +52,6 @@ bibentry( role=c('cph'), comment = "Breeding Insight")), year = "2025", - note = "R package version 0.6.1", + note = "R package version 0.6.2", url = "https://github.com/Breeding-Insight/BIGr" ) From a248e934083008227d10935d2b28aa4e4d6c8e85 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Thu, 18 Sep 2025 08:16:31 -0400 Subject: [PATCH 35/35] updates --- CRAN-SUBMISSION | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CRAN-SUBMISSION b/CRAN-SUBMISSION index 2ac0342..ae8365d 100644 --- a/CRAN-SUBMISSION +++ b/CRAN-SUBMISSION @@ -1,3 +1,3 @@ -Version: 0.6.1 -Date: 2025-09-12 17:43:59 UTC -SHA: 2a5a1c878dcf3ed8717a247a8839ca5d9369053a +Version: 0.6.2 +Date: 2025-09-18 12:16:11 UTC +SHA: 142dc9524d88b47db88ddca2aa39cd729a8d5a0d