Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: BIGr
Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species
Version: 0.6.0
Version: 0.6.1
Authors@R: c(person(given='Alexander M.',
family='Sandercock',
email='ams866@cornell.edu',
Expand Down Expand Up @@ -53,15 +53,16 @@ Imports:
Rdpack (>= 0.7),
readr (>= 2.1.5),
reshape2 (>= 1.4.4),
rrBLUP,
rlang,
tidyr (>= 1.3.1),
vcfR (>= 1.15.0),
Rsamtools,
Biostrings,
pwalign,
janitor,
quadprog,
tibble
tibble,
stringr
Suggests:
covr,
spelling,
Expand Down
6 changes: 5 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(allele_freq_poly)
export(calculate_Het)
export(calculate_MAF)
export(check_homozygous_trios)
Expand All @@ -16,11 +17,15 @@ export(madc2gmat)
export(madc2vcf_all)
export(madc2vcf_targets)
export(merge_MADCs)
export(solve_composition_poly)
export(thinSNP)
export(updog2vcf)
import(dplyr)
import(janitor)
import(parallel)
import(quadprog)
import(rlang)
import(stringr)
import(tibble)
import(tidyr)
import(vcfR)
Expand All @@ -33,7 +38,6 @@ importFrom(pwalign,pairwiseAlignment)
importFrom(readr,read_csv)
importFrom(reshape2,dcast)
importFrom(reshape2,melt)
importFrom(rrBLUP,A.mat)
importFrom(stats,cor)
importFrom(stats,setNames)
importFrom(utils,packageVersion)
Expand Down
4 changes: 2 additions & 2 deletions R/breedtools_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#' allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4)
#' print(allele_freqs)
#'
#' @noRd
#' @export
allele_freq_poly <- function(geno, populations, ploidy = 2) {

# Initialize returned df
Expand Down Expand Up @@ -168,7 +168,7 @@ QPsolve <- function(Y, X) {
#' ploidy = 4)
#' print(composition)
#'
#' @noRd
#' @export
solve_composition_poly <- function(Y,
X,
ped = NULL,
Expand Down
200 changes: 164 additions & 36 deletions R/madc2gmat.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,17 @@
#' Scale and normalize MADC read count data and convert it to an additive genomic relationship matrix.
#'
#'@details
#' This function reads a MADC file, processes it to remove unnecessary columns, scales and normalizes the data, and
#' then converts it into an additive genomic relationship matrix using the `A.mat` function from the `rrBLUP` package.
#' This function reads a MADC file, processes it to remove unnecessary columns, and then
#' converts it into an additive genomic relationship matrix using the first method proposed by VanRaden (2008).
#' The resulting matrix can be used for genomic selection or other genetic analyses.
#'
#'@import dplyr
#'@importFrom utils read.csv write.csv
#'@importFrom rrBLUP A.mat
#'@import tibble stringr dplyr tidyr
#'
#'@param madc_file Path to the MADC file to be filtered
#'@param seed Optional seed for random number generation (default is NULL)
#'@param method Method to use for processing the MADC data. Options are "unique" or "collapsed". Default is "collapsed".
#'@param ploidy Numeric. Ploidy level of the samples (e.g., 2 for diploid, 4 for tetraploid)
#'@param output.file Path to save the filtered data (if NULL, data will not be saved)
#'
#'@return data.frame or saved csv file
Expand All @@ -27,14 +28,17 @@
#' # Converting to additive relationship matrix
#' gmat <- madc2gmat(madc_file,
#' seed = 123,
#' ploidy=2,
#' output.file = NULL)
#'
#'@references
#'Endelman, J. B. (2011). Ridge regression and other kernels for genomic selection with R package rrBLUP. The Plant Genome, 4(3).
#'@references VanRaden, P. M. (2008). Efficient methods to compute genomic predictions. Journal of Dairy Science, 91(11), 4414-4423
#'@author Alexander M. Sandercock
#'
#'@export
madc2gmat <- function(madc_file,
seed = NULL,
method = "collapsed",
ploidy,
Copy link

Copilot AI Aug 12, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ploidy parameter lacks a default value, making it a required parameter. This is a breaking change for existing code. Consider providing a sensible default (e.g., ploidy = 2) or document this as a breaking change in the function documentation.

Suggested change
ploidy,
ploidy = 2,

Copilot uses AI. Check for mistakes.
Copy link

Copilot AI Sep 9, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The ploidy parameter lacks a default value, which could break backward compatibility. Consider adding a default value (e.g., ploidy = 2) or implement a deprecation warning to guide users.

Suggested change
ploidy,
ploidy = 2,

Copilot uses AI. Check for mistakes.
output.file = NULL) {
#set seed if not null
if (!is.null(seed)) {
Expand Down Expand Up @@ -62,39 +66,163 @@ madc2gmat <- function(madc_file,
filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)
}

#Remove extra text after Ref and Alt (_001 or _002)
filtered_df$AlleleID <- sub("\\|Ref_001*", "|Ref", filtered_df$AlleleID)
filtered_df$AlleleID <- sub("\\|Alt_002", "|Alt", filtered_df$AlleleID)
#Convert to data.table
#filtered_df <- as.data.table(filtered_df)

if (method == "unique") {
#Get allele ratios
# --- Step 1: Calculate frequency of each specific allele using total locus count ---
# The denominator MUST be the sum of all reads at the locus.
allele_freq_df <- filtered_df %>%
group_by(CloneID) %>%
mutate(across(
.cols = where(is.numeric),
# The calculation is count / total_locus_count
.fns = ~ . / sum(.),
.names = "{.col}_freq"
)) %>%
ungroup()

#Rm object
rm(filtered_df)

# --- Step 2: Prepare data for reshaping ---
# Filter out the reference rows and create a unique marker name for each alternative allele
markers_to_pivot <- allele_freq_df %>%
filter(!str_ends(AlleleID, "\\|Ref_0001")) %>%
# Create a new, unique column for each marker (e.g., "chr1.1_000194324_RefMatch_0001")
# This makes each alternative allele its own column in the final matrix.
mutate(MarkerID = str_replace(AlleleID, "\\|", "_")) %>%
select(MarkerID, ends_with("_freq")) %>%
tibble::column_to_rownames(var = "MarkerID")
names(markers_to_pivot) <- str_remove(names(markers_to_pivot), "_freq$")
markers_to_pivot <- t(markers_to_pivot)
#Replace the NaN to NA
markers_to_pivot[is.nan(markers_to_pivot)] <- NA

#rm unneeded objects
rm(allele_freq_df)

#Step 3 is a robust transformation step, but probably not needed.
# --- Step 3: Reshape data into the Marker Matrix format ---
#marker_matrix_for_Amat <- markers_to_pivot %>%
# pivot_longer(
# cols = -MarkerID,
# names_to = "SampleID",
# values_to = "AlleleFreq"
# ) %>%
# mutate(SampleID = str_remove(SampleID, "_freq$")) %>%
# pivot_wider(
# names_from = MarkerID,
# values_from = AlleleFreq,
# If a sample has zero reads for an allele, it won't appear after the filter.
# values_fill = 0 ensures these are explicitly set to zero frequency.
# values_fill = 0
# ) %>%
# tibble::column_to_rownames("SampleID") %>%
# as.matrix()
} else if (method == "collapsed"){

# This single pipeline calculates the final matrix.
# The output is the marker dosage matrix (X) to be used as input for A.mat()
markers_to_pivot <- filtered_df %>%

# --- Part A: Calculate the frequency of each allele at each locus ---
group_by(CloneID) %>%
mutate(across(
.cols = where(is.numeric),
# Use if_else to prevent 0/0, which results in NaN
.fns = ~ . / sum(.),
.names = "{.col}_freq"
)) %>%
ungroup() %>%

# --- Part B: Isolate and sum all alternative allele frequencies ---
#Note, the RefMatch counts are being counted as Reference allele counts, and not
#Alternative allele counts
# Filter to keep only the alternative alleles using base R's endsWith()
filter(!grepl("\\|Ref", AlleleID)) %>%
# Group by the main locus ID
group_by(CloneID) %>%
# For each locus, sum the frequencies of all its alternative alleles
summarise(across(ends_with("_freq"), sum), .groups = 'drop') %>%

# --- Part C: Reshape the data into the final matrix format ---
# Convert from a "long" format to a "wide" format
pivot_longer(
cols = -CloneID,
names_to = "SampleID",
values_to = "Dosage"
) %>%
# Clean up the sample names using base R's sub()
mutate(SampleID = sub("_freq$", "", SampleID)) %>%
# Pivot to the final shape: samples in rows, markers in columns
pivot_wider(
names_from = CloneID,
values_from = Dosage,
# This ensures that if a sample/locus combination is missing,
# it gets a value of 0 instead of NA.
values_fill = 0
) %>%
# Convert the 'SampleID' column into the actual row names of the dataframe
column_to_rownames("SampleID") %>%
# Convert the final dataframe into a true matrix object
as.matrix()
#Replace the NaN to NA
markers_to_pivot[is.nan(markers_to_pivot)] <- NA
#rm unneeded objects
rm(filtered_df)

#Removing extra columns
row.names(filtered_df) <- filtered_df$AlleleID
filtered_df <- filtered_df %>%
select(-c(AlleleID, CloneID, AlleleSequence))


#Scale and normalized data
message("Scaling and normalizing data to be -1,1")
# Function to scale a matrix to be between -1 and 1 for rrBLUP
scale_matrix <- function(mat) {
min_val <- min(mat)
max_val <- max(mat)

# Normalize to [0, 1]
normalized <- (mat - min_val) / (max_val - min_val)

# Scale to [-1, 1]
scaled <- 2 * normalized - 1

return(scaled)
} else {
stop("Invalid method specified. Use 'unique' or 'collapsed'.")
}

# Apply the scaling function
filtered_df <- scale_matrix(filtered_df)

#Making additive relationship matrix
MADC.mat <- A.mat(t(filtered_df))

rm(filtered_df)
#VanRaden method
#This method is used to calculate the additive relationship matrix
compute_relationship_matrix <- function(mat,
ploidy,
method = "VanRaden",
impute = TRUE) {
### Prepare matrix
#Remove any SNPs that are all NAs
mat <- mat[, colSums(is.na(mat)) < nrow(mat)]
#impute the missing values with the mean of the column
if (impute) {
mat <- data.frame(mat, check.names = FALSE, check.rows = FALSE) %>%
mutate_all(~ifelse(is.na(.x), mean(.x, na.rm = TRUE), .x)) %>%
as.matrix()
}

if (method == "VanRaden") {
# Calculate the additive relationship matrix using VanRaden's method

#Calculate allele frequencies
p <- apply(mat, 2, mean, na.rm = TRUE)
q <- 1 - p
#Calculate the denominator
#Note: need to use 1/ploidy for ratios, where ploidy should be used for dosages.
#This is because the ratios are from 0-1, which is what you get when dosage/ploidy, whereas dosages are from 0 to ploidy
denominator <- (1/as.numeric(ploidy))*sum(p * q, na.rm = TRUE)
#Get the numerator
Z <- scale(mat, center = TRUE, scale = FALSE)
ZZ <- (Z %*% t(Z))
#Calculate the additive relationship matrix
A.mat <- ZZ / denominator

return(A.mat)

} else {
stop("Unsupported method. Only 'VanRaden' is currently implemented.")
}
}
#Calculate the additive relationship matrix
MADC.mat <- compute_relationship_matrix(markers_to_pivot,
ploidy = ploidy,
method = "VanRaden",
impute = TRUE)

#Remove the markers_to_pivot object to free up memory
rm(markers_to_pivot)

#Save the output to disk if file name provided
if (!is.null(output.file)) {
Expand Down
39 changes: 33 additions & 6 deletions R/madc2vcf_targets.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,28 +97,55 @@ madc2vcf_targets <- function(madc_file, output.file, botloci_file, get_REF_ALT =
botloci <- checked_botloci[[1]]
csv <- checked_botloci[[2]]

# FIXED: Store original sequences before any transformation
csv$OriginalAlleleSequence <- csv$AlleleSequence

# Apply reverse complement to sequences for bottom strand markers
idx <- which(csv$CloneID %in% botloci[,1])
csv$AlleleSequence[idx] <- sapply(csv$AlleleSequence[idx], function(sequence) as.character(reverseComplement(DNAString(sequence))))

ref_seq <- csv$AlleleSequence[grep("\\|Ref.*", csv$AlleleID)]
ref_ord <- csv$CloneID[grep("\\|Ref.*", csv$AlleleID)]
alt_seq <- csv$AlleleSequence[grep("\\|Alt.*", csv$AlleleID)]
alt_ord <- csv$CloneID[grep("\\|Alt.*", csv$AlleleID)]

# FIXED: Get original sequences for SNP calling
orig_ref_seq <- csv$OriginalAlleleSequence[grep("\\|Ref.*", csv$AlleleID)]
orig_alt_seq <- csv$OriginalAlleleSequence[grep("\\|Alt.*", csv$AlleleID)]

if(all(sort(ref_ord) == sort(alt_ord))){
# Order sequences consistently
ref_seq <- ref_seq[order(ref_ord)]
alt_seq <- alt_seq[order(alt_ord)]
orig_ref_seq <- orig_ref_seq[order(ref_ord)]
orig_alt_seq <- orig_alt_seq[order(alt_ord)]
ordered_clone_ids <- sort(ref_ord)

ref_base <- alt_base <- vector()
for(i in 1:length(ref_seq)){
temp_list <- strsplit(c(ref_seq[i], alt_seq[i]), "")
idx <- which(temp_list[[1]] != temp_list[[2]])
if(length(idx) >1) { # If finds more than one polymorphism between Ref and Alt sequences
for(i in 1:length(orig_ref_seq)){
# FIXED: Use original sequences for SNP calling
temp_list <- strsplit(c(orig_ref_seq[i], orig_alt_seq[i]), "")
idx_diff <- which(temp_list[[1]] != temp_list[[2]])

if(length(idx_diff) > 1) { # If finds more than one polymorphism between Ref and Alt sequences
ref_base[i] <- NA
alt_base[i] <- NA
} else if(length(idx_diff) == 1) {
orig_ref_base <- temp_list[[1]][idx_diff]
orig_alt_base <- temp_list[[2]][idx_diff]

# FIXED: Apply reverse complement to bases only if marker is in botloci
if(ordered_clone_ids[i] %in% botloci[,1]) {
ref_base[i] <- as.character(reverseComplement(DNAString(orig_ref_base)))
alt_base[i] <- as.character(reverseComplement(DNAString(orig_alt_base)))
} else {
ref_base[i] <- orig_ref_base
alt_base[i] <- orig_alt_base
}
} else {
ref_base[i] <- temp_list[[1]][idx]
alt_base[i] <- temp_list[[2]][idx]
# No differences found
ref_base[i] <- NA
alt_base[i] <- NA
}
}
} else {
Expand Down
Loading
Loading