Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
42 commits
Select commit Hold shift + click to select a range
d2fc874
Added MADC Functions
alex-sandercock May 16, 2025
e3a312b
update description
alex-sandercock May 16, 2025
5d4097b
added madc2gmat test
alex-sandercock May 16, 2025
e0709ee
cleaned
alex-sandercock May 17, 2025
ca6aa80
filterMADC added
alex-sandercock May 17, 2025
e4873fa
added filterMADC tests
alex-sandercock May 17, 2025
ee6bc5e
Add example
alex-sandercock May 17, 2025
85f1210
Example updates
alex-sandercock May 19, 2025
0b54561
updated test
alex-sandercock May 19, 2025
c50d224
update documentation
alex-sandercock May 19, 2025
11877f9
Update Description
alex-sandercock May 19, 2025
9a9ac43
bug fix
Cristianetaniguti May 21, 2025
18ad68d
solve conflict
Cristianetaniguti May 21, 2025
1d4e3bb
removed ref and alt name changes
alex-sandercock May 21, 2025
73ec035
Merge pull request #37 from Breeding-Insight/madc2vcf_strawberry_fix
alex-sandercock May 21, 2025
0594201
Merge pull request #38 from Breeding-Insight/development
alex-sandercock May 21, 2025
1277f7a
Merge pull request #36 from Breeding-Insight/madc_filtering
alex-sandercock May 21, 2025
6ef6978
fix chr ID
Cristianetaniguti May 22, 2025
ffbbc98
Merge pull request #39 from Breeding-Insight/madc2vcf_strawberry_fix
alex-sandercock May 22, 2025
be4f2ab
Added thinSNP function
alex-sandercock Jun 4, 2025
23fc569
Update DESCRIPTION
alex-sandercock Jun 4, 2025
4d0fa76
update madc2gmat function
alex-sandercock Jun 11, 2025
9118cd0
test update
alex-sandercock Jun 11, 2025
f606048
updated test
alex-sandercock Jun 11, 2025
66dec47
added madc2gmat method
alex-sandercock Jun 11, 2025
494e60a
update documentation
alex-sandercock Jun 11, 2025
4436464
updated madc2gmat method
alex-sandercock Jun 13, 2025
deee40b
updated madc2gmat
alex-sandercock Jun 16, 2025
d732bc1
Update breedtools_functions.R
alex-sandercock Aug 12, 2025
1998376
madc2vcf exception added: presence of N
Cristianetaniguti Sep 4, 2025
7a22ee3
RefAlt updog2vcf support added
alex-sandercock Sep 9, 2025
f04dce8
Merge pull request #43 from Breeding-Insight/development
alex-sandercock Sep 9, 2025
4666a1f
updated docs
alex-sandercock Sep 9, 2025
21472ab
fix botloci base
alex-sandercock Sep 10, 2025
8e41b92
fixing test
alex-sandercock Sep 10, 2025
eaa1812
internalized Qsolve
alex-sandercock Sep 10, 2025
b98602e
version
alex-sandercock Sep 12, 2025
d613ef2
Merge pull request #40 from Breeding-Insight/thinning
alex-sandercock Sep 12, 2025
e02f1e6
email change
alex-sandercock Sep 12, 2025
2a5a1c8
citation update
alex-sandercock Sep 12, 2025
142dc95
CRAN notes
alex-sandercock Sep 18, 2025
a248e93
updates
alex-sandercock Sep 18, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@
.Rhistory
.RData
.Ruserdata
revdep/
6 changes: 3 additions & 3 deletions CRAN-SUBMISSION
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
Version: 0.5.5
Date: 2025-05-14 16:17:56 UTC
SHA: 067acfa2a94f0baee7a151efd10fb13d5d50509b
Version: 0.6.2
Date: 2025-09-18 12:16:11 UTC
SHA: 142dc9524d88b47db88ddca2aa39cd729a8d5a0d
10 changes: 6 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
Package: BIGr
Title: Breeding Insight Genomics Functions for Polyploid and Diploid Species
Version: 0.5.5
Version: 0.6.2
Authors@R: c(person(given='Alexander M.',
family='Sandercock',
email='ams866@cornell.edu',
email='sandercock.alex@gmail.com',
role=c('cre','aut')),
person(given='Cristiane',
family='Taniguti',
Expand All @@ -26,7 +26,7 @@ Authors@R: c(person(given='Alexander M.',
person('Cornell', 'University',
role=c('cph'),
comment = "Breeding Insight"))
Maintainer: Alexander M. Sandercock <ams866@cornell.edu>
Maintainer: Alexander M. Sandercock <sandercock.alex@gmail.com>
Description: Functions developed within Breeding Insight to analyze
diploid and polyploid breeding and genetic data. 'BIGr' provides the
ability to filter variant call format (VCF) files, extract single nucleotide polymorphisms (SNPs)
Expand All @@ -53,14 +53,16 @@ Imports:
Rdpack (>= 0.7),
readr (>= 2.1.5),
reshape2 (>= 1.4.4),
rlang,
tidyr (>= 1.3.1),
vcfR (>= 1.15.0),
Rsamtools,
Biostrings,
pwalign,
janitor,
quadprog,
tibble
tibble,
stringr
Suggests:
covr,
spelling,
Expand Down
7 changes: 7 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,24 +1,31 @@
# Generated by roxygen2: do not edit by hand

export(allele_freq_poly)
export(calculate_Het)
export(calculate_MAF)
export(check_homozygous_trios)
export(check_ped)
export(check_replicates)
export(dosage2vcf)
export(dosage_ratios)
export(filterMADC)
export(filterVCF)
export(flip_dosage)
export(get_countsMADC)
export(imputation_concordance)
export(madc2gmat)
export(madc2vcf_all)
export(madc2vcf_targets)
export(merge_MADCs)
export(solve_composition_poly)
export(thinSNP)
export(updog2vcf)
import(dplyr)
import(janitor)
import(parallel)
import(quadprog)
import(rlang)
import(stringr)
import(tibble)
import(tidyr)
import(vcfR)
Expand Down
59 changes: 35 additions & 24 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,43 +1,54 @@
# BIGr 0.3.3
# BIGr 0.6.2

- Adapt updog2vcf to model f1, f1pp, s1 and s1pp
- Fixed the doi and name list in the CITATION file

# BIGr 0.3.2
# BIGr 0.6.1

- updog2vcf function option to output compressed VCF (.vcf.gz) - set as default
- remove need for defining ploidy
- add metadata at the VCF header
- Added new functions for filtering MADC files and converting to relationship matrices
- Added function thinSNPs to thin SNPs based on physical distance
- Added bug fixes and improvements to existing functions

# BIGr 0.5.0
# BIGr 0.5.5

- Add imputation_concordance function to estimate accuracy of imputed and original dataset
- Add get_OffTargets function to extract target and off-target SNPs from a MADC file
- Add merge_MADCs function to merge two or more MADC files together
- Improved documentation and examples for all functions
- Add tests for all functions
- Updated DESCRIPTION
- Added return value for merge_MADCs
- Added optional seed for check_ped
- Added verbose option

# BIGr 0.5.1
# BIGr 0.5.4

- Improvements of testthat tests
- Add check_replicates and check_homozygous_trios for pedigree relationship quality check
- Updated dosage2vcf example

# BIGr 0.5.3

- Updated madc2vcf_all example

# BIGr 0.5.2

- madc2vcf function changed to madc2vcf_targets
- get_OffTargets function changed to madc2vcf_all
- Updates to testthat tests and function examples

# BIGr 0.5.3
# BIGr 0.5.1

- Updated madc2vcf_all example
- Improvements of testthat tests
- Add check_replicates and check_homozygous_trios for pedigree relationship quality check

# BIGr 0.5.4
# BIGr 0.5.0

- Updated dosage2vcf example
- Add imputation_concordance function to estimate accuracy of imputed and original dataset
- Add get_OffTargets function to extract target and off-target SNPs from a MADC file
- Add merge_MADCs function to merge two or more MADC files together
- Improved documentation and examples for all functions
- Add tests for all functions

# BIGr 0.5.5
# BIGr 0.3.3

- Adapt updog2vcf to model f1, f1pp, s1 and s1pp

# BIGr 0.3.2

- updog2vcf function option to output compressed VCF (.vcf.gz) - set as default
- remove need for defining ploidy
- add metadata at the VCF header

- Updated DESCRIPTION
- Added return value for merge_MADCs
- Added optional seed for check_ped
- Added verbose option
4 changes: 2 additions & 2 deletions R/breedtools_functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
#' allele_freqs <- allele_freq_poly(geno = geno_matrix, populations = pop_list, ploidy = 4)
#' print(allele_freqs)
#'
#' @noRd
#' @export
allele_freq_poly <- function(geno, populations, ploidy = 2) {

# Initialize returned df
Expand Down Expand Up @@ -168,7 +168,7 @@ QPsolve <- function(Y, X) {
#' ploidy = 4)
#' print(composition)
#'
#' @noRd
#' @export
solve_composition_poly <- function(Y,
X,
ped = NULL,
Expand Down
215 changes: 215 additions & 0 deletions R/filterMADC.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
#' Filter MADC Files
#'
#' Filter and process MADC files to remove low quality microhaplotypes
#'
#' @details
#' This function can filter raw MADC files or pre-processed MADC files with fixed allele IDs. Additionally,
#' it can filter based on mean read depth, number of mhaps per target loci, and other criteria. Optionally, users
#' can plot summary statistics and save the filtered data to a file.
#'
#'@import dplyr
#'@importFrom utils read.csv
#'
#'@param madc_file Path to the MADC file to be filtered
#'@param min.mean.reads Minimum mean read depth for filtering
#'@param max.mean.reads Maximum mean read depth for filtering
#'@param max.mhaps.per.loci Maximum number of matching mhaps per target loci. Retains only the target Ref and Alt loci at the sites that exceeds the \code{max.mhaps.per.loci} threshold.
#'@param min.reads.per.site Minimum number of reads per site for \code{min.ind.with.reads}. Otherwise, this parameter is ignored
#'@param min.ind.with.reads Minimum number of individuals with \code{min.reads.per.site} reads for filtering
#'@param target.only Logical indicating whether to filter for target loci only
#'@param n.summary.columns (optional) Number of summary columns to remove from MADC file not including the first three. Otherwise, the columns will be automatically detected and removed.
#@param plot.summary Logical indicating whether to plot summary statistics
#'@param output.file Path to save the filtered data (if NULL, data will not be saved)
#'
#'@return data.frame or saved csv file
#'
#'@examples
#' #Example
#'
#' #Example MADC
#' madc_file <- system.file("example_MADC_FixedAlleleID.csv", package="BIGr")
#'
#' #Remove mhaps exceeding 3 per target region including the ref and alt target mhaps
#' filtered_df <- filterMADC(madc_file,
#' min.mean.reads = NULL,
#' max.mean.reads = NULL,
#' max.mhaps.per.loci = 3,
#' min.reads.per.site = 1,
#' min.ind.with.reads = NULL,
#' target.only = FALSE,
#' n.summary.columns = NULL,
#' output.file = NULL)
#'
#'
#'
#'@export
filterMADC <- function(madc_file,
min.mean.reads = NULL,
max.mean.reads = NULL,
max.mhaps.per.loci = NULL,
min.reads.per.site = 1,
min.ind.with.reads = NULL,
target.only = FALSE,
n.summary.columns = NULL,
#plot.summary = FALSE,
output.file = NULL) {


#Need to first inspect the first 7 rows of the MADC to see if it has been preprocessed or not
first_seven_rows <- read.csv(madc_file, header = FALSE, nrows = 7, colClasses = c(NA, "NULL"))

#Check if all entries in the first column are either blank or "*"
check_entries <- all(first_seven_rows[, 1] %in% c("", "*"))

#Check if the MADC file has the filler rows or is processed from updated fixed allele ID pipeline
if (check_entries) {
#Note: This assumes that the first 7 rows are placeholder info from DArT processing

warning("The MADC file has not been pre-processed for Fixed Allele IDs. The first 7 rows are placeholder info from DArT processing.")

#Read the madc file
filtered_df <- read.csv(madc_file, sep = ',', skip = 7, check.names = FALSE)

#Remove extra text after Ref and Alt (_001 or _002)
#filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
#filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)

} else {

#Read the madc file
filtered_df <- read.csv(madc_file, sep = ',', check.names = FALSE)

#Remove extra text after Ref and Alt (_001 or _002)
#filtered_df$AlleleID <- sub("\\|Ref_.*", "|Ref", filtered_df$AlleleID)
#filtered_df$AlleleID <- sub("\\|Alt_.*", "|Alt", filtered_df$AlleleID)

}
#Check for extra columns
#Save the three columns for later adding to the output
saved_columns <- filtered_df[,1:3]

if (!is.null(n.summary.columns)) {
#Remove the first n.summary.columns columns
filtered_df <- filtered_df[,-c(4:n.summary.columns)]
}else{
rm.col <- c("ClusterConsensusSequence",
"CallRate", "OneRatioRef", "OneRatioSnp", "FreqHomRef", "FreqHomSnp",
"FreqHets", "PICRef", "PICSnp", "AvgPIC", "AvgCountRef", "AvgCountSnp","RatioAvgCountRefAvgCountSnp")

filtered_df <- filtered_df[, !(colnames(filtered_df) %in% rm.col)]
}

#Now add rownames
rownames(filtered_df) <- saved_columns[,1]

#Remove refmatch and altmatch if wanted
if (target.only) {
message("Retaining target markers only")
#Retain only the Ref and Alt haplotypes
filtered_df <- filtered_df[!grepl("\\|AltMatch|\\|RefMatch", filtered_df$AlleleID), ]
}

## Filtering

#Min mean reads
if (!is.null(min.mean.reads)) {
message("Filtering for minimum mean reads across all samples")
#Get the mean value for each row, and remove the rows below that threshold
filtered_df$MeanReads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE)
filtered_df <- filtered_df[filtered_df$MeanReads >= min.mean.reads, ]
#Remove the MeanReads column
filtered_df <- filtered_df[, -which(colnames(filtered_df) == "MeanReads")]
}

#Max mean reads
if (!is.null(max.mean.reads)) {
message("Filtering for maximum mean reads across all samples")
#Get the mean value for each row, and remove the rows above that threshold
filtered_df$MeanReads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE)
filtered_df <- filtered_df[filtered_df$MeanReads <= max.mean.reads, ]
#Remove the MeanReads column
filtered_df <- filtered_df[, -which(colnames(filtered_df) == "MeanReads")]
}

#Max mhaps per loci
if (!is.null(max.mhaps.per.loci)) {
message("Filtering for maximum number of matching mhaps per target loci")
#Get the number of matching mhaps for loci, and remove the mhaps at those loci that exceed the max number
mhap_counts <- filtered_df %>%
group_by(CloneID) %>%
summarise(Count = n(), .groups = 'drop') %>%
filter(Count > max.mhaps.per.loci)

patterns_to_search <- "\\|AltMatch|\\|RefMatch"
clone_ids_to_target <- mhap_counts$CloneID

filtered_df <- filtered_df %>%
filter(
!( # "keep rows that DO NOT match both conditions"
CloneID %in% clone_ids_to_target & # Condition 1: CloneID is one of the targeted IDs
grepl(patterns_to_search, AlleleID) # Condition 2: AlleleID contains one of the patterns
)
)
}

#Min individuals with reads
if (!is.null(min.ind.with.reads)) {
message("Filtering for minimum number of individuals with reads per site")
message(paste0("Minimum number of individuals with reads per site: ", min.ind.with.reads))
message(paste0("Minimum number of reads per site: ", min.reads.per.site))

#Getting colnames
cols_to_check <- colnames(filtered_df)[-(1:3)]

filtered_df <- filtered_df %>%
rowwise() %>% # Process data row by row
mutate(
# For each row, count how many of the 'cols_to_check' meet the criterion
qualifying_sites_count = sum(
c_across(all_of(cols_to_check)) >= min.reads.per.site,
na.rm = TRUE # Treats NAs in data as not meeting the criterion
)
) %>%
ungroup() %>% # Always ungroup after rowwise operations
# Filter rows where this count meets the 'min.ind.with.reads' threshold
filter(qualifying_sites_count >= min.ind.with.reads) %>%
# Optionally, remove the temporary count column if it's no longer needed
select(-qualifying_sites_count)
}

#Plots
#if (plot.summary) {
# message("Plotting summary statistics")
# #Plot mean read depth
# mean_reads <- rowMeans(filtered_df[, -c(1:3)], na.rm = TRUE)
# hist(mean_reads, main = "Mean Read Depth", xlab = "Mean Reads", ylab = "Frequency")

# #Plot number of Altmatch and Refmatch mhaps per target loci
# altmatch_counts <- filtered_df %>%
# filter(grepl("\\|AltMatch", AlleleID)) %>%
# group_by(CloneID) %>%
# summarise(Count = n(), .groups = 'drop')

# refmatch_counts <- filtered_df %>%
# filter(grepl("\\|RefMatch", AlleleID)) %>%
# group_by(CloneID) %>%
# summarise(Count = n(), .groups = 'drop')

# barplot(cbind(altmatch_counts$Count, refmatch_counts$Count), beside = TRUE,
# names.arg = altmatch_counts$CloneID, main = "Number of AltMatch and RefMatch Mhaps",
# xlab = "Clone ID", ylab = "Count")

#Plot density of number of CloneID per site on a marker distribution plot

#}

#Save the output to disk if file name provided
if (!is.null(output.file)) {
message("Saving filtered data to file")
write.csv(filtered_df, paste0(output.file,".csv"), row.names = FALSE)
} else {
message("No output file provided. Returning filtered data.")
return(filtered_df)
}

}
Loading
Loading