From 1b761b92586c5eca1521b52cb7e745edb7e173dc Mon Sep 17 00:00:00 2001 From: josuechinchilla Date: Tue, 4 Nov 2025 16:06:12 -0500 Subject: [PATCH 1/2] updated check_ped to save corrected dataframe and report --- .DS_Store | Bin 0 -> 8196 bytes BIGr.Rproj | 1 + DESCRIPTION | 2 +- R/check_ped.R | 267 +++++++++++++++++++++++++++++--------------------- 4 files changed, 157 insertions(+), 113 deletions(-) create mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..3b9d8426aa5a73e270c1816e6de3b764b9deb853 GIT binary patch literal 8196 zcmeHMU2GLa6rOKeV3!TB)B*)rduxSaxrVktrJ``TZ6iM|f48MS{M5a77rJ4)Tkh_? zr8d?2qyaVYMdSY)Q6EsFDEjJuVtgP-l$eMy@rf6W;Dd?r%+6k*Eq&Aw*hyy2oHOUl z^ql?9%$+G?4C#W|$XFF)Os2}Irc!g0!pC`?R-{N$P84L%{Nb$QWr;hR9iFBg3*rdG z5r`uYM<9+s9D!RQ0(55cBF}K{i_y4^BM?X6j*I}`9}-kKO+|D<&~WRZCcFX=O19Ce0V?}Tsw(I&v=O>yYtb#ENM*eYHl z$}-tu;myU8H_BX9ou~Ko_Vw>SFraA#-`ZLfdX=Gu}aB>AwwzfD<#(rFA=klK8?Dl*+usyf0H?WQee2h;tX{fi~_wqq&I9abv z^9!vCzQ(R$ZtU-~@;*^TIX!<#^?j=zXnOGBRQtBhJ?9r_3oF!0b#YJ8_U)`=b&uF) zFxY4KIm5REEQFQnW?AXYc|$|wj4xzqy-n7>mJjiTI}AVA zHEg-1KPjzmmF0ea(8yc6MhJ2xiw;>1KO@PxNxNwWUOtS*&Kh0s<5O)hP0J*wmsKsT z&@+0steqnp&3216ftYNCHR-399fU6aRu`tH9sRqGUmc$X|hnmhBSek8T zhuAP1WhdE->`nF|JHtL=U$F1k1@;sBjs4F4U{}~x_BYBg7nN9!6{y21+>a)##dirPV^%O6NhmG4hksZI7aaVp2jmcfs=Rvui#a@jyG@$@8UhYk5BO#KF62%3K#G* zF5wqk#uZ$}Uy>qKNXsQ%x<{&$R!Q~JYH5?SS=uV?k@}?pX;63|l@jgXWY45g;-hVY z5-oc6lTP0}(W)Qax^4T8j$1a-ew@!!>#oX0iHDhe5O>?=J2$Y&vWgXd(~=f7A1^eSJn|bo03JatCQ8$nnXD& z*oMYxRV$+mR6^VJNn#N(Ef1-xR!+Gn#y081vWQ8y(lk7?`G+2X={FCXD}s zGRRnh1XdzRnBIgG;kpg&gzMediwwGO5CbrfC6qgGF^pr#V+`YX5)*ieF#ar_!}E9v zFB8sRBcz|gTX-Aq-~*h&$2f~mLd1WI?{N{oPQh^JWDMia>liL(@~-2#M@Uja{AA_Z zk}DHlRq^|O`^@kEx25l~Q*i|12#5%vye-p~rv6NOAHQoSs6IxO7p^xUXlO#sq=x|F kul>W2`Uz6yHmQhC2uczv|NBG0^dH*s`5&MEcij2^FAa5Kvj6}9 literal 0 HcmV?d00001 diff --git a/BIGr.Rproj b/BIGr.Rproj index 69fafd4..5638e2e 100644 --- a/BIGr.Rproj +++ b/BIGr.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 0eeaab63-2615-4da7-b10a-927160fc78a3 RestoreWorkspace: No SaveWorkspace: No diff --git a/DESCRIPTION b/DESCRIPTION index 31ad1e1..cba831e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: diff --git a/R/check_ped.R b/R/check_ped.R index 3f4831b..35b0ba9 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,99 +1,142 @@ -#' Evaluate Pedigree File for Accuracy +#' Check a pedigree file for accuracy and report/correct common errors #' -#' Check a pedigree file for accuracy and output suspected errors +#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) +#' and performs quality checks, optionally correcting or flagging errors. #' -#'check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: -#'* Ids that appear more than once in the id column -#'* Ids that appear in both sire and dam columns -#'* Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -#'* Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +#' The function checks for: +#' * Exact duplicate rows and removes them (keeping one copy) +#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +#' * IDs that appear in both sire and dam columns +#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" +#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant #' -#'When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -#'Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. #' -#'Note: This function does not change the input file but prints any errors found in the console. +#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: +#' * Corrected pedigree as a dataframe in the global environment +#' * A report listing all detected issues +#' +#' @param ped.file Path to the pedigree text file. +#' @param seed Optional seed for reproducibility. +#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. +#' +#' @return A list of data.frames containing detected issues: +#' * `exact_duplicates`: rows that were exact duplicates +#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam +#' * `messy_parents`: IDs appearing as both sire and dam +#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam +#' * `dependencies`: detected cycles in the pedigree #' -#' @param ped.file path to pedigree text file. The pedigree file is a -#' 3-column pedigree tab separated file with columns labeled as id sire dam in any order -#' @param seed Optional seed for reproducibility -#' @param verbose Logical. If TRUE, print the errors to the console. -#' @return A list of data.frames of error types, and the output printed to the console #' @examples -#' ##Get list with a dataframe for each error type -#' ped_file <- system.file("check_ped_test.txt", package="BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, -#' seed = 101919) +#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) #' -#' ##Access the "messy parents" dataframe result +#' # Access messy parents #' ped_errors$messy_parents #' -#' ##Get list of sample IDs with messy parents error -#' messy_parent_ids <- ped_errors$messy_parents$id -#' print(messy_parent_ids) +#' # IDs with messy parents +#' messy_ids <- ped_errors$messy_parents$id +#' print(messy_ids) +#' #' @import dplyr #' @import janitor #' @importFrom stats setNames #' @importFrom utils read.table #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { - #### Function to check for hierarchical errors missing parents and repeated ids #### - if(!is.null(seed)){ - set.seed(seed) - } - #### read in data #### - data = utils::read.table(ped.file, header = T) - data <- data %>% + + #### setup #### + if (!is.null(seed)) set.seed(seed) + + # Read and clean data + data <- utils::read.table(ped.file, header = TRUE) %>% janitor::clean_names() %>% mutate( id = as.character(id), sire = as.character(sire), dam = as.character(dam) - ) - #Missing parents dataframe initialize - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + ) + + original_data <- data errors <- list() - # repeated id checks - n_occur <- data.frame(table(data$id)) - repeated_ids = n_occur[n_occur$Freq > 1,] %>% - rename(id = Var1) - # Check for ids that appear as both sire and dam ###This is possible for plants so maybe do not control for this or do not delete these rows just print them - messy_parents <- as.data.frame(intersect(data$sire, data$dam)) %>% - rename(id = 1) %>% - filter(id != 0) - # Missing parents check + missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + + #### check 1: exact duplicates #### + exact_duplicates <- data[duplicated(data), ] + if (nrow(exact_duplicates) > 0) { + data <- distinct(data) # remove exact duplicates + } + + #### check 2: repeated IDs with conflicting sire/dam #### + repeated_ids <- data %>% + group_by(id) %>% + filter(n() > 1) %>% + ungroup() + + # Only IDs with actual conflicting sire/dam + conflicting_ids <- repeated_ids %>% + group_by(id) %>% + filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% + ungroup() + + if (nrow(conflicting_ids) > 0) { + # Keep one row per ID, set sire/dam to "0" + data <- data %>% + group_by(id) %>% + summarize( + sire = if(n_distinct(sire) > 1) "0" else first(sire), + dam = if(n_distinct(dam) > 1) "0" else first(dam), + .groups = "drop" + ) + } + + repeated_ids_report <- conflicting_ids + + #### check 3: missing parents #### for (i in 1:nrow(data)) { id <- data$id[i] sire <- data$sire[i] dam <- data$dam[i] + if (sire != "0" && sire != id && !sire %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) } if (dam != "0" && dam != id && !dam %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) } + if (sire == id || dam == id) { errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) } } - # Remove duplicates + missing_parents <- distinct(missing_parents) - # Combine original data with missing parents - corrected_data <- bind_rows(data, missing_parents) - # Function to detect cycles in the pedigree graph and identify the nodes involved + if (nrow(missing_parents) > 0) { + data <- bind_rows(data, missing_parents) + } + + #### check 4: messy parents #### + sire_ids <- unique(data$sire[data$sire != "0"]) + dam_ids <- unique(data$dam[data$dam != "0"]) + messy_ids <- intersect(sire_ids, dam_ids) + messy_parents <- data %>% filter(id %in% messy_ids) + + #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { - # Create an adjacency list - adj_list <- list() - for (i in 1:nrow(data)) { - adj_list[[data$id[i]]] <- c(data$sire[i], data$dam[i]) - } - # Helper function to perform DFS and detect cycles + adj_list <- lapply(data$id, function(x) { + row <- data[data$id == x, ] + c(row$sire, row$dam) + }) + names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE path <- append(path, node) cycles <- list() + for (neighbor in adj_list[[node]]) { - if (neighbor != "0") { + if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { cycles <- append(cycles, dfs(neighbor, visited, rec_stack, path)) } else if (rec_stack[neighbor]) { @@ -102,14 +145,15 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } + rec_stack[node] <- FALSE return(cycles) } - # Initialize visited and recursion stack + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() - # Check for cycles in the graph and return the nodes involved + for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) @@ -120,75 +164,74 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } return(all_cycles) } - # Check for cycles in the corrected pedigree data - cycles <- detect_all_cycles(corrected_data) + + cycles <- detect_all_cycles(data) if (length(cycles) > 0) { - cycle_number <- 1 for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) } } - results <- list(missing_parents = missing_parents, dependencies = data.frame(Dependency = unlist(errors)), repeated_ids = repeated_ids, messy_parents = messy_parents) - repeated_ids <- results$repeated_ids - missing_parents <- results$missing_parents - messy_parents <- results$messy_parents - errors <- results$dependencies - # Adding the dataframes as an output list - output.results <- list() - #### Print errors and cycles #### - # Print repeated ids if any - if (nrow(repeated_ids) > 0) { - if (verbose) { - cat("Repeated ids found:\n") - message(repeated_ids) - } - output.results$repeated_ids <- repeated_ids - } else { - if (verbose) { - cat("No repeated ids found.\n") - } - } - #Print parents that appear as male and female - if (nrow(messy_parents) > 0) { - if (verbose) { - cat("Ids found as male and female parent:\n") - message(messy_parents) - } - output.results$messy_parents <- messy_parents + #### compile findings #### + input_ped_report <- list( + exact_duplicates = exact_duplicates, + repeated_ids_diff = repeated_ids_report, + messy_parents = messy_parents, + missing_parents = missing_parents, + dependencies = data.frame(Dependency = unique(unlist(errors))) + ) - } else { - if (verbose) { - cat("No ids found as male and female parent.\n") - } - } - # Print missing parents if any - if (nrow(missing_parents) > 0) { - if (verbose) { - cat("Missing parents found:\n") - message(missing_parents) - } - output.results$missing_parents <- missing_parents + #### file names #### + file_base <- tools::file_path_sans_ext(basename(ped.file)) + corrected_name <- paste0(file_base, "_corrected") + report_name <- paste0(file_base, "_report") - } else { - if (verbose) { - cat("No missing parents found.\n") - } - } - # Print errors if any - if (nrow(errors) > 0) { - if (verbose) { - cat("Dependencies found:\n") - message(unique(errors$Dependency)) + #### output #### + if (verbose) { + cat("\n=== Pedigree Quality Check Report ===\n") + + if (nrow(exact_duplicates) > 0) { + cat("\nExact duplicate rows detected and removed (only one copy kept):\n") + print(exact_duplicates) + } else cat("\nNo exact duplicate rows found.\n") + + if (nrow(repeated_ids_report) > 0) { + cat("\nRepeated IDs with conflicting sire/dam (sire/dam set to 0 in corrected pedigree):\n") + print(repeated_ids_report) + } else cat("\nNo conflicting repeated IDs found.\n") + + if (nrow(messy_parents) > 0) { + cat("\nIDs found as both sire and dam:\n") + print(messy_parents) + } else cat("\nNo IDs found as both sire and dam.\n") + + if (nrow(missing_parents) > 0) { + cat("\nMissing parents were added to the pedigree with 0 as sire/dam:\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") + + if (nrow(input_ped_report$dependencies) > 0) { + cat("\nDependencies detected:\n") + print(input_ped_report$dependencies) + } else cat("\nNo dependencies detected.\n") + + #### interactive save #### + cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) + ans <- tolower(trimws(readline())) + if (ans == "y") { + assign(corrected_name, data, envir = .GlobalEnv) + assign("input_ped_report", input_ped_report, envir = .GlobalEnv) + cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) + } else { + cat("No corrected pedigree was saved.\n") } - output.results$dependencies <- data.frame(Dependency = unlist(errors)) } else { - if (verbose) { - cat("No dependencies found.\n") - } + # Silent automatic mode + assign(corrected_name, data, envir = .GlobalEnv) + assign(report_name, input_ped_report, envir = .GlobalEnv) } - return(results) + invisible(input_ped_report) } From 743043a0753f35def5d52c441d077f07800adb1d Mon Sep 17 00:00:00 2001 From: josuechinchilla Date: Tue, 4 Nov 2025 16:23:14 -0500 Subject: [PATCH 2/2] reorganized report and fixed language --- R/check_ped.R | 21 +++++++++--------- man/check_ped.Rd | 56 ++++++++++++++++++++++++++++-------------------- 2 files changed, 44 insertions(+), 33 deletions(-) diff --git a/R/check_ped.R b/R/check_ped.R index 35b0ba9..a2277bc 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -169,7 +169,7 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { if (length(cycles) > 0) { for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) + errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) } } @@ -192,24 +192,25 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { cat("\n=== Pedigree Quality Check Report ===\n") if (nrow(exact_duplicates) > 0) { - cat("\nExact duplicate rows detected and removed (only one copy kept):\n") + cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") print(exact_duplicates) - } else cat("\nNo exact duplicate rows found.\n") + } else cat("\nNo exact duplicate trios found.\n") if (nrow(repeated_ids_report) > 0) { - cat("\nRepeated IDs with conflicting sire/dam (sire/dam set to 0 in corrected pedigree):\n") + cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") print(repeated_ids_report) - } else cat("\nNo conflicting repeated IDs found.\n") + } else cat("\nNo conflicting repeated trios found.\n") + + if (nrow(missing_parents) > 0) { + cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") if (nrow(messy_parents) > 0) { - cat("\nIDs found as both sire and dam:\n") + cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") print(messy_parents) } else cat("\nNo IDs found as both sire and dam.\n") - if (nrow(missing_parents) > 0) { - cat("\nMissing parents were added to the pedigree with 0 as sire/dam:\n") - print(missing_parents) - } else cat("\nNo missing parents found.\n") if (nrow(input_ped_report$dependencies) > 0) { cat("\nDependencies detected:\n") diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 693bfe0..ea63de7 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,48 +2,58 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Evaluate Pedigree File for Accuracy} +\title{Check a pedigree file for accuracy and report/correct common errors} \usage{ check_ped(ped.file, seed = NULL, verbose = TRUE) } \arguments{ -\item{ped.file}{path to pedigree text file. The pedigree file is a -3-column pedigree tab separated file with columns labeled as id sire dam in any order} +\item{ped.file}{Path to the pedigree text file.} -\item{seed}{Optional seed for reproducibility} +\item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE, print the errors to the console.} +\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} } \value{ -A list of data.frames of error types, and the output printed to the console +A list of data.frames containing detected issues: +\itemize{ +\item \code{exact_duplicates}: rows that were exact duplicates +\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam +\item \code{messy_parents}: IDs appearing as both sire and dam +\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam +\item \code{dependencies}: detected cycles in the pedigree +} } \description{ -Check a pedigree file for accuracy and output suspected errors +\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) +and performs quality checks, optionally correcting or flagging errors. } \details{ -check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: +The function checks for: \itemize{ -\item Ids that appear more than once in the id column -\item Ids that appear in both sire and dam columns -\item Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -\item Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +\item Exact duplicate rows and removes them (keeping one copy) +\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +\item IDs that appear in both sire and dam columns +\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" +\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant } -When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -Note: This function does not change the input file but prints any errors found in the console. +The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: +\itemize{ +\item Corrected pedigree as a dataframe in the global environment +\item A report listing all detected issues +} } \examples{ -##Get list with a dataframe for each error type -ped_file <- system.file("check_ped_test.txt", package="BIGr") -ped_errors <- check_ped(ped.file = ped_file, - seed = 101919) +ped_file <- system.file("check_ped_test.txt", package = "BIGr") +ped_errors <- check_ped(ped.file = ped_file, seed = 101919) -##Access the "messy parents" dataframe result +# Access messy parents ped_errors$messy_parents -##Get list of sample IDs with messy parents error -messy_parent_ids <- ped_errors$messy_parents$id -print(messy_parent_ids) +# IDs with messy parents +messy_ids <- ped_errors$messy_parents$id +print(messy_ids) + }