diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..3b9d842 Binary files /dev/null and b/.DS_Store differ diff --git a/BIGr.Rproj b/BIGr.Rproj index 69fafd4..5638e2e 100644 --- a/BIGr.Rproj +++ b/BIGr.Rproj @@ -1,4 +1,5 @@ Version: 1.0 +ProjectId: 0eeaab63-2615-4da7-b10a-927160fc78a3 RestoreWorkspace: No SaveWorkspace: No diff --git a/DESCRIPTION b/DESCRIPTION index 31ad1e1..cba831e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -44,7 +44,7 @@ URL: https://github.com/Breeding-Insight/BIGr BugReports: https://github.com/Breeding-Insight/BIGr/issues Encoding: UTF-8 Roxygen: list(markdown = TRUE) -RoxygenNote: 7.3.2 +RoxygenNote: 7.3.3 Depends: R (>= 4.4.0) biocViews: Imports: diff --git a/R/check_ped.R b/R/check_ped.R index 3f4831b..a2277bc 100644 --- a/R/check_ped.R +++ b/R/check_ped.R @@ -1,99 +1,142 @@ -#' Evaluate Pedigree File for Accuracy +#' Check a pedigree file for accuracy and report/correct common errors #' -#' Check a pedigree file for accuracy and output suspected errors +#' `check_ped` reads a 3-column pedigree file (tab-separated, columns labeled `id`, `sire`, `dam` in any order) +#' and performs quality checks, optionally correcting or flagging errors. #' -#'check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: -#'* Ids that appear more than once in the id column -#'* Ids that appear in both sire and dam columns -#'* Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -#'* Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +#' The function checks for: +#' * Exact duplicate rows and removes them (keeping one copy) +#' * IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +#' * IDs that appear in both sire and dam columns +#' * Missing parents (IDs referenced as sire/dam but not in `id` column), adds them with sire/dam = "0" +#' * Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant #' -#'When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -#'Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +#' After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. #' -#'Note: This function does not change the input file but prints any errors found in the console. +#' The function does **not** overwrite the input file. Instead, it prints findings to the console and optionally saves: +#' * Corrected pedigree as a dataframe in the global environment +#' * A report listing all detected issues +#' +#' @param ped.file Path to the pedigree text file. +#' @param seed Optional seed for reproducibility. +#' @param verbose Logical. If TRUE (default), prints errors and prompts for interactive saving. +#' +#' @return A list of data.frames containing detected issues: +#' * `exact_duplicates`: rows that were exact duplicates +#' * `repeated_ids_diff`: IDs appearing more than once with conflicting sire/dam +#' * `messy_parents`: IDs appearing as both sire and dam +#' * `missing_parents`: parents added to the pedigree with 0 as sire/dam +#' * `dependencies`: detected cycles in the pedigree #' -#' @param ped.file path to pedigree text file. The pedigree file is a -#' 3-column pedigree tab separated file with columns labeled as id sire dam in any order -#' @param seed Optional seed for reproducibility -#' @param verbose Logical. If TRUE, print the errors to the console. -#' @return A list of data.frames of error types, and the output printed to the console #' @examples -#' ##Get list with a dataframe for each error type -#' ped_file <- system.file("check_ped_test.txt", package="BIGr") -#' ped_errors <- check_ped(ped.file = ped_file, -#' seed = 101919) +#' ped_file <- system.file("check_ped_test.txt", package = "BIGr") +#' ped_errors <- check_ped(ped.file = ped_file, seed = 101919) #' -#' ##Access the "messy parents" dataframe result +#' # Access messy parents #' ped_errors$messy_parents #' -#' ##Get list of sample IDs with messy parents error -#' messy_parent_ids <- ped_errors$messy_parents$id -#' print(messy_parent_ids) +#' # IDs with messy parents +#' messy_ids <- ped_errors$messy_parents$id +#' print(messy_ids) +#' #' @import dplyr #' @import janitor #' @importFrom stats setNames #' @importFrom utils read.table #' @export check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { - #### Function to check for hierarchical errors missing parents and repeated ids #### - if(!is.null(seed)){ - set.seed(seed) - } - #### read in data #### - data = utils::read.table(ped.file, header = T) - data <- data %>% + + #### setup #### + if (!is.null(seed)) set.seed(seed) + + # Read and clean data + data <- utils::read.table(ped.file, header = TRUE) %>% janitor::clean_names() %>% mutate( id = as.character(id), sire = as.character(sire), dam = as.character(dam) - ) - #Missing parents dataframe initialize - missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + ) + + original_data <- data errors <- list() - # repeated id checks - n_occur <- data.frame(table(data$id)) - repeated_ids = n_occur[n_occur$Freq > 1,] %>% - rename(id = Var1) - # Check for ids that appear as both sire and dam ###This is possible for plants so maybe do not control for this or do not delete these rows just print them - messy_parents <- as.data.frame(intersect(data$sire, data$dam)) %>% - rename(id = 1) %>% - filter(id != 0) - # Missing parents check + missing_parents <- data.frame(id = character(), sire = character(), dam = character(), stringsAsFactors = FALSE) + + #### check 1: exact duplicates #### + exact_duplicates <- data[duplicated(data), ] + if (nrow(exact_duplicates) > 0) { + data <- distinct(data) # remove exact duplicates + } + + #### check 2: repeated IDs with conflicting sire/dam #### + repeated_ids <- data %>% + group_by(id) %>% + filter(n() > 1) %>% + ungroup() + + # Only IDs with actual conflicting sire/dam + conflicting_ids <- repeated_ids %>% + group_by(id) %>% + filter(n_distinct(sire) > 1 | n_distinct(dam) > 1) %>% + ungroup() + + if (nrow(conflicting_ids) > 0) { + # Keep one row per ID, set sire/dam to "0" + data <- data %>% + group_by(id) %>% + summarize( + sire = if(n_distinct(sire) > 1) "0" else first(sire), + dam = if(n_distinct(dam) > 1) "0" else first(dam), + .groups = "drop" + ) + } + + repeated_ids_report <- conflicting_ids + + #### check 3: missing parents #### for (i in 1:nrow(data)) { id <- data$id[i] sire <- data$sire[i] dam <- data$dam[i] + if (sire != "0" && sire != id && !sire %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = sire, sire = "0", dam = "0", stringsAsFactors = FALSE)) } if (dam != "0" && dam != id && !dam %in% data$id) { missing_parents <- rbind(missing_parents, data.frame(id = dam, sire = "0", dam = "0", stringsAsFactors = FALSE)) } + if (sire == id || dam == id) { errors <- append(errors, paste("Dependency: Individual", id, "cannot be its own parent")) } } - # Remove duplicates + missing_parents <- distinct(missing_parents) - # Combine original data with missing parents - corrected_data <- bind_rows(data, missing_parents) - # Function to detect cycles in the pedigree graph and identify the nodes involved + if (nrow(missing_parents) > 0) { + data <- bind_rows(data, missing_parents) + } + + #### check 4: messy parents #### + sire_ids <- unique(data$sire[data$sire != "0"]) + dam_ids <- unique(data$dam[data$dam != "0"]) + messy_ids <- intersect(sire_ids, dam_ids) + messy_parents <- data %>% filter(id %in% messy_ids) + + #### check 5: dependencies (cycles) #### detect_all_cycles <- function(data) { - # Create an adjacency list - adj_list <- list() - for (i in 1:nrow(data)) { - adj_list[[data$id[i]]] <- c(data$sire[i], data$dam[i]) - } - # Helper function to perform DFS and detect cycles + adj_list <- lapply(data$id, function(x) { + row <- data[data$id == x, ] + c(row$sire, row$dam) + }) + names(adj_list) <- data$id + dfs <- function(node, visited, rec_stack, path) { visited[node] <- TRUE rec_stack[node] <- TRUE path <- append(path, node) cycles <- list() + for (neighbor in adj_list[[node]]) { - if (neighbor != "0") { + if (neighbor %in% names(adj_list)) { if (!visited[neighbor]) { cycles <- append(cycles, dfs(neighbor, visited, rec_stack, path)) } else if (rec_stack[neighbor]) { @@ -102,14 +145,15 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } } } + rec_stack[node] <- FALSE return(cycles) } - # Initialize visited and recursion stack + visited <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) rec_stack <- stats::setNames(rep(FALSE, length(adj_list)), names(adj_list)) all_cycles <- list() - # Check for cycles in the graph and return the nodes involved + for (node in names(adj_list)) { if (!visited[node]) { node_cycles <- dfs(node, visited, rec_stack, character()) @@ -120,75 +164,75 @@ check_ped <- function(ped.file, seed = NULL, verbose = TRUE) { } return(all_cycles) } - # Check for cycles in the corrected pedigree data - cycles <- detect_all_cycles(corrected_data) + + cycles <- detect_all_cycles(data) if (length(cycles) > 0) { - cycle_number <- 1 for (cycle_group in cycles) { cycle_ids <- unique(unlist(cycle_group)) - errors <- append(errors, paste("Cycle detected involving nodes:", paste(cycle_ids, collapse = " -> "))) + errors <- append(errors, paste("Cycle detected involving IDs:", paste(cycle_ids, collapse = " -> "))) } } - results <- list(missing_parents = missing_parents, dependencies = data.frame(Dependency = unlist(errors)), repeated_ids = repeated_ids, messy_parents = messy_parents) - repeated_ids <- results$repeated_ids - missing_parents <- results$missing_parents - messy_parents <- results$messy_parents - errors <- results$dependencies - # Adding the dataframes as an output list - output.results <- list() - #### Print errors and cycles #### - # Print repeated ids if any - if (nrow(repeated_ids) > 0) { - if (verbose) { - cat("Repeated ids found:\n") - message(repeated_ids) - } - output.results$repeated_ids <- repeated_ids - } else { - if (verbose) { - cat("No repeated ids found.\n") - } - } - #Print parents that appear as male and female - if (nrow(messy_parents) > 0) { - if (verbose) { - cat("Ids found as male and female parent:\n") - message(messy_parents) - } - output.results$messy_parents <- messy_parents + #### compile findings #### + input_ped_report <- list( + exact_duplicates = exact_duplicates, + repeated_ids_diff = repeated_ids_report, + messy_parents = messy_parents, + missing_parents = missing_parents, + dependencies = data.frame(Dependency = unique(unlist(errors))) + ) - } else { - if (verbose) { - cat("No ids found as male and female parent.\n") - } - } - # Print missing parents if any - if (nrow(missing_parents) > 0) { - if (verbose) { - cat("Missing parents found:\n") - message(missing_parents) - } - output.results$missing_parents <- missing_parents + #### file names #### + file_base <- tools::file_path_sans_ext(basename(ped.file)) + corrected_name <- paste0(file_base, "_corrected") + report_name <- paste0(file_base, "_report") - } else { - if (verbose) { - cat("No missing parents found.\n") - } - } - # Print errors if any - if (nrow(errors) > 0) { - if (verbose) { - cat("Dependencies found:\n") - message(unique(errors$Dependency)) + #### output #### + if (verbose) { + cat("\n=== Pedigree Quality Check Report ===\n") + + if (nrow(exact_duplicates) > 0) { + cat("\n Exact duplicate trios detected (only one copy will be kept in corrected pedigree):\n") + print(exact_duplicates) + } else cat("\nNo exact duplicate trios found.\n") + + if (nrow(repeated_ids_report) > 0) { + cat("\nConflicting trios detected (sire/dam set to 0 in corrected pedigree):\n") + print(repeated_ids_report) + } else cat("\nNo conflicting repeated trios found.\n") + + if (nrow(missing_parents) > 0) { + cat("\n Parents missing as IDs found in the pedigree (will be added as founders in corrected pedigree):\n") + print(missing_parents) + } else cat("\nNo missing parents found.\n") + + if (nrow(messy_parents) > 0) { + cat("\n IDs found as both sire and dam (is selfing or hermaphrodytism possible?):\n") + print(messy_parents) + } else cat("\nNo IDs found as both sire and dam.\n") + + + if (nrow(input_ped_report$dependencies) > 0) { + cat("\nDependencies detected:\n") + print(input_ped_report$dependencies) + } else cat("\nNo dependencies detected.\n") + + #### interactive save #### + cat(paste0("\nDo you want to save the corrected pedigree as dataframe `", corrected_name, "`? (y/n): ")) + ans <- tolower(trimws(readline())) + if (ans == "y") { + assign(corrected_name, data, envir = .GlobalEnv) + assign("input_ped_report", input_ped_report, envir = .GlobalEnv) + cat(paste0("Saved corrected pedigree as `", corrected_name, "` and report as `input_ped_report`.\n")) + } else { + cat("No corrected pedigree was saved.\n") } - output.results$dependencies <- data.frame(Dependency = unlist(errors)) } else { - if (verbose) { - cat("No dependencies found.\n") - } + # Silent automatic mode + assign(corrected_name, data, envir = .GlobalEnv) + assign(report_name, input_ped_report, envir = .GlobalEnv) } - return(results) + invisible(input_ped_report) } diff --git a/man/check_ped.Rd b/man/check_ped.Rd index 693bfe0..ea63de7 100644 --- a/man/check_ped.Rd +++ b/man/check_ped.Rd @@ -2,48 +2,58 @@ % Please edit documentation in R/check_ped.R \name{check_ped} \alias{check_ped} -\title{Evaluate Pedigree File for Accuracy} +\title{Check a pedigree file for accuracy and report/correct common errors} \usage{ check_ped(ped.file, seed = NULL, verbose = TRUE) } \arguments{ -\item{ped.file}{path to pedigree text file. The pedigree file is a -3-column pedigree tab separated file with columns labeled as id sire dam in any order} +\item{ped.file}{Path to the pedigree text file.} -\item{seed}{Optional seed for reproducibility} +\item{seed}{Optional seed for reproducibility.} -\item{verbose}{Logical. If TRUE, print the errors to the console.} +\item{verbose}{Logical. If TRUE (default), prints errors and prompts for interactive saving.} } \value{ -A list of data.frames of error types, and the output printed to the console +A list of data.frames containing detected issues: +\itemize{ +\item \code{exact_duplicates}: rows that were exact duplicates +\item \code{repeated_ids_diff}: IDs appearing more than once with conflicting sire/dam +\item \code{messy_parents}: IDs appearing as both sire and dam +\item \code{missing_parents}: parents added to the pedigree with 0 as sire/dam +\item \code{dependencies}: detected cycles in the pedigree +} } \description{ -Check a pedigree file for accuracy and output suspected errors +\code{check_ped} reads a 3-column pedigree file (tab-separated, columns labeled \code{id}, \code{sire}, \code{dam} in any order) +and performs quality checks, optionally correcting or flagging errors. } \details{ -check_ped takes a 3-column pedigree tab separated file with columns labeled as id sire dam in any order and checks for: +The function checks for: \itemize{ -\item Ids that appear more than once in the id column -\item Ids that appear in both sire and dam columns -\item Direct (e.g. parent is a offspring of his own daughter) and indirect (e.g. a great grandparent is son of its grandchild) dependencies within the pedigree. -\item Individuals included in the pedigree as sire or dam but not on the id column and reports them back with unknown parents (0). +\item Exact duplicate rows and removes them (keeping one copy) +\item IDs that appear more than once with conflicting sire/dam assignments (sets sire/dam to "0") +\item IDs that appear in both sire and dam columns +\item Missing parents (IDs referenced as sire/dam but not in \code{id} column), adds them with sire/dam = "0" +\item Direct and indirect pedigree dependencies (cycles), such as a parent being its own descendant } -When using check_ped, do a first run to check for repeated ids and parents that appear as sire and dam. -Once these errors are cleaned run the function again to check for dependencies as this will provide the most accurate report. +After an initial run to clean exact duplicates and repeated IDs, you can run the function again to detect cycles more accurately. -Note: This function does not change the input file but prints any errors found in the console. +The function does \strong{not} overwrite the input file. Instead, it prints findings to the console and optionally saves: +\itemize{ +\item Corrected pedigree as a dataframe in the global environment +\item A report listing all detected issues +} } \examples{ -##Get list with a dataframe for each error type -ped_file <- system.file("check_ped_test.txt", package="BIGr") -ped_errors <- check_ped(ped.file = ped_file, - seed = 101919) +ped_file <- system.file("check_ped_test.txt", package = "BIGr") +ped_errors <- check_ped(ped.file = ped_file, seed = 101919) -##Access the "messy parents" dataframe result +# Access messy parents ped_errors$messy_parents -##Get list of sample IDs with messy parents error -messy_parent_ids <- ped_errors$messy_parents$id -print(messy_parent_ids) +# IDs with messy parents +messy_ids <- ped_errors$messy_parents$id +print(messy_ids) + }