diff --git a/NEWS.md b/NEWS.md index c561e732..6737cbd6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,9 +1,10 @@ -# BGmisc beta 1.5.2 +# BGmisc 1.5.2 * More flexible ID generation for simulatePedigree * Created ped2gen function to extract generation information from pedigree data.frames * Added tests for ped2gen * Fixed handling of character ID variables leading to a warning in ped2fam * Added famIDs to phantom parents +* Tweaked how sex coding is handled to allow for unknown sex # BGmisc 1.5.1 ## CRAN submission diff --git a/R/checkParents.R b/R/checkParents.R index 5ee927f8..060553c3 100644 --- a/R/checkParents.R +++ b/R/checkParents.R @@ -12,11 +12,12 @@ #' @param repairsex A logical flag indicating whether to attempt repairs on sex of the parents #' @param addphantoms A logical flag indicating whether to add phantom parents for missing parent IDs. #' @param parentswithoutrow A logical flag indicating whether to add parents without a row in the pedigree. -#' @param famID Character. Column name for family IDs. -#' @param personID Character. Column name for individual IDs. +#' @param famID Character. Column name for family IDs. +#' @param personID Character. Column name for individual IDs. #' @param momID Character. Column name for maternal IDs. #' @param dadID Character. Column name for paternal IDs. -#' +#' @param code_male The code value used to represent male sex in the 'sex' column of \code{ped}. +#' @param code_female The code value used to represent female sex in the 'sex' column of \code{ped}. #' #' @return Depending on the value of `repair`, either a list containing validation results or a repaired dataframe is returned. #' @examples @@ -32,7 +33,10 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, famID = "famID", personID = "ID", momID = "momID", - dadID = "dadID") { + dadID = "dadID", + code_male = NULL, + code_female = NULL + ) { # Standardize column names in the input dataframe ped <- standardizeColnames(ped, verbose = verbose) @@ -87,18 +91,28 @@ checkParentIDs <- function(ped, verbose = FALSE, repair = FALSE, cat("Step 2: Determining the if moms are the same sex and dads are same sex\n") } # Determine modal sex values for moms and dads + + + mom_results <- checkParentSex(ped, parent_col = "momID", verbose = verbose) dad_results <- checkParentSex(ped, parent_col = "dadID", verbose = verbose) validation_results$mom_sex <- mom_results$unique_sexes validation_results$dad_sex <- dad_results$unique_sexes - validation_results$female_var <- mom_results$modal_sex - validation_results$male_var <- dad_results$modal_sex + validation_results$wrong_sex_moms <- mom_results$inconsistent_parents validation_results$wrong_sex_dads <- dad_results$inconsistent_parents validation_results$female_moms <- mom_results$all_same_sex validation_results$male_dads <- dad_results$all_same_sex - + if (!is.null(code_male) && !is.null(code_female)) { + validation_results$male_var <- code_male + validation_results$female_var <- code_female + validation_results$sex_code_source <- "user_provided_codes" + } else { + validation_results$female_var <- mom_results$modal_sex + validation_results$male_var <- dad_results$modal_sex + validation_results$sex_code_source <- "modal_parent_sex" + } # Are any parents in both momID and dadID? momdad <- intersect(ped$dadID, ped$momID) if (length(momdad) > 0 && !is.na(momdad)) { diff --git a/R/checkSex.R b/R/checkSex.R index d7b3f8ab..9eb8181e 100644 --- a/R/checkSex.R +++ b/R/checkSex.R @@ -17,13 +17,14 @@ #' #' @details This function uses the terms 'male' and 'female' in a biological context, referring to chromosomal and other biologically-based characteristics necessary for constructing genetic pedigrees. The biological aspect of sex used in genetic analysis (genotype) is distinct from the broader, richer concept of gender identity (phenotype). #' -#' We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities. +#' We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities. #' The developers of this package express unequivocal support for folx in the transgender #' and LGBTQ+ communities. #' #' @param ped A dataframe representing the pedigree data with a 'sex' column. #' @param code_male The current code used to represent males in the 'sex' column. #' @param code_female The current code used to represent females in the 'sex' column. If both are NULL, no recoding is performed. +#' @param code_unknown The current code used to represent unknown or ambiguous sex in the 'sex' column. Can be NA to indicate that missing values should be treated as unknown. If NULL and both code_male and code_female are provided, values not matching either will be inferred as unknown. #' @param verbose A logical flag indicating whether to print progress and validation messages to the console. #' @param repair A logical flag indicating whether to attempt repairs on the sex coding. #' @param momID The column name for maternal IDs. Default is "momID". @@ -37,7 +38,10 @@ #' } #' @export #' -checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, repair = FALSE, +checkSex <- function(ped, code_male = NULL, + code_female = NULL, + code_unknown = NULL, + verbose = FALSE, repair = FALSE, momID = "momID", dadID = "dadID") { # Standardize column names in the input dataframe @@ -61,7 +65,6 @@ checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, } - # Are there multiple sexes/genders in the list of dads and moms? dad_results <- checkParentSex(ped, parent_col = dadID, verbose = verbose) @@ -92,7 +95,11 @@ checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, if (validation_results$sex_length == 2) { # Recode all dads to the most frequent male value - ped <- recodeSex(ped, code_male = validation_results$most_frequent_sex_dad) + ped <- recodeSex(ped, + code_male = validation_results$most_frequent_sex_dad, + code_female = validation_results$most_frequent_sex_mom, + code_unknown = code_unknown + ) # Count and record the change num_changes <- sum(original_ped$sex != ped$sex) # Record the change and the count @@ -128,8 +135,16 @@ checkSex <- function(ped, code_male = NULL, code_female = NULL, verbose = FALSE, #' @export #' #' @seealso \code{\link{checkSex}} -repairSex <- function(ped, verbose = FALSE, code_male = NULL, code_female = NULL) { - checkSex(ped = ped, verbose = verbose, repair = TRUE, code_male = code_male, code_female = code_female) +repairSex <- function(ped, verbose = FALSE, + code_male = NULL, + code_female = NULL, + code_unknown = NULL) { + checkSex( + ped = ped, verbose = verbose, repair = TRUE, + code_male = code_male, + code_female = code_female, + code_unknown = code_unknown + ) } #' Recodes Sex Variable in a Pedigree Dataframe @@ -142,51 +157,76 @@ repairSex <- function(ped, verbose = FALSE, code_male = NULL, code_female = NULL #' @param recode_na The value to use for missing values. Default is NA_character_ #' @param recode_male The value to use for males. Default is "M" #' @param recode_female The value to use for females. Default is "F" +#' @param recode_unknown The value to use for unknown values. Default is "U" #' @inherit checkSex details #' @return A modified version of the input data.frame \code{ped}, containing an additional or modified 'sex_recode' column where the 'sex' values are recoded according to \code{code_male}. NA values in the 'sex' column are preserved. #' @export recodeSex <- function( - ped, verbose = FALSE, code_male = NULL, code_na = NULL, code_female = NULL, - recode_male = "M", recode_female = "F", recode_na = NA_character_) { + ped, verbose = FALSE, code_male = NULL, code_na = NULL, code_female = NULL, + code_unknown = NULL, + recode_male = "M", + recode_female = "F", + recode_unknown = "U", + recode_na = NA_character_ +) { + if (is.null(code_male) && is.null(code_female)) { + if (verbose == TRUE) { + warning("Both code male and code female are empty. No recoding was done.") + } + return(ped) + } + # First, set any code_na values to NA if (!is.null(code_na)) { ped$sex[ped$sex == code_na] <- NA } - # Recode as "F" or "M" based on code_male, preserving NAs - if (!is.null(code_male) && !is.null(code_female)) { - # Initialize sex_recode as NA, preserving the length of the 'sex' column - ped$sex_recode <- recode_na - ped$sex_recode[ped$sex == code_female] <- recode_female - ped$sex_recode[ped$sex == code_male] <- recode_male - # Overwriting temp recode variable - ped$sex <- ped$sex_recode - ped$sex_recode <- NULL - } else if (!is.null(code_male) && is.null(code_female)) { - # Initialize sex_recode as NA, preserving the length of the 'sex' column - ped$sex_recode <- recode_na - ped$sex_recode[ped$sex != code_male & !is.na(ped$sex)] <- recode_female + + # Initialize sex_recode as NA, preserving the length of the 'sex' column + ped$sex_recode <- recode_na + + + if (!is.null(code_male)) { ped$sex_recode[ped$sex == code_male] <- recode_male - # Overwriting temp recode variable - ped$sex <- ped$sex_recode - ped$sex_recode <- NULL - } else if (is.null(code_male) && !is.null(code_female)) { - # Initialize sex_recode as NA, preserving the length of the 'sex' column - ped$sex_recode <- recode_na - ped$sex_recode[ped$sex != code_female & !is.na(ped$sex)] <- recode_male + } + if (!is.null(code_female)) { ped$sex_recode[ped$sex == code_female] <- recode_female - # Overwriting temp recode variable - ped$sex <- ped$sex_recode - ped$sex_recode <- NULL - } else { - if (verbose == TRUE) { - warning("Both code male and code female are empty. No recoding was done.") + } + + # handle unknown codes + if (!is.null(code_unknown) && !is.na(code_unknown)) { + ped$sex_recode[ped$sex == code_unknown] <- recode_unknown + } else if (!is.null(code_unknown) && is.na(code_unknown)) { + ped$sex_recode[is.na(ped$sex)] <- recode_unknown + } else if (!is.null(code_male) && !is.null(code_female)) { + ped$sex_recode[!ped$sex %in% c(code_male, code_female) & !is.na(ped$sex)] <- recode_unknown + } + + + # Handle cases where only one of code + # just male + if (!is.null(code_male) && is.null(code_female)) { + if (!is.null(code_unknown)) { + ped$sex_recode[ped$sex != code_male & !is.na(ped$sex) & ped$sex != code_unknown] <- recode_female + } else if (is.null(code_unknown)) { + ped$sex_recode[ped$sex != code_male & !is.na(ped$sex)] <- recode_female } } + # just female + if (is.null(code_male) && !is.null(code_female)) { + if (!is.null(code_unknown)) { + ped$sex_recode[ped$sex != code_female & !is.na(ped$sex) & ped$sex != code_unknown] <- recode_male + } else if (is.null(code_unknown)) { + ped$sex_recode[ped$sex != code_female & !is.na(ped$sex)] <- recode_male + } + } + + # Overwriting temp recode variable + ped$sex <- ped$sex_recode + ped$sex_recode <- NULL return(ped) } - #' Check Parental Role Sex Consistency #' #' Validates sex coding consistency for a given parental role (momID or dadID). @@ -225,9 +265,15 @@ checkParentSex <- function(ped, parent_col, sex_col = "sex", verbose = FALSE) { # Store the most frequent sex for moms and dads modal_sex <- names(sort(table(parent_sexes), decreasing = TRUE))[1] + if (all(is.na(modal_sex)) && verbose == TRUE) { + cat(paste0("All parents in role ", parent_col, " have missing sex values.\n")) + } + # Type coercion based on ped$sex type if (is.numeric(ped[[sex_col]])) { modal_sex <- as.numeric(modal_sex) + } else if (is.character(ped[[sex_col]])) { + modal_sex <- as.character(modal_sex) } # List ids for dads that are female, moms that are male diff --git a/man/checkParentIDs.Rd b/man/checkParentIDs.Rd index d7386b10..c175f4e8 100644 --- a/man/checkParentIDs.Rd +++ b/man/checkParentIDs.Rd @@ -14,7 +14,9 @@ checkParentIDs( famID = "famID", personID = "ID", momID = "momID", - dadID = "dadID" + dadID = "dadID", + code_male = NULL, + code_female = NULL ) } \arguments{ @@ -37,6 +39,10 @@ checkParentIDs( \item{momID}{Character. Column name for maternal IDs.} \item{dadID}{Character. Column name for paternal IDs.} + +\item{code_male}{The code value used to represent male sex in the 'sex' column of \code{ped}.} + +\item{code_female}{The code value used to represent female sex in the 'sex' column of \code{ped}.} } \value{ Depending on the value of `repair`, either a list containing validation results or a repaired dataframe is returned. diff --git a/man/checkSex.Rd b/man/checkSex.Rd index 866ee531..fb1cfe24 100644 --- a/man/checkSex.Rd +++ b/man/checkSex.Rd @@ -8,6 +8,7 @@ checkSex( ped, code_male = NULL, code_female = NULL, + code_unknown = NULL, verbose = FALSE, repair = FALSE, momID = "momID", @@ -21,6 +22,8 @@ checkSex( \item{code_female}{The current code used to represent females in the 'sex' column. If both are NULL, no recoding is performed.} +\item{code_unknown}{The current code used to represent unknown or ambiguous sex in the 'sex' column. Can be NA to indicate that missing values should be treated as unknown. If NULL and both code_male and code_female are provided, values not matching either will be inferred as unknown.} + \item{verbose}{A logical flag indicating whether to print progress and validation messages to the console.} \item{repair}{A logical flag indicating whether to attempt repairs on the sex coding.} @@ -50,7 +53,7 @@ If `repair = TRUE`, the function standardizes sex coding by: This function uses the terms 'male' and 'female' in a biological context, referring to chromosomal and other biologically-based characteristics necessary for constructing genetic pedigrees. The biological aspect of sex used in genetic analysis (genotype) is distinct from the broader, richer concept of gender identity (phenotype). -We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities. +We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities. The developers of this package express unequivocal support for folx in the transgender and LGBTQ+ communities. } diff --git a/man/recodeSex.Rd b/man/recodeSex.Rd index 076ffa10..3da597bf 100644 --- a/man/recodeSex.Rd +++ b/man/recodeSex.Rd @@ -10,8 +10,10 @@ recodeSex( code_male = NULL, code_na = NULL, code_female = NULL, + code_unknown = NULL, recode_male = "M", recode_female = "F", + recode_unknown = "U", recode_na = NA_character_ ) } @@ -26,10 +28,14 @@ recodeSex( \item{code_female}{The current code used to represent females in the 'sex' column. If both are NULL, no recoding is performed.} +\item{code_unknown}{The current code used to represent unknown or ambiguous sex in the 'sex' column. Can be NA to indicate that missing values should be treated as unknown. If NULL and both code_male and code_female are provided, values not matching either will be inferred as unknown.} + \item{recode_male}{The value to use for males. Default is "M"} \item{recode_female}{The value to use for females. Default is "F"} +\item{recode_unknown}{The value to use for unknown values. Default is "U"} + \item{recode_na}{The value to use for missing values. Default is NA_character_} } \value{ @@ -51,7 +57,7 @@ If `repair = TRUE`, the function standardizes sex coding by: This function uses the terms 'male' and 'female' in a biological context, referring to chromosomal and other biologically-based characteristics necessary for constructing genetic pedigrees. The biological aspect of sex used in genetic analysis (genotype) is distinct from the broader, richer concept of gender identity (phenotype). -We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities. +We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities. The developers of this package express unequivocal support for folx in the transgender and LGBTQ+ communities. } diff --git a/man/repairSex.Rd b/man/repairSex.Rd index 5c8e1ae0..2f79b2b6 100644 --- a/man/repairSex.Rd +++ b/man/repairSex.Rd @@ -4,7 +4,13 @@ \alias{repairSex} \title{Repairs Sex Coding in a Pedigree Dataframe} \usage{ -repairSex(ped, verbose = FALSE, code_male = NULL, code_female = NULL) +repairSex( + ped, + verbose = FALSE, + code_male = NULL, + code_female = NULL, + code_unknown = NULL +) } \arguments{ \item{ped}{A dataframe representing the pedigree data with a 'sex' column.} @@ -14,6 +20,8 @@ repairSex(ped, verbose = FALSE, code_male = NULL, code_female = NULL) \item{code_male}{The current code used to represent males in the 'sex' column.} \item{code_female}{The current code used to represent females in the 'sex' column. If both are NULL, no recoding is performed.} + +\item{code_unknown}{The current code used to represent unknown or ambiguous sex in the 'sex' column. Can be NA to indicate that missing values should be treated as unknown. If NULL and both code_male and code_female are provided, values not matching either will be inferred as unknown.} } \value{ A modified version of the input data.frame \code{ped}, containing an additional or modified 'sex_recode' column where the 'sex' values are recoded according to \code{code_male}. NA values in the 'sex' column are preserved. @@ -34,7 +42,7 @@ If `repair = TRUE`, the function standardizes sex coding by: This function uses the terms 'male' and 'female' in a biological context, referring to chromosomal and other biologically-based characteristics necessary for constructing genetic pedigrees. The biological aspect of sex used in genetic analysis (genotype) is distinct from the broader, richer concept of gender identity (phenotype). -We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities. +We recognize the importance of using language and methodologies that affirm and respect the full spectrum of gender identities. The developers of this package express unequivocal support for folx in the transgender and LGBTQ+ communities. } diff --git a/tests/testthat/test-checkSex.R b/tests/testthat/test-checkSex.R index 335aee62..427d37b4 100644 --- a/tests/testthat/test-checkSex.R +++ b/tests/testthat/test-checkSex.R @@ -86,3 +86,185 @@ test_that("Functions handle missing values gracefully", { expect_silent(repairSex(ped_with_na, verbose = FALSE, code_male = "M")) expect_silent(recodeSex(ped_with_na, verbose = FALSE, code_male = "M", code_female = "F")) }) + + +# Test Case 5: Handle code_unknown parameter with explicit value +test_that("recodeSex handles code_unknown parameter when explicitly provided", { + # Create pedigree with unknown sex codes + ped <- data.frame( + ID = c(1, 2, 3, 4, 5, 6), + sex = c("M", "F", "M", "F", "U", "U"), + dadID = c(NA, NA, 1, 1, NA, NA), + momID = c(NA, NA, 2, 2, NA, NA) + ) + + # Test with code_unknown = "U" + recoded_ped <- recodeSex(ped, + code_male = "M", + code_female = "F", + code_unknown = "U", + recode_male = "Male", + recode_female = "Female", + recode_unknown = "Unknown" + ) + + # Check that unknown codes are recoded correctly + expect_equal(recoded_ped$sex[5], "Unknown") + expect_equal(recoded_ped$sex[6], "Unknown") + expect_equal(recoded_ped$sex[1], "Male") + expect_equal(recoded_ped$sex[2], "Female") +}) + + +# Test Case 6: Handle code_unknown when it's NA +test_that("recodeSex handles code_unknown = NA correctly", { + # Create pedigree where NA represents unknown sex + ped <- data.frame( + ID = c(1, 2, 3, 4, 5), + sex = c("M", "F", "M", "F", NA), + dadID = c(NA, NA, 1, 1, NA), + momID = c(NA, NA, 2, 2, NA) + ) + + # Test with code_unknown = NA + recoded_ped <- recodeSex(ped, + code_male = "M", + code_female = "F", + code_unknown = NA, + recode_male = "Male", + recode_female = "Female", + recode_unknown = "Unknown" + ) + + # Check that NA values are recoded to "Unknown" + expect_equal(recoded_ped$sex[5], "Unknown") + expect_equal(recoded_ped$sex[1], "Male") + expect_equal(recoded_ped$sex[2], "Female") +}) + + +# Test Case 7: Infer unknown values from data when code_unknown not provided +test_that("recodeSex infers unknown values when code_unknown is not provided", { + # Create pedigree with values that are neither male nor female + ped <- data.frame( + ID = c(1, 2, 3, 4, 5, 6), + sex = c("M", "F", "M", "F", "X", "?"), + dadID = c(NA, NA, 1, 1, NA, NA), + momID = c(NA, NA, 2, 2, NA, NA) + ) + + # Test without code_unknown - should infer "X" and "?" as unknown + recoded_ped <- recodeSex(ped, + code_male = "M", + code_female = "F", + recode_male = "Male", + recode_female = "Female", + recode_unknown = "Unknown" + ) + + # Check that values not in code_male/code_female are recoded to unknown + expect_equal(recoded_ped$sex[5], "Unknown") + expect_equal(recoded_ped$sex[6], "Unknown") + expect_equal(recoded_ped$sex[1], "Male") + expect_equal(recoded_ped$sex[2], "Female") +}) + + +# Test Case 8: Test recode_unknown parameter variations +test_that("recodeSex respects recode_unknown parameter", { + ped <- data.frame( + ID = c(1, 2, 3, 4, 5), + sex = c("M", "F", "M", "F", "U"), + dadID = c(NA, NA, 1, 1, NA), + momID = c(NA, NA, 2, 2, NA) + ) + + # Test with custom recode_unknown value + recoded_ped <- recodeSex(ped, + code_male = "M", + code_female = "F", + code_unknown = "U", + recode_male = "1", + recode_female = "0", + recode_unknown = "9" + ) + + expect_equal(recoded_ped$sex[5], "9") + expect_equal(recoded_ped$sex[1], "1") + expect_equal(recoded_ped$sex[2], "0") +}) + + +# Test Case 9: Test code_unknown with only code_male provided +test_that("recodeSex handles code_unknown with only code_male", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + sex = c("M", "F", "M", "U"), + dadID = c(NA, NA, 1, NA), + momID = c(NA, NA, 2, NA) + ) + + # Test with only code_male and code_unknown + recoded_ped <- recodeSex(ped, + code_male = "M", + code_unknown = "U", + recode_male = "Male", + recode_female = "Female", + recode_unknown = "Unknown" + ) + + # Check recoding: M->Male, F->Female (inferred), U->Unknown + expect_equal(recoded_ped$sex[1], "Male") + expect_equal(recoded_ped$sex[2], "Female") + expect_equal(recoded_ped$sex[4], "Unknown") +}) + + +# Test Case 10: Test code_unknown with only code_female provided +test_that("recodeSex handles code_unknown with only code_female", { + ped <- data.frame( + ID = c(1, 2, 3, 4), + sex = c("M", "F", "F", "U"), + dadID = c(NA, NA, NA, NA), + momID = c(NA, NA, NA, NA) + ) + + # Test with only code_female and code_unknown + recoded_ped <- recodeSex(ped, + code_female = "F", + code_unknown = "U", + recode_male = "Male", + recode_female = "Female", + recode_unknown = "Unknown" + ) + + # Check recoding: F->Female, M->Male (inferred), U->Unknown + expect_equal(recoded_ped$sex[1], "Male") + expect_equal(recoded_ped$sex[2], "Female") + expect_equal(recoded_ped$sex[4], "Unknown") +}) + + +# Test Case 11: Test numeric codes with code_unknown +test_that("recodeSex handles numeric code_unknown values", { + ped <- data.frame( + ID = c(1, 2, 3, 4, 5), + sex = c(1, 0, 1, 0, 9), + dadID = c(NA, NA, 1, 1, NA), + momID = c(NA, NA, 2, 2, NA) + ) + + # Test with numeric codes + recoded_ped <- recodeSex(ped, + code_male = 1, + code_female = 0, + code_unknown = 9, + recode_male = "M", + recode_female = "F", + recode_unknown = "U" + ) + + expect_equal(recoded_ped$sex[5], "U") + expect_equal(recoded_ped$sex[1], "M") + expect_equal(recoded_ped$sex[2], "F") +})