Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/scripts/check_style.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
files <- styler::style_pkg(
transformers = styler::tidyverse_style(indent_by = 4),
dry = "on"
)

if (any(files$changed)) {
message("The following files need styling. Please run:")
message(" styler::style_pkg(transformers = styler::tidyverse_style(indent_by = 4))")
quit(status = 1)
}
7 changes: 7 additions & 0 deletions .github/workflows/dry-run-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@ jobs:
steps:
- name: Checkout repository
uses: actions/checkout@v4
- name: Setup R for linting
uses: r-lib/actions/setup-r@v2
- name: Check style
run: |
Rscript -e "install.packages('styler')"
Rscript -e "install.packages('roxygen2')"
Rscript .github/scripts/check_style.R
- name: Setup R and Bioconductor
uses: grimbough/bioc-actions/setup-bioc@v1
with:
Expand Down
227 changes: 114 additions & 113 deletions R/annotateProteinInfoFromIndra.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
#'
#' This function annotates a data frame with protein information from Indra.
#'
#' @param df output of \code{\link[MSstats]{groupComparison}} function's
#' comparisonResult table, which contains a list of proteins and their
#' corresponding p-values, logFCs, along with additional HGNC ID and HGNC
#' @param df output of \code{\link[MSstats]{groupComparison}} function's
#' comparisonResult table, which contains a list of proteins and their
#' corresponding p-values, logFCs, along with additional HGNC ID and HGNC
#' name columns
#' @param proteinIdType A character string specifying the type of protein ID.
#' @param proteinIdType A character string specifying the type of protein ID.
#' It can be either "Uniprot", "Uniprot_Mnemonic", or "Hgnc_Name".
#' @return A data frame with the following columns:
#' \describe{
Expand All @@ -24,14 +24,14 @@
#' head(annotated_df)
#' @export
annotateProteinInfoFromIndra <- function(df, proteinIdType) {
.validateAnnotateProteinInfoFromIndraInput(df)
df <- .populateUniprotIdsInDataFrame(df, proteinIdType)
df <- .populateHgncIdsInDataFrame(df, proteinIdType)
df <- .populateHgncNamesInDataFrame(df)
df <- .populateTranscriptionFactorInfoInDataFrame(df)
df <- .populateKinaseInfoInDataFrame(df)
df <- .populatePhophataseInfoInDataFrame(df)
return(df)
.validateAnnotateProteinInfoFromIndraInput(df)
df <- .populateUniprotIdsInDataFrame(df, proteinIdType)
df <- .populateHgncIdsInDataFrame(df, proteinIdType)
df <- .populateHgncNamesInDataFrame(df)
df <- .populateTranscriptionFactorInfoInDataFrame(df)
df <- .populateKinaseInfoInDataFrame(df)
df <- .populatePhophataseInfoInDataFrame(df)
return(df)
}

#' Validate Annotate Protein Info Input
Expand All @@ -41,86 +41,87 @@ annotateProteinInfoFromIndra <- function(df, proteinIdType) {
#' @param df A data frame containing protein information.
#' @return None. Throws an error if validation fails.
.validateAnnotateProteinInfoFromIndraInput <- function(df) {
if (!"Protein" %in% colnames(df)) {
stop("Input dataframe must contain 'Protein' column.")
}
if (!"Protein" %in% colnames(df)) {
stop("Input dataframe must contain 'Protein' column.")
}
}

#' Populate Uniprot IDs in Data Frame
#'
#' This function populates the Uniprot IDs in the data frame based on the protein ID type.
#'
#' @param df A data frame containing protein information.
#' @param proteinIdType A character string specifying the type of protein ID.
#' @param proteinIdType A character string specifying the type of protein ID.
#' It can be either "Uniprot" or "Uniprot_Mnemonic".
#' @return A data frame with populated Uniprot IDs.
.populateUniprotIdsInDataFrame <- function(df, proteinIdType) {
if ("GlobalProtein" %in% colnames(df)) {
protein_ids = unique(as.character(df$GlobalProtein))
} else {
df$Protein = as.character(df$Protein)
df$GlobalProtein = ifelse(grepl("_[A-Z][0-9]", df$Protein),
gsub("_[A-Z][0-9].*", "", df$Protein, perl = TRUE),
df$Protein)
protein_ids = unique(df$GlobalProtein)
}
df$UniprotId <- NA
if (proteinIdType == "Uniprot") {
df$UniprotId <- as.character(df$GlobalProtein)
}

if (proteinIdType == "Uniprot_Mnemonic") {
mnemonicProteins <- protein_ids
if (length(mnemonicProteins) > 0) {
uniprotMapping <- .callGetUniprotIdsFromUniprotMnemonicIdsApi(as.list(mnemonicProteins))
for (mnemonicId in names(uniprotMapping)) {
if (!is.null(uniprotMapping[[mnemonicId]])) {
df$UniprotId[df$GlobalProtein == mnemonicId] <- uniprotMapping[[mnemonicId]]
}
}
if ("GlobalProtein" %in% colnames(df)) {
protein_ids <- unique(as.character(df$GlobalProtein))
} else {
df$Protein <- as.character(df$Protein)
df$GlobalProtein <- ifelse(grepl("_[A-Z][0-9]", df$Protein),
gsub("_[A-Z][0-9].*", "", df$Protein, perl = TRUE),
df$Protein
)
protein_ids <- unique(df$GlobalProtein)
}
df$UniprotId <- NA
if (proteinIdType == "Uniprot") {
df$UniprotId <- as.character(df$GlobalProtein)
}

if (proteinIdType == "Uniprot_Mnemonic") {
mnemonicProteins <- protein_ids
if (length(mnemonicProteins) > 0) {
uniprotMapping <- .callGetUniprotIdsFromUniprotMnemonicIdsApi(as.list(mnemonicProteins))
for (mnemonicId in names(uniprotMapping)) {
if (!is.null(uniprotMapping[[mnemonicId]])) {
df$UniprotId[df$GlobalProtein == mnemonicId] <- uniprotMapping[[mnemonicId]]
}
}
}

if (proteinIdType == "Hgnc_Name") {
df$UniprotId <- NA
}
return(df)
}

if (proteinIdType == "Hgnc_Name") {
df$UniprotId <- NA
}
return(df)
}

#' Populate HGNC IDs in Data Frame
#'
#' This function populates the HGNC IDs in the data frame based on the Uniprot IDs.
#'
#' @param df A data frame containing protein information.
#' @param proteinIdType A character string specifying the type of protein ID.
#' @param proteinIdType A character string specifying the type of protein ID.
#' It can be either "Uniprot", "Uniprot_Mnemonic", or "Hgnc_Name".
#' @return A data frame with populated HGNC IDs.
.populateHgncIdsInDataFrame <- function(df, proteinIdType) {
df$HgncId <- NA
if (proteinIdType == "Uniprot" || proteinIdType == "Uniprot_Mnemonic") {
validMask <- !is.na(df$UniprotId)
validUniprots <- unique(df$UniprotId[validMask])
if (length(validUniprots) > 0) {
hgncMapping <- .callGetHgncIdsFromUniprotIdsApi(as.list(validUniprots))
for (uniprotId in names(hgncMapping)) {
if (!is.null(hgncMapping[[uniprotId]])) {
df$HgncId[df$UniprotId == uniprotId] <- hgncMapping[[uniprotId]]
}
df$HgncId <- NA
if (proteinIdType == "Uniprot" || proteinIdType == "Uniprot_Mnemonic") {
validMask <- !is.na(df$UniprotId)
validUniprots <- unique(df$UniprotId[validMask])
if (length(validUniprots) > 0) {
hgncMapping <- .callGetHgncIdsFromUniprotIdsApi(as.list(validUniprots))
for (uniprotId in names(hgncMapping)) {
if (!is.null(hgncMapping[[uniprotId]])) {
df$HgncId[df$UniprotId == uniprotId] <- hgncMapping[[uniprotId]]
}
}
} else {
hgncNames <- unique(df$GlobalProtein)
if (length(hgncNames) > 0) {
hgncMapping <- .callGetHgncIdsFromGildaApi(as.list(hgncNames))
for (hgncName in names(hgncMapping)) {
if (!is.null(hgncMapping[[hgncName]])) {
df$HgncId[df$GlobalProtein == hgncName] <- hgncMapping[[hgncName]]
}
}
} else {
hgncNames <- unique(df$GlobalProtein)
if (length(hgncNames) > 0) {
hgncMapping <- .callGetHgncIdsFromGildaApi(as.list(hgncNames))
for (hgncName in names(hgncMapping)) {
if (!is.null(hgncMapping[[hgncName]])) {
df$HgncId[df$GlobalProtein == hgncName] <- hgncMapping[[hgncName]]
}
}
}

return(df)
}

return(df)
}

#' Populate HGNC Names in Data Frame
Expand All @@ -130,18 +131,18 @@ annotateProteinInfoFromIndra <- function(df, proteinIdType) {
#' @param df A data frame containing protein information.
#' @return A data frame with populated HGNC names.
.populateHgncNamesInDataFrame <- function(df) {
df$HgncName <- NA
validHgncMask <- !is.na(df$HgncId)
validHgncs <- unique(df$HgncId[validHgncMask])
if (length(validHgncs) > 0) {
nameMapping <- .callGetHgncNamesFromHgncIdsApi(as.list(validHgncs))
for (hgncId in names(nameMapping)) {
if (!is.null(nameMapping[[hgncId]])) {
df$HgncName[df$HgncId == hgncId] <- nameMapping[[hgncId]]
}
}
df$HgncName <- NA
validHgncMask <- !is.na(df$HgncId)
validHgncs <- unique(df$HgncId[validHgncMask])
if (length(validHgncs) > 0) {
nameMapping <- .callGetHgncNamesFromHgncIdsApi(as.list(validHgncs))
for (hgncId in names(nameMapping)) {
if (!is.null(nameMapping[[hgncId]])) {
df$HgncName[df$HgncId == hgncId] <- nameMapping[[hgncId]]
}
}
return(df)
}
return(df)
}

#' Populate Transcription Factor Info in Data Frame
Expand All @@ -151,19 +152,19 @@ annotateProteinInfoFromIndra <- function(df, proteinIdType) {
#' @param df A data frame containing protein information.
#' @return A data frame with populated transcription factor information.
.populateTranscriptionFactorInfoInDataFrame <- function(df) {
df$IsTranscriptionFactor <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsTranscriptionFactorApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsTranscriptionFactor[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
df$IsTranscriptionFactor <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsTranscriptionFactorApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsTranscriptionFactor[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
return(df)
}
return(df)
}

#' Populate Kinase Info in Data Frame
Expand All @@ -173,19 +174,19 @@ annotateProteinInfoFromIndra <- function(df, proteinIdType) {
#' @param df A data frame containing protein information.
#' @return A data frame with populated kinase information.
.populateKinaseInfoInDataFrame <- function(df) {
df$IsKinase <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsKinaseApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsKinase[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
df$IsKinase <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsKinaseApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsKinase[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
return(df)
}
return(df)
}

#' Populate Phosphatase Info in Data Frame
Expand All @@ -195,17 +196,17 @@ annotateProteinInfoFromIndra <- function(df, proteinIdType) {
#' @param df A data frame containing protein information.
#' @return A data frame with populated phosphatase information.
.populatePhophataseInfoInDataFrame <- function(df) {
df$IsPhosphatase <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsPhosphataseApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsPhosphatase[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
df$IsPhosphatase <- NA
validNameMask <- !is.na(df$HgncName)
validNames <- unique(df$HgncName[validNameMask])
if (length(validNames) > 0) {
validNamesList <- as.list(validNames)
charMapping <- .callIsPhosphataseApi(validNamesList)
for (hgncName in names(charMapping)) {
if (!is.null(charMapping[[hgncName]])) {
df$IsPhosphatase[df$HgncName == hgncName] <- charMapping[[hgncName]]
}
}
return(df)
}
}
return(df)
}
Loading