diff --git a/DESCRIPTION b/DESCRIPTION index aa696c5..35fc5b7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -23,7 +23,7 @@ Authors@R: c( License: Artistic-2.0 Depends: R (>= 4.3) Imports: dplyr, gridExtra, stringr, stats, ggplot2, stringi, - grDevices, MSstatsTMT, MSstatsConvert, MSstats, + grDevices, MSstatsTMT, MSstatsConvert (>= 1.19.1), MSstats, data.table, Rcpp, Biostrings, checkmate, ggrepel, plotly, htmltools, rlang Suggests: @@ -32,6 +32,7 @@ Suggests: tinytest, covr, mockery, + arrow, testthat (>= 3.0.0) LazyData: true LinkingTo: Rcpp diff --git a/R/converters.R b/R/converters.R index 108cc41..294d03e 100644 --- a/R/converters.R +++ b/R/converters.R @@ -7,10 +7,8 @@ #' #' @importFrom data.table as.data.table #' @importFrom MSstatsConvert DIANNtoMSstatsFormat +#' @inheritParams MSstatsConvert::DIANNtoMSstatsFormat #' -#' @param input data.frame of `report.tsv` file produced by Philosopher -#' @param annotation annotation with Run, Fraction, TechRepMixture, Mixture, Channel, -#' BioReplicate, Condition columns or a path to file. Refer to the example 'annotation' for the meaning of each column. #' @param input_protein same as `input` for global profiling run. Default is NULL. #' @param annotation_protein same as `annotation` for global profiling run. Default is NULL. #' @param fasta_path A string of path to a FASTA file, used to match PTM peptides. @@ -22,25 +20,6 @@ #' in `protein_id_col`. The protein names in these two columns must match in #' order to join the FASTA file with the DIA-NN output. Default is "uniprot_ac" #' for uniprot ID. For uniprot mnemonic ID, use "entry_name" -#' @param global_qvalue_cutoff The global qvalue cutoff. Default is 0.01. -#' @param qvalue_cutoff local qvalue cutoff for library. Default is 0.01. -#' @param pg_qvalue_cutoff local qvalue cutoff for protein groups Run should be -#' the same as filename. Default is 0.01. -#' @param useUniquePeptide logical, if TRUE (default) removes peptides that are assigned for more than one proteins. -#' We assume to use unique peptide for each protein. -#' @param removeFewMeasurements TRUE (default) will remove the features that have 1 or 2 measurements within each Run. -#' @param removeOxidationMpeptides TRUE (default) will remove the peptides including oxidation (M) sequence. -#' @param removeProtein_with1Feature TRUE will remove the proteins which have only 1 peptide and charge. Defaut is FALSE. -#' @param MBR If analaysis was done with match between runs or not. Default is TRUE. -#' @param use_log_file logical. If TRUE, information about data processing will -#' be saved to a file. -#' @param append logical. If TRUE, information about data processing will be -#' added to an existing log file. -#' @param verbose logical. If TRUE, information about data processing wil be -#' printed to the console. -#' @param log_file_path character. Path to a file to which information about -#' data processing will be saved. If not provided, such a file will be created -#' automatically. If 'append = TRUE', has to be a valid path to a file. #' #' @return `list` of one or two `data.frame` of class `MSstatsTMT`, named `PTM` and `PROTEIN` #' @@ -68,6 +47,28 @@ #' #' head(msstatsptm_format$PTM) #' +#' # Example DIANN 2.0 +#' input = system.file("tinytest/raw_data/DIANN/diann_2_ptm.parquet", +#' package = "MSstatsPTM") +#' input = arrow::read_parquet(input) +#' annot = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0_ptm.csv", +#' package = "MSstatsPTM") +#' annot = data.table::fread(annot) +#' fasta_path = system.file("extdata", "diann.fasta", +#' package="MSstatsPTM") +#' +#' msstatsptm_format = DIANNtoMSstatsPTMFormat( +#' input, +#' annot, +#' protein_id_col = "Protein.Names", +#' fasta_path = fasta_path, +#' fasta_protein_name = "entry_name", +#' use_log_file = FALSE, +#' quantificationColumn = "auto" +#' ) +#' +#' head(msstatsptm_format$PTM) +#' DIANNtoMSstatsPTMFormat = function(input, annotation, input_protein=NULL, @@ -84,6 +85,7 @@ DIANNtoMSstatsPTMFormat = function(input, removeOxidationMpeptides = TRUE, removeProtein_with1Feature = FALSE, MBR=TRUE, + quantificationColumn = "FragmentQuantCorrected", use_log_file = TRUE, append = FALSE, verbose = TRUE, @@ -120,20 +122,21 @@ DIANNtoMSstatsPTMFormat = function(input, input = input[input[,..protein_id_col][[1]] != input$ProteinNameUnmod] } - ptm_input = DIANNtoMSstatsFormat(input, - annotation, - global_qvalue_cutoff, - qvalue_cutoff, - pg_qvalue_cutoff, - useUniquePeptide, - removeFewMeasurements, - removeOxidationMpeptides, - removeProtein_with1Feature, - use_log_file, - append, - verbose, - log_file_path, - MBR) + ptm_input = DIANNtoMSstatsFormat(input = input, + annotation = annotation, + global_qvalue_cutoff = global_qvalue_cutoff, + qvalue_cutoff = qvalue_cutoff, + pg_qvalue_cutoff = pg_qvalue_cutoff, + useUniquePeptide = useUniquePeptide, + removeFewMeasurements = removeFewMeasurements, + removeOxidationMpeptides = removeOxidationMpeptides, + removeProtein_with1Feature = removeProtein_with1Feature, + use_log_file = use_log_file, + append = append, + verbose = verbose, + log_file_path = log_file_path, + MBR = MBR, + quantificationColumn = quantificationColumn) msstats_format = list(PTM=ptm_input, PROTEIN=NULL) @@ -141,20 +144,21 @@ DIANNtoMSstatsPTMFormat = function(input, checkmate::assertTRUE(!is.null(input_protein) & !is.null(annotation_protein)) - protein_input = DIANNtoMSstatsFormat(input_protein, - annotation_protein, - global_qvalue_cutoff, - qvalue_cutoff, - pg_qvalue_cutoff, - useUniquePeptide, - removeFewMeasurements, - removeOxidationMpeptides, - removeProtein_with1Feature, - use_log_file, - append, - verbose, - log_file_path, - MBR) + protein_input = DIANNtoMSstatsFormat(input = input_protein, + annotation = annotation_protein, + global_qvalue_cutoff = global_qvalue_cutoff, + qvalue_cutoff = qvalue_cutoff, + pg_qvalue_cutoff = pg_qvalue_cutoff, + useUniquePeptide = useUniquePeptide, + removeFewMeasurements = removeFewMeasurements, + removeOxidationMpeptides = removeOxidationMpeptides, + removeProtein_with1Feature = removeProtein_with1Feature, + use_log_file = use_log_file, + append = append, + verbose = verbose, + log_file_path = log_file_path, + MBR = MBR, + quantificationColumn = quantificationColumn) msstats_format = list(PTM=ptm_input, PROTEIN=protein_input) diff --git a/inst/tinytest/raw_data/DIANN/annotation_diann_2.0_ptm.csv b/inst/tinytest/raw_data/DIANN/annotation_diann_2.0_ptm.csv new file mode 100644 index 0000000..d876c3b --- /dev/null +++ b/inst/tinytest/raw_data/DIANN/annotation_diann_2.0_ptm.csv @@ -0,0 +1,9 @@ +Run,BioReplicate,Condition +Run1,1,Control +Run2,2,Control +Run3,3,Control +Run4,4,Control +Run5,5,Treatment +Run6,6,Treatment +Run7,7,Treatment +Run8,8,Treatment diff --git a/inst/tinytest/raw_data/DIANN/diann_2_ptm.parquet b/inst/tinytest/raw_data/DIANN/diann_2_ptm.parquet new file mode 100644 index 0000000..4366097 Binary files /dev/null and b/inst/tinytest/raw_data/DIANN/diann_2_ptm.parquet differ diff --git a/man/DIANNtoMSstatsPTMFormat.Rd b/man/DIANNtoMSstatsPTMFormat.Rd index b7a5c79..e2139c0 100644 --- a/man/DIANNtoMSstatsPTMFormat.Rd +++ b/man/DIANNtoMSstatsPTMFormat.Rd @@ -21,6 +21,7 @@ DIANNtoMSstatsPTMFormat( removeOxidationMpeptides = TRUE, removeProtein_with1Feature = FALSE, MBR = TRUE, + quantificationColumn = "FragmentQuantCorrected", use_log_file = TRUE, append = FALSE, verbose = TRUE, @@ -28,10 +29,10 @@ DIANNtoMSstatsPTMFormat( ) } \arguments{ -\item{input}{data.frame of \code{report.tsv} file produced by Philosopher} +\item{input}{name of MSstats input report from Diann, which includes fragment-level data. +Output fragment data with --export-quant flag in DIA-NN 2.0} -\item{annotation}{annotation with Run, Fraction, TechRepMixture, Mixture, Channel, -BioReplicate, Condition columns or a path to file. Refer to the example 'annotation' for the meaning of each column.} +\item{annotation}{name of 'annotation.txt' data which includes Condition, BioReplicate, Run.} \item{input_protein}{same as \code{input} for global profiling run. Default is NULL.} @@ -50,36 +51,46 @@ in \code{protein_id_col}. The protein names in these two columns must match in order to join the FASTA file with the DIA-NN output. Default is "uniprot_ac" for uniprot ID. For uniprot mnemonic ID, use "entry_name"} -\item{global_qvalue_cutoff}{The global qvalue cutoff. Default is 0.01.} +\item{global_qvalue_cutoff}{The qvalue cutoff for the Q.Value column, i.e. +the run-specific precursor q-value. Default is 0.01.} -\item{qvalue_cutoff}{local qvalue cutoff for library. Default is 0.01.} +\item{qvalue_cutoff}{If MBR is false, the qvalue cutoff for the Global.Q.Value +column, i.e. global precursor q-value. If MBR is true, the qvalue cutoff for the +Lib.Q.Value column, i.e. the q-value for the library created after the first MBR pass. +Default is 0.01.} -\item{pg_qvalue_cutoff}{local qvalue cutoff for protein groups Run should be -the same as filename. Default is 0.01.} +\item{pg_qvalue_cutoff}{If MBR is false, the qvalue cutoff for the Global.PG.Q.Value +column, i.e. the global q-value for the protein group. If MBR is true, the +qvalue cutoff for the Lib.PG.Q.Value column, i.e. the protein group q-value for +the library created after the first MBR pass. Default is 0.01.} -\item{useUniquePeptide}{logical, if TRUE (default) removes peptides that are assigned for more than one proteins. -We assume to use unique peptide for each protein.} +\item{useUniquePeptide}{should unique peptides be removed} -\item{removeFewMeasurements}{TRUE (default) will remove the features that have 1 or 2 measurements within each Run.} +\item{removeFewMeasurements}{should proteins with few measurements be removed} -\item{removeOxidationMpeptides}{TRUE (default) will remove the peptides including oxidation (M) sequence.} +\item{removeOxidationMpeptides}{should peptides with oxidation be removed} -\item{removeProtein_with1Feature}{TRUE will remove the proteins which have only 1 peptide and charge. Defaut is FALSE.} +\item{removeProtein_with1Feature}{should proteins with a single feature be removed} -\item{MBR}{If analaysis was done with match between runs or not. Default is TRUE.} +\item{MBR}{True if analysis was done with match between runs} -\item{use_log_file}{logical. If TRUE, information about data processing will -be saved to a file.} +\item{quantificationColumn}{Use 'FragmentQuantCorrected'(default) column for quantified intensities for DIANN 1.8.x. +Use 'FragmentQuantRaw' for quantified intensities for DIANN 1.9.x. +Use 'auto' for quantified intensities for DIANN 2.x where each fragment intensity is a separate column, e.g. Fr0Quantity.} -\item{append}{logical. If TRUE, information about data processing will be -added to an existing log file.} +\item{use_log_file}{logical. If TRUE, information about data processing +will be saved to a file.} -\item{verbose}{logical. If TRUE, information about data processing wil be -printed to the console.} +\item{append}{logical. If TRUE, information about data processing will be added +to an existing log file.} + +\item{verbose}{logical. If TRUE, information about data processing wil be printed +to the console.} \item{log_file_path}{character. Path to a file to which information about -data processing will be saved. If not provided, such a file will be created -automatically. If 'append = TRUE', has to be a valid path to a file.} +data processing will be saved. +If not provided, such a file will be created automatically. +If \code{append = TRUE}, has to be a valid path to a file.} } \value{ \code{list} of one or two \code{data.frame} of class \code{MSstatsTMT}, named \code{PTM} and \code{PROTEIN} @@ -112,4 +123,26 @@ msstatsptm_format = DIANNtoMSstatsPTMFormat( head(msstatsptm_format$PTM) +# Example DIANN 2.0 +input = system.file("tinytest/raw_data/DIANN/diann_2_ptm.parquet", + package = "MSstatsPTM") +input = arrow::read_parquet(input) +annot = system.file("tinytest/raw_data/DIANN/annotation_diann_2.0_ptm.csv", + package = "MSstatsPTM") +annot = data.table::fread(annot) +fasta_path = system.file("extdata", "diann.fasta", + package="MSstatsPTM") + +msstatsptm_format = DIANNtoMSstatsPTMFormat( + input, + annot, + protein_id_col = "Protein.Names", + fasta_path = fasta_path, + fasta_protein_name = "entry_name", + use_log_file = FALSE, + quantificationColumn = "auto" +) + +head(msstatsptm_format$PTM) + }