diff --git a/scRNA-seq-advanced/04-gene_set_enrichment_analysis.Rmd b/scRNA-seq-advanced/04-gene_set_enrichment_analysis.Rmd index e22afb81..513c4ea7 100644 --- a/scRNA-seq-advanced/04-gene_set_enrichment_analysis.Rmd +++ b/scRNA-seq-advanced/04-gene_set_enrichment_analysis.Rmd @@ -69,9 +69,11 @@ fs::dir_create(results_dir) ```{r input_files} -input_file <- file.path(rms_analysis_dir, - "deseq", - "rms_myoblast_deseq_results.tsv") +input_file <- file.path( + rms_analysis_dir, + "deseq", + "rms_myoblast_deseq_results.tsv" +) ``` #### Output files @@ -79,8 +81,10 @@ input_file <- file.path(rms_analysis_dir, We'll save our table of GSEA results as a TSV. ```{r output_files} -output_file <- file.path(results_dir, - "rms_myoblast_gsea_results.tsv") +output_file <- file.path( + results_dir, + "rms_myoblast_gsea_results.tsv" +) ``` ## Gene sets @@ -113,11 +117,13 @@ Here's an excerpt of the [collection description](https://www.gsea-msigdb.org/gs Notably, there are only 50 gene sets included in this collection. The fewer gene sets we test, the lower our multiple hypothesis testing burden. -We can retrieve only the Hallmark gene sets by specifying `category = "H"` to the `msigdbr()` function. +We can retrieve only the Hallmark gene sets by specifying `collection = "H"` to the `msigdbr()` function. ```{r immunologic_sets, live = TRUE} -hs_hallmarks_df <- msigdbr(species = "Homo sapiens", - category = "H") +hs_hallmarks_df <- msigdbr( + species = "Homo sapiens", + collection = "H" +) ``` ## Gene Set Enrichment Analysis @@ -211,14 +217,16 @@ Now for the analysis! We can use the `GSEA()` function to perform GSEA with any generic set of gene sets, but there are several functions for using specific, commonly used gene sets (e.g., `gseKEGG()`). ```{r run_gsea} -gsea_results <- GSEA(geneList = lfc_vector, # ordered ranked gene list - minGSSize = 25, # minimum gene set size - maxGSSize = 500, # maximum gene set set - pvalueCutoff = 0.05, - pAdjustMethod = "BH", # correction for multiple hypothesis testing - TERM2GENE = dplyr::select(hs_hallmarks_df, - gs_name, - ensembl_gene)) # pass the correct identifier column +gsea_results <- GSEA( + geneList = lfc_vector, # ordered ranked gene list + minGSSize = 25, # minimum gene set size + maxGSSize = 500, # maximum gene set set + pvalueCutoff = 0.05, + pAdjustMethod = "BH", # correction for multiple hypothesis testing + TERM2GENE = dplyr::select(hs_hallmarks_df, + gs_name, + ensembl_gene) # pass the correct identifier column +) ``` Let's take a look at the GSEA results. @@ -250,10 +258,12 @@ Let's take a look at 3 different pathways -- one with a highly positive NES, one Let's take look at a pathway with a highly positive NES (`HALLMARK_MYC_TARGETS_V2`) using a GSEA plot. ```{r highly_pos} -enrichplot::gseaplot(gsea_results, - geneSetID = "HALLMARK_MYC_TARGETS_V2", - title = "HALLMARK_MYC_TARGETS_V2", - color.line = "#0066FF") +enrichplot::gseaplot( + gsea_results, + geneSetID = "HALLMARK_MYC_TARGETS_V2", + title = "HALLMARK_MYC_TARGETS_V2", + color.line = "#0066FF" +) ``` Notice how the genes that are in the gene set, indicated by the black bars, tend to be on the left side of the graph indicating that they have positive gene-level scores. @@ -263,10 +273,12 @@ Notice how the genes that are in the gene set, indicated by the black bars, tend The gene set `HALLMARK_MYOGENESIS` had a highly negative NES. ```{r highly_neg} -enrichplot::gseaplot(gsea_results, - geneSetID = "HALLMARK_MYOGENESIS", - title = "HALLMARK_MYOGENESIS", - color.line = "#0066FF") +enrichplot::gseaplot( + gsea_results, + geneSetID = "HALLMARK_MYOGENESIS", + title = "HALLMARK_MYOGENESIS", + color.line = "#0066FF" +) ``` This gene set shows the opposite pattern -- genes in the pathway tend to be on the right side of the graph. @@ -277,10 +289,12 @@ The `@results` slot will only show gene sets that pass the `pvalueCutoff` thresh Let's look at `HALLMARK_P53_PATHWAY`, which was not in the results we viewed earlier. ```{r p53, live = TRUE} -enrichplot::gseaplot(gsea_results, - geneSetID = "HALLMARK_P53_PATHWAY", - title = "HALLMARK_P53_PATHWAY", - color.line = "#0066FF") +enrichplot::gseaplot( + gsea_results, + geneSetID = "HALLMARK_P53_PATHWAY", + title = "HALLMARK_P53_PATHWAY", + color.line = "#0066FF" +) ``` Genes in the pathway are distributed more evenly throughout the ranked list, resulting in a more "middling" score. diff --git a/scRNA-seq-advanced/05-aucell.Rmd b/scRNA-seq-advanced/05-aucell.Rmd index 4f185439..55d3891b 100644 --- a/scRNA-seq-advanced/05-aucell.Rmd +++ b/scRNA-seq-advanced/05-aucell.Rmd @@ -18,7 +18,7 @@ date: 2024 --- -In this notebook, we'll demonstrate how to use the `AUCell` method, introduced in [Aibar _et al_. 2017.](https://doi.org/10.1038/nmeth.4463). +In this notebook, we'll demonstrate how to use the `AUCell` method, introduced in [Aibar _et al_. 2017](https://doi.org/10.1038/nmeth.4463). We can use `AUCell` when we are interested in a gene set's relative expression or activity in an individual cell. Gene sets can come from a curated collection of prior knowledge, like the Hallmark collection we used in the last notebook, or we can use our own custom gene sets (e.g., a set of marker genes for a cell type of interest). @@ -77,16 +77,20 @@ fs::dir_create(analysis_dir) The input will be a `SingleCellExperiment` for an individual Ewing sarcoma library. ```{r setup_input_files} -sce_file <- fs::path(processed_dir, - "SCPCS000490", - "SCPCL000822_processed.rds") +sce_file <- fs::path( + processed_dir, + "SCPCS000490", + "SCPCL000822_processed.rds" +) ``` We will save the `AUCell` results as a table in the analysis directory. ```{r setup_output_files, live = TRUE} -output_file <- fs::path(analysis_dir, - "ewing_sarcoma_aucell_results.tsv") +output_file <- fs::path( + analysis_dir, + "ewing_sarcoma_aucell_results.tsv" +) ``` @@ -112,8 +116,10 @@ We would expect both of these gene sets to have high expression in tumor cells. ```{r genesets} # Create a named vector with the relevant gene set names -ewing_gene_set_names <- c(zhang = "ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION", - riggi = "RIGGI_EWING_SARCOMA_PROGENITOR_UP") +ewing_gene_set_names <- c( + zhang = "ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION", + riggi = "RIGGI_EWING_SARCOMA_PROGENITOR_UP" +) ewing_gene_set_names ``` @@ -122,32 +128,25 @@ These gene sets come from the C2 gene set collection from MSigDB. Let's retrieve them using `msigdbr()`. ```{r extract_genesets, live = TRUE} -ewing_gene_sets_df <- msigdbr(species = "Homo sapiens", - category = "C2", - subcategory = "CGP") |> - dplyr::filter(gs_name %in% ewing_gene_set_names) +ewing_gene_sets_df <- msigdbr( + species = "Homo sapiens", + collection = "C2", + subcollection = "CGP" +) |> + dplyr::filter(gs_name %in% ewing_gene_set_names) |> + dplyr::select(gs_name, ensembl_gene) ``` -`AUCell` uses gene sets in a particular format that comes from the `GSEABase` package. -We need to create a `GeneSetCollection`. +We can provide these gene sets as a named list to `AUCell`, so let's create that now. -```{r gene_set_collection} -ewing_gene_set_collection <- ewing_gene_set_names |> - purrr::map( - # For each gene set - \(gene_set_name) { - ewing_gene_sets_df |> - # Subset to the rows in that gene set - dplyr::filter(gs_name == gene_set_name) |> - # Grab the Ensembl gene identifiers - dplyr::pull(ensembl_gene) |> - # Create a GeneSet object - GeneSet(setName = gene_set_name, - geneIdType = ENSEMBLIdentifier()) - } - ) |> - # Turn the list of GeneSet objects into a GeneSet collection - GeneSetCollection() +```{r create_gene_list} +ewing_gene_set_list <- split( + ewing_gene_sets_df$ensembl_gene, + ewing_gene_sets_df$gs_name +) + +# briefly, show the resulting list +purrr::map(ewing_gene_set_list, head) ``` ## Read in and prepare the `SingleCellExperiment` @@ -176,13 +175,16 @@ We can extract a counts matrix in sparse format for use with `AUCell`. counts_matrix <- counts(sce) ``` -There may be genes in our gene set that do not appear in the `SingleCellExperiment` object. -We can remove them using the `subsetGeneSets()` function. +There may be genes in our gene set that do not appear in the `SingleCellExperiment` object which we should remove before proceeding. ```{r subset_gene_sets, live = TRUE} -# Remove genes from gene sets if they are not in the SCE -ewing_gene_set_collection <- subsetGeneSets(ewing_gene_set_collection, - rownames(counts_matrix)) +# Remove genes from list if they are not in the SCE +ewing_gene_set_list <- ewing_gene_set_list |> + purrr::map( + \(gene_list) { + intersect(gene_list, rownames(sce)) # keep only the intersection with our genes + } +) ``` ## `AUCell` @@ -248,11 +250,13 @@ First, we'll start with a cell with a high AUC. We picked this barcode ahead of time when we wrote the notebook. ```{r high_recovery_curve} -plot_recovery_curve(cell_rankings, - ewing_gene_set_collection, - gene_set_name = "ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION", - barcode = "CTGAGCGGTCTTTATC", - auc_max_rank = auc_max_rank) # 1% threshold +plot_recovery_curve( + cell_rankings, + ewing_gene_set_list, + gene_set_name = "ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION", + barcode = "CTGAGCGGTCTTTATC", + auc_max_rank = auc_max_rank # 1% threshold +) ``` The x-axis is the gene ranks for all genes. @@ -262,11 +266,13 @@ The AUC is the area under this recovery curve at the max rank threshold chosen f Now, let's look at an example with a low AUC. ```{r low_recovery_curve} -plot_recovery_curve(cell_rankings, - ewing_gene_set_collection, - gene_set_name = "ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION", - barcode = "AGATAGAGTCACAATC", - auc_max_rank = auc_max_rank) # 1% threshold +plot_recovery_curve( + cell_rankings, + ewing_gene_set_list, + gene_set_name = "ZHANG_TARGETS_OF_EWSR1_FLI1_FUSION", + barcode = "AGATAGAGTCACAATC", + auc_max_rank = auc_max_rank # 1% threshold +) ``` Far fewer genes in the gene set are ranked above the threshold, yielding a lower AUC value. @@ -276,9 +282,11 @@ Far fewer genes in the gene set are ranked above the threshold, yielding a lower Once we have the rankings, we can calculate the AUC scores for both gene sets in all cells with the `AUCell_calcAUC()` function. ```{r calc_auc, live = TRUE} -cell_auc <- AUCell_calcAUC(geneSets = ewing_gene_set_collection, - rankings = cell_rankings, - aucMaxRank = auc_max_rank) +cell_auc <- AUCell_calcAUC( + geneSets = ewing_gene_set_list, + rankings = cell_rankings, + aucMaxRank = auc_max_rank +) ``` This function returns an `aucellResults` object. @@ -309,18 +317,22 @@ head(auc_df) We'll explore these in a later plot, but for now, let's calculate the threshold and assign cells with `AUCell_exploreThresholds()`. ```{r auc_assignments, live = TRUE} -auc_assignments <- AUCell_exploreThresholds(cell_auc, - plotHist = FALSE, - assignCells = TRUE) +auc_assignments <- AUCell_exploreThresholds( + cell_auc, + plotHist = FALSE, + assignCells = TRUE +) ``` We're going to plot the distribution of AUC values with `ggplot2`, so we will want the AUC values in a longer format. ```{r auc_plotting_df} auc_plotting_df <- auc_df |> - tidyr::pivot_longer(!barcodes, - names_to = "gene_set", - values_to = "auc") |> + tidyr::pivot_longer( + !barcodes, + names_to = "gene_set", + values_to = "auc" + ) |> dplyr::mutate( # Create a new logical column called assigned assigned = dplyr::case_when( @@ -396,9 +408,10 @@ First, let's rename the gene set columns to something more easily typed. ```{r rename_gene_set} auc_df <- auc_df |> # Use shorter names - dplyr::rename(zhang_auc = ewing_gene_set_names[["zhang"]], - riggi_auc = ewing_gene_set_names[["riggi"]]) - + dplyr::rename( + zhang_auc = ewing_gene_set_names[["zhang"]], + riggi_auc = ewing_gene_set_names[["riggi"]] + ) ``` And join it to the existing `colData`. @@ -431,18 +444,26 @@ We can use the `plotUMAP()` function from the `scater` package to plot a UMAP wi ```{r plot_umap_zhang} scater::plotUMAP(sce, colour_by = "zhang_auc") + # Use the gene set name, replacing underscores with spaces - ggplot2::ggtitle(stringr::str_replace_all(ewing_gene_set_names[["zhang"]], - "\\_", - " ")) + ggplot2::ggtitle( + stringr::str_replace_all( + ewing_gene_set_names[["zhang"]], + "\\_", + " " + ) + ) ``` Let's color the points by the AUC values for the other gene set. ```{r plot_umap_riggi, live = TRUE} scater::plotUMAP(sce, colour_by = "riggi_auc") + - ggplot2::ggtitle(stringr::str_replace_all(ewing_gene_set_names[["riggi"]], - "\\_", - " ")) + ggplot2::ggtitle( + stringr::str_replace_all( + ewing_gene_set_names[["riggi"]], + "\\_", + " " + ) + ) ``` We would want to do something more formal to confirm, but it seems like the same cells have high AUC values for both gene sets! diff --git a/scRNA-seq-advanced/util/aucell_functions.R b/scRNA-seq-advanced/util/aucell_functions.R index 16f3482f..e4bbfa98 100644 --- a/scRNA-seq-advanced/util/aucell_functions.R +++ b/scRNA-seq-advanced/util/aucell_functions.R @@ -3,8 +3,8 @@ #' Adapted from https://github.com/aertslab/AUCell/blob/91753b327a39dc7a4bbed46408ec2271c485f2f0/vignettes/AUCell.Rmd#L295-L316 #' #' @param cell_rankings Cell rankings; the output of AUCell::AUCell_buildRankings() -#' @param gene_set_collection A GSEABase GeneSetCollection object -#' @param gene_set_name The name of the gene set from the GeneSetCollection +#' @param gene_set_list A list of gene sets +#' @param gene_set_name The name of the gene set the gene_set_list #' that you would like to plot a recovery curve for #' @param barcode The cell barcode that you would like to plot a recovery #' curve for @@ -14,7 +14,7 @@ #' #' @return Outputs a recovery curve plot with the plot_recovery_curve <- function(cell_rankings, - gene_set_collection, + gene_set_list, gene_set_name, barcode, auc_max_rank, @@ -24,8 +24,8 @@ plot_recovery_curve <- function(cell_rankings, # Pull out the gene set and identify where in the cell ranks those genes # lie - gene_set <- gene_set_collection[[gene_set_name]] - gene_set_ranks <- cell_rankings[geneIds(gene_set), ] + gene_set <- gene_set_list[[gene_set_name]] + gene_set_ranks <- cell_rankings[gene_set, ] # Index of the cell with the barcode barcode_index <- which(colnames(gene_set_ranks) == barcode) @@ -35,7 +35,7 @@ plot_recovery_curve <- function(cell_rankings, plot(aucCurve, type="s", col="darkblue", lwd=1, xlab="Gene rank", ylab="# genes in the gene set", - xlim=c(0, auc_max_rank*3), ylim=c(0, nGenes(gene_set)), + xlim=c(0, auc_max_rank*3), ylim=c(0, length(gene_set)), main="Recovery curve", sub=paste("Cell:", colnames(gene_set_ranks)[barcode_index])) aucShade <- aucCurve[which(aucCurve[,1] < auc_max_rank),] diff --git a/scripts/render-live.sh b/scripts/render-live.sh index 4149f84a..fe0cd425 100644 --- a/scripts/render-live.sh +++ b/scripts/render-live.sh @@ -35,15 +35,15 @@ files=( scRNA-seq-advanced/01-read_filter_normalize_scRNA.Rmd scRNA-seq-advanced/02-dataset_integration.Rmd scRNA-seq-advanced/03-differential_expression.Rmd - # scRNA-seq-advanced/04-gene_set_enrichment_analysis.Rmd # TEMPORARY - # scRNA-seq-advanced/05-aucell.Rmd # TEMPORARY + scRNA-seq-advanced/04-gene_set_enrichment_analysis.Rmd + scRNA-seq-advanced/05-aucell.Rmd # machine-learning/01-openpbta_heatmap.Rmd # machine-learning/02-openpbta_consensus_clustering.Rmd # machine-learning/03-openpbta_PLIER.Rmd # machine-learning/04-openpbta_plot_LV.Rmd - # pathway-analysis/01-overrepresentation_analysis.Rmd # TEMPORARY - # pathway-analysis/02-gene_set_enrichment_analysis.Rmd # TEMPORARY - # pathway-analysis/03-gene_set_variation_analysis.Rmd # TEMPORARY + # pathway-analysis/01-overrepresentation_analysis.Rmd # TODO: Uncomment for https://github.com/AlexsLemonade/training-modules/issues/911 + # pathway-analysis/02-gene_set_enrichment_analysis.Rmd # TODO: Uncomment for https://github.com/AlexsLemonade/training-modules/issues/911 + # pathway-analysis/03-gene_set_variation_analysis.Rmd # TODO: Uncomment for https://github.com/AlexsLemonade/training-modules/issues/911 ) for file in ${files[@]} do