Single-cell Differential Expression and Functional Enrichment Analysis Tutorial
Introduce differences and enrichment analysis
This tutorial provides a comprehensive guide for performing differential expression analysis and enrichment analysis on single-cell RNA sequencing data. The analysis includes:
- Differential Expression Analysis: Identifying genes that are differentially expressed between different cell clusters or experimental conditions
- Enrichment Analysis: Understanding the biological significance of differentially expressed genes through pathway and functional analysis
Kernel Configuration
Important: This tutorial should be run using the "common_r" kernel.
Required packages:
- Seurat (>= 4.0)
- clusterProfiler, msigdbr, org.Hs.eg.db, org.Mm.eg.db
- dplyr, tidyr, readr
- enrichplot, ggplot2, ggrepel, RColorBrewer, viridis, ComplexHeatmap and patchwork
Libraries loading and Parameter Configuration
Load Libraries
Load all required libraries.
# Load required libraries
suppressMessages({
library(Seurat)
library(dplyr)
library(tidyr)
library(ggplot2)
library(patchwork)
library(clusterProfiler)
library(org.Hs.eg.db)
library(org.Mm.eg.db)
library(enrichplot)
library(msigdbr)
library(viridis)
library(RColorBrewer)
library(ComplexHeatmap)
library(readr)
library(ggrepel)
})
# Set random seed for reproducibility
set.seed(42)
print("Libraries loaded successfully!")Configure Analysis Parameters
Set up the parameters for the analysis including file paths and visualization settings.
# Analysis Parameters
params <- list(
# Data paths
input_path = "/home/demo-seekgene-com/workspace/project/demo-seekgene-com/CellAnnotation/result/Step4_Final_Cell_Annotation.rds",
output_path = "/home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result",
# Species information
species = "human", # Options: "human", "mouse"
# Visualization parameters
discrete_colors = "Paired", # For discrete variables
continuous_colors = viridis(100), # For continuous variables
plot_dpi = 300, # Plot resolution
# Differential expression parameters
differential_analysis_type = "group_comparison", # Options: "clusters" OR "group_comparison"
cluster_by = "RNA_snn_res.0.5", # Column name for clustering or cell type (e.g., resolution.0.5_d30, celltype)
select_clusters = c("0", "5", "6"), # Optional subset of "cluster_by" to analyze; set NULL to include all clusters
group_by = "sample", # Column name for grouping (e.g., sample, condition)
compare_groups = c("S133TvsS133A"), # Optional subset of "group_by" to analyze, pairwise comparison spec (e.g., "TreatedvsControl","DiseasevsHealth"); set NULL to auto-generate all pairs
de_method = "wilcox", # Options: "wilcox", "bimod", "roc", "t", "negbinom", "poisson", "LR", "MAST"
min_pct = 0.1, # Minimum percentage of cells expressing the gene
logfc_threshold = 0.5, # Log fold change threshold used by Seurat when finding markers
p_adj_cutoff = 0.05, # adjusted P-value cutoff for keeping rows in DEG table
abs_log2fc_cutoff = 0.5, # absolute log2FC cutoff for keeping rows in DEG table
# Enrichment analysis parameters
enrichment_analysis_type = "ORA", # Options: "ORA" OR "GSEA"
enrichment_databases = "GO,KEGG,HALLMARK,REACTOME", # Comma-separated
pvalue_cutoff = 0.05, # P-value cutoff for enrichment
qvalue_cutoff = 0.2, # Q-value cutoff for enrichment
minGSSize = 10, # minimum gene set size in enrichment
maxGSSize = 500 # maximum gene set size in enrichment
)
# Create output directory if it doesn't exist
if (!dir.exists(params$output_path)) {
dir.create(params$output_path, recursive = TRUE)
}
cat("Analysis configuration:")
cat("\n- Output directory:", params$output_path, "\n")
if (params$differential_analysis_type == "clusters") {
if (!is.null(params$select_clusters)) {
cat("- differential analysis among clusters: ",
paste(params$select_clusters, collapse = ", "), "\n", sep = "")
} else {
cat("- pairwise differential analysis across all clusters\n")
}
} else if (params$differential_analysis_type == "group_comparison") {
comp_str <- if (is.null(params$compare_groups)) "auto-generate all pairwise groups" else paste(params$compare_groups, collapse = ", ")
cl_str <- if (is.null(params$select_clusters)) "all clusters" else paste(params$select_clusters, collapse = ", ")
cat("- group comparison ", comp_str, " within clusters: ", cl_str, "\n", sep = "")
}
cat("\n- Differential analysis type:", params$differential_analysis_type)
cat("\n- Enrichment analysis type:", params$enrichment_analysis_type)- Output directory: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result
- group comparison S133TvsS133A within clusters: 0, 5, 6
- Enrichment analysis type: ORA
Load and Prepare Data
Load the single-cell data and prepare it for differential expression analysis.
# Load the Seurat object
cat("Loading data from:", params$input_path, "\n")
seurat_obj <- readRDS(params$input_path)
# Basic data information
cat("\nData Summary:")
cat("\n- Number of cells:", ncol(seurat_obj))
cat("\n- Number of genes:", nrow(seurat_obj))
cat("\n- Available metadata columns:", paste(colnames(seurat_obj@meta.data), collapse = ", "))
# Check available clusters
if (params$cluster_by %in% colnames(seurat_obj@meta.data)) {
cat("\n- Available clusters:", paste(unique(seurat_obj[[params$cluster_by]]), collapse = ", "))
}
# Check available groups
if (params$group_by %in% colnames(seurat_obj@meta.data)) {
cat("\n- Available groups:", paste(unique(seurat_obj@meta.data[[params$group_by]]), collapse = ", "))
}
# Set default assay to RNA if not already set
DefaultAssay(seurat_obj) <- "RNA"
# Set cluster colors
n_clusters <- length(table(seurat_obj[[params$cluster_by]]))
clusters_colors <- brewer.pal(min(n_clusters, 12), params$discrete_colors)
if (n_clusters > 12) {
clusters_colors <- colorRampPalette(clusters_colors)(n_clusters)
}
# Scale data
if (dim(seurat_obj@assays$RNA@scale.data)[1] > 1) {
cat("\nThe scale.data slot exists.")
} else {
cat("\nScaling data...")
seurat_obj <- ScaleData(seurat_obj)
}
# If select_clusters is specified, subset data first
if (!is.null(params$select_clusters)) {
cat("\nFiltering clusters:", paste(params$select_clusters, collapse = ", "))
# Ensure select_clusters is character to match metadata type
select_clusters <- as.character(params$select_clusters)
# Verify requested clusters exist
available_clusters <- unique(seurat_obj@meta.data[[params$cluster_by]])
missing_clusters <- setdiff(select_clusters, available_clusters)
if (length(missing_clusters) > 0) {
warning("The following clusters do not exist and will be ignored: ", paste(missing_clusters, collapse = ", "))
}
# Keep only valid clusters
valid_clusters <- intersect(select_clusters, available_clusters)
if (length(valid_clusters) == 0) {
stop("No valid clusters found for differential analysis")
}
# Subset data to selected clusters
seurat_obj <- subset(seurat_obj,
cells = rownames(seurat_obj@meta.data)[
as.character(seurat_obj@meta.data[[params$cluster_by]]) %in% valid_clusters
])
cat("\nCells after filtering:", ncol(seurat_obj))
cat("\nCluster distribution after filtering:\n")
print(table(seurat_obj@meta.data[[params$cluster_by]]))
}
cat("\nData loaded successfully!")Data Summary:
- Number of cells: 16924
- Number of genes: 24091
- Available metadata columns: orig.ident, nCount_RNA, nFeature_RNA, sample, species, percent.mitochondrial, filter_status, pANN_0.05_0.3_1372, DF.classifications_0.05_0.3_1372, RNA_snn_res.0.5, seurat_clusters, SingleR_HumanPrimaryCellAtlas, SingleR_score_HumanPrimaryCellAtlas, SingleR_BlueprintEncode, SingleR_score_BlueprintEncode, Lymphoid_B_cells_Score, Lymphoid_Plasma_cells_Score, Lymphoid_T_cells_Score, Lymphoid_NK_cells_Score, Myeloid_Monocytes_Score, Myeloid_Macrophages_Score, Myeloid_DCs_Score, Myeloid_pDCs_Score, Myeloid_Granulocytes_Score, Myeloid_Neutrophils_Score, Myeloid_Mast_cells_Score, Myeloid_Megakaryocytes_Score, Myeloid_Erythrocytes_Score, Stromal_Epithelial_cells_Score, Stromal_Endothelial_cells_Score, Stromal_Fibroblasts_Score, Stromal_Proliferating_cells_Score, celltype
- Available clusters: c(1, 2, 7, 10, 5, 3, 6, 11, 8, 4, 16, 15, 9, 13, 12, 17, 14, 18)
- Available groups: S133A, S133T
The scale.data slot exists.
Filtering clusters: 0, 5, 6
Cells after filtering: 6395
Cluster distribution after filtering:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
4645 0 0 0 0 892 858 0 0 0 0 0 0 0 0 0
16 17
0 0
Data loaded successfully!
Differential Expression and Enrichment Analysis
Differential Expression Analysis and Visualization
Clusters comparison: Compare and visualize cell clusters against each other
Group comparison within clusters: Compare and visualize different experimental groups within specific clusters
Define functions for differential expression analysis and visualization
# Function 1: Clusters differential expression analysis
perform_clusters_de <- function(seurat_obj, params) {
cat("Performing differential expression analysis across all clusters...\n")
# Find markers for all clusters
all_markers <- FindAllMarkers(
seurat_obj,
only.pos = TRUE,
min.pct = params$min_pct,
logfc.threshold = params$logfc_threshold,
test.use = params$de_method
)
# Save results
base_dir <- file.path(params$output_path, "cluster_DE")
if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
output_file <- file.path(base_dir, "clusters_markers_all.csv")
write.csv(all_markers, output_file, row.names = FALSE)
cat("Results saved to:", output_file, "\n")
return(all_markers)
}
# Function 2: parse comparison pairs
parse_compare_groups <- function(params, available_groups) {
parsed <- list()
for (comp in params$compare_groups) {
# Ensure string contains "vs"
if (!grepl("vs", comp)) {
warning("Invalid comparison format:", comp, "- should contain 'vs'")
next
}
# Split the string
parts <- strsplit(comp, "vs")[[1]]
if (length(parts) != 2) {
warning("Invalid comparison format:", comp, "- should have exactly one 'vs'")
next
}
ident1 <- trimws(parts[1])
ident2 <- trimws(parts[2])
# Validate group existence
if (!(ident1 %in% available_groups)) {
warning("Group", ident1, "not found in available groups")
next
}
if (!(ident2 %in% available_groups)) {
warning("Group", ident2, "not found in available groups")
next
}
parsed[[length(parsed) + 1]] <- list(
ident1 = ident1,
ident2 = ident2,
comparison = comp
)
}
return(parsed)
}
# Function 3: Group comparison differential expression analysis
perform_group_comparison_de <- function(seurat_obj, params) {
# Verify the group column exists
if (!params$group_by %in% colnames(seurat_obj@meta.data)) {
stop(paste("Group column", params$group_by, "not found in metadata"))
}
# Get all available groups
available_groups <- unique(seurat_obj@meta.data[[params$group_by]])
cat("Available groups:", paste(available_groups, collapse = ", "), "\n")
# Auto-generate all pairwise comparisons if not specified
if (is.null(params$compare_groups)) {
if (length(available_groups) >= 2) {
params$compare_groups <- combn(available_groups, 2, function(x) paste(x[1], "vs", x[2], sep = ""))
cat("Auto-generated comparisons:", paste(params$compare_groups, collapse = ", "), "\n")
} else {
stop("Need at least 2 groups for comparison")
}
}
# Parse comparison pairs
parsed_comparisons <- parse_compare_groups(params, available_groups)
# Get available clusters
available_clusters <- unique(seurat_obj@meta.data[[params$cluster_by]])
cat("Available clusters:", paste(available_clusters, collapse = ", "), "\n")
# Store all results
group_markers_list <- list()
# Compare groups within each cluster
for (cluster in available_clusters) {
cat("\nAnalyzing cluster:", cluster, "\n")
# Subset to the specific cluster
cells_in_cluster <- colnames(seurat_obj)[seurat_obj@meta.data[[params$cluster_by]] == cluster]
cluster_subset <- subset(seurat_obj, cells = cells_in_cluster)
# Ensure this cluster has at least two groups
cluster_groups <- unique(cluster_subset@meta.data[[params$group_by]])
if (length(cluster_groups) < 2) {
cat("Skipping cluster", cluster, "- insufficient groups\n")
next
}
# Perform DE for each comparison pair
for (comp in parsed_comparisons) {
ident1 <- comp$ident1
ident2 <- comp$ident2
# Check if both groups exist in this cluster
if (!(ident1 %in% cluster_groups) || !(ident2 %in% cluster_groups)) {
cat("Skipping comparison", comp$comparison, "- groups not found in cluster", cluster, "\n")
next
}
cat(" Comparing", ident1, "vs", ident2, "\n")
# Set group identities
Idents(cluster_subset) <- params$group_by
# Run differential expression
tryCatch({
markers <- FindMarkers(
cluster_subset,
ident.1 = ident1,
ident.2 = ident2,
only.pos = FALSE,
min.pct = params$min_pct,
logfc.threshold = params$logfc_threshold,
test.use = params$de_method
)
# Add metadata
markers$gene <- rownames(markers)
markers$cluster <- cluster
markers$ident1 <- ident1
markers$ident2 <- ident2
markers$comparison <- comp$comparison
# Store results
result_name <- paste0("cluster_", cluster, "_", comp$comparison)
group_markers_list[[result_name]] <- markers
cat(" Found", nrow(markers), "differentially expressed genes\n")
}, error = function(e) {
cat(" Error in comparison", comp$comparison, ":", e$message, "\n")
})
}
}
# Combine all results
if (length(group_markers_list) > 0) {
group_markers <- do.call(rbind, group_markers_list)
# Save results
base_dir <- file.path(params$output_path, "group_DE")
if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
output_file <- file.path(base_dir, "group_comparison_markers_all.csv")
write.csv(group_markers, output_file, row.names = FALSE)
cat("\nGroup comparison results saved to:", output_file, "\n")
return(group_markers)
} else {
cat("\nNo valid group comparisons found.\n")
return(NULL)
}
}
# Function 4: Create visualizations for clusters analysis
create_clusters_plots <- function(seurat_obj, all_markers, params, clusters_colors) {
# 1. Top 3 genes dotplot plot for each cluster
top_genes_per_cluster <- all_markers %>%
group_by(cluster) %>%
top_n(3, avg_log2FC) %>%
pull(gene)
dotplot_plot <- DotPlot(seurat_obj, features = unique(top_genes_per_cluster)) +
scale_color_gradientn(colors = params$continuous_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Top 3 Marker Genes per Cluster",
subtitle = "DotPlot")
# Save dotplot plot
ggsave(file.path(params$output_path, "cluster_DE", "clusters_dotplot.pdf"),
dotplot_plot, width = 10, height = 8, dpi = params$plot_dpi)
# 2. Heatmap of top genes
top_genes_heatmap <- all_markers %>%
group_by(cluster) %>%
top_n(5, avg_log2FC) %>%
pull(gene)
# Check if genes exist in scale.data slot
available_genes_heatmap <- intersect(unique(top_genes_heatmap), rownames(GetAssayData(seurat_obj, slot = "scale.data")))
if (length(available_genes_heatmap) > 0) {
heatmap_plot <- DoHeatmap(seurat_obj, features = available_genes_heatmap,
slot = "scale.data",
group.colors = clusters_colors) +
scale_fill_gradientn(colors = params$continuous_colors) +
labs(title = "Top 5 Marker Genes per Cluster",
subtitle = "Heatmap")
# Save heatmap
ggsave(file.path(params$output_path, "cluster_DE", "clusters_heatmap.pdf"),
heatmap_plot, width = 12, height = 10, dpi = params$plot_dpi)
} else {
cat("Warning: No requested features found in the scale.data slot for the RNA assay. Skipping heatmap generation.\n")
}
# 3. Violin plots for top genes
top_genes_violin <- all_markers %>%
group_by(cluster) %>%
top_n(3, avg_log2FC) %>%
pull(gene)
violin_plot <- VlnPlot(seurat_obj, features = unique(top_genes_violin),
stack = TRUE, flip = TRUE,
fill.by = "ident",cols = clusters_colors) +
theme(axis.text.x = element_text(angle = 0, hjust = 1))+
labs(title = "Top 3 Marker Genes per Cluster",
subtitle = "Violin Plot")
# Save violin plot
ggsave(file.path(params$output_path, "cluster_DE", "clusters_violin_plot.pdf"),
violin_plot, width = 12, height = 8, dpi = params$plot_dpi)
cat("\nAll clusters visualization plots saved to:", params$output_path)
}
# Function 5: Create visualizations for group comparison analysis
create_group_comparison_plots <- function(seurat_obj, group_markers, params) {
if (is.null(group_markers) || nrow(group_markers) == 0) {
cat("No group comparison results to visualize.\n")
return(invisible(NULL))
}
group_col <- params$group_by
cluster_col <- params$cluster_by
# 1. Stacked bar chart for group proportions by cluster
cat("Creating stacked bar chart for group proportions by cluster...\n")
prop_data <- seurat_obj@meta.data %>%
dplyr::group_by(!!rlang::sym(cluster_col), !!rlang::sym(group_col)) %>%
dplyr::summarise(count = dplyr::n(), .groups = 'drop') %>%
dplyr::group_by(!!rlang::sym(cluster_col)) %>%
dplyr::mutate(proportion = count / sum(count)) %>%
dplyr::ungroup()
stacked_bar <- ggplot(prop_data, aes(x = !!rlang::sym(cluster_col), y = proportion, fill = !!rlang::sym(group_col))) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_brewer(palette = params$discrete_colors) +
labs(title = "Group Proportions by Cluster",
subtitle = "Stacked Bar Chart",
x = "Cluster", y = "Proportion", fill = "Group") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_y_continuous(labels = scales::percent_format())
ggsave(file.path(params$output_path, "group_DE", "group_comparison_group_proportions_by_cluster.pdf"),
stacked_bar, width = 10, height = 6, dpi = params$plot_dpi)
# 2. Each cluster outputs: DotPlot, Violin, Heatmap, Volcano
cat("Creating per-cluster Dot/Violin/Heatmap/Volcano...\n")
clusters_all <- sort(unique(as.character(seurat_obj@meta.data[[cluster_col]])))
n_clusters <- length(clusters_all)
clusters_colors <- RColorBrewer::brewer.pal(min(max(3, n_clusters), 12), params$discrete_colors)
if (n_clusters > length(clusters_colors)) {
clusters_colors <- grDevices::colorRampPalette(clusters_colors)(n_clusters)
}
out_dir <- file.path(params$output_path, "group_DE")
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
for (cluster in clusters_all) {
cluster_data <- group_markers[group_markers$cluster == cluster, , drop = FALSE]
if (nrow(cluster_data) == 0) next
up_genes <- cluster_data %>%
dplyr::filter(avg_log2FC > 0) %>%
dplyr::top_n(5, avg_log2FC) %>%
dplyr::pull(gene)
down_genes <- cluster_data %>%
dplyr::filter(avg_log2FC < 0) %>%
dplyr::top_n(5, abs(avg_log2FC)) %>%
dplyr::pull(gene)
cluster_genes <- unique(c(up_genes, down_genes))
cluster_genes <- cluster_genes[cluster_genes %in% rownames(seurat_obj)]
if (length(cluster_genes) == 0) next
cells_in_cluster <- rownames(seurat_obj@meta.data)[as.character(seurat_obj@meta.data[[cluster_col]]) == cluster]
cluster_subset <- subset(seurat_obj, cells = cells_in_cluster)
# 2.1 DotPlot
cluster_dot <- DotPlot(cluster_subset, features = cluster_genes, cols = c("lightgrey", "red"),
group.by = group_col) +
scale_color_gradientn(colors = params$continuous_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste("Cluster", cluster, "DE Genes"),
subtitle = "Top 5 Up + Top 5 Down") +
theme_bw()
ggsave(file.path(out_dir, paste0("cluster_", cluster, "_de_genes_dotplot.pdf")),
cluster_dot, width = 10, height = 6, dpi = params$plot_dpi)
# 2.2 Violin
cluster_violin <- VlnPlot(cluster_subset, features = cluster_genes,
stack = TRUE, flip = TRUE,
fill.by = "ident",
group.by = group_col,
cols = brewer.pal(8, params$discrete_colors)[1:2]) +
NoLegend() +
labs(title = paste("Cluster", cluster, "DE Genes"),
subtitle = "Top 5 Up + Top 5 Down")
ggsave(file.path(out_dir, paste0("cluster_", cluster, "_de_genes_violin.pdf")),
cluster_violin, width = 6, height = 10, dpi = params$plot_dpi)
# 2.3 Heatmap
available_genes_heatmap <- intersect(cluster_genes, rownames(GetAssayData(cluster_subset, slot = "scale.data")))
if (length(available_genes_heatmap) > 1) {
heatmap_cluster <- DoHeatmap(cluster_subset, features = available_genes_heatmap,
slot = "scale.data",
group.colors = clusters_colors,
group.by = group_col, draw.lines = TRUE) +
scale_fill_gradientn(colors = params$continuous_colors) +
labs(title = paste("Cluster", cluster, "DE Genes"),
subtitle = "Top 5 Up + Top 5 Down")
ggsave(file.path(out_dir, paste0("cluster_", cluster, "_de_genes_heatmap.pdf")),
heatmap_cluster, width = 10, height = 3, dpi = params$plot_dpi)
} else {
cat("No genes found in scale.data for cluster", cluster, "heatmap generation.\n")
}
# 2.4 Volcano
cluster_data$top_genes <- ifelse(cluster_data$gene %in% cluster_genes, cluster_data$gene, "")
volcano_plot <- ggplot(cluster_data, aes(x = avg_log2FC, y = -log10(p_val_adj))) +
geom_point(aes(color = ifelse(p_val_adj < 0.05 & abs(avg_log2FC) > 0.5,
ifelse(avg_log2FC > 0, "Up", "Down"), "NS")),
alpha = 0.6, size = 1.5) +
scale_color_manual(values = c("Down" = "blue", "NS" = "grey", "Up" = "red")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red", alpha = 0.7) +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "red", alpha = 0.7) +
ggrepel::geom_text_repel(aes(label = top_genes),
size = 3, max.overlaps = 20,
box.padding = 0.5, point.padding = 0.3,
segment.color = "black", segment.size = 0.3) +
labs(title = paste("Volcano Plot - Cluster", cluster),
subtitle = "Top 5 Up + Top 5 Down Genes Marked",
x = "Average Log2 Fold Change",
y = "-log10(Adjusted P-value)",
color = "Regulation") +
theme_bw() +
theme(legend.position = "bottom",
plot.title = element_text(hjust = 0.5, size = 14),
plot.subtitle = element_text(hjust = 0.5, size = 12))
ggsave(file.path(out_dir, paste0("cluster_", cluster, "_de_genes_volcano.pdf")),
volcano_plot, width = 6, height = 6, dpi = params$plot_dpi)
}
cat("\nGroup comparison visualization plots saved to:", params$output_path, "\n")
invisible(NULL)
}Perform differential analysis and visualization
if (params$differential_analysis_type == "clusters") {
cat("Performing CLUSTERS differential expression analysis...\n")
# Perform analysis (ALL genes from Seurat DE)
clusters_markers_all <- perform_clusters_de(seurat_obj, params)
group_markers <- NULL
# FILTERED version for reporting/plots
if (!is.null(clusters_markers_all)) {
clusters_markers <- clusters_markers_all %>%
dplyr::filter(
is.finite(p_val_adj) & p_val_adj < params$p_adj_cutoff
) %>%
dplyr::filter(
is.finite(avg_log2FC) & abs(avg_log2FC) >= params$abs_log2fc_cutoff
)
write.csv(clusters_markers, file.path(params$output_path, "cluster_DE", "clusters_markers_filtered.csv"), row.names = FALSE)
cat("Filtering results:\n")
cat("- Total genes before filtering:", nrow(clusters_markers_all), "\n")
cat("- Total genes after filtering:", nrow(clusters_markers), "\n")
cat("- P-value cutoff:", params$p_adj_cutoff, "\n")
cat("- Log2FC cutoff:", params$abs_log2fc_cutoff, "\n")
} else {
clusters_markers <- NULL
}
# Visualizations use FILTERED
create_clusters_plots(seurat_obj, clusters_markers, params, clusters_colors)
# Summary (FILTERED)
cat("\nSummary of all clusters differential expression (filtered):")
cat("\n- Total differentially expressed genes:", nrow(clusters_markers))
cat("\n- Average genes per cluster:", ifelse(length(unique(clusters_markers$cluster))>0, round(nrow(clusters_markers) / length(unique(clusters_markers$cluster)), 1), 0))
cat("\n- Top 5 genes per cluster:")
if (nrow(clusters_markers) > 0) print(clusters_markers %>% group_by(cluster) %>% top_n(5, avg_log2FC))
} else if (params$differential_analysis_type == "group_comparison") {
cat("Performing GROUP COMPARISON differential expression analysis...\n")
# Perform analysis (ALL)
group_markers_all <- perform_group_comparison_de(seurat_obj, params)
clusters_markers <- NULL
# FILTERED
if (!is.null(group_markers_all)) {
group_markers <- group_markers_all %>%
dplyr::filter(
is.finite(p_val_adj) & p_val_adj < params$p_adj_cutoff
) %>%
dplyr::filter(
is.finite(avg_log2FC) & abs(avg_log2FC) >= params$abs_log2fc_cutoff
)
write.csv(group_markers, file.path(params$output_path, "group_DE", "group_comparison_markers_filtered.csv"), row.names = FALSE)
cat("Group comparison filtering results:\n")
cat("- Total genes before filtering:", nrow(group_markers_all), "\n")
cat("- Total genes after filtering:", nrow(group_markers), "\n")
cat("- P-value cutoff:", params$p_adj_cutoff, "\n")
cat("- Log2FC cutoff:", params$abs_log2fc_cutoff, "\n")
} else {
group_markers <- NULL
}
# Visualizations use FILTERED
create_group_comparison_plots(seurat_obj, group_markers, params)
# Summary (FILTERED)
if (!is.null(group_markers)) {
cat("\nSummary of group comparison differential expression (filtered):")
cat("\n- Total differentially expressed genes:", nrow(group_markers))
cat("\n- Comparisons performed:", paste(unique(group_markers$comparison), collapse = ", "))
cat("\n- Clusters analyzed:", paste(unique(group_markers$cluster), collapse = ", "))
}
}Available clusters: 0, 6, 5
Analyzing cluster: 0
Comparing S133T vs S133A
Found 293 differentially expressed genes
Analyzing cluster: 6
Comparing S133T vs S133A
Found 548 differentially expressed genes
Analyzing cluster: 5
Comparing S133T vs S133A
Found 855 differentially expressed genes
Group comparison results saved to: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result/group_DE/group_comparison_markers_all.csv
Group comparison filtering results:
- Total genes before filtering: 1696
- Total genes after filtering: 933
- P-value cutoff: 0.05
- Log2FC cutoff: 0.5
Creating stacked bar chart for group proportions by cluster...n Creating per-cluster Dot/Violin/Heatmap/Volcano...n
Warning message:
“Scaling data with a low number of groups may produce misleading results”
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
Warning message:
“Scaling data with a low number of groups may produce misleading results”
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
Warning message:
“Scaling data with a low number of groups may produce misleading results”
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
Group comparison visualization plots saved to: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result
Summary of group comparison differential expression (filtered):
- Total differentially expressed genes: 933
- Comparisons performed: S133TvsS133A
- Clusters analyzed: 0, 6, 5
Enrichment Analysis
Perform ORA (Over-Representation Analysis) or GSEA (Gene Set Enrichment Analysis) to understand the biological significance of differentially expressed genes.
Prepare the necessary databases for enrichment analysis.
# Setup enrichment analysis databases
setup_enrichment_databases <- function(species) {
if (species == "human") {
org_db <- org.Hs.eg.db
kegg_org <- "hsa"
} else if (species == "mouse") {
org_db <- org.Mm.eg.db
kegg_org <- "mmu"
} else {
stop("Species must be 'human' or 'mouse'")
}
# Parse enrichment databases
databases <- strsplit(params$enrichment_databases, ",")[[1]]
return(list(
org_db = org_db,
kegg_org = kegg_org,
databases = databases
))
}
# Setup databases
db_setup <- setup_enrichment_databases(params$species)
cat("Enrichment databases setup for species:", params$species, "\n")
cat("Available databases:", paste(db_setup$databases, collapse = ", "), "\n")Available databases: GO, KEGG, HALLMARK, REACTOME
Define functions for enrichment analysis and visualization
# Function 1: read GMT file
read_gmt_file <- function(file_path) {
if (!file.exists(file_path)) {
stop("GMT file not found:", file_path)
}
# Read GMT file
gmt_data <- readLines(file_path)
# Parse GMT format
genesets <- list()
for (line in gmt_data) {
parts <- strsplit(line, "\t")[[1]]
if (length(parts) >= 3) {
term_name <- parts[1]
term_description <- parts[2]
genes <- parts[3:length(parts)]
genes <- genes[genes != ""] # Remove empty strings
if (length(genes) > 0) {
genesets[[term_name]] <- data.frame(
term = term_name,
gene = genes,
stringsAsFactors = FALSE
)
}
}
}
# Combine all gene sets
if (length(genesets) > 0) {
combined_genesets <- do.call(rbind, genesets)
return(combined_genesets)
} else {
stop("No valid genesets found in GMT file")
}
}
# Function 2: ORA enrichment analysis
perform_ora_analysis <- function(markers_data, db_setup, params) {
cat("Performing Over-Representation Analysis (ORA) with local data (split Up/Down)...\n")
# Prepare gene lists by (cluster or comparison) AND direction
if ("cluster" %in% colnames(markers_data)) {
split_key <- markers_data$cluster
} else {
split_key <- markers_data$comparison
}
markers_data$direction <- ifelse(is.finite(markers_data$avg_log2FC) & markers_data$avg_log2FC > 0, "Up",
ifelse(is.finite(markers_data$avg_log2FC) & markers_data$avg_log2FC < 0, "Down", NA))
markers_data <- markers_data[!is.na(markers_data$direction), , drop = FALSE]
index_keys <- unique(split_key)
directions <- c("Up","Down")
# convert symbol to entrez
convert_to_entrez <- function(genes) {
m <- AnnotationDbi::select(db_setup$org_db, keys = genes, columns = "ENTREZID", keytype = "SYMBOL")
unique(m$ENTREZID[!is.na(m$ENTREZID)])
}
enrichment_results <- list()
combined_results_list <- list()
for (db in db_setup$databases) {
cat("Analyzing database:", db, "\n")
for (k in index_keys) {
for (dirc in directions) {
sub <- markers_data[split_key == k & markers_data$direction == dirc, , drop = FALSE]
if (nrow(sub) == 0) next
genes <- unique(sub$gene)
eg <- convert_to_entrez(genes)
if (length(eg) == 0) next
if (db == "GO") {
ego <- enrichGO(gene = eg,
OrgDb = db_setup$org_db,
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = params$pvalue_cutoff,
qvalueCutoff = params$qvalue_cutoff,
minGSSize = params$minGSSize,
maxGSSize = params$maxGSSize)
if (nrow(ego) > 0) {
ego_df <- as.data.frame(ego)
ego_df$cluster <- k
ego_df$database <- db
ego_df$direction <- dirc
enrichment_results[[paste(db, k, dirc, sep = "_")]] <- ego
combined_results_list[[paste(db, k, dirc, sep = "_")]] <- ego_df
}
} else if (db == "KEGG") {
ekegg <- enrichKEGG(gene = eg,
organism = db_setup$kegg_org,
pAdjustMethod = "BH",
pvalueCutoff = params$pvalue_cutoff,
qvalueCutoff = params$qvalue_cutoff,
minGSSize = params$minGSSize,
maxGSSize = params$maxGSSize)
if (nrow(ekegg) > 0) {
ekegg_df <- as.data.frame(ekegg)
ekegg_df$cluster <- k
ekegg_df$database <- db
ekegg_df$direction <- dirc
enrichment_results[[paste(db, k, dirc, sep = "_")]] <- ekegg
combined_results_list[[paste(db, k, dirc, sep = "_")]] <- ekegg_df
}
} else if (db == "HALLMARK") {
data_dir <- "/jp_envs/Enrichment/"
hallmark_file <- file.path(data_dir, "h.all.v7.5.1.entrez.gmt")
if (file.exists(hallmark_file)) {
hallmark_genesets <- read_gmt_file(hallmark_file)
emsig <- tryCatch({
enricher(gene = eg,
TERM2GENE = hallmark_genesets,
pAdjustMethod = "BH",
pvalueCutoff = params$pvalue_cutoff,
qvalueCutoff = params$qvalue_cutoff,
minGSSize = params$minGSSize,
maxGSSize = params$maxGSSize)
}, error = function(e) NULL)
if (!is.null(emsig) && nrow(emsig) > 0) {
emsig_df <- as.data.frame(emsig)
emsig_df$cluster <- k
emsig_df$database <- db
emsig_df$direction <- dirc
enrichment_results[[paste(db, k, dirc, sep = "_")]] <- emsig
combined_results_list[[paste(db, k, dirc, sep = "_")]] <- emsig_df
}
} else {
cat("Warning: HALLMARK file not found, skipping...\n")
}
} else if (db == "REACTOME") {
data_dir <- "/jp_envs/Enrichment/"
reactome_file <- file.path(data_dir, "c2.cp.reactome.v7.5.1.entrez.gmt")
if (file.exists(reactome_file)) {
reactome_genesets <- read_gmt_file(reactome_file)
emsig <- tryCatch({
enricher(gene = eg,
TERM2GENE = reactome_genesets,
pAdjustMethod = "BH",
pvalueCutoff = params$pvalue_cutoff,
qvalueCutoff = params$qvalue_cutoff,
minGSSize = params$minGSSize,
maxGSSize = params$maxGSSize)
}, error = function(e) NULL)
if (!is.null(emsig) && nrow(emsig) > 0) {
emsig_df <- as.data.frame(emsig)
emsig_df$cluster <- k
emsig_df$database <- db
emsig_df$direction <- dirc
enrichment_results[[paste(db, k, dirc, sep = "_")]] <- emsig
combined_results_list[[paste(db, k, dirc, sep = "_")]] <- emsig_df
}
} else {
cat("Warning: REACTOME file not found, skipping...\n")
}
}
}
}
}
# Combine results
if (length(combined_results_list) > 0) {
all_columns <- unique(unlist(lapply(combined_results_list, colnames)))
cat("All columns found:", paste(all_columns, collapse = ", "), "\n")
standardized_results <- lapply(combined_results_list, function(df) {
for (col in all_columns) {
if (!col %in% colnames(df)) df[[col]] <- NA
}
df <- df[, all_columns, drop = FALSE]
df
})
combined_results <- do.call(rbind, standardized_results)
# Save results (with direction)
base_dir <- file.path(params$output_path, "ORA")
if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
output_file <- file.path(base_dir, "ora_enrichment_results.csv")
write.csv(combined_results, output_file, row.names = FALSE)
cat("ORA results saved to:", output_file, "\n")
return(list(results = combined_results, enrichment_objects = enrichment_results))
} else {
cat("No significant enrichment results found.\n")
return(NULL)
}
}
# Function 3: GSEA enrichment analysis
perform_gsea_analysis <- function(markers_data, db_setup, params) {
cat("Performing Gene Set Enrichment Analysis (GSEA) with local data...\n")
# Prepare ranked gene lists
if ("cluster" %in% colnames(markers_data)) {
gene_lists <- split(markers_data, markers_data$cluster)
} else {
gene_lists <- split(markers_data, markers_data$comparison)
}
# Convert gene symbols to Entrez IDs and create ranked lists
ranked_lists <- list()
for (name in names(gene_lists)) {
cluster_data <- gene_lists[[name]]
# Create ranked list based on log fold change (no filtering)
ranked_genes <- cluster_data$avg_log2FC
names(ranked_genes) <- cluster_data$gene
# Sort by fold change
ranked_genes <- sort(ranked_genes, decreasing = TRUE)
# Convert to Entrez IDs
entrez_mapping <- AnnotationDbi::select(db_setup$org_db,
keys = names(ranked_genes),
columns = "ENTREZID",
keytype = "SYMBOL")
# Create ranked list with Entrez IDs
entrez_ranked <- ranked_genes[match(entrez_mapping$SYMBOL, names(ranked_genes))]
names(entrez_ranked) <- entrez_mapping$ENTREZID
entrez_ranked <- entrez_ranked[!is.na(names(entrez_ranked))]
ranked_lists[[name]] <- entrez_ranked
}
# Perform GSEA for each database
gsea_results <- list()
combined_results_list <- list()
for (db in db_setup$databases) {
cat("Analyzing database:", db, "\n")
if (db == "GO") {
# GO GSEA
for (name in names(ranked_lists)) {
if (length(ranked_lists[[name]]) > 0) {
gsea_go <- gseGO(geneList = ranked_lists[[name]],
OrgDb = db_setup$org_db,
ont = "ALL",
pAdjustMethod = "BH",
pvalueCutoff = params$pvalue_cutoff,
minGSSize = params$minGSSize,
maxGSSize = params$maxGSSize,
nPerm = 1000)
if (nrow(gsea_go) > 0) {
gsea_go_df <- as.data.frame(gsea_go)
gsea_go_df$cluster <- name
gsea_go_df$database <- db
gsea_results[[paste(db, name, sep = "_")]] <- gsea_go
combined_results_list[[paste(db, name, sep = "_")]] <- gsea_go_df
}
}
}
} else if (db == "KEGG") {
# KEGG GSEA
for (name in names(ranked_lists)) {
if (length(ranked_lists[[name]]) > 0) {
gsea_kegg <- gseKEGG(geneList = ranked_lists[[name]],
organism = db_setup$kegg_org,
pAdjustMethod = "BH",
pvalueCutoff = params$pvalue_cutoff,
minGSSize = params$minGSSize,
maxGSSize = params$maxGSSize,
nPerm = 1000)
if (nrow(gsea_kegg) > 0) {
gsea_kegg_df <- as.data.frame(gsea_kegg)
gsea_kegg_df$cluster <- name
gsea_kegg_df$database <- db
gsea_results[[paste(db, name, sep = "_")]] <- gsea_kegg
combined_results_list[[paste(db, name, sep = "_")]] <- gsea_kegg_df
}
}
}
} else if (db == "HALLMARK") {
# HALLMARK GSEA using local data
data_dir <- "/jp_envs/Enrichment/"
hallmark_file <- file.path(data_dir, "h.all.v7.5.1.entrez.gmt")
if (file.exists(hallmark_file)) {
hallmark_genesets <- read_gmt_file(hallmark_file)
for (name in names(ranked_lists)) {
if (length(ranked_lists[[name]]) > 0) {
tryCatch({
gsea_msig <- GSEA(geneList = ranked_lists[[name]],
TERM2GENE = hallmark_genesets,
pAdjustMethod = "BH",
pvalueCutoff = params$pvalue_cutoff,
minGSSize = params$minGSSize,
maxGSSize = params$maxGSSize,
nPerm = 1000)
if (nrow(gsea_msig) > 0) {
gsea_msig_df <- as.data.frame(gsea_msig)
gsea_msig_df$cluster <- name
gsea_msig_df$database <- db
gsea_results[[paste(db, name, sep = "_")]] <- gsea_msig
combined_results_list[[paste(db, name, sep = "_")]] <- gsea_msig_df
}
}, error = function(e) {
cat("Warning: Failed to analyze HALLMARK for", name, ":", e$message, "\n")
})
}
}
} else {
cat("Warning: HALLMARK file not found, skipping...\n")
}
} else if (db == "REACTOME") {
# REACTOME GSEA using local data
data_dir <- "/jp_envs/Enrichment/"
reactome_file <- file.path(data_dir, "c2.cp.reactome.v7.5.1.entrez.gmt")
if (file.exists(reactome_file)) {
reactome_genesets <- read_gmt_file(reactome_file)
for (name in names(ranked_lists)) {
if (length(ranked_lists[[name]]) > 0) {
tryCatch({
gsea_msig <- GSEA(geneList = ranked_lists[[name]],
TERM2GENE = reactome_genesets,
pAdjustMethod = "BH",
pvalueCutoff = params$pvalue_cutoff,
minGSSize = params$minGSSize,
maxGSSize = params$maxGSSize,
nPerm = 1000)
if (nrow(gsea_msig) > 0) {
gsea_msig_df <- as.data.frame(gsea_msig)
gsea_msig_df$cluster <- name
gsea_msig_df$database <- db
gsea_results[[paste(db, name, sep = "_")]] <- gsea_msig
combined_results_list[[paste(db, name, sep = "_")]] <- gsea_msig_df
}
}, error = function(e) {
cat("Warning: Failed to analyze REACTOME for", name, ":", e$message, "\n")
})
}
}
} else {
cat("Warning: REACTOME file not found, skipping...\n")
}
}
}
# Combine results
if (length(combined_results_list) > 0) {
all_columns <- unique(unlist(lapply(combined_results_list, colnames)))
cat("All columns found:", paste(all_columns, collapse = ", "), "\n")
standardized_results <- lapply(combined_results_list, function(df) {
for (col in all_columns) {
if (!col %in% colnames(df)) df[[col]] <- NA
}
df <- df[, all_columns, drop = FALSE]
df
})
combined_results <- do.call(rbind, standardized_results)
# Save results
base_dir <- file.path(params$output_path, "GSEA")
if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
output_file <- file.path(base_dir, "gsea_enrichment_results.csv")
write.csv(combined_results, output_file, row.names = FALSE)
cat("GSEA results saved to:", output_file, "\n")
return(list(results = combined_results, gsea_objects = gsea_results, ranked_lists = ranked_lists))
} else {
cat("No significant GSEA results found.\n")
return(NULL)
}
}
# Function 4: create ORA visualization plots
create_ora_plots <- function(ora_results, params) {
if (is.null(ora_results)) { cat("No ORA results to visualize.\n"); return(invisible(NULL)) }
df <- ora_results$results
if (is.null(df) || nrow(df) == 0) { cat("Empty ORA results.\n"); return(invisible(NULL)) }
if (!"direction" %in% names(df)) {
df$direction <- "All"
} else {
df$direction <- ifelse(is.na(df$direction) | df$direction == "", "All", as.character(df$direction))
}
# generation
if (!"generation" %in% names(df)) {
gen <- rep(NA_real_, nrow(df))
if ("GeneRatio" %in% names(df)) {
gr <- as.character(df$GeneRatio)
gen_from_gr <- suppressWarnings(vapply(gr, function(x) {
if (is.na(x) || x == "") return(NA_real_)
if (grepl("/", x, fixed = TRUE)) {
p <- strsplit(x, "/", fixed = TRUE)[[1]]
as.numeric(p[1]) / as.numeric(p[2])
} else as.numeric(x)
}, numeric(1)))
gen <- gen_from_gr
}
if (all(is.na(gen)) && "Count" %in% names(df)) gen <- suppressWarnings(as.numeric(df$Count))
df$generation <- gen
}
# GO
df$ont <- NA_character_
if ("ONTOLOGY" %in% names(df)) df$ont <- as.character(df$ONTOLOGY)
else if ("gs_subcat" %in% names(df)) df$ont <- toupper(as.character(df$gs_subcat))
else if ("subcategory" %in% names(df)) df$ont <- toupper(as.character(df$subcategory))
else if ("category" %in% names(df)) df$ont <- toupper(as.character(df$category))
df$ont[grepl("BP", df$ont, ignore.case = TRUE)] <- "BP"
df$ont[grepl("CC", df$ont, ignore.case = TRUE)] <- "CC"
df$ont[grepl("MF", df$ont, ignore.case = TRUE)] <- "MF"
df$ont[is.na(df$ont) | df$ont == ""] <- "Other"
clusters <- sort(unique(df$cluster))
databases <- sort(unique(df$database))
directions <- sort(unique(df$direction))
for (db in databases) {
for (dirc in directions) {
db_dot_dir <- file.path(params$output_path, "ORA", db, dirc, "dotplot")
db_bar_dir <- file.path(params$output_path, "ORA", db, dirc, "bar")
if (!dir.exists(db_dot_dir)) dir.create(db_dot_dir, recursive = TRUE)
if (!dir.exists(db_bar_dir)) dir.create(db_bar_dir, recursive = TRUE)
for (cl in clusters) {
cl_df <- df[df$database == db & df$cluster == cl & df$direction == dirc, , drop = FALSE]
# filter
cl_df <- cl_df[is.finite(cl_df$generation) & !is.na(cl_df$Description), , drop = FALSE]
if (nrow(cl_df) == 0) next
# Top10
cl_top <- cl_df[order(-cl_df$generation, cl_df$p.adjust), , drop = FALSE]
if (nrow(cl_top) > 10) cl_top <- cl_top[seq_len(10), , drop = FALSE]
cl_top$neglogp <- -log10(cl_top$p.adjust)
# 1. dotplot
cl_top$Description <- forcats::fct_reorder(cl_top$Description, cl_top$generation, .desc = FALSE)
p_dot <- ggplot(cl_top, aes(x = generation, y = Description, size = Count, color = neglogp))
if (identical(db, "GO")) {
p_dot <- p_dot + geom_point(aes(shape = ont)) +
scale_shape_manual(values = c(BP = 18, CC = 17, MF = 15, Other = 16), na.translate = FALSE)
} else {
p_dot <- p_dot + geom_point()
}
p_dot <- p_dot +
scale_color_gradientn(colors = params$continuous_colors) +
labs(title = paste0("ORA dotplot - ", dirc, " - Cluster ", cl, " (", db, ")"),
x = "generation", y = "Enriched Terms",
size = "Gene Count", color = "-log10(Adj.P)") +
theme_bw() + theme(axis.text.y = element_text(size = 8))
ggsave(file.path(db_dot_dir, paste0("cluster_", cl, ".pdf")),
p_dot, width = 12, height = 8, dpi = params$plot_dpi)
# 2. barplot
p_bar <- ggplot(cl_top, aes(
x = generation,
y = forcats::fct_reorder(Description, generation, .desc = FALSE),
fill = neglogp
)) +
geom_col() +
scale_fill_gradientn(colors = params$continuous_colors) +
labs(
title = paste0("ORA Bar - ", dirc, " - Cluster ", cl, " (", db, ")"),
x = "generation", y = "Enriched Terms", fill = "-log10(Adj.P)"
) +
theme_bw() + theme(axis.text.y = element_text(size = 8))
ggsave(file.path(db_bar_dir, paste0("cluster_", cl, ".pdf")),
p_bar, width = 12, height = 8, dpi = params$plot_dpi)
}
}
}
cat("\nORA plots saved (split by Up/Down).\n")
invisible(NULL)
}
# Function 5: create GSEA visualization plots
create_gsea_plots <- function(gsea_results, params) {
if (is.null(gsea_results)) {
cat("No GSEA results to visualize.\n")
return(invisible(NULL))
}
df <- gsea_results$results
gsea_objects <- gsea_results$gsea_objects
if (is.null(df) || nrow(df) == 0) {
cat("Empty GSEA results.\n")
return(invisible(NULL))
}
df$neglogp <- -log10(df$p.adjust)
df$ont <- NA_character_
if ("ONTOLOGY" %in% names(df)) df$ont <- as.character(df$ONTOLOGY)
df$ont[grepl("BP", df$ont, ignore.case = TRUE)] <- "BP"
df$ont[grepl("CC", df$ont, ignore.case = TRUE)] <- "CC"
df$ont[grepl("MF", df$ont, ignore.case = TRUE)] <- "MF"
df$ont[is.na(df$ont) | df$ont == ""] <- "Other"
clusters <- sort(unique(df$cluster))
databases <- sort(unique(df$database))
base_dir <- file.path(params$output_path, "GSEA")
if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
for (db in databases) {
db_dot_dir <- file.path(base_dir, db, "dotplot")
db_bar_dir <- file.path(base_dir, db, "bar")
db_curve_dir <- file.path(base_dir, db, "curve")
if (!dir.exists(db_dot_dir)) dir.create(db_dot_dir, recursive = TRUE)
if (!dir.exists(db_bar_dir)) dir.create(db_bar_dir, recursive = TRUE)
if (!dir.exists(db_curve_dir)) dir.create(db_curve_dir, recursive = TRUE)
for (cl in clusters) {
cl_df <- df[df$database == db & df$cluster == cl, , drop = FALSE]
cl_df <- cl_df[is.finite(cl_df$NES) & !is.na(cl_df$Description), , drop = FALSE]
if (nrow(cl_df) == 0) next
# Top10 by |NES| desc, then by p.adjust asc
cl_top <- cl_df[order(-(cl_df$NES), cl_df$p.adjust), , drop = FALSE]
if (nrow(cl_top) > 10) cl_top <- cl_top[seq_len(10), , drop = FALSE]
# 1.dotplot
cl_top$Description <- forcats::fct_reorder(cl_top$Description, cl_top$NES, .desc = FALSE)
p_dot <- ggplot(cl_top, aes(x = NES, y = Description, size = setSize, color = neglogp))
if (identical(db, "GO")) {
p_dot <- p_dot + geom_point(aes(shape = ont), alpha = 0.85) +
scale_shape_manual(values = c(BP = 16, CC = 17, MF = 15, Other = 18), name = "GO")
} else {
p_dot <- p_dot + geom_point(alpha = 0.85)
}
p_dot <- p_dot +
scale_size_continuous(name = "Set Size") +
scale_color_gradientn(colors = params$continuous_colors) +
labs(title = paste0("GSEA dotplot - Cluster ", cl, " (", db, ")"),
x = "NES", y = "Enriched Terms", color = "-log10(Adj.P)") +
theme_minimal() + theme(axis.text.y = element_text(size = 8))
ggsave(file.path(db_dot_dir, paste0("cluster_", cl, ".pdf")),
p_dot, width = 12, height = 8, dpi = params$plot_dpi)
# 2.bar
p_bar <- ggplot(cl_top, aes(x = NES,
y = forcats::fct_reorder(Description, NES, .desc = FALSE),
fill = neglogp)) +
geom_col() +
scale_fill_gradientn(colors = params$continuous_colors) +
labs(title = paste0("GSEA Bar - Cluster ", cl, " (", db, ")"),
x = "NES", y = "Enriched Terms", fill = "-log10(Adj.P)") +
theme_minimal() + theme(axis.text.y = element_text(size = 8))
ggsave(file.path(db_bar_dir, paste0("cluster_", cl, ".pdf")),
p_bar, width = 12, height = 8, dpi = params$plot_dpi)
# 3.curve
k_curve <- 5
top_terms <- cl_df[order(cl_df$NES), , drop = FALSE]
top_terms <- top_terms[seq_len(min(k_curve, nrow(top_terms))), , drop = FALSE]
obj_keys <- names(gsea_objects)
if (length(obj_keys) > 0) {
cand <- obj_keys[grepl(paste0("^", db, "_"), obj_keys)]
cand <- cand[grepl(paste0("_", cl, "$"), cand)]
} else cand <- character(0)
if (length(cand) > 0) {
gobj <- gsea_objects[[cand[1]]]
if (inherits(gobj, "gseaResult") && nrow(gobj@result) > 0) {
ids_in_obj <- gobj@result$ID
if ("ID" %in% names(top_terms)) {
term_ids <- intersect(top_terms$ID, ids_in_obj)
} else {
term_ids <- gobj@result$ID[match(top_terms$Description, gobj@result$Description)]
term_ids <- term_ids[!is.na(term_ids)]
}
# Draw up to first k_curve curves
if (length(term_ids) > 0) {
# Individual curves
for (tid in term_ids) {
tid_idx <- which(gobj@result$ID == tid)[1]
if (length(tid_idx) == 1 && !is.na(tid_idx)) {
gp <- enrichplot::gseaplot2(gobj, geneSetID = tid_idx,
title = paste0("GSEA - ", gobj@result$Description[tid_idx],
" (Cluster ", cl, ", ", db, ")"))
safe_name <- gsub("[^A-Za-z0-9]+", "_", gobj@result$Description[tid_idx])
ggsave(file.path(db_curve_dir, paste0("cluster_", cl, "_", safe_name, ".pdf")),
gp, width = 10, height = 8, dpi = params$plot_dpi)
}
}
# Multi-curve
idx_vec <- which(gobj@result$ID %in% term_ids)
if (length(idx_vec) > 1) {
idx_vec <- idx_vec[seq_len(min(k_curve, length(idx_vec)))]
gp_multi <- enrichplot::gseaplot2(gobj, geneSetID = idx_vec,
title = paste0("GSEA Top Terms - Cluster ", cl, " (", db, ")"))
ggsave(file.path(db_curve_dir, paste0("cluster_", cl, "_top_terms.pdf")),
gp_multi, width = 12, height = 8, dpi = params$plot_dpi)
}
}
}
}
}
}
cat("\nGSEA plots saved (simple mode) to:", base_dir, "\n")
invisible(NULL)
}Perform enrichment analysis and visualization
if (params$enrichment_analysis_type == "ORA") {
cat("Performing Over-Representation Analysis (ORA)...\n")
# Perform ORA analysis on available markers
if (!is.null(clusters_markers)) {
ora_results_clusters <- perform_ora_analysis(markers_data = clusters_markers, db_setup, params)
ora_results_group <- NULL
# Create ORA visualizations
create_ora_plots(ora_results_clusters, params)
} else if (!is.null(group_markers)) {
ora_results_group <- perform_ora_analysis(markers_data = group_markers, db_setup, params)
ora_results_clusters <- NULL
# Create ORA visualizations
create_ora_plots(ora_results_group, params)
} else {
ora_results_clusters <- NULL
ora_results_group <- NULL
cat("No differential expression results available for ORA analysis.\n")
}
# Set GSEA results to NULL since we're not doing GSEA
gsea_results_clusters <- NULL
gsea_results_group <- NULL
} else if (params$enrichment_analysis_type == "GSEA") {
cat("Performing Gene Set Enrichment Analysis (GSEA)...\n")
# Perform GSEA analysis on available markers
if (!is.null(clusters_markers_all)) {
gsea_results_clusters <- perform_gsea_analysis(clusters_markers_all, db_setup, params)
saveRDS(gsea_results_clusters, file.path(params$output_path, "GSEA", "gsea_results.rds"))
gsea_results_group <- NULL
# Create GSEA visualizations
create_gsea_plots(gsea_results_clusters, params)
} else if (!is.null(group_markers_all)) {
gsea_results_group <- perform_gsea_analysis(group_markers_all, db_setup, params)
saveRDS(gsea_results_group, file.path(params$output_path, "GSEA", "gsea_results.rds"))
gsea_results_clusters <- NULL
# Create GSEA visualizations
create_gsea_plots(gsea_results_group, params)
} else {
gsea_results_clusters <- NULL
gsea_results_group <- NULL
cat("No differential expression results available for GSEA analysis.\n")
}
# Set ORA results to NULL since we're not doing ORA
ora_results_clusters <- NULL
ora_results_group <- NULL
}'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
Analyzing database: KEGG
'select()' returned 1:1 mapping between keys and columns
Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...n
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...n
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
Analyzing database: HALLMARK
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
Analyzing database: REACTOME
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
'select()' returned 1:1 mapping between keys and columns
All columns found: ONTOLOGY, ID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue, geneID, Count, cluster, database, direction, category, subcategory
ORA results saved to: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result/ORA/ora_enrichment_results.csv
ORA plots saved (split by Up/Down).
Visualize custom differentially expressed genes and enriched pathways
Visualization based on custom differentially expressed genes and enriched pathways.
Configure custom gene and pathway visualization parameters
custom_plot <- FALSE # enable/disable custom gene/pathway visualizations
custom_genes <- c("C1QA","C1QB","C1QC", "CD3D", "CD3E") # genes of interest to highlight in custom plots (must exist in the Seurat object)
terms <- c("regulation of T cell activation",
"positive regulation of lymphocyte activation",
"positive regulation of leukocyte activation",
"leukocyte cell-cell adhesion",
"positive regulation of cell activation") # pathway terms in ORA/GSEA results to plotDefine custom gene and pathway visualization functions
# Function 1: Custom gene plots for clusters DE
plot_custom_genes_clusters <- function(seurat_obj, custom_genes, params, clusters_colors) {
if (length(custom_genes) == 0) {
cat("No custom genes provided.\n"); return(invisible(NULL))
}
out_dir <- file.path(params$output_path, "custom_cluster_DE")
if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)
# Keep only genes present in the object
genes_in_obj <- intersect(unique(custom_genes), rownames(seurat_obj))
if (length(genes_in_obj) == 0) {
cat("Custom genes not found in object; skip all plots.\n"); return(invisible(NULL))
}
# 1. DotPlot
cluster_col <- params$cluster_by
if (!(cluster_col %in% colnames(seurat_obj@meta.data))) {
cluster_col <- "ident"
seurat_obj@meta.data$ident <- as.character(Idents(seurat_obj))
cat("Using Idents as cluster column for custom genes plots\n")
} else {
cat("Using cluster column for custom genes plots:", cluster_col, "\n")
}
cat("Creating DotPlot for custom genes...\n")
p_dot <- DotPlot(seurat_obj, features = genes_in_obj, group.by = cluster_col) +
scale_color_gradientn(colors = params$continuous_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = "Custom Genes - DotPlot (All Clusters)",
x = "Genes", y = "Clusters")
ggsave(file.path(out_dir, "custom_genes_clusters_dotplot.pdf"),
p_dot, width = 12, height = 6, dpi = params$plot_dpi)
# 2. Violin
cat("Creating Violin for custom genes...\n")
mat <- FetchData(seurat_obj, vars = genes_in_obj, slot = "data")
var_ok <- sapply(as.data.frame(mat), function(x) stats::var(x, na.rm = TRUE) > 0)
genes_var <- genes_in_obj[var_ok]
if (length(genes_var) == 0) {
cat("All requested genes have zero variance; skip Violin.\n")
} else {
if (length(genes_var) >= 2) {
p_vln <- VlnPlot(seurat_obj, features = genes_var,
stack = TRUE, flip = TRUE, group.by = cluster_col, cols = clusters_colors, fill.by = "ident") +
labs(title = "Custom Genes - Stacked Violin (All Clusters)",
x = cluster_col, y = "Expression")
} else {
p_vln <- VlnPlot(seurat_obj, features = genes_var,
stack = FALSE, flip = FALSE, group.by = cluster_col, cols = clusters_colors, fill.by = "ident") +
labs(title = "Custom Gene - Violin (All Clusters)",
x = cluster_col, y = "Expression")
}
ggsave(file.path(out_dir, "custom_genes_clusters_violin.pdf"),
p_vln, width = 12, height = 8, dpi = params$plot_dpi)
}
# 3. Heatmap
cat("Creating Heatmap for custom genes...\n")
genes_in_scale <- intersect(genes_in_obj, rownames(GetAssayData(seurat_obj, slot = "scale.data")))
if (length(genes_in_scale) < 2) {
cat("No or insufficient custom genes in scale.data; skip Heatmap.\n")
} else {
heat_obj <- seurat_obj
max_cells <- 1200
if (ncol(heat_obj) > max_cells) {
set.seed(42)
heat_obj <- subset(heat_obj, cells = sample(colnames(heat_obj), max_cells))
}
p_heat <- DoHeatmap(heat_obj, features = genes_in_scale, slot = "scale.data",
draw.lines = TRUE, group.colors = clusters_colors) +
scale_fill_gradientn(colors = params$continuous_colors) +
labs(title = "Custom Genes - Heatmap (All Clusters)")
ggsave(file.path(out_dir, "custom_genes_clusters_heatmap.pdf"),
p_heat, width = 12, height = 8, dpi = params$plot_dpi)
}
invisible(list(dotplot = p_dot, violin = if (exists("p_vln")) p_vln else NULL))
}
# Function 2: Custom gene plots for group-comparison DE
plot_custom_genes_group_comparison <- function(seurat_obj, group_markers, custom_genes, params, clusters_colors) {
if (is.null(group_markers) || nrow(group_markers) == 0) {
cat("No group comparison markers; skip custom plots.\n")
return()
}
if (!requireNamespace("ggrepel", quietly = TRUE)) {
cat("Package 'ggrepel' not available; labels will be limited.\n")
}
# Determine grouping column and cluster column
group_col <- params$group_by
# Locate cluster column by params$cluster_by; fallback to Idents if missing
cluster_col <- params$cluster_by
if (!(cluster_col %in% colnames(seurat_obj@meta.data))) {
cluster_col <- "ident"
seurat_obj@meta.data$ident <- as.character(Idents(seurat_obj))
cat("Using Idents as cluster column for custom genes plots\n")
} else {
cat("Using cluster column for custom genes plots:", cluster_col, "\n")
}
# 1. Volcano
for (cl in unique(group_markers$cluster)) {
df <- group_markers[group_markers$cluster == cl, , drop = FALSE]
if (nrow(df) == 0) next
df$highlight <- ifelse(df$gene %in% custom_genes, "Custom", "Other")
p_vol <- ggplot(df, aes(x = avg_log2FC, y = -log10(p_val_adj))) +
geom_point(aes(color = highlight), alpha = 0.7, size = 1.5) +
scale_color_manual(values = c("Custom" = "red", "Other" = "grey70")) +
geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "red") +
labs(title = paste0("Volcano (highlight custom) - Cluster ", cl),
x = "Average Log2 Fold Change", y = "-log10(Adjusted P-value)", color = "") +
theme_bw()
if (requireNamespace("ggrepel", quietly = TRUE)) {
lab_df <- df[df$highlight == "Custom" & df$p_val_adj < 0.05 & abs(df$avg_log2FC) > 0.5, ]
if (nrow(lab_df) > 0) {
p_vol <- p_vol + ggrepel::geom_text_repel(data = lab_df,
aes(label = gene), size = 3, max.overlaps = 20)
}
}
if (!dir.exists(file.path(params$output_path, "custom_group_DE"))) dir.create(file.path(params$output_path, "custom_group_DE"), recursive = TRUE)
ggsave(file.path(params$output_path, "custom_group_DE", paste0("custom_genes_cluster_", cl, "_volcano.pdf")),
p_vol, width = 8, height = 6, dpi = params$plot_dpi)
}
# 2. For each cluster, create group-wise custom gene visualizations
clusters <- unique(group_markers$cluster)
genes_available <- intersect(custom_genes, rownames(seurat_obj))
if (length(genes_available) == 0) {
cat("No custom genes found in object; skip cluster-specific plots.\n")
return()
}
for (cl in clusters) {
cat("Creating custom gene plots for cluster", cl, "...\n")
# Subset to the specific cluster
cluster_subset <- subset(seurat_obj,
cells = rownames(seurat_obj@meta.data)[
as.character(seurat_obj@meta.data[[cluster_col]]) == cl
])
if (ncol(cluster_subset) == 0) {
cat("No cells found in cluster", cl, "; skip.\n")
next
}
# Check available groups in this cluster
cluster_groups <- unique(cluster_subset@meta.data[[group_col]])
if (length(cluster_groups) < 2) {
cat("Cluster", cl, "has insufficient groups; skip.\n")
next
}
# 2.1 DotPlot
p_dot <- DotPlot(cluster_subset, features = genes_available,
group.by = group_col) +
scale_color_gradientn(colors = params$continuous_colors) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
labs(title = paste("Custom Genes - Cluster", cl),
subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")),
x = "Genes", y = "Groups") +
theme_bw()
ggsave(file.path(params$output_path, "custom_group_DE", paste0("custom_genes_cluster_", cl, "_dotplot.pdf")),
p_dot, width = 12, height = 6, dpi = params$plot_dpi)
# 2.2 VlnPlot
mat <- FetchData(cluster_subset, vars = genes_available, slot = "data")
var_ok <- sapply(as.data.frame(mat), function(x) stats::var(x, na.rm = TRUE) > 0)
genes_var <- genes_available[var_ok]
if (length(genes_var) == 0) {
cat("All custom genes have zero variance in cluster", cl, "; skip violin.\n")
} else {
if (length(cluster_groups) >= 2) {
if (length(genes_var) >= 2) {
p_vln <- VlnPlot(cluster_subset, features = genes_var,
stack = TRUE, flip = TRUE,
fill.by = "ident",
group.by = group_col,
cols = brewer.pal(8, params$discrete_colors)[1:2]) +
labs(title = paste("Custom Genes - Cluster", cl),
subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")))
} else {
p_vln <- VlnPlot(cluster_subset, features = genes_var,
stack = FALSE, flip = FALSE,
fill.by = "ident",
group.by = group_col,
cols = brewer.pal(8, params$discrete_colors)[1:2]) +
labs(title = paste("Custom Gene - Cluster", cl),
subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")))
}
ggsave(file.path(params$output_path, "custom_group_DE", paste0("custom_genes_cluster_", cl, "_violin.pdf")),
p_vln, width = 6, height = 10, dpi = params$plot_dpi)
} else {
cat("Cluster", cl, "has only one group; skip violin.\n")
}
}
# 2.3 heatmap
genes_heat <- intersect(genes_available, rownames(GetAssayData(seurat_obj, slot = "scale.data")))
if (length(genes_heat) > 0) {
if (length(genes_heat) > 20) {
genes_heat <- genes_heat[1:20]
}
if (length(cluster_groups) >= 2) {
p_heat <- DoHeatmap(cluster_subset, features = genes_heat,
slot = "scale.data",
group.colors = brewer.pal(8, params$discrete_colors),
group.by = group_col) +
scale_fill_gradientn(colors = params$continuous_colors) +
labs(title = paste("Custom Genes - Cluster", cl),
subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")))
} else {
p_heat <- DoHeatmap(cluster_subset, features = genes_heat,
slot = "scale.data") +
scale_fill_gradientn(colors = params$continuous_colors) +
labs(title = paste("Custom Genes - Cluster", cl),
subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")))
}
ggsave(file.path(params$output_path, "custom_group_DE", paste0("custom_genes_cluster_", cl, "_heatmap.pdf")),
p_heat, width = 12, height = 8, dpi = params$plot_dpi)
} else {
cat("No custom genes found in scale.data for cluster", cl, "; skip heatmap.\n")
}
}
}
# Function 3: Custom ORA plots
plot_custom_from_ora_csv <- function(
csv_path,
terms,
output_dir,
plot_dpi = 300
) {
if (!file.exists(csv_path)) {
cat("ORA CSV not found:", csv_path, "\n"); return(invisible(NULL))
}
if (length(terms) == 0) {
cat("No terms provided. Nothing to plot.\n"); return(invisible(NULL))
}
df <- suppressMessages(readr::read_csv(csv_path, show_col_types = FALSE))
need_cols <- c("Description", "p.adjust", "Count", "cluster", "database")
miss <- setdiff(need_cols, colnames(df))
if (length(miss) > 0) {
cat("CSV missing columns:", paste(miss, collapse = ", "), "\n")
return(invisible(NULL))
}
if (!"direction" %in% colnames(df)) df$direction <- "All"
# terms
sel <- tolower(df$Description) %in% tolower(terms)
sub <- df[sel, , drop = FALSE]
if (nrow(sub) == 0) {
cat("No matched pathways in CSV (case-insensitive).\n"); return(invisible(NULL))
}
out_dir_base <- file.path(output_dir, "custom_ORA")
if (!dir.exists(out_dir_base)) dir.create(out_dir_base, recursive = TRUE)
# generation and -log10(padj)
if (!"generation" %in% colnames(sub)) {
if ("GeneRatio" %in% colnames(sub)) {
gr <- as.character(sub$GeneRatio)
sub$generation <- suppressWarnings(vapply(gr, function(x) {
if (is.na(x) || x == "") return(NA_real_)
if (grepl("/", x, fixed = TRUE)) {
p <- strsplit(x, "/", fixed = TRUE)[[1]]
as.numeric(p[1]) / as.numeric(p[2])
} else as.numeric(x)
}, numeric(1)))
} else if ("Count" %in% colnames(sub)) {
sub$generation <- suppressWarnings(as.numeric(sub$Count))
} else {
sub$generation <- NA_real_
}
}
sub$neglogp <- -log10(sub$p.adjust)
# GO
if (!"ONTOLOGY" %in% colnames(sub)) sub$ONTOLOGY <- NA_character_
# plot
for (db in sort(unique(sub$database))) {
for (dirc in sort(unique(sub$direction))) {
sub_db_dir <- file.path(out_dir_base, db, dirc)
dir_dot <- file.path(sub_db_dir, "dotplot")
dir_bar <- file.path(sub_db_dir, "bar")
for (d in c(dir_dot, dir_bar)) if (!dir.exists(d)) dir.create(d, recursive = TRUE)
sub_db_dirc <- sub[sub$database == db & sub$direction == dirc, , drop = FALSE]
if (nrow(sub_db_dirc) == 0) next
for (cl in sort(unique(sub_db_dirc$cluster))) {
sub_cl <- sub_db_dirc[sub_db_dirc$cluster == cl, , drop = FALSE]
if (nrow(sub_cl) == 0) next
# sort
sub_cl <- sub_cl[order(sub_cl$generation, sub_cl$p.adjust), , drop = FALSE]
sub_cl$Description <- forcats::fct_reorder(sub_cl$Description, sub_cl$generation, .desc = FALSE)
# GO
sub_cl$go_type <- "Other"
if ("ONTOLOGY" %in% colnames(sub_cl)) {
sub_cl$go_type <- toupper(ifelse(is.na(sub_cl$ONTOLOGY), "Other", as.character(sub_cl$ONTOLOGY)))
sub_cl$go_type[!sub_cl$go_type %in% c("BP","CC","MF")] <- "Other"
}
# 1. dotplot
p_dot <- ggplot(sub_cl, aes(x = generation, y = Description, size = Count, color = neglogp)) +
geom_point(aes(shape = go_type)) +
scale_shape_manual(values = c(BP = 16, CC = 17, MF = 15, Other = 19), na.translate = FALSE) +
scale_color_gradientn(colors = params$continuous_colors) +
labs(title = paste0("ORA dotplot (Selected Terms) - ", dirc, " - Cluster ", cl, " (", db, ")"),
x = "generation", y = "Enriched Terms",
size = "Gene Count", color = "-log10(Adj.P)", shape = "GO Type") +
theme_minimal() + theme(axis.text.y = element_text(size = 8))
ggsave(file.path(dir_dot, paste0("cluster_", cl, "_", dirc, ".pdf")),
p_dot, width = 12, height = 8, dpi = plot_dpi)
# 2. barplot
p_bar <- ggplot(sub_cl, aes(x = generation,
y = forcats::fct_reorder(Description, generation, .desc = FALSE),
fill = neglogp)) +
geom_col() +
scale_fill_gradientn(colors = params$continuous_colors) +
labs(title = paste0("ORA Bar (Selected Terms) - ", dirc, " - Cluster ", cl, " (", db, ")"),
x = "generation", y = "Enriched Terms", fill = "-log10(Adj.P)") +
theme_minimal() + theme(axis.text.y = element_text(size = 8))
ggsave(file.path(dir_bar, paste0("cluster_", cl, "_", dirc, ".pdf")),
p_bar, width = 12, height = 8, dpi = plot_dpi)
}
}
}
cat("Custom ORA plots saved to:", out_dir_base, "\n")
invisible(NULL)
}
# Function 4: Custom GSEA plots
plot_custom_from_gsea_csv <- function(csv_path,
terms,
output_dir,
plot_dpi = 300,
discrete_palette = "Paired",
gsea_rds_path
) {
if (!file.exists(csv_path)) stop("GSEA CSV not found: ", csv_path)
if (!file.exists(gsea_rds_path)) stop("GSEA RDS not found: ", gsea_rds_path)
# Read GSEA object
gsea_res <- readRDS(gsea_rds_path)
gsea_objects <- gsea_res$gsea_objects
if (is.null(gsea_objects) || !length(gsea_objects)) {
stop("gsea_objects not found inside RDS. Ensure you saved the full result of perform_gsea_analysis().")
}
df <- suppressMessages(readr::read_csv(csv_path, show_col_types = FALSE))
need_cols <- c("ID","Description","setSize","NES","p.adjust","cluster","database")
miss <- setdiff(need_cols, colnames(df))
if (length(miss) > 0) stop("CSV missing required columns: ", paste(miss, collapse=", "))
if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)
if (length(terms) == 0) stop("No terms provided.")
desc_norm <- tolower(df$Description)
terms_norm <- tolower(terms)
sub <- df[desc_norm %in% terms_norm, , drop = FALSE]
if (nrow(sub) == 0) stop("No matched pathways in CSV for given terms.")
# GO type
if ("ONTOLOGY" %in% colnames(sub)) {
sub$go_type <- toupper(as.character(sub$ONTOLOGY))
sub$go_type[!sub$go_type %in% c("BP","CC","MF")] <- "Other"
} else {
sub$go_type <- "Other"
}
sub$go_type <- factor(sub$go_type, levels = c("BP","CC","MF","Other"))
base_dir <- file.path(output_dir, "custom_GSEA")
if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
for (db in unique(sub$database)) {
db_dotplot_dir <- file.path(base_dir, db, "dotplot")
db_bar_dir <- file.path(base_dir, db, "bar")
db_curve_dir <- file.path(base_dir, db, "curve")
for (d in c(db_dotplot_dir, db_bar_dir, db_curve_dir)) if (!dir.exists(d)) dir.create(d, recursive = TRUE)
for (cl in unique(sub$cluster)) {
cl_df <- sub[sub$cluster == cl & sub$database == db, , drop = FALSE]
if (nrow(cl_df) == 0) next
# 1. barplot
p_bar <- ggplot(cl_df,
aes(x = reorder(Description, NES),
y = NES,
fill = -log10(p.adjust))) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_gradientn(colors = params$continuous_colors, name = "-log10(Adj.P)") +
labs(title = paste0("GSEA Bar (Selected Terms) - Cluster ", cl, " (", db, ")"),
x = "Enriched Terms", y = "NES") +
theme_minimal() + theme(axis.text.y = element_text(size = 8))
ggsave(file.path(db_bar_dir, paste0("cluster_", cl, "_custom_bar.pdf")),
p_bar, width = 12, height = 8, dpi = plot_dpi)
# 2. dotplot
p_dot <- ggplot(cl_df,
aes(x = NES, y = reorder(Description, NES),
size = setSize, color = -log10(p.adjust),
shape = go_type)) +
scale_color_gradientn(colors = params$continuous_colors, name = "-log10(Adj.P)") +
scale_size_continuous(name = "Set Size", range = c(2, 8)) +
scale_shape_manual(values = c("BP"=18, "CC"=17, "MF"=15, "Other"=16), name = "ont") +
geom_point(alpha = 0.8) +
labs(title = paste0("GSEA DotPlot (Selected Terms) - Cluster ", cl, " (", db, ")"),
x = "NES", y = "Enriched Terms") +
theme_minimal() + theme(axis.text.y = element_text(size = 8)) +
geom_vline(xintercept = 0, linetype = "dashed", color = "grey50")
ggsave(file.path(db_dotplot_dir, paste0("cluster_", cl, "_custom_dotplot.pdf")),
p_dot, width = 12, height = 8, dpi = plot_dpi)
# 3. curve
keys <- names(gsea_objects)
keys <- keys[grepl(paste0("^", db, "_"), keys)]
keys <- keys[grepl(paste0("_", cl, "$"), keys)]
if (length(keys) == 0) {
warning("No gseaResult object found for ", db, " cluster=", cl, ". Skip curves."); next
}
gobj <- gsea_objects[[keys[1]]]
if (!(inherits(gobj, "gseaResult") && nrow(gobj) > 0)) {
warning("gsea_object is not a valid gseaResult or empty for ", db, " cluster=", cl); next
}
for (i in seq_len(nrow(cl_df))) {
term_id <- cl_df$ID[i]
term_desc <- cl_df$Description[i]
clean_nm <- gsub("[^A-Za-z0-9]", "_", term_desc)
idx <- which(gobj@result$ID == term_id)
if (length(idx) == 0) next
gp <- gseaplot2(gobj, geneSetID = idx,
title = paste0("GSEA - ", term_desc, " (Cluster ", cl, ")"))
ggsave(file.path(db_curve_dir, paste0("cluster_", cl, "_", clean_nm, ".pdf")),
gp, width = 10, height = 8, dpi = plot_dpi)
}
}
}
cat("Custom GSEA plots (bar, dot, curves) saved to:", base_dir, "\n")
invisible(TRUE)
}perform custom genes and pathways visualization
# Custom plots
if (custom_plot) {
if (params$differential_analysis_type == "clusters") {
# Visualize custom genes across clusters
cat("Performing ALL CLUSTERS differential expression analysis...\n")
plot_custom_genes_clusters(seurat_obj, custom_genes, params, clusters_colors)
} else if (params$differential_analysis_type == "group_comparison") {
# Visualize custom genes inside clusters split by groups
cat("Performing GROUP COMPARISON differential expression analysis...\n")
plot_custom_genes_group_comparison(seurat_obj, group_markers, custom_genes, params, clusters_colors)
}
if (params$enrichment_analysis_type == "ORA") {
# Plot selected pathways from ORA CSV
file_path <- file.path(params$output_path, "ORA", "ora_enrichment_results.csv")
plot_custom_from_ora_csv(
csv_path = file_path,
terms = terms,
output_dir = params$output_path,
plot_dpi = params$plot_dpi
)
} else if (params$enrichment_analysis_type == "GSEA") {
# Plot selected pathways from GSEA CSV
file_path <- file.path(params$output_path, "GSEA", "gsea_enrichment_results.csv")
plot_custom_from_gsea_csv(
csv_path = file_path,
terms = terms,
output_dir = params$output_path,
plot_dpi = params$plot_dpi,
discrete_palette = params$discrete_colors,
gsea_rds_path = file.path(params$output_path, "GSEA", "gsea_results.rds")
)
}
}Template Information
Template Launch Date: September 16, 2025
Last Updated: September 16, 2025
Contact: For questions, issues, or suggestions regarding this tutorial, please contact us through the "Intelligent Consultation" service.
References
[1] Seurat Package: https://github.com/satijalab/Seurat
[2] clusterProfiler: https://github.com/YuLab-SMU/clusterProfiler
[3] Gene Ontology: http://geneontology.org/
[4] KEGG: https://www.genome.jp/kegg/
[5] MSigDB: https://www.gsea-msigdb.org/
[6] Reactome: https://reactome.org/ Liu X., Wu C., Pan L., et al. (2025). SeekSoul Online: A user-friendly bioinformatics platform focused on single-cell multi-omics analysis. The Innovation Life 3:100156. https://doi.org/10.59717/j.xinn-life.2025.100156
