Skip to content

单细胞差异表达与功能富集分析全流程教程

作者: SeekGene
时长: 55 分钟
字数: 11.6k 字
更新: 2026-02-27
阅读: 0 次
3' 转录组 5' + 免疫组库 ATAC + RNA 双组学 FFPE 单细胞转录组 Notebooks 全序列转录组 分析指南 差异富集分析 甲基化 + RNA 双组学 空间转录组

差异表达与富集分析介绍

本教程提供了单细胞 RNA 测序数据差异表达分析和富集分析的综合指南。分析包括:

  • 差异表达分析:识别不同细胞簇或实验条件之间差异表达的基因
  • 富集分析:通过通路和功能分析了解差异表达基因的生物学意义

内核配置

重要提示:本教程应使用 "common_r" 内核运行。

所需包:

  • Seurat (>= 4.0)
  • clusterProfiler, msigdbr, org.Hs.eg.db, org.Mm.eg.db
  • dplyr, tidyr, readr
  • enrichplot, ggplot2, ggrepel, RColorBrewer, viridis, ComplexHeatmap and patchwork

加载库和参数配置

加载库

加载所有必需的库。

R
# 加载所需的库
suppressMessages({
    library(Seurat)
    library(dplyr)
    library(tidyr)
    library(ggplot2)
    library(patchwork)
    library(clusterProfiler)
    library(org.Hs.eg.db)
    library(org.Mm.eg.db)
    library(enrichplot)
    library(msigdbr)
    library(viridis)
    library(RColorBrewer)
    library(ComplexHeatmap)
    library(readr)
    library(ggrepel)
})

# 设置随机种子以保证可重复性
set.seed(42)

print("Libraries loaded successfully!")
output
[1] "Libraries loaded successfully!"

配置分析参数

设置分析参数,包括文件路径和可视化设置。

R
# 分析参数
params <- list(
  # 数据路径
  input_path = "/home/demo-seekgene-com/workspace/project/demo-seekgene-com/CellAnnotation/result/Step4_Final_Cell_Annotation.rds",
  output_path = "/home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result",
  
  # 物种信息
  species = "human",  # Options: "human", "mouse"
  
  # 可视化参数
  discrete_colors = "Paired",  # 用于离散变量
  continuous_colors = viridis(100),  # 用于连续变量
  plot_dpi = 300,   # 绘图分辨率
  
  # 差异表达参数
  differential_analysis_type = "group_comparison",  # 选项:"clusters" 或 "group_comparison"
  cluster_by = "RNA_snn_res.0.5",  # 聚类或细胞类型的列名(例如 resolution.0.5_d30, celltype)
  select_clusters = c("0", "5", "6"),  # 要分析的 "cluster_by" 的可选子集;设为 NULL 以包含所有簇
  group_by = "sample",  # 分组的列名(例如 sample, condition)
  compare_groups = c("S133TvsS133A"),  # 要分析的 "group_by" 的可选子集,成对比较规范(例如 "TreatedvsControl","DiseasevsHealth");设为 NULL 以自动生成所有对
  de_method = "wilcox",  # 选项:"wilcox", "bimod", "roc", "t", "negbinom", "poisson", "LR", "MAST"
  min_pct = 0.1,  # 表达该基因的细胞的最小百分比
  logfc_threshold = 0.5,  # Seurat 寻找标记物时使用的对数倍数变化阈值
  p_adj_cutoff = 0.05,          # 保留 DEG 表中行的校正后 P 值截止值
  abs_log2fc_cutoff = 0.5,       # 保留 DEG 表中行的绝对 log2FC 截止值
  
  # 富集分析参数
  enrichment_analysis_type = "ORA",  # 选项:"ORA" 或 "GSEA"
  enrichment_databases = "GO,KEGG,HALLMARK,REACTOME",  # 逗号分隔
  pvalue_cutoff = 0.05,  # 富集分析的 P 值截止值
  qvalue_cutoff = 0.2,   # 富集分析的 Q 值截止值
  minGSSize = 10,        # 富集分析中的最小基因集大小
  maxGSSize = 500       # 富集分析中的最大基因集大小
)

# 如果输出目录不存在,则创建它
if (!dir.exists(params$output_path)) {
  dir.create(params$output_path, recursive = TRUE)
}

cat("Analysis configuration:")
cat("\n- Output directory:", params$output_path, "\n")
if (params$differential_analysis_type == "clusters") {
  if (!is.null(params$select_clusters)) {
    cat("- differential analysis among clusters: ",
        paste(params$select_clusters, collapse = ", "), "\n", sep = "")
  } else {
    cat("- pairwise differential analysis across all clusters\n")
  }
} else if (params$differential_analysis_type == "group_comparison") {
  comp_str <- if (is.null(params$compare_groups)) "auto-generate all pairwise groups" else paste(params$compare_groups, collapse = ", ")
  cl_str <- if (is.null(params$select_clusters)) "all clusters" else paste(params$select_clusters, collapse = ", ")
  cat("- group comparison ", comp_str, " within clusters: ", cl_str, "\n", sep = "")
}
cat("\n- Differential analysis type:", params$differential_analysis_type)
cat("\n- Enrichment analysis type:", params$enrichment_analysis_type)
output
Analysis configuration:
- Output directory: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result
- group comparison S133TvsS133A within clusters: 0, 5, 6

- Enrichment analysis type: ORA

加载和准备数据

加载单细胞数据并为差异表达分析做好准备。

R
# 加载 Seurat 对象
cat("Loading data from:", params$input_path, "\n")
seurat_obj <- readRDS(params$input_path)

# 基本数据信息
cat("\nData Summary:")
cat("\n- Number of cells:", ncol(seurat_obj))
cat("\n- Number of genes:", nrow(seurat_obj))
cat("\n- Available metadata columns:", paste(colnames(seurat_obj@meta.data), collapse = ", "))

# 检查可用簇
if (params$cluster_by %in% colnames(seurat_obj@meta.data)) {
  cat("\n- Available clusters:", paste(unique(seurat_obj[[params$cluster_by]]), collapse = ", "))
}

# 检查可用分组
if (params$group_by %in% colnames(seurat_obj@meta.data)) {
  cat("\n- Available groups:", paste(unique(seurat_obj@meta.data[[params$group_by]]), collapse = ", "))
}

# 如果尚未设置,则将默认测定设置为 RNA
DefaultAssay(seurat_obj) <- "RNA"

# 设置簇颜色
n_clusters <- length(table(seurat_obj[[params$cluster_by]]))
clusters_colors <- brewer.pal(min(n_clusters, 12), params$discrete_colors)
if (n_clusters > 12) {
    clusters_colors <- colorRampPalette(clusters_colors)(n_clusters)
}

# 缩放数据
if (dim(seurat_obj@assays$RNA@scale.data)[1] > 1) {
  cat("\nThe scale.data slot exists.")
} else {
  cat("\nScaling data...")
  seurat_obj <- ScaleData(seurat_obj)
}

# 如果指定了 select_clusters,先对数据进行子集化
if (!is.null(params$select_clusters)) {
  cat("\nFiltering clusters:", paste(params$select_clusters, collapse = ", "))
  
  # 确保 select_clusters 为字符型以匹配元数据类型
  select_clusters <- as.character(params$select_clusters)
  
  # 验证请求的簇是否存在
  available_clusters <- unique(seurat_obj@meta.data[[params$cluster_by]])
  missing_clusters <- setdiff(select_clusters, available_clusters)
  if (length(missing_clusters) > 0) {
    warning("The following clusters do not exist and will be ignored: ", paste(missing_clusters, collapse = ", "))
  }
  
  # 仅保留有效簇
  valid_clusters <- intersect(select_clusters, available_clusters)
  if (length(valid_clusters) == 0) {
    stop("No valid clusters found for differential analysis")
  }
  
  # 将数据子集化为选定的簇
  seurat_obj <- subset(seurat_obj, 
                          cells = rownames(seurat_obj@meta.data)[
                            as.character(seurat_obj@meta.data[[params$cluster_by]]) %in% valid_clusters
                          ])
  
  cat("\nCells after filtering:", ncol(seurat_obj))
  cat("\nCluster distribution after filtering:\n")
  print(table(seurat_obj@meta.data[[params$cluster_by]]))
}

cat("\nData loaded successfully!")
output
Loading data from: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/CellAnnotation/result/Step4_Final_Cell_Annotation.rds

Data Summary:
- Number of cells: 16924
- Number of genes: 24091
- Available metadata columns: orig.ident, nCount_RNA, nFeature_RNA, sample, species, percent.mitochondrial, filter_status, pANN_0.05_0.3_1372, DF.classifications_0.05_0.3_1372, RNA_snn_res.0.5, seurat_clusters, SingleR_HumanPrimaryCellAtlas, SingleR_score_HumanPrimaryCellAtlas, SingleR_BlueprintEncode, SingleR_score_BlueprintEncode, Lymphoid_B_cells_Score, Lymphoid_Plasma_cells_Score, Lymphoid_T_cells_Score, Lymphoid_NK_cells_Score, Myeloid_Monocytes_Score, Myeloid_Macrophages_Score, Myeloid_DCs_Score, Myeloid_pDCs_Score, Myeloid_Granulocytes_Score, Myeloid_Neutrophils_Score, Myeloid_Mast_cells_Score, Myeloid_Megakaryocytes_Score, Myeloid_Erythrocytes_Score, Stromal_Epithelial_cells_Score, Stromal_Endothelial_cells_Score, Stromal_Fibroblasts_Score, Stromal_Proliferating_cells_Score, celltype
- Available clusters: c(1, 2, 7, 10, 5, 3, 6, 11, 8, 4, 16, 15, 9, 13, 12, 17, 14, 18)
- Available groups: S133A, S133T
The scale.data slot exists.
Filtering clusters: 0, 5, 6
Cells after filtering: 6395
Cluster distribution after filtering:

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
4645 0 0 0 0 892 858 0 0 0 0 0 0 0 0 0
16 17
0 0

Data loaded successfully!

差异表达与富集分析

差异表达分析与可视化

簇间比较:比较并可视化细胞簇之间的差异
簇内分组比较:比较并可视化特定簇内不同实验组之间的差异

定义差异表达分析和可视化的函数

R
# 函数 1:簇差异表达分析
perform_clusters_de <- function(seurat_obj, params) {
  cat("Performing differential expression analysis across all clusters...\n")  

  # 查找所有簇的标记物
  all_markers <- FindAllMarkers(
    seurat_obj,
    only.pos = TRUE,
    min.pct = params$min_pct,
    logfc.threshold = params$logfc_threshold,
    test.use = params$de_method
  )
  
  # 保存结果
  base_dir <- file.path(params$output_path, "cluster_DE")
  if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
  output_file <- file.path(base_dir, "clusters_markers_all.csv")
  write.csv(all_markers, output_file, row.names = FALSE)
  cat("Results saved to:", output_file, "\n")
  
  return(all_markers)
}

# 函数 2:解析比较对
parse_compare_groups <- function(params, available_groups) {
  parsed <- list()
  
  for (comp in params$compare_groups) {
    # 确保字符串包含 "vs"
    if (!grepl("vs", comp)) {
      warning("Invalid comparison format:", comp, "- should contain 'vs'")
      next
    }
    
    # 拆分字符串
    parts <- strsplit(comp, "vs")[[1]]
    if (length(parts) != 2) {
      warning("Invalid comparison format:", comp, "- should have exactly one 'vs'")
      next
    }
    
    ident1 <- trimws(parts[1])
    ident2 <- trimws(parts[2])
    
    # 验证分组是否存在
    if (!(ident1 %in% available_groups)) {
      warning("Group", ident1, "not found in available groups")
      next
    }
    if (!(ident2 %in% available_groups)) {
      warning("Group", ident2, "not found in available groups")
      next
    }
    
    parsed[[length(parsed) + 1]] <- list(
      ident1 = ident1,
      ident2 = ident2,
      comparison = comp
    )
  }
  
  return(parsed)
}

# 函数 3:分组比较差异表达分析
perform_group_comparison_de <- function(seurat_obj, params) {
  # 验证分组列是否存在
  if (!params$group_by %in% colnames(seurat_obj@meta.data)) {
    stop(paste("Group column", params$group_by, "not found in metadata"))
  }
  
  # 获取所有可用分组
  available_groups <- unique(seurat_obj@meta.data[[params$group_by]])
  cat("Available groups:", paste(available_groups, collapse = ", "), "\n")
  
  # 如果未指定,自动生成所有成对比较
  if (is.null(params$compare_groups)) {
    if (length(available_groups) >= 2) {
      params$compare_groups <- combn(available_groups, 2, function(x) paste(x[1], "vs", x[2], sep = ""))
      cat("Auto-generated comparisons:", paste(params$compare_groups, collapse = ", "), "\n")
    } else {
      stop("Need at least 2 groups for comparison")
    }
  }
  
  # 解析比较对
  parsed_comparisons <- parse_compare_groups(params, available_groups)
  
  # 获取可用簇
  available_clusters <- unique(seurat_obj@meta.data[[params$cluster_by]])
  cat("Available clusters:", paste(available_clusters, collapse = ", "), "\n")
  
  # 存储所有结果
  group_markers_list <- list()
  
  # 比较每个簇内的分组
  for (cluster in available_clusters) {
    cat("\nAnalyzing cluster:", cluster, "\n")
    
    # 子集化为特定簇
    cells_in_cluster <- colnames(seurat_obj)[seurat_obj@meta.data[[params$cluster_by]] == cluster]
    cluster_subset <- subset(seurat_obj, cells = cells_in_cluster)

    # 确保该簇至少有两个分组
    cluster_groups <- unique(cluster_subset@meta.data[[params$group_by]])
    if (length(cluster_groups) < 2) {
      cat("Skipping cluster", cluster, "- insufficient groups\n")
      next
    }

    # 对每个比较对执行 DE
    for (comp in parsed_comparisons) {
      ident1 <- comp$ident1
      ident2 <- comp$ident2
      
      # 检查该簇中是否存在这两个分组
      if (!(ident1 %in% cluster_groups) || !(ident2 %in% cluster_groups)) {
        cat("Skipping comparison", comp$comparison, "- groups not found in cluster", cluster, "\n")
        next
      }
      
      cat("  Comparing", ident1, "vs", ident2, "\n")
      
      # 设置分组标识
      Idents(cluster_subset) <- params$group_by
      
      # 运行差异表达
      tryCatch({
        markers <- FindMarkers(
          cluster_subset,
          ident.1 = ident1,
          ident.2 = ident2,
          only.pos = FALSE,
          min.pct = params$min_pct,
          logfc.threshold = params$logfc_threshold,
          test.use = params$de_method
        )
        
        # 添加元数据
        markers$gene <- rownames(markers)
        markers$cluster <- cluster
        markers$ident1 <- ident1
        markers$ident2 <- ident2
        markers$comparison <- comp$comparison
        
        # Store results
        result_name <- paste0("cluster_", cluster, "_", comp$comparison)
        group_markers_list[[result_name]] <- markers
        
        cat("    Found", nrow(markers), "differentially expressed genes\n")
        
      }, error = function(e) {
        cat("    Error in comparison", comp$comparison, ":", e$message, "\n")
      })
    }
  }
  
  # 合并所有结果
  if (length(group_markers_list) > 0) {
    group_markers <- do.call(rbind, group_markers_list)
    
    # 保存结果
    base_dir <- file.path(params$output_path, "group_DE")
    if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
    output_file <- file.path(base_dir, "group_comparison_markers_all.csv")
    write.csv(group_markers, output_file, row.names = FALSE)
    cat("\nGroup comparison results saved to:", output_file, "\n")
    
    return(group_markers)
  } else {
    cat("\nNo valid group comparisons found.\n")
    return(NULL)
  }
}

# 函数 4:创建簇分析的可视化
create_clusters_plots <- function(seurat_obj, all_markers, params, clusters_colors) {
  
  # 1. 每个簇的前 3 个基因点图
  top_genes_per_cluster <- all_markers %>% 
    group_by(cluster) %>% 
    top_n(3, avg_log2FC) %>% 
    pull(gene)
  
  dotplot_plot <- DotPlot(seurat_obj, features = unique(top_genes_per_cluster)) +
    scale_color_gradientn(colors = params$continuous_colors) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "Top 3 Marker Genes per Cluster", 
         subtitle = "DotPlot")
  
  # 保存点图
  ggsave(file.path(params$output_path, "cluster_DE", "clusters_dotplot.pdf"), 
         dotplot_plot, width = 10, height = 8, dpi = params$plot_dpi)
  
  # 2. 热门基因热图
  top_genes_heatmap <- all_markers %>% 
    group_by(cluster) %>% 
    top_n(5, avg_log2FC) %>% 
    pull(gene)

  # 检查基因是否存在于 scale.data 插槽中
  available_genes_heatmap <- intersect(unique(top_genes_heatmap), rownames(GetAssayData(seurat_obj, slot = "scale.data")))
  
  if (length(available_genes_heatmap) > 0) {
    heatmap_plot <- DoHeatmap(seurat_obj, features = available_genes_heatmap, 
                              slot = "scale.data", 
                              group.colors = clusters_colors) +
      scale_fill_gradientn(colors = params$continuous_colors) +
      labs(title = "Top 5 Marker Genes per Cluster", 
          subtitle = "Heatmap")
    
    # 保存热图
    ggsave(file.path(params$output_path, "cluster_DE", "clusters_heatmap.pdf"), 
          heatmap_plot, width = 12, height = 10, dpi = params$plot_dpi)
  } else {
    cat("Warning: No requested features found in the scale.data slot for the RNA assay. Skipping heatmap generation.\n")
  }
  
  # 3. 热门基因小提琴图
  top_genes_violin <- all_markers %>% 
    group_by(cluster) %>% 
    top_n(3, avg_log2FC) %>% 
    pull(gene)

  violin_plot <- VlnPlot(seurat_obj, features = unique(top_genes_violin), 
                         stack = TRUE, flip = TRUE, 
                         fill.by = "ident",cols = clusters_colors) +
    theme(axis.text.x = element_text(angle = 0, hjust = 1))+
    labs(title = "Top 3 Marker Genes per Cluster", 
         subtitle = "Violin Plot")
  
  # 保存小提琴图
  ggsave(file.path(params$output_path, "cluster_DE", "clusters_violin_plot.pdf"), 
         violin_plot, width = 12, height = 8, dpi = params$plot_dpi)
  
  cat("\nAll clusters visualization plots saved to:", params$output_path)
}

# 函数 5:创建分组比较分析的可视化
create_group_comparison_plots <- function(seurat_obj, group_markers, params) {
  if (is.null(group_markers) || nrow(group_markers) == 0) {
    cat("No group comparison results to visualize.\n")
    return(invisible(NULL))
  }

  group_col   <- params$group_by
  cluster_col <- params$cluster_by

  # 1. 按簇分组比例的堆叠条形图
  cat("Creating stacked bar chart for group proportions by cluster...\n")
  prop_data <- seurat_obj@meta.data %>%
    dplyr::group_by(!!rlang::sym(cluster_col), !!rlang::sym(group_col)) %>%
    dplyr::summarise(count = dplyr::n(), .groups = 'drop') %>%
    dplyr::group_by(!!rlang::sym(cluster_col)) %>%
    dplyr::mutate(proportion = count / sum(count)) %>%
    dplyr::ungroup()

  stacked_bar <- ggplot(prop_data, aes(x = !!rlang::sym(cluster_col), y = proportion, fill = !!rlang::sym(group_col))) +
    geom_bar(stat = "identity", position = "stack") +
    scale_fill_brewer(palette = params$discrete_colors) +
    labs(title = "Group Proportions by Cluster",
         subtitle = "Stacked Bar Chart",
         x = "Cluster", y = "Proportion", fill = "Group") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_y_continuous(labels = scales::percent_format())
  ggsave(file.path(params$output_path, "group_DE", "group_comparison_group_proportions_by_cluster.pdf"),
         stacked_bar, width = 10, height = 6, dpi = params$plot_dpi)

  # 2. 每个簇输出:点图、小提琴图、热图、火山图
  cat("Creating per-cluster Dot/Violin/Heatmap/Volcano...\n")

  clusters_all <- sort(unique(as.character(seurat_obj@meta.data[[cluster_col]])))
  n_clusters   <- length(clusters_all)
  clusters_colors <- RColorBrewer::brewer.pal(min(max(3, n_clusters), 12), params$discrete_colors)
  if (n_clusters > length(clusters_colors)) {
    clusters_colors <- grDevices::colorRampPalette(clusters_colors)(n_clusters)
  }

  out_dir <- file.path(params$output_path, "group_DE")
  if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

  for (cluster in clusters_all) {
    cluster_data <- group_markers[group_markers$cluster == cluster, , drop = FALSE]
    if (nrow(cluster_data) == 0) next

    up_genes <- cluster_data %>%
      dplyr::filter(avg_log2FC > 0) %>%
      dplyr::top_n(5, avg_log2FC) %>%
      dplyr::pull(gene)

    down_genes <- cluster_data %>%
      dplyr::filter(avg_log2FC < 0) %>%
      dplyr::top_n(5, abs(avg_log2FC)) %>%
      dplyr::pull(gene)

    cluster_genes <- unique(c(up_genes, down_genes))
    cluster_genes <- cluster_genes[cluster_genes %in% rownames(seurat_obj)]
    if (length(cluster_genes) == 0) next

    cells_in_cluster <- rownames(seurat_obj@meta.data)[as.character(seurat_obj@meta.data[[cluster_col]]) == cluster]
    cluster_subset <- subset(seurat_obj, cells = cells_in_cluster)

    # 2.1 点图
    cluster_dot <- DotPlot(cluster_subset, features = cluster_genes, cols = c("lightgrey", "red"),
                           group.by = group_col) +
      scale_color_gradientn(colors = params$continuous_colors) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      labs(title = paste("Cluster", cluster, "DE Genes"),
           subtitle = "Top 5 Up + Top 5 Down") +
      theme_bw()
    ggsave(file.path(out_dir, paste0("cluster_", cluster, "_de_genes_dotplot.pdf")),
           cluster_dot, width = 10, height = 6, dpi = params$plot_dpi)

    # 2.2 小提琴图
    cluster_violin <- VlnPlot(cluster_subset, features = cluster_genes,
                              stack = TRUE, flip = TRUE,
                              fill.by = "ident",
                              group.by = group_col,
                              cols = brewer.pal(8, params$discrete_colors)[1:2]) +
      NoLegend() +
      labs(title = paste("Cluster", cluster, "DE Genes"),
           subtitle = "Top 5 Up + Top 5 Down")
    ggsave(file.path(out_dir, paste0("cluster_", cluster, "_de_genes_violin.pdf")),
           cluster_violin, width = 6, height = 10, dpi = params$plot_dpi)

    # 2.3 热图
    available_genes_heatmap <- intersect(cluster_genes, rownames(GetAssayData(cluster_subset, slot = "scale.data")))
    if (length(available_genes_heatmap) > 1) {
      heatmap_cluster <- DoHeatmap(cluster_subset, features = available_genes_heatmap,
                                   slot = "scale.data",
                                   group.colors = clusters_colors,
                                   group.by = group_col, draw.lines = TRUE) +
        scale_fill_gradientn(colors = params$continuous_colors) +
        labs(title = paste("Cluster", cluster, "DE Genes"),
             subtitle = "Top 5 Up + Top 5 Down")
      ggsave(file.path(out_dir, paste0("cluster_", cluster, "_de_genes_heatmap.pdf")),
             heatmap_cluster, width = 10, height = 3, dpi = params$plot_dpi)
    } else {
      cat("No genes found in scale.data for cluster", cluster, "heatmap generation.\n")
    }

    # 2.4 火山图
    cluster_data$top_genes <- ifelse(cluster_data$gene %in% cluster_genes, cluster_data$gene, "")
    volcano_plot <- ggplot(cluster_data, aes(x = avg_log2FC, y = -log10(p_val_adj))) +
      geom_point(aes(color = ifelse(p_val_adj < 0.05 & abs(avg_log2FC) > 0.5,
                                    ifelse(avg_log2FC > 0, "Up", "Down"), "NS")),
                 alpha = 0.6, size = 1.5) +
      scale_color_manual(values = c("Down" = "blue", "NS" = "grey", "Up" = "red")) +
      geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red", alpha = 0.7) +
      geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "red", alpha = 0.7) +
      ggrepel::geom_text_repel(aes(label = top_genes),
                               size = 3, max.overlaps = 20,
                               box.padding = 0.5, point.padding = 0.3,
                               segment.color = "black", segment.size = 0.3) +
      labs(title = paste("Volcano Plot - Cluster", cluster),
           subtitle = "Top 5 Up + Top 5 Down Genes Marked",
           x = "Average Log2 Fold Change",
           y = "-log10(Adjusted P-value)",
           color = "Regulation") +
      theme_bw() +
      theme(legend.position = "bottom",
            plot.title = element_text(hjust = 0.5, size = 14),
            plot.subtitle = element_text(hjust = 0.5, size = 12))
    ggsave(file.path(out_dir, paste0("cluster_", cluster, "_de_genes_volcano.pdf")),
           volcano_plot, width = 6, height = 6, dpi = params$plot_dpi)
  }

  cat("\nGroup comparison visualization plots saved to:", params$output_path, "\n")
  invisible(NULL)
}

执行差异分析和可视化

R
if (params$differential_analysis_type == "clusters") {
  cat("Performing CLUSTERS differential expression analysis...\n")
  
  # 执行分析(Seurat DE 的所有基因)
  clusters_markers_all <- perform_clusters_de(seurat_obj, params)
  group_markers <- NULL
  
  # 用于报告/绘图的过滤版本
  if (!is.null(clusters_markers_all)) {
    clusters_markers <- clusters_markers_all %>%
      dplyr::filter(
        is.finite(p_val_adj) & p_val_adj < params$p_adj_cutoff
      ) %>%
      dplyr::filter(
        is.finite(avg_log2FC) & abs(avg_log2FC) >= params$abs_log2fc_cutoff
      )
    write.csv(clusters_markers, file.path(params$output_path, "cluster_DE", "clusters_markers_filtered.csv"), row.names = FALSE)
    
    cat("Filtering results:\n")
    cat("- Total genes before filtering:", nrow(clusters_markers_all), "\n")
    cat("- Total genes after filtering:", nrow(clusters_markers), "\n")
    cat("- P-value cutoff:", params$p_adj_cutoff, "\n")
    cat("- Log2FC cutoff:", params$abs_log2fc_cutoff, "\n")
  } else {
    clusters_markers <- NULL
  }

  # 可视化使用过滤后的数据
  create_clusters_plots(seurat_obj, clusters_markers, params, clusters_colors)

  # 摘要(过滤后)
  cat("\nSummary of all clusters differential expression (filtered):")
  cat("\n- Total differentially expressed genes:", nrow(clusters_markers))
  cat("\n- Average genes per cluster:", ifelse(length(unique(clusters_markers$cluster))>0, round(nrow(clusters_markers) / length(unique(clusters_markers$cluster)), 1), 0))
  cat("\n- Top 5 genes per cluster:")
  if (nrow(clusters_markers) > 0) print(clusters_markers %>% group_by(cluster) %>% top_n(5, avg_log2FC))
  
} else if (params$differential_analysis_type == "group_comparison") {
  cat("Performing GROUP COMPARISON differential expression analysis...\n")
  
  # 执行分析(所有)
  group_markers_all <- perform_group_comparison_de(seurat_obj, params)
  clusters_markers <- NULL
  
  # 过滤后
  if (!is.null(group_markers_all)) {
    group_markers <- group_markers_all %>%
      dplyr::filter(
        is.finite(p_val_adj) & p_val_adj < params$p_adj_cutoff
      ) %>%
      dplyr::filter(
        is.finite(avg_log2FC) & abs(avg_log2FC) >= params$abs_log2fc_cutoff
      )
    write.csv(group_markers, file.path(params$output_path, "group_DE", "group_comparison_markers_filtered.csv"), row.names = FALSE)
    
    cat("Group comparison filtering results:\n")
    cat("- Total genes before filtering:", nrow(group_markers_all), "\n")
    cat("- Total genes after filtering:", nrow(group_markers), "\n")
    cat("- P-value cutoff:", params$p_adj_cutoff, "\n")
    cat("- Log2FC cutoff:", params$abs_log2fc_cutoff, "\n")
  } else {
    group_markers <- NULL
  }
  
  # 可视化使用过滤后的数据
  create_group_comparison_plots(seurat_obj, group_markers, params)
  
  # 摘要(过滤后)
  if (!is.null(group_markers)) {
    cat("\nSummary of group comparison differential expression (filtered):")
    cat("\n- Total differentially expressed genes:", nrow(group_markers))
    cat("\n- Comparisons performed:", paste(unique(group_markers$comparison), collapse = ", "))
    cat("\n- Clusters analyzed:", paste(unique(group_markers$cluster), collapse = ", "))
  }
}
output
Performing GROUP COMPARISON differential expression analysis...n Available groups: S133A, S133T
Available clusters: 0, 6, 5

Analyzing cluster: 0
Comparing S133T vs S133A
Found 293 differentially expressed genes

Analyzing cluster: 6
Comparing S133T vs S133A
Found 548 differentially expressed genes

Analyzing cluster: 5
Comparing S133T vs S133A
Found 855 differentially expressed genes

Group comparison results saved to: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result/group_DE/group_comparison_markers_all.csv
Group comparison filtering results:
- Total genes before filtering: 1696
- Total genes after filtering: 933
- P-value cutoff: 0.05
- Log2FC cutoff: 0.5
Creating stacked bar chart for group proportions by cluster...n Creating per-cluster Dot/Violin/Heatmap/Volcano...n

Warning message:
“Scaling data with a low number of groups may produce misleading results”
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
Warning message:
“Scaling data with a low number of groups may produce misleading results”
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.
Warning message:
“Scaling data with a low number of groups may produce misleading results”
Scale for colour is already present.
Adding another scale for colour, which will replace the existing scale.
Scale for fill is already present.
Adding another scale for fill, which will replace the existing scale.



Group comparison visualization plots saved to: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result

Summary of group comparison differential expression (filtered):
- Total differentially expressed genes: 933
- Comparisons performed: S133TvsS133A
- Clusters analyzed: 0, 6, 5

富集分析

执行 ORA(过表达分析)或 GSEA(基因集富集分析)以了解差异表达基因的生物学意义。

准备富集分析所需的数据库。

R
# 设置富集分析数据库
setup_enrichment_databases <- function(species) {
  
  if (species == "human") {
    org_db <- org.Hs.eg.db
    kegg_org <- "hsa"
  } else if (species == "mouse") {
    org_db <- org.Mm.eg.db
    kegg_org <- "mmu"
  } else {
    stop("Species must be 'human' or 'mouse'")
  }
  
  # 解析富集数据库
  databases <- strsplit(params$enrichment_databases, ",")[[1]]
  
  return(list(
    org_db = org_db,
    kegg_org = kegg_org,
    databases = databases
  ))
}

# 设置数据库
db_setup <- setup_enrichment_databases(params$species)
cat("Enrichment databases setup for species:", params$species, "\n")
cat("Available databases:", paste(db_setup$databases, collapse = ", "), "\n")
output
Enrichment databases setup for species: human
Available databases: GO, KEGG, HALLMARK, REACTOME

定义富集分析和可视化的函数

R
# 函数 1:读取 GMT 文件
read_gmt_file <- function(file_path) {
  if (!file.exists(file_path)) {
    stop("GMT file not found:", file_path)
  }
  
  # 读取 GMT 文件
  gmt_data <- readLines(file_path)
  
  # 解析 GMT 格式
  genesets <- list()
  for (line in gmt_data) {
    parts <- strsplit(line, "\t")[[1]]
    if (length(parts) >= 3) {
      term_name <- parts[1]
      term_description <- parts[2]
      genes <- parts[3:length(parts)]
      genes <- genes[genes != ""]  # 移除空字符串
      
      if (length(genes) > 0) {
        genesets[[term_name]] <- data.frame(
          term = term_name,
          gene = genes,
          stringsAsFactors = FALSE
        )
      }
    }
  }
  
  # 合并所有基因集
  if (length(genesets) > 0) {
    combined_genesets <- do.call(rbind, genesets)
    return(combined_genesets)
  } else {
    stop("No valid genesets found in GMT file")
  }
}

# 函数 2:ORA 富集分析
perform_ora_analysis <- function(markers_data, db_setup, params) {
  cat("Performing Over-Representation Analysis (ORA) with local data (split Up/Down)...\n")
  
  # 按(簇或比较)和方向准备基因列表
  if ("cluster" %in% colnames(markers_data)) {
    split_key <- markers_data$cluster
  } else {
    split_key <- markers_data$comparison
  }
  
  markers_data$direction <- ifelse(is.finite(markers_data$avg_log2FC) & markers_data$avg_log2FC > 0, "Up",
                                   ifelse(is.finite(markers_data$avg_log2FC) & markers_data$avg_log2FC < 0, "Down", NA))
  markers_data <- markers_data[!is.na(markers_data$direction), , drop = FALSE]

  index_keys <- unique(split_key)
  directions <- c("Up","Down")

  # 将 symbol 转换为 entrez
  convert_to_entrez <- function(genes) {
    m <- AnnotationDbi::select(db_setup$org_db, keys = genes, columns = "ENTREZID", keytype = "SYMBOL")
    unique(m$ENTREZID[!is.na(m$ENTREZID)])
  }

  enrichment_results <- list()
  combined_results_list <- list()

  for (db in db_setup$databases) {
    cat("Analyzing database:", db, "\n")

    for (k in index_keys) {
      for (dirc in directions) {
        sub <- markers_data[split_key == k & markers_data$direction == dirc, , drop = FALSE]
        if (nrow(sub) == 0) next
        genes <- unique(sub$gene)
        eg <- convert_to_entrez(genes)
        if (length(eg) == 0) next

        if (db == "GO") {
          ego <- enrichGO(gene = eg,
                          OrgDb = db_setup$org_db,
                          ont = "ALL",
                          pAdjustMethod = "BH",
                          pvalueCutoff = params$pvalue_cutoff,
                          qvalueCutoff = params$qvalue_cutoff,
                          minGSSize = params$minGSSize,
                          maxGSSize = params$maxGSSize)
          if (nrow(ego) > 0) {
            ego_df <- as.data.frame(ego)
            ego_df$cluster   <- k
            ego_df$database  <- db
            ego_df$direction <- dirc
            enrichment_results[[paste(db, k, dirc, sep = "_")]] <- ego
            combined_results_list[[paste(db, k, dirc, sep = "_")]] <- ego_df
          }

        } else if (db == "KEGG") {
          ekegg <- enrichKEGG(gene = eg,
                              organism = db_setup$kegg_org,
                              pAdjustMethod = "BH",
                              pvalueCutoff = params$pvalue_cutoff,
                              qvalueCutoff = params$qvalue_cutoff,
                              minGSSize = params$minGSSize,
                              maxGSSize = params$maxGSSize)
          if (nrow(ekegg) > 0) {
            ekegg_df <- as.data.frame(ekegg)
            ekegg_df$cluster   <- k
            ekegg_df$database  <- db
            ekegg_df$direction <- dirc
            enrichment_results[[paste(db, k, dirc, sep = "_")]] <- ekegg
            combined_results_list[[paste(db, k, dirc, sep = "_")]] <- ekegg_df
          }

        } else if (db == "HALLMARK") {
          data_dir <- "/jp_envs/Enrichment/"
          hallmark_file <- file.path(data_dir, "h.all.v7.5.1.entrez.gmt")
          if (file.exists(hallmark_file)) {
            hallmark_genesets <- read_gmt_file(hallmark_file)
            emsig <- tryCatch({
              enricher(gene = eg,
                       TERM2GENE = hallmark_genesets,
                       pAdjustMethod = "BH",
                       pvalueCutoff = params$pvalue_cutoff,
                       qvalueCutoff = params$qvalue_cutoff,
                       minGSSize = params$minGSSize,
                       maxGSSize = params$maxGSSize)
            }, error = function(e) NULL)
            if (!is.null(emsig) && nrow(emsig) > 0) {
              emsig_df <- as.data.frame(emsig)
              emsig_df$cluster   <- k
              emsig_df$database  <- db
              emsig_df$direction <- dirc
              enrichment_results[[paste(db, k, dirc, sep = "_")]] <- emsig
              combined_results_list[[paste(db, k, dirc, sep = "_")]] <- emsig_df
            }
          } else {
            cat("Warning: HALLMARK file not found, skipping...\n")
          }

        } else if (db == "REACTOME") {
          data_dir <- "/jp_envs/Enrichment/"
          reactome_file <- file.path(data_dir, "c2.cp.reactome.v7.5.1.entrez.gmt")
          if (file.exists(reactome_file)) {
            reactome_genesets <- read_gmt_file(reactome_file)
            emsig <- tryCatch({
              enricher(gene = eg,
                       TERM2GENE = reactome_genesets,
                       pAdjustMethod = "BH",
                       pvalueCutoff = params$pvalue_cutoff,
                       qvalueCutoff = params$qvalue_cutoff,
                       minGSSize = params$minGSSize,
                       maxGSSize = params$maxGSSize)
            }, error = function(e) NULL)
            if (!is.null(emsig) && nrow(emsig) > 0) {
              emsig_df <- as.data.frame(emsig)
              emsig_df$cluster   <- k
              emsig_df$database  <- db
              emsig_df$direction <- dirc
              enrichment_results[[paste(db, k, dirc, sep = "_")]] <- emsig
              combined_results_list[[paste(db, k, dirc, sep = "_")]] <- emsig_df
            }
          } else {
            cat("Warning: REACTOME file not found, skipping...\n")
          }
        }
      }
    }
  }
  
  # 合并结果
  if (length(combined_results_list) > 0) {
    all_columns <- unique(unlist(lapply(combined_results_list, colnames)))
    cat("All columns found:", paste(all_columns, collapse = ", "), "\n")
    
    standardized_results <- lapply(combined_results_list, function(df) {
      for (col in all_columns) {
        if (!col %in% colnames(df)) df[[col]] <- NA
      }
      df <- df[, all_columns, drop = FALSE]
      df
    })
    combined_results <- do.call(rbind, standardized_results)

    # 保存结果(带方向)
    base_dir <- file.path(params$output_path, "ORA")
    if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
    output_file <- file.path(base_dir, "ora_enrichment_results.csv")
    write.csv(combined_results, output_file, row.names = FALSE)
    cat("ORA results saved to:", output_file, "\n")

    return(list(results = combined_results, enrichment_objects = enrichment_results))
  } else {
    cat("No significant enrichment results found.\n")
    return(NULL)
  }
}

# 函数 3:GSEA 富集分析
perform_gsea_analysis <- function(markers_data, db_setup, params) {
  cat("Performing Gene Set Enrichment Analysis (GSEA) with local data...\n")
  
  # 准备排序的基因列表 
  if ("cluster" %in% colnames(markers_data)) {
    gene_lists <- split(markers_data, markers_data$cluster)
  } else {
    gene_lists <- split(markers_data, markers_data$comparison)
  }
  
  # 将基因符号转换为 Entrez ID 并创建排序列表
  ranked_lists <- list()
  
  for (name in names(gene_lists)) {
    cluster_data <- gene_lists[[name]]
    
    # 基于对数倍数变化创建排序列表(无过滤)
    ranked_genes <- cluster_data$avg_log2FC
    names(ranked_genes) <- cluster_data$gene
    
    # 按倍数变化排序
    ranked_genes <- sort(ranked_genes, decreasing = TRUE)
    
    # 转换为 Entrez ID
    entrez_mapping <- AnnotationDbi::select(db_setup$org_db, 
                                           keys = names(ranked_genes), 
                                           columns = "ENTREZID", 
                                           keytype = "SYMBOL")
    
    # 使用 Entrez ID 创建排序列表
    entrez_ranked <- ranked_genes[match(entrez_mapping$SYMBOL, names(ranked_genes))]
    names(entrez_ranked) <- entrez_mapping$ENTREZID
    entrez_ranked <- entrez_ranked[!is.na(names(entrez_ranked))]
    
    ranked_lists[[name]] <- entrez_ranked
  }
  
  # 对每个数据库执行 GSEA
  gsea_results <- list()
  combined_results_list <- list()
  
  for (db in db_setup$databases) {
    cat("Analyzing database:", db, "\n")
    
    if (db == "GO") {
      # GO GSEA
      for (name in names(ranked_lists)) {
        if (length(ranked_lists[[name]]) > 0) {
          gsea_go <- gseGO(geneList = ranked_lists[[name]],
                           OrgDb = db_setup$org_db,
                           ont = "ALL",
                           pAdjustMethod = "BH",
                           pvalueCutoff = params$pvalue_cutoff,
                           minGSSize = params$minGSSize,
                           maxGSSize = params$maxGSSize,
                           nPerm = 1000)
          
          if (nrow(gsea_go) > 0) {
            gsea_go_df <- as.data.frame(gsea_go)
            gsea_go_df$cluster <- name
            gsea_go_df$database <- db
            
            gsea_results[[paste(db, name, sep = "_")]] <- gsea_go
            combined_results_list[[paste(db, name, sep = "_")]] <- gsea_go_df
          }
        }
      }
      
    } else if (db == "KEGG") {
      # KEGG GSEA
      for (name in names(ranked_lists)) {
        if (length(ranked_lists[[name]]) > 0) {
          gsea_kegg <- gseKEGG(geneList = ranked_lists[[name]],
                               organism = db_setup$kegg_org,
                               pAdjustMethod = "BH",
                               pvalueCutoff = params$pvalue_cutoff,
                               minGSSize = params$minGSSize,
                               maxGSSize = params$maxGSSize,
                               nPerm = 1000)
          
          if (nrow(gsea_kegg) > 0) {
            gsea_kegg_df <- as.data.frame(gsea_kegg)
            gsea_kegg_df$cluster <- name
            gsea_kegg_df$database <- db
            
            gsea_results[[paste(db, name, sep = "_")]] <- gsea_kegg
            combined_results_list[[paste(db, name, sep = "_")]] <- gsea_kegg_df
          }
        }
      }
      
    } else if (db == "HALLMARK") {
      # 使用本地数据的 HALLMARK GSEA
      data_dir <- "/jp_envs/Enrichment/"
      hallmark_file <- file.path(data_dir, "h.all.v7.5.1.entrez.gmt")
      
      if (file.exists(hallmark_file)) {
        hallmark_genesets <- read_gmt_file(hallmark_file)
        
        for (name in names(ranked_lists)) {
          if (length(ranked_lists[[name]]) > 0) {
            tryCatch({
              gsea_msig <- GSEA(geneList = ranked_lists[[name]],
                                TERM2GENE = hallmark_genesets,
                                pAdjustMethod = "BH",
                                pvalueCutoff = params$pvalue_cutoff,
                                minGSSize = params$minGSSize,
                                maxGSSize = params$maxGSSize,
                                nPerm = 1000)
              
              if (nrow(gsea_msig) > 0) {
                gsea_msig_df <- as.data.frame(gsea_msig)
                gsea_msig_df$cluster <- name
                gsea_msig_df$database <- db
                
                gsea_results[[paste(db, name, sep = "_")]] <- gsea_msig
                combined_results_list[[paste(db, name, sep = "_")]] <- gsea_msig_df
              }
            }, error = function(e) {
              cat("Warning: Failed to analyze HALLMARK for", name, ":", e$message, "\n")
            })
          }
        }
      } else {
        cat("Warning: HALLMARK file not found, skipping...\n")
      }
      
    } else if (db == "REACTOME") {
      # 使用本地数据的 REACTOME GSEA
      data_dir <- "/jp_envs/Enrichment/"
      reactome_file <- file.path(data_dir, "c2.cp.reactome.v7.5.1.entrez.gmt")
      
      if (file.exists(reactome_file)) {
        reactome_genesets <- read_gmt_file(reactome_file)
        
        for (name in names(ranked_lists)) {
          if (length(ranked_lists[[name]]) > 0) {
            tryCatch({
              gsea_msig <- GSEA(geneList = ranked_lists[[name]],
                                TERM2GENE = reactome_genesets,
                                pAdjustMethod = "BH",
                                pvalueCutoff = params$pvalue_cutoff,
                                minGSSize = params$minGSSize,
                                maxGSSize = params$maxGSSize,
                                nPerm = 1000)
              
              if (nrow(gsea_msig) > 0) {
                gsea_msig_df <- as.data.frame(gsea_msig)
                gsea_msig_df$cluster <- name
                gsea_msig_df$database <- db
                
                gsea_results[[paste(db, name, sep = "_")]] <- gsea_msig
                combined_results_list[[paste(db, name, sep = "_")]] <- gsea_msig_df
              }
            }, error = function(e) {
              cat("Warning: Failed to analyze REACTOME for", name, ":", e$message, "\n")
            })
          }
        }
      } else {
        cat("Warning: REACTOME file not found, skipping...\n")
      }
    }
  }
  
  # 合并结果
  if (length(combined_results_list) > 0) {
    all_columns <- unique(unlist(lapply(combined_results_list, colnames)))
    cat("All columns found:", paste(all_columns, collapse = ", "), "\n")
    
    standardized_results <- lapply(combined_results_list, function(df) {
      for (col in all_columns) {
        if (!col %in% colnames(df)) df[[col]] <- NA
      }
      df <- df[, all_columns, drop = FALSE]
      df
    })
    
    combined_results <- do.call(rbind, standardized_results)
    
    # 保存结果
    base_dir <- file.path(params$output_path, "GSEA")
    if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)
    output_file <- file.path(base_dir, "gsea_enrichment_results.csv")
    write.csv(combined_results, output_file, row.names = FALSE)
    cat("GSEA results saved to:", output_file, "\n")
    
    return(list(results = combined_results, gsea_objects = gsea_results, ranked_lists = ranked_lists))
  } else {
    cat("No significant GSEA results found.\n")
    return(NULL)
  }
}

# 函数 4:创建 ORA 可视化图
create_ora_plots <- function(ora_results, params) {
  if (is.null(ora_results)) { cat("No ORA results to visualize.\n"); return(invisible(NULL)) }
  df <- ora_results$results
  if (is.null(df) || nrow(df) == 0) { cat("Empty ORA results.\n"); return(invisible(NULL)) }

  if (!"direction" %in% names(df)) {
    df$direction <- "All"
  } else {
    df$direction <- ifelse(is.na(df$direction) | df$direction == "", "All", as.character(df$direction))
  }

  # 生成
  if (!"generation" %in% names(df)) {
    gen <- rep(NA_real_, nrow(df))
    if ("GeneRatio" %in% names(df)) {
      gr <- as.character(df$GeneRatio)
      gen_from_gr <- suppressWarnings(vapply(gr, function(x) {
        if (is.na(x) || x == "") return(NA_real_)
        if (grepl("/", x, fixed = TRUE)) {
          p <- strsplit(x, "/", fixed = TRUE)[[1]]
          as.numeric(p[1]) / as.numeric(p[2])
        } else as.numeric(x)
      }, numeric(1)))
      gen <- gen_from_gr
    }
    if (all(is.na(gen)) && "Count" %in% names(df)) gen <- suppressWarnings(as.numeric(df$Count))
    df$generation <- gen
  }

  # GO
  df$ont <- NA_character_
  if ("ONTOLOGY" %in% names(df)) df$ont <- as.character(df$ONTOLOGY)
  else if ("gs_subcat" %in% names(df)) df$ont <- toupper(as.character(df$gs_subcat))
  else if ("subcategory" %in% names(df)) df$ont <- toupper(as.character(df$subcategory))
  else if ("category" %in% names(df)) df$ont <- toupper(as.character(df$category))
  df$ont[grepl("BP", df$ont, ignore.case = TRUE)] <- "BP"
  df$ont[grepl("CC", df$ont, ignore.case = TRUE)] <- "CC"
  df$ont[grepl("MF", df$ont, ignore.case = TRUE)] <- "MF"
  df$ont[is.na(df$ont) | df$ont == ""] <- "Other"

  clusters   <- sort(unique(df$cluster))
  databases  <- sort(unique(df$database))
  directions <- sort(unique(df$direction))

  for (db in databases) {
    for (dirc in directions) {
      db_dot_dir <- file.path(params$output_path, "ORA", db, dirc, "dotplot")
      db_bar_dir <- file.path(params$output_path, "ORA", db, dirc, "bar")
      if (!dir.exists(db_dot_dir)) dir.create(db_dot_dir, recursive = TRUE)
      if (!dir.exists(db_bar_dir)) dir.create(db_bar_dir, recursive = TRUE)

      for (cl in clusters) {
        cl_df <- df[df$database == db & df$cluster == cl & df$direction == dirc, , drop = FALSE]
        # 过滤
        cl_df <- cl_df[is.finite(cl_df$generation) & !is.na(cl_df$Description), , drop = FALSE]
        if (nrow(cl_df) == 0) next

        # Top10
        cl_top <- cl_df[order(-cl_df$generation, cl_df$p.adjust), , drop = FALSE]
        if (nrow(cl_top) > 10) cl_top <- cl_top[seq_len(10), , drop = FALSE]
        cl_top$neglogp <- -log10(cl_top$p.adjust)

        # 1. 点图
        cl_top$Description <- forcats::fct_reorder(cl_top$Description, cl_top$generation, .desc = FALSE)
        p_dot <- ggplot(cl_top, aes(x = generation, y = Description, size = Count, color = neglogp))
        if (identical(db, "GO")) {
          p_dot <- p_dot + geom_point(aes(shape = ont)) +
            scale_shape_manual(values = c(BP = 18, CC = 17, MF = 15, Other = 16), na.translate = FALSE)
        } else {
          p_dot <- p_dot + geom_point()
        }
        p_dot <- p_dot +
          scale_color_gradientn(colors = params$continuous_colors) +
          labs(title = paste0("ORA dotplot - ", dirc, " - Cluster ", cl, " (", db, ")"),
               x = "generation", y = "Enriched Terms",
               size = "Gene Count", color = "-log10(Adj.P)") +
          theme_bw() + theme(axis.text.y = element_text(size = 8))
        ggsave(file.path(db_dot_dir, paste0("cluster_", cl, ".pdf")),
               p_dot, width = 12, height = 8, dpi = params$plot_dpi)

        # 2. 条形图
        p_bar <- ggplot(cl_top, aes(
          x = generation,
          y = forcats::fct_reorder(Description, generation, .desc = FALSE),
          fill = neglogp
        )) +
          geom_col() +
          scale_fill_gradientn(colors = params$continuous_colors) +
          labs(
            title = paste0("ORA Bar - ", dirc, " - Cluster ", cl, " (", db, ")"),
            x = "generation", y = "Enriched Terms", fill = "-log10(Adj.P)"
          ) +
          theme_bw() + theme(axis.text.y = element_text(size = 8))
        ggsave(file.path(db_bar_dir, paste0("cluster_", cl, ".pdf")),
               p_bar, width = 12, height = 8, dpi = params$plot_dpi)
      }
    }
  }
  cat("\nORA plots saved (split by Up/Down).\n")
  invisible(NULL)
}

# 函数 5:创建 GSEA 可视化图
create_gsea_plots <- function(gsea_results, params) {
  if (is.null(gsea_results)) {
    cat("No GSEA results to visualize.\n")
    return(invisible(NULL))
  }
  df <- gsea_results$results
  gsea_objects <- gsea_results$gsea_objects
  if (is.null(df) || nrow(df) == 0) {
    cat("Empty GSEA results.\n")
    return(invisible(NULL))
  }

  df$neglogp <- -log10(df$p.adjust)

  df$ont <- NA_character_
  if ("ONTOLOGY" %in% names(df)) df$ont <- as.character(df$ONTOLOGY)
  df$ont[grepl("BP", df$ont, ignore.case = TRUE)] <- "BP"
  df$ont[grepl("CC", df$ont, ignore.case = TRUE)] <- "CC"
  df$ont[grepl("MF", df$ont, ignore.case = TRUE)] <- "MF"
  df$ont[is.na(df$ont) | df$ont == ""] <- "Other"

  clusters  <- sort(unique(df$cluster))
  databases <- sort(unique(df$database))

  base_dir <- file.path(params$output_path, "GSEA")
  if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)

  for (db in databases) {
    db_dot_dir   <- file.path(base_dir, db, "dotplot")
    db_bar_dir   <- file.path(base_dir, db, "bar")
    db_curve_dir <- file.path(base_dir, db, "curve")
    if (!dir.exists(db_dot_dir))   dir.create(db_dot_dir, recursive = TRUE)
    if (!dir.exists(db_bar_dir))   dir.create(db_bar_dir, recursive = TRUE)
    if (!dir.exists(db_curve_dir)) dir.create(db_curve_dir, recursive = TRUE)

    for (cl in clusters) {
      cl_df <- df[df$database == db & df$cluster == cl, , drop = FALSE]
      cl_df <- cl_df[is.finite(cl_df$NES) & !is.na(cl_df$Description), , drop = FALSE]
      if (nrow(cl_df) == 0) next

      # 按 |NES| 降序,然后按 p.adjust 升序的前 10 个
      cl_top <- cl_df[order(-(cl_df$NES), cl_df$p.adjust), , drop = FALSE]
      if (nrow(cl_top) > 10) cl_top <- cl_top[seq_len(10), , drop = FALSE]

      # 1. 点图
      cl_top$Description <- forcats::fct_reorder(cl_top$Description, cl_top$NES, .desc = FALSE)
      p_dot <- ggplot(cl_top, aes(x = NES, y = Description, size = setSize, color = neglogp))
      if (identical(db, "GO")) {
        p_dot <- p_dot + geom_point(aes(shape = ont), alpha = 0.85) +
          scale_shape_manual(values = c(BP = 16, CC = 17, MF = 15, Other = 18), name = "GO")
      } else {
        p_dot <- p_dot + geom_point(alpha = 0.85)
      }
      p_dot <- p_dot +
        scale_size_continuous(name = "Set Size") +
        scale_color_gradientn(colors = params$continuous_colors) +
        labs(title = paste0("GSEA dotplot - Cluster ", cl, " (", db, ")"),
             x = "NES", y = "Enriched Terms", color = "-log10(Adj.P)") +
        theme_minimal() + theme(axis.text.y = element_text(size = 8))
      ggsave(file.path(db_dot_dir, paste0("cluster_", cl, ".pdf")),
             p_dot, width = 12, height = 8, dpi = params$plot_dpi)

      # 2. 条形图
      p_bar <- ggplot(cl_top, aes(x = NES,
                                  y = forcats::fct_reorder(Description, NES, .desc = FALSE),
                                  fill = neglogp)) +
        geom_col() +
        scale_fill_gradientn(colors = params$continuous_colors) +
        labs(title = paste0("GSEA Bar - Cluster ", cl, " (", db, ")"),
             x = "NES", y = "Enriched Terms", fill = "-log10(Adj.P)") +
        theme_minimal() + theme(axis.text.y = element_text(size = 8))
      ggsave(file.path(db_bar_dir, paste0("cluster_", cl, ".pdf")),
             p_bar, width = 12, height = 8, dpi = params$plot_dpi)

      # 3. 曲线图
      k_curve <- 5
      top_terms <- cl_df[order(cl_df$NES), , drop = FALSE]
      top_terms <- top_terms[seq_len(min(k_curve, nrow(top_terms))), , drop = FALSE]

      obj_keys <- names(gsea_objects)
      if (length(obj_keys) > 0) {
        cand <- obj_keys[grepl(paste0("^", db, "_"), obj_keys)]
        cand <- cand[grepl(paste0("_", cl, "$"), cand)]
      } else cand <- character(0)

      if (length(cand) > 0) {
        gobj <- gsea_objects[[cand[1]]]
        if (inherits(gobj, "gseaResult") && nrow(gobj@result) > 0) {
          ids_in_obj <- gobj@result$ID
          if ("ID" %in% names(top_terms)) {
            term_ids <- intersect(top_terms$ID, ids_in_obj)
          } else {
            term_ids <- gobj@result$ID[match(top_terms$Description, gobj@result$Description)]
            term_ids <- term_ids[!is.na(term_ids)]
          }
          # Draw up to first k_curve curves
          if (length(term_ids) > 0) {
            # 单独曲线
            for (tid in term_ids) {
              tid_idx <- which(gobj@result$ID == tid)[1]
              if (length(tid_idx) == 1 && !is.na(tid_idx)) {
                gp <- enrichplot::gseaplot2(gobj, geneSetID = tid_idx,
                                            title = paste0("GSEA - ", gobj@result$Description[tid_idx],
                                                           " (Cluster ", cl, ", ", db, ")"))
                safe_name <- gsub("[^A-Za-z0-9]+", "_", gobj@result$Description[tid_idx])
                ggsave(file.path(db_curve_dir, paste0("cluster_", cl, "_", safe_name, ".pdf")),
                       gp, width = 10, height = 8, dpi = params$plot_dpi)
              }
            }
            # 多重曲线 
            idx_vec <- which(gobj@result$ID %in% term_ids)
            if (length(idx_vec) > 1) {
              idx_vec <- idx_vec[seq_len(min(k_curve, length(idx_vec)))]
              gp_multi <- enrichplot::gseaplot2(gobj, geneSetID = idx_vec,
                                                title = paste0("GSEA Top Terms - Cluster ", cl, " (", db, ")"))
              ggsave(file.path(db_curve_dir, paste0("cluster_", cl, "_top_terms.pdf")),
                     gp_multi, width = 12, height = 8, dpi = params$plot_dpi)
            }
          }
        }
      }
    }
  }

  cat("\nGSEA plots saved (simple mode) to:", base_dir, "\n")
  invisible(NULL)
}

执行富集分析和可视化

R
if (params$enrichment_analysis_type == "ORA") {
  cat("Performing Over-Representation Analysis (ORA)...\n")
  
  # 对可用标记物执行 ORA 分析
  if (!is.null(clusters_markers)) {
    ora_results_clusters <- perform_ora_analysis(markers_data = clusters_markers, db_setup, params)
    ora_results_group <- NULL
    
    # 创建 ORA 可视化
    create_ora_plots(ora_results_clusters, params)
    
  } else if (!is.null(group_markers)) {
    ora_results_group <- perform_ora_analysis(markers_data = group_markers, db_setup, params)
    ora_results_clusters <- NULL
    
    # 创建 ORA 可视化
    create_ora_plots(ora_results_group, params)
    
  } else {
    ora_results_clusters <- NULL
    ora_results_group <- NULL
    cat("No differential expression results available for ORA analysis.\n")
  }
  
  # 由于未执行 GSEA,将 GSEA 结果设置为 NULL
  gsea_results_clusters <- NULL
  gsea_results_group <- NULL
  
} else if (params$enrichment_analysis_type == "GSEA") {
  cat("Performing Gene Set Enrichment Analysis (GSEA)...\n")
  
  # 对可用标记物执行 GSEA 分析
  if (!is.null(clusters_markers_all)) {
    gsea_results_clusters <- perform_gsea_analysis(clusters_markers_all, db_setup, params)
    saveRDS(gsea_results_clusters, file.path(params$output_path, "GSEA", "gsea_results.rds"))
    gsea_results_group <- NULL
    
    # 创建 GSEA 可视化
    create_gsea_plots(gsea_results_clusters, params)
    
  } else if (!is.null(group_markers_all)) {
    gsea_results_group <- perform_gsea_analysis(group_markers_all, db_setup, params)
    saveRDS(gsea_results_group, file.path(params$output_path, "GSEA", "gsea_results.rds"))
    gsea_results_clusters <- NULL
    
    # 创建 GSEA 可视化
    create_gsea_plots(gsea_results_group, params)
    
  } else {
    gsea_results_clusters <- NULL
    gsea_results_group <- NULL
    cat("No differential expression results available for GSEA analysis.\n")
  }
  
  # 由于未执行 ORA,将 ORA 结果设置为 NULL
  ora_results_clusters <- NULL
  ora_results_group <- NULL
}
output
Performing Over-Representation Analysis (ORA)...n Performing Over-Representation Analysis (ORA) with local data (split Up/Down)...n Analyzing database: GO


'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns



Analyzing database: KEGG


'select()' returned 1:1 mapping between keys and columns

Reading KEGG annotation online: "https://rest.kegg.jp/link/hsa/pathway"...n
Reading KEGG annotation online: "https://rest.kegg.jp/list/pathway/hsa"...n
'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns



Analyzing database: HALLMARK


'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns



Analyzing database: REACTOME


'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns



All columns found: ONTOLOGY, ID, Description, GeneRatio, BgRatio, pvalue, p.adjust, qvalue, geneID, Count, cluster, database, direction, category, subcategory
ORA results saved to: /home/demo-seekgene-com/workspace/project/demo-seekgene-com/DiffEnrich/result/ORA/ora_enrichment_results.csv

ORA plots saved (split by Up/Down).

可视化自定义差异表达基因和富集通路

基于自定义差异表达基因和富集通路的可视化。

配置自定义基因和通路可视化参数

R
custom_plot <- FALSE # 启用/禁用自定义基因/通路可视化
custom_genes <- c("C1QA","C1QB","C1QC", "CD3D", "CD3E") # 要在自定义图中突出显示的感兴趣基因(必须存在于 Seurat 对象中)
terms <- c("regulation of T cell activation", 
        "positive regulation of lymphocyte activation", 
        "positive regulation of leukocyte activation",
        "leukocyte cell-cell adhesion",
        "positive regulation of cell activation") # 要在 ORA/GSEA 结果中绘制的通路术语

定义自定义基因和通路可视化函数

R
# 函数 1:簇 DE 的自定义基因图
plot_custom_genes_clusters <- function(seurat_obj, custom_genes, params, clusters_colors) {
  if (length(custom_genes) == 0) {
    cat("No custom genes provided.\n"); return(invisible(NULL))
  }
  out_dir <- file.path(params$output_path, "custom_cluster_DE")
  if (!dir.exists(out_dir)) dir.create(out_dir, recursive = TRUE)

  # 仅保留对象中存在的基因
  genes_in_obj <- intersect(unique(custom_genes), rownames(seurat_obj))
  if (length(genes_in_obj) == 0) {
    cat("Custom genes not found in object; skip all plots.\n"); return(invisible(NULL))
  }

  # 1. 点图
  cluster_col <- params$cluster_by
  if (!(cluster_col %in% colnames(seurat_obj@meta.data))) {
    cluster_col <- "ident"
    seurat_obj@meta.data$ident <- as.character(Idents(seurat_obj))
    cat("Using Idents as cluster column for custom genes plots\n")
  } else {
    cat("Using cluster column for custom genes plots:", cluster_col, "\n")
  }

  cat("Creating DotPlot for custom genes...\n")
  p_dot <- DotPlot(seurat_obj, features = genes_in_obj, group.by = cluster_col) +
    scale_color_gradientn(colors = params$continuous_colors) +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "Custom Genes - DotPlot (All Clusters)",
         x = "Genes", y = "Clusters")
  ggsave(file.path(out_dir, "custom_genes_clusters_dotplot.pdf"),
         p_dot, width = 12, height = 6, dpi = params$plot_dpi)

  # 2. 小提琴图
  cat("Creating Violin for custom genes...\n")
  mat <- FetchData(seurat_obj, vars = genes_in_obj, slot = "data")
  var_ok <- sapply(as.data.frame(mat), function(x) stats::var(x, na.rm = TRUE) > 0)
  genes_var <- genes_in_obj[var_ok]
  if (length(genes_var) == 0) {
    cat("All requested genes have zero variance; skip Violin.\n")
  } else {
    if (length(genes_var) >= 2) {
      p_vln <- VlnPlot(seurat_obj, features = genes_var,
                       stack = TRUE, flip = TRUE, group.by = cluster_col, cols = clusters_colors, fill.by = "ident") +
        labs(title = "Custom Genes - Stacked Violin (All Clusters)",
             x = cluster_col, y = "Expression")
    } else {
      p_vln <- VlnPlot(seurat_obj, features = genes_var,
                       stack = FALSE, flip = FALSE, group.by = cluster_col, cols = clusters_colors, fill.by = "ident") +
        labs(title = "Custom Gene - Violin (All Clusters)",
             x = cluster_col, y = "Expression")
    }
    ggsave(file.path(out_dir, "custom_genes_clusters_violin.pdf"),
           p_vln, width = 12, height = 8, dpi = params$plot_dpi)
  }

  # 3. 热图 
  cat("Creating Heatmap for custom genes...\n")
  genes_in_scale <- intersect(genes_in_obj, rownames(GetAssayData(seurat_obj, slot = "scale.data")))
  if (length(genes_in_scale) < 2) {
    cat("No or insufficient custom genes in scale.data; skip Heatmap.\n")
  } else {
    heat_obj <- seurat_obj
    max_cells <- 1200
    if (ncol(heat_obj) > max_cells) {
      set.seed(42)
      heat_obj <- subset(heat_obj, cells = sample(colnames(heat_obj), max_cells))
    }
    p_heat <- DoHeatmap(heat_obj, features = genes_in_scale, slot = "scale.data",
                        draw.lines = TRUE, group.colors = clusters_colors) +
      scale_fill_gradientn(colors = params$continuous_colors) +
      labs(title = "Custom Genes - Heatmap (All Clusters)")
    ggsave(file.path(out_dir, "custom_genes_clusters_heatmap.pdf"),
           p_heat, width = 12, height = 8, dpi = params$plot_dpi)
  }
  invisible(list(dotplot = p_dot, violin = if (exists("p_vln")) p_vln else NULL))
}

# 函数 2:分组比较 DE 的自定义基因图
plot_custom_genes_group_comparison <- function(seurat_obj, group_markers, custom_genes, params, clusters_colors) {
  if (is.null(group_markers) || nrow(group_markers) == 0) {
    cat("No group comparison markers; skip custom plots.\n")
    return()
  }
  if (!requireNamespace("ggrepel", quietly = TRUE)) {
    cat("Package 'ggrepel' not available; labels will be limited.\n")
  }

  # 确定分组列和簇列
  group_col <- params$group_by
  
  # 通过 params$cluster_by 定位簇列;如果缺失则回退到 Idents
  cluster_col <- params$cluster_by
  if (!(cluster_col %in% colnames(seurat_obj@meta.data))) {
    cluster_col <- "ident"
    seurat_obj@meta.data$ident <- as.character(Idents(seurat_obj))
    cat("Using Idents as cluster column for custom genes plots\n")
  } else {
    cat("Using cluster column for custom genes plots:", cluster_col, "\n")
  }

  # 1. 火山图
  for (cl in unique(group_markers$cluster)) {
    df <- group_markers[group_markers$cluster == cl, , drop = FALSE]
    if (nrow(df) == 0) next
    df$highlight <- ifelse(df$gene %in% custom_genes, "Custom", "Other")

    p_vol <- ggplot(df, aes(x = avg_log2FC, y = -log10(p_val_adj))) +
      geom_point(aes(color = highlight), alpha = 0.7, size = 1.5) +
      scale_color_manual(values = c("Custom" = "red", "Other" = "grey70")) +
      geom_hline(yintercept = -log10(0.05), linetype = "dashed", color = "red") +
      geom_vline(xintercept = c(-0.5, 0.5), linetype = "dashed", color = "red") +
      labs(title = paste0("Volcano (highlight custom) - Cluster ", cl),
           x = "Average Log2 Fold Change", y = "-log10(Adjusted P-value)", color = "") +
      theme_bw()

    if (requireNamespace("ggrepel", quietly = TRUE)) {
      lab_df <- df[df$highlight == "Custom" & df$p_val_adj < 0.05 & abs(df$avg_log2FC) > 0.5, ]
      if (nrow(lab_df) > 0) {
        p_vol <- p_vol + ggrepel::geom_text_repel(data = lab_df,
                  aes(label = gene), size = 3, max.overlaps = 20)
      }
    }
    if (!dir.exists(file.path(params$output_path, "custom_group_DE"))) dir.create(file.path(params$output_path, "custom_group_DE"), recursive = TRUE)
    ggsave(file.path(params$output_path, "custom_group_DE", paste0("custom_genes_cluster_", cl, "_volcano.pdf")),
           p_vol, width = 8, height = 6, dpi = params$plot_dpi)
  }

  # 2. 为每个簇创建分组自定义基因可视化
  clusters <- unique(group_markers$cluster)
  genes_available <- intersect(custom_genes, rownames(seurat_obj))
  
  if (length(genes_available) == 0) {
    cat("No custom genes found in object; skip cluster-specific plots.\n")
    return()
  }

  for (cl in clusters) {
    cat("Creating custom gene plots for cluster", cl, "...\n")
    
    # 子集化为特定簇
    cluster_subset <- subset(seurat_obj, 
                            cells = rownames(seurat_obj@meta.data)[
                              as.character(seurat_obj@meta.data[[cluster_col]]) == cl
                            ])
    
    if (ncol(cluster_subset) == 0) {
      cat("No cells found in cluster", cl, "; skip.\n")
      next
    }
    
    # 检查该簇中的可用分组
    cluster_groups <- unique(cluster_subset@meta.data[[group_col]])
    if (length(cluster_groups) < 2) {
      cat("Cluster", cl, "has insufficient groups; skip.\n")
      next
    }
    
    # 2.1 点图
    p_dot <- DotPlot(cluster_subset, features = genes_available, 
                     group.by = group_col) +
      scale_color_gradientn(colors = params$continuous_colors) +
      theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
      labs(title = paste("Custom Genes - Cluster", cl),
           subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")),
           x = "Genes", y = "Groups") +
      theme_bw()
    
    ggsave(file.path(params$output_path, "custom_group_DE", paste0("custom_genes_cluster_", cl, "_dotplot.pdf")),
           p_dot, width = 12, height = 6, dpi = params$plot_dpi)
    
    # 2.2 小提琴图
    mat <- FetchData(cluster_subset, vars = genes_available, slot = "data")
    var_ok <- sapply(as.data.frame(mat), function(x) stats::var(x, na.rm = TRUE) > 0)
    genes_var <- genes_available[var_ok]
    
    if (length(genes_var) == 0) {
      cat("All custom genes have zero variance in cluster", cl, "; skip violin.\n")
    } else {
      if (length(cluster_groups) >= 2) {
        if (length(genes_var) >= 2) {
          p_vln <- VlnPlot(cluster_subset, features = genes_var, 
                           stack = TRUE, flip = TRUE, 
                           fill.by = "ident",
                           group.by = group_col,
                           cols = brewer.pal(8, params$discrete_colors)[1:2]) +
            labs(title = paste("Custom Genes - Cluster", cl),
                 subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")))
        } else {
          p_vln <- VlnPlot(cluster_subset, features = genes_var, 
                           stack = FALSE, flip = FALSE, 
                           fill.by = "ident",
                           group.by = group_col,
                           cols = brewer.pal(8, params$discrete_colors)[1:2]) +
            labs(title = paste("Custom Gene - Cluster", cl),
                 subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")))
        }
        
        ggsave(file.path(params$output_path, "custom_group_DE", paste0("custom_genes_cluster_", cl, "_violin.pdf")),
               p_vln, width = 6, height = 10, dpi = params$plot_dpi)
      } else {
        cat("Cluster", cl, "has only one group; skip violin.\n")
      }
    }
    
    # 2.3 热图
    genes_heat <- intersect(genes_available, rownames(GetAssayData(seurat_obj, slot = "scale.data")))
    if (length(genes_heat) > 0) {
      if (length(genes_heat) > 20) {
        genes_heat <- genes_heat[1:20]
      }
      
      if (length(cluster_groups) >= 2) {
        p_heat <- DoHeatmap(cluster_subset, features = genes_heat, 
                            slot = "scale.data", 
                            group.colors = brewer.pal(8, params$discrete_colors),
                            group.by = group_col) +
          scale_fill_gradientn(colors = params$continuous_colors) +
          labs(title = paste("Custom Genes - Cluster", cl),
               subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")))
      } else {
        p_heat <- DoHeatmap(cluster_subset, features = genes_heat, 
                            slot = "scale.data") +
          scale_fill_gradientn(colors = params$continuous_colors) +
          labs(title = paste("Custom Genes - Cluster", cl),
               subtitle = paste("Groups:", paste(cluster_groups, collapse = " vs ")))
      }
      
      ggsave(file.path(params$output_path, "custom_group_DE", paste0("custom_genes_cluster_", cl, "_heatmap.pdf")),
             p_heat, width = 12, height = 8, dpi = params$plot_dpi)
    } else {
      cat("No custom genes found in scale.data for cluster", cl, "; skip heatmap.\n")
    }
  }
}

# 函数 3:自定义 ORA 图
plot_custom_from_ora_csv <- function(
  csv_path,
  terms,
  output_dir,
  plot_dpi = 300
) {
  if (!file.exists(csv_path)) {
    cat("ORA CSV not found:", csv_path, "\n"); return(invisible(NULL))
  }
  if (length(terms) == 0) {
    cat("No terms provided. Nothing to plot.\n"); return(invisible(NULL))
  }

  df <- suppressMessages(readr::read_csv(csv_path, show_col_types = FALSE))

  need_cols <- c("Description", "p.adjust", "Count", "cluster", "database")
  miss <- setdiff(need_cols, colnames(df))
  if (length(miss) > 0) {
    cat("CSV missing columns:", paste(miss, collapse = ", "), "\n")
    return(invisible(NULL))
  }
  if (!"direction" %in% colnames(df)) df$direction <- "All"

  # 术语
  sel <- tolower(df$Description) %in% tolower(terms)
  sub <- df[sel, , drop = FALSE]
  if (nrow(sub) == 0) {
    cat("No matched pathways in CSV (case-insensitive).\n"); return(invisible(NULL))
  }

  out_dir_base <- file.path(output_dir, "custom_ORA")
  if (!dir.exists(out_dir_base)) dir.create(out_dir_base, recursive = TRUE)

  # 生成和 -log10(padj)
  if (!"generation" %in% colnames(sub)) {
    if ("GeneRatio" %in% colnames(sub)) {
      gr <- as.character(sub$GeneRatio)
      sub$generation <- suppressWarnings(vapply(gr, function(x) {
        if (is.na(x) || x == "") return(NA_real_)
        if (grepl("/", x, fixed = TRUE)) {
          p <- strsplit(x, "/", fixed = TRUE)[[1]]
          as.numeric(p[1]) / as.numeric(p[2])
        } else as.numeric(x)
      }, numeric(1)))
    } else if ("Count" %in% colnames(sub)) {
      sub$generation <- suppressWarnings(as.numeric(sub$Count))
    } else {
      sub$generation <- NA_real_
    }
  }
  sub$neglogp <- -log10(sub$p.adjust)

  # GO
  if (!"ONTOLOGY" %in% colnames(sub)) sub$ONTOLOGY <- NA_character_

  # 绘图
  for (db in sort(unique(sub$database))) {
    for (dirc in sort(unique(sub$direction))) {
      sub_db_dir <- file.path(out_dir_base, db, dirc)
      dir_dot <- file.path(sub_db_dir, "dotplot")
      dir_bar <- file.path(sub_db_dir, "bar")
      for (d in c(dir_dot, dir_bar)) if (!dir.exists(d)) dir.create(d, recursive = TRUE)

      sub_db_dirc <- sub[sub$database == db & sub$direction == dirc, , drop = FALSE]
      if (nrow(sub_db_dirc) == 0) next

      for (cl in sort(unique(sub_db_dirc$cluster))) {
        sub_cl <- sub_db_dirc[sub_db_dirc$cluster == cl, , drop = FALSE]
        if (nrow(sub_cl) == 0) next

        # sort
        sub_cl <- sub_cl[order(sub_cl$generation, sub_cl$p.adjust), , drop = FALSE]
        sub_cl$Description <- forcats::fct_reorder(sub_cl$Description, sub_cl$generation, .desc = FALSE)

        # GO
        sub_cl$go_type <- "Other"
        if ("ONTOLOGY" %in% colnames(sub_cl)) {
          sub_cl$go_type <- toupper(ifelse(is.na(sub_cl$ONTOLOGY), "Other", as.character(sub_cl$ONTOLOGY)))
          sub_cl$go_type[!sub_cl$go_type %in% c("BP","CC","MF")] <- "Other"
        }

        # 1. 点图
        p_dot <- ggplot(sub_cl, aes(x = generation, y = Description, size = Count, color = neglogp)) +
          geom_point(aes(shape = go_type)) +
          scale_shape_manual(values = c(BP = 16, CC = 17, MF = 15, Other = 19), na.translate = FALSE) +
          scale_color_gradientn(colors = params$continuous_colors) +
          labs(title = paste0("ORA dotplot (Selected Terms) - ", dirc, " - Cluster ", cl, " (", db, ")"),
               x = "generation", y = "Enriched Terms",
               size = "Gene Count", color = "-log10(Adj.P)", shape = "GO Type") +
          theme_minimal() + theme(axis.text.y = element_text(size = 8))
        ggsave(file.path(dir_dot, paste0("cluster_", cl, "_", dirc, ".pdf")),
               p_dot, width = 12, height = 8, dpi = plot_dpi)

        # 2. 条形图
        p_bar <- ggplot(sub_cl, aes(x = generation,
                                    y = forcats::fct_reorder(Description, generation, .desc = FALSE),
                                    fill = neglogp)) +
          geom_col() +
          scale_fill_gradientn(colors = params$continuous_colors) +
          labs(title = paste0("ORA Bar (Selected Terms) - ", dirc, " - Cluster ", cl, " (", db, ")"),
               x = "generation", y = "Enriched Terms", fill = "-log10(Adj.P)") +
          theme_minimal() + theme(axis.text.y = element_text(size = 8))
        ggsave(file.path(dir_bar, paste0("cluster_", cl, "_", dirc, ".pdf")),
               p_bar, width = 12, height = 8, dpi = plot_dpi)
      }
    }
  }

  cat("Custom ORA plots saved to:", out_dir_base, "\n")
  invisible(NULL)
}

# 函数 4:自定义 GSEA 图
plot_custom_from_gsea_csv <- function(csv_path,
                                      terms,      
                                      output_dir,         
                                      plot_dpi = 300,
                                      discrete_palette = "Paired",
                                      gsea_rds_path
) {
  if (!file.exists(csv_path)) stop("GSEA CSV not found: ", csv_path)
  if (!file.exists(gsea_rds_path)) stop("GSEA RDS not found: ", gsea_rds_path)

  # 读取 GSEA 对象
  gsea_res <- readRDS(gsea_rds_path)
  gsea_objects <- gsea_res$gsea_objects
  if (is.null(gsea_objects) || !length(gsea_objects)) {
    stop("gsea_objects not found inside RDS. Ensure you saved the full result of perform_gsea_analysis().")
  }

  df <- suppressMessages(readr::read_csv(csv_path, show_col_types = FALSE))
  need_cols <- c("ID","Description","setSize","NES","p.adjust","cluster","database")
  miss <- setdiff(need_cols, colnames(df))
  if (length(miss) > 0) stop("CSV missing required columns: ", paste(miss, collapse=", "))

  if (!dir.exists(output_dir)) dir.create(output_dir, recursive = TRUE)

  if (length(terms) == 0) stop("No terms provided.")
  desc_norm  <- tolower(df$Description)
  terms_norm <- tolower(terms)
  sub <- df[desc_norm %in% terms_norm, , drop = FALSE]
  if (nrow(sub) == 0) stop("No matched pathways in CSV for given terms.")

  # GO 类型
  if ("ONTOLOGY" %in% colnames(sub)) {
    sub$go_type <- toupper(as.character(sub$ONTOLOGY))
    sub$go_type[!sub$go_type %in% c("BP","CC","MF")] <- "Other"
  } else {
    sub$go_type <- "Other"
  }
  sub$go_type <- factor(sub$go_type, levels = c("BP","CC","MF","Other"))

  base_dir <- file.path(output_dir, "custom_GSEA")
  if (!dir.exists(base_dir)) dir.create(base_dir, recursive = TRUE)

  for (db in unique(sub$database)) {
    db_dotplot_dir <- file.path(base_dir, db, "dotplot")
    db_bar_dir    <- file.path(base_dir, db, "bar")
    db_curve_dir  <- file.path(base_dir, db, "curve")
    for (d in c(db_dotplot_dir, db_bar_dir, db_curve_dir)) if (!dir.exists(d)) dir.create(d, recursive = TRUE)

    for (cl in unique(sub$cluster)) {
      cl_df <- sub[sub$cluster == cl & sub$database == db, , drop = FALSE]
      if (nrow(cl_df) == 0) next

      # 1. barplot
      p_bar <- ggplot(cl_df,
                      aes(x = reorder(Description, NES),
                          y = NES,
                          fill = -log10(p.adjust))) +
        geom_bar(stat = "identity") +
        coord_flip() +
        scale_fill_gradientn(colors = params$continuous_colors, name = "-log10(Adj.P)") +
        labs(title = paste0("GSEA Bar (Selected Terms) - Cluster ", cl, " (", db, ")"),
             x = "Enriched Terms", y = "NES") +
        theme_minimal() + theme(axis.text.y = element_text(size = 8))
      ggsave(file.path(db_bar_dir, paste0("cluster_", cl, "_custom_bar.pdf")),
             p_bar, width = 12, height = 8, dpi = plot_dpi)

      # 2. dotplot
      p_dot <- ggplot(cl_df,
                      aes(x = NES, y = reorder(Description, NES),
                          size = setSize, color = -log10(p.adjust),
                          shape = go_type)) +
        scale_color_gradientn(colors = params$continuous_colors, name = "-log10(Adj.P)") +
        scale_size_continuous(name = "Set Size", range = c(2, 8)) +
        scale_shape_manual(values = c("BP"=18, "CC"=17, "MF"=15, "Other"=16), name = "ont") +
        geom_point(alpha = 0.8) +
        labs(title = paste0("GSEA DotPlot (Selected Terms) - Cluster ", cl, " (", db, ")"),
             x = "NES", y = "Enriched Terms") +
        theme_minimal() + theme(axis.text.y = element_text(size = 8)) +
        geom_vline(xintercept = 0, linetype = "dashed", color = "grey50")
      ggsave(file.path(db_dotplot_dir, paste0("cluster_", cl, "_custom_dotplot.pdf")),
             p_dot, width = 12, height = 8, dpi = plot_dpi)

      # 3. curve
      keys <- names(gsea_objects)
      keys <- keys[grepl(paste0("^", db, "_"), keys)]
      keys <- keys[grepl(paste0("_", cl, "$"), keys)]
      if (length(keys) == 0) {
        warning("No gseaResult object found for ", db, " cluster=", cl, ". Skip curves."); next
      }
      gobj <- gsea_objects[[keys[1]]]
      if (!(inherits(gobj, "gseaResult") && nrow(gobj) > 0)) {
        warning("gsea_object is not a valid gseaResult or empty for ", db, " cluster=", cl); next
      }

      for (i in seq_len(nrow(cl_df))) {
        term_id   <- cl_df$ID[i]
        term_desc <- cl_df$Description[i]
        clean_nm  <- gsub("[^A-Za-z0-9]", "_", term_desc)
        idx <- which(gobj@result$ID == term_id)
        if (length(idx) == 0) next
        gp <- gseaplot2(gobj, geneSetID = idx,
                        title = paste0("GSEA - ", term_desc, " (Cluster ", cl, ")"))
        ggsave(file.path(db_curve_dir, paste0("cluster_", cl, "_", clean_nm, ".pdf")),
               gp, width = 10, height = 8, dpi = plot_dpi)
      }
    }
  }
  cat("Custom GSEA plots (bar, dot, curves) saved to:", base_dir, "\n")
  invisible(TRUE)
}

执行自定义基因和通路可视化

R
# 自定义绘图
if (custom_plot) {
  if (params$differential_analysis_type == "clusters") {
      # 跨簇可视化自定义基因
      cat("Performing ALL CLUSTERS differential expression analysis...\n")
      plot_custom_genes_clusters(seurat_obj, custom_genes, params, clusters_colors)
  } else if (params$differential_analysis_type == "group_comparison") {
      # 在簇内按分组拆分可视化自定义基因
      cat("Performing GROUP COMPARISON differential expression analysis...\n")
      plot_custom_genes_group_comparison(seurat_obj, group_markers, custom_genes, params, clusters_colors)
  }

  if (params$enrichment_analysis_type == "ORA") {
    # 绘制来自 ORA CSV 的选定通路
    file_path <- file.path(params$output_path, "ORA", "ora_enrichment_results.csv")
    plot_custom_from_ora_csv(
      csv_path       = file_path,
      terms          = terms,
      output_dir     = params$output_path,
      plot_dpi       = params$plot_dpi
    )
  } else if (params$enrichment_analysis_type == "GSEA") {
    # 绘制来自 GSEA CSV 的选定通路
    file_path <- file.path(params$output_path, "GSEA", "gsea_enrichment_results.csv")
    plot_custom_from_gsea_csv(
      csv_path       = file_path,
      terms          = terms,
      output_dir     = params$output_path,
      plot_dpi       = params$plot_dpi,
      discrete_palette = params$discrete_colors,
      gsea_rds_path = file.path(params$output_path, "GSEA", "gsea_results.rds")
    )
  }
}

模版信息

模版发布日期:2025年9月16日
最后更新:2025年9月16日

联系方式:有关本教程的问题、问题或建议,请通过 “智能咨询” 服务联系我们。

参考资料

[1] Seurat Package: https://github.com/satijalab/Seurat

[2] clusterProfiler: https://github.com/YuLab-SMU/clusterProfiler

[3] Gene Ontology: http://geneontology.org/

[4] KEGG: https://www.genome.jp/kegg/

[5] MSigDB: https://www.gsea-msigdb.org/

[6] Reactome: https://reactome.org/ ### 本教程引用 Liu X., Wu C., Pan L., et al. (2025). SeekSoul Online: A user-friendly bioinformatics platform focused on single-cell multi-omics analysis. The Innovation Life 3:100156. https://doi.org/10.59717/j.xinn-life.2025.100156

0 条评论·0 条回复