Skip to content

Visualization of Single-cell Gene Expression Correlation Scatter Plots

Author: SeekGene
Time: 11 min
Words: 2.2k words
Updated: 2026-02-27
Reads: 0 times
3' scRNA-seq 5' + Immune Profiling FFPE scRNA-seq Notebooks Plotting Spatial-seq scATAC + RNA-seq scFAST-seq scMethyl + RNA-seq

Environment Setup

R
# Load required R packages
library(Seurat)      # For single-cell data analysis
library(ggplot2)     # For data visualization
library(ggpubr)      # For adding statistical info to plots
library(ggExtra)     # For adding marginal distribution plots
library(dplyr)       # For data processing and pipeline operations

Data Loading and Preprocessing

R
# Read data and preprocessing
seurat.obj <- readRDS("data/AY1739512568405/input.rds")  # Read Seurat object
meta <- read.table("data/AY1739512568405/meta.tsv", header=T, sep="\t", row.names = 1)  # Read metadata
obj <- AddMetaData(seurat.obj, meta)  # Add metadata to Seurat object
DefaultAssay(obj) = "RNA"  # Set default assay to RNA expression data
head(obj@meta.data)  # View first few rows of metadata
R
# View sample and cell type information
unique(obj@meta.data$Sample)  # View all unique sample names
unique(obj@meta.data$celltype)  # View all unique cell types
R
# Set target cell type and sample
target_celltypes <- c('T cell')  # Specify target cell type as T cell
target_samples1 <- c('S133T')    # Specify single target sample
# Create subset objects
sub_obj1 <- subset(obj, subset = Sample %in% target_samples1 & celltype %in% target_celltypes)  # Create single sample T cell subset
sub_obj2 <- subset(obj, subset = celltype %in% target_celltypes)  # Create all samples T cell subset

# Verify subset data
unique(sub_obj1@meta.data$Sample)    # Check sample of subset 1
unique(sub_obj1@meta.data$celltype)  # Check cell type of subset 1
unique(sub_obj2@meta.data$Sample)    # Check sample of subset 2
unique(sub_obj2@meta.data$celltype)  # Check cell type of subset 2

Single Sample Correlation Analysis

R
# Extract single sample gene expression data
plot_data1 <- FetchData(sub_obj1, 
                      vars = c("PPT1", "TRAF1"),
                      slot = "data")  # Extract PPT1 and TRAF1 expression values from normalized data

# Data quality check
print("Data summary:")
summary(plot_data1)  # Show statistical summary of data
print("Number of NA values:")
print(colSums(is.na(plot_data1)))  # Check number of NA values
print("Number of 0 values:")
print(colSums(plot_data1 == 0))  # Check number of 0 values
print("Number of cells with both genes as 0:")
sum(plot_data1$PPT1== 0 & plot_data1$TRAF1 == 0)  # Check number of cells with double zero expression
R
# Filter data
plot_data1_filtered <- plot_data1[plot_data1$PPT1 > 0 & plot_data1$TRAF1 > 0, ]  # Filter double gene expressing cells
print("Data summary after filtering:")
summary(plot_data1_filtered)  # Show statistical summary after filtering
print(paste0("Cell count after filtering:", nrow(plot_data1_filtered)))  # Show cell count after filtering
R
# Create single sample correlation scatter plot
options(repr.plot.height=8, repr.plot.width=8)  # Set figure size
p1 <- ggplot(plot_data1, aes(x = PPT1, y = TRAF1)) +
    geom_point(size = 1, alpha = 0.5, color = "grey30") +  # Add points
    geom_smooth(method = "lm", color = "blue", se = TRUE, fill = "grey90") +  # Add fitted line
    labs(x = "PPT1 expression", y = "TRAF1 expression") +  # Set axis labels
    theme_bw() +  # Set theme
    # Custom theme details
    theme(
        panel.grid.major = element_line(color = "grey90", linewidth = 0.3),
        panel.grid.minor = element_line(color = "grey90", linewidth = 0.3),
        panel.background = element_rect(fill = "white"),
        panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5),
        axis.ticks = element_line(color = "black", linewidth = 0.3),
        axis.text = element_text(color = "black", size = 10),
        axis.title = element_text(color = "black", size = 16),
        axis.ticks.length = unit(0.2, "cm"),
        plot.background = element_rect(fill = "white")
    ) +
    # Set x-axis ticks
    scale_x_continuous(
        breaks = seq(0, max(plot_data1$PPT1), by = 1),
        minor_breaks = seq(0, max(plot_data1$TRAF1), by = 0.5)
    ) +
    # Set y-axis ticks
    scale_y_continuous(
        limits = c(0, max(plot_data1$TRAF1)),
        breaks = seq(0, max(plot_data1$TRAF1), by = 1),
        minor_breaks = seq(0, max(plot_data1$TRAF1), by = 0.5)
    ) +
    # Add correlation statistics
    stat_cor(method = "pearson",
             label.x.npc = "left",
             label.y.npc = "top",
             size = 4)
# Add marginal distribution plots
p1_with_marginal <- ggMarginal(p1,
                             type = "density",  # Set marginal plot type to density
                             margins = "both",  # Show marginal distribution for both x and y axes
                             size = 5,
                             xparams = list(fill = "orange", alpha = 1),  # Set x-axis marginal plot style
                             yparams = list(fill = "blue", alpha = 1))    # Set y-axis marginal plot style
p1_with_marginal  # Show plot
R
# Create scatter plot for filtered data
options(repr.plot.height=8, repr.plot.width=8)  # Set figure size
p1_filtered <- ggplot(plot_data1_filtered, aes(x = PPT1, y = TRAF1)) +
    geom_point(size = 1, alpha = 0.5, color = "grey30") +  # Add points
    geom_smooth(method = "lm", color = "blue", se = TRUE, fill = "grey90") +  # Add fitted line and confidence interval
    labs(x = "PPT1 expression", y = "TRAF1 expression") +  # Set axis labels
    # Set theme and style
    theme_bw() +
    theme(
        panel.grid.major = element_line(color = "grey90", linewidth = 0.3),  # Major grid lines
        panel.grid.minor = element_line(color = "grey90", linewidth = 0.3),  # Minor grid lines
        panel.background = element_rect(fill = "white"),  # Background color
        panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5),  # Border
        axis.ticks = element_line(color = "black", linewidth = 0.3),  # Ticks
        axis.text = element_text(color = "black", size = 10),  # Axis text
        axis.title = element_text(color = "black", size = 16),  # Axis title
        axis.ticks.length = unit(0.2, "cm"),  # Tick length
        plot.background = element_rect(fill = "white")  # Plot background
    ) +
    # Set x-axis ticks
    scale_x_continuous(
        breaks = seq(0, max(plot_data1_filtered$PPT1), by = 1),
        minor_breaks = seq(0, max(plot_data1_filtered$TRAF1), by = 0.5)
    ) +
    # Set y-axis ticks
    scale_y_continuous(
        limits = c(0, max(plot_data1_filtered$TRAF1)),
        breaks = seq(0, max(plot_data1_filtered$TRAF1), by = 1),
        minor_breaks = seq(0, max(plot_data1_filtered$TRAF1), by = 0.5)
    ) +
    # Add correlation statistics
    stat_cor(method = "pearson",
             label.x.npc = "left",
             label.y.npc = "top",
             size = 4)

# Add marginal distribution to filtered scatter plot
p1_filtered_with_marginal <- ggMarginal(p1_filtered,
                             type = "density",  # Set marginal plot type to density
                             margins = "both",  # Show marginal distribution for both axes
                             size = 5,
                             xparams = list(fill = "orange", alpha = 1),  # x-axis marginal plot style
                             yparams = list(fill = "blue", alpha = 1))    # y-axis marginal plot style
p1_filtered_with_marginal  # Show plot

Figure Legend

This is a gene expression correlation scatter plot showing the relationship between PPT1 and TRAF1 expression levels in cells:

(1) Main Plot

  • X-axis: Represents PPT1 gene expression level
  • Y-axis: Represents TRAF1 gene expression level
  • Points: Each gray dot represents a cell
  • Fitted Line: Blue line is the fitted line, indicating the overall trend of expression
  • Confidence Interval: Light gray area is the confidence interval of the fitted line

(2) Statistics in the plot

  • R = 0.72: Correlation coefficient, ranging from -1 to 1. 0.72 indicates a strong positive correlation
  • p = 2.5e-16: Extremely small p-value, indicating statistical significance

(3) Marginal Distribution Plots

  • Top orange curve: Shows distribution of PPT1 expression
  • Right blue curve: Shows distribution of TRAF1 expression

Biological Significance PPT1 and TRAF1 expression show significant positive correlation; when one increases, the other tends to increase. This correlation may suggest functional linkage or shared regulatory mechanisms.

Multi-sample Correlation Analysis

R
# Extract multi-sample expression data and group info
plot_data2 <- FetchData(sub_obj2, 
                      vars = c("PPT1", "TRAF1", "Sample"),  # Extract gene expression values and sample info
                      slot = "data")  # Use normalized data

# Data quality check
print("Data summary:")
summary(plot_data2)  # Show data summary
print("Number of NA values:")
print(colSums(is.na(plot_data2)))  # Check NA values
print("Number of 0 values:")
print(colSums(plot_data2 == 0))  # Check 0 values
print("Number of cells with both genes as 0:")
sum(plot_data2$PPT1== 0 & plot_data2$TRAF1 == 0)  # Check double zero expressing cells
R
# Filter multi-sample data
plot_data2_filtered <- plot_data2[plot_data2$PPT1 > 0 & plot_data2$TRAF1 > 0, ]  # Filter double gene expressing cells
print("Data summary after filtering:")
summary(plot_data2_filtered)  # Show statistical summary after filtering
print(paste0("Cell count after filtering:", nrow(plot_data2_filtered)))  # Show cell count after filtering
R
# Create multi-sample scatter plot
options(repr.plot.height=8, repr.plot.width=8)  # Set figure size
p2 <- ggplot(plot_data2, aes(x = PPT1, y = TRAF1, color = Sample)) +  # Color by sample
    geom_point(size = 1.5, alpha = 0.8) +  # Add points
    labs(title = "PPT1 vs TRAF1 Correlation",  # Set title
         x = "PPT1 expression",  # x-axis label
         y = "TRAF1 expression") +  # y-axis label
    # Set theme and style
    theme_bw() +
    theme(
        panel.grid.major = element_line(color = "grey90", linewidth = 0.3),  # Major grid lines
        panel.grid.minor = element_line(color = "grey95", linewidth = 0.3),  # Minor grid lines
        panel.background = element_rect(fill = "white"),  # Background
        panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5),  # Border
        axis.ticks = element_line(color = "black", linewidth = 0.3),  # Ticks
        axis.text = element_text(color = "black", size = 16),  # Axis text
        axis.title = element_text(color = "black", size = 16),  # Axis title
        legend.title = element_text(size = 16),  # Legend title
        legend.text = element_text(size = 16),  # Legend text
        legend.position = "right",  # Legend position
        plot.title = element_text(size = 16, hjust = 0)  # Plot title
    ) +
    # Set x-axis ticks
    scale_x_continuous(
        breaks = seq(0, max(plot_data2$PPT1), by = 1),
        minor_breaks = seq(0, max(plot_data2$TRAF1), by = 0.5)
    ) +
    # Set y-axis ticks
    scale_y_continuous(
        limits = c(0, max(plot_data2$TRAF1)),
        breaks = seq(0, max(plot_data2$TRAF1), by = 1),
        minor_breaks = seq(0, max(plot_data2$TRAF1), by = 0.5)
    )
p2  # Show plot

Figure Legend

This is a multi-sample gene expression correlation scatter plot showing the relationship between PPT1 and TRAF1 expression in different samples (Correlation coefficients and P-values for each sample can be found in the output correlation_results.txt below):

(1) Basic Structure

  • X-axis: PPT1 gene expression level
  • Y-axis: TRAF1 gene expression level
  • Legend: Right legend, different colors represent different samples (S133A to S159T; 'A' suffix for adjacent normal, 'T' for tumor)
  • Points: Each dot represents a cell

(2) Data Features

  • Positive correlation between PPT1 and TRAF1 is observed in multiple samples
  • Many points near 0 on x/y axes indicate low expression of both genes in some cells
  • Correlation patterns may vary slightly between samples

Biological Significance

  • Compare expression relationship differences between samples (esp. tumor vs adjacent)
  • Helps understand regulatory relationships in different pathological states
  • May reveal potential sample-specific expression patterns
R
# Calculate correlation statistics for each sample
correlation_results <- plot_data2 %>%
  group_by(Sample) %>%  # Group by sample
  summarise(
    Correlation = cor(PPT1, TRAF1, method = "pearson"),  # Calculate Pearson correlation
    P_value = cor.test(PPT1, TRAF1, method = "pearson")$p.value,  # Calculate p-value
    Sample_size = n()  # Calculate sample size
  ) %>%
  mutate(
    P_value = format(P_value, scientific = TRUE, digits = 3)  # Format p-value to scientific notation
  )

# Save correlation results
write.table(correlation_results, 
            file = "Tcell_allsampes_PPT1_TRAF1_correlation_results.txt", 
            sep = "\t",             # Tab separated
            row.names = FALSE,      # Exclude row names
            quote = FALSE)          # Do not quote
R
# Create filtered multi-sample scatter plot
options(repr.plot.height=8, repr.plot.width=8)  # Set figure size
p2_filtered <- ggplot(plot_data2_filtered, aes(x = PPT1, y = TRAF1, color = Sample)) +  # Color by sample
    geom_point(size = 1.5, alpha = 0.8) +  # Add points
    labs(title = "PPT1 vs TRAF1 Correlation",  # Set title
         x = "PPT1 expression",  # x-axis label
         y = "TRAF1 expression") +  # y-axis label
    # Set theme and style
    theme_bw() +
    theme(
        panel.grid.major = element_line(color = "grey90", linewidth = 0.3),  # Major grid lines
        panel.grid.minor = element_line(color = "grey95", linewidth = 0.3),  # Minor grid lines
        panel.background = element_rect(fill = "white"),  # Background
        panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5),  # Border
        axis.ticks = element_line(color = "black", linewidth = 0.3),# Axis ticks
        axis.text = element_text(color = "black", size = 16),  # Axis text
        axis.title = element_text(color = "black", size = 16),  # Axis title
        legend.title = element_text(size = 16),  # Legend title
        legend.text = element_text(size = 16),  # Legend text
        legend.position = "right",  # Legend position
        plot.title = element_text(size = 16, hjust = 0)  # Plot title
    ) +
    # Set x-axis ticks
    scale_x_continuous(
        breaks = seq(0, max(plot_data2_filtered$PPT1), by = 1),
        minor_breaks = seq(0, max(plot_data2_filtered$TRAF1), by = 0.5)
    ) +
    # Set y-axis ticks
    scale_y_continuous(
        limits = c(0, max(plot_data2_filtered$TRAF1)),
        breaks = seq(0, max(plot_data2_filtered$TRAF1), by = 1),
        minor_breaks = seq(0, max(plot_data2_filtered$TRAF1), by = 0.5)
    )
p2_filtered  # Show plot
R
# Calculate correlation statistics for each sample after filtering
correlation_results_filtered <- plot_data2_filtered %>%
  group_by(Sample) %>%  # Group by sample
  summarise(
    Correlation = cor(PPT1, TRAF1, method = "pearson"),  # Calculate Pearson correlation
    # Calculate p-value
    P_value = cor.test(PPT1, TRAF1, method = "pearson")$p.value,  # Calculate p-value
    Sample_size = n()  # Calculate sample size
  ) %>%
  # Convert values to scientific notation
  mutate(
    P_value = format(P_value, scientific = TRUE, digits = 3)
  )

# Save correlation results
write.table(correlation_results_filtered,  # Format p-values as scientific notation
            file = "Tcell_allsampes_PPT1_TRAF1_filtered_correlation_results.txt", 
            sep = "\t",             # Tab separated
            row.names = FALSE,      # Exclude row names
            quote = FALSE)          # Do not quote strings
R
# Save plot
ggsave("Tcell_S133T_PPT1_TRAF1_filtered_correlation_with_marginal.pdf", 
       plot = p1_filtered_with_marginal, width = 8, height = 8)  # Save filtered single-sample plot
ggsave("Tcell_allsamples_PPT1_TRAF1_correlation.pdf", 
       plot = p2, width = 8, height = 8)  # Save original multi-sample plot
ggsave("Tcell_allsamples_PPT1_TRAF1_filtered_correlation.pdf", 
       plot = p2_filtered, width = 8, height = 8)  # Save filtered multi-sample plot
0 comments·0 replies