Visualization of Single-cell Gene Expression Correlation Scatter Plots
Time: 11 min
Words: 2.2k words
Updated: 2026-02-27
Reads: 0 times
Environment Setup
R
# Load required R packages
library(Seurat) # For single-cell data analysis
library(ggplot2) # For data visualization
library(ggpubr) # For adding statistical info to plots
library(ggExtra) # For adding marginal distribution plots
library(dplyr) # For data processing and pipeline operationsData Loading and Preprocessing
R
# Read data and preprocessing
seurat.obj <- readRDS("data/AY1739512568405/input.rds") # Read Seurat object
meta <- read.table("data/AY1739512568405/meta.tsv", header=T, sep="\t", row.names = 1) # Read metadata
obj <- AddMetaData(seurat.obj, meta) # Add metadata to Seurat object
DefaultAssay(obj) = "RNA" # Set default assay to RNA expression data
head(obj@meta.data) # View first few rows of metadataR
# View sample and cell type information
unique(obj@meta.data$Sample) # View all unique sample names
unique(obj@meta.data$celltype) # View all unique cell typesR
# Set target cell type and sample
target_celltypes <- c('T cell') # Specify target cell type as T cell
target_samples1 <- c('S133T') # Specify single target sample
# Create subset objects
sub_obj1 <- subset(obj, subset = Sample %in% target_samples1 & celltype %in% target_celltypes) # Create single sample T cell subset
sub_obj2 <- subset(obj, subset = celltype %in% target_celltypes) # Create all samples T cell subset
# Verify subset data
unique(sub_obj1@meta.data$Sample) # Check sample of subset 1
unique(sub_obj1@meta.data$celltype) # Check cell type of subset 1
unique(sub_obj2@meta.data$Sample) # Check sample of subset 2
unique(sub_obj2@meta.data$celltype) # Check cell type of subset 2Single Sample Correlation Analysis
R
# Extract single sample gene expression data
plot_data1 <- FetchData(sub_obj1,
vars = c("PPT1", "TRAF1"),
slot = "data") # Extract PPT1 and TRAF1 expression values from normalized data
# Data quality check
print("Data summary:")
summary(plot_data1) # Show statistical summary of data
print("Number of NA values:")
print(colSums(is.na(plot_data1))) # Check number of NA values
print("Number of 0 values:")
print(colSums(plot_data1 == 0)) # Check number of 0 values
print("Number of cells with both genes as 0:")
sum(plot_data1$PPT1== 0 & plot_data1$TRAF1 == 0) # Check number of cells with double zero expressionR
# Filter data
plot_data1_filtered <- plot_data1[plot_data1$PPT1 > 0 & plot_data1$TRAF1 > 0, ] # Filter double gene expressing cells
print("Data summary after filtering:")
summary(plot_data1_filtered) # Show statistical summary after filtering
print(paste0("Cell count after filtering:", nrow(plot_data1_filtered))) # Show cell count after filteringR
# Create single sample correlation scatter plot
options(repr.plot.height=8, repr.plot.width=8) # Set figure size
p1 <- ggplot(plot_data1, aes(x = PPT1, y = TRAF1)) +
geom_point(size = 1, alpha = 0.5, color = "grey30") + # Add points
geom_smooth(method = "lm", color = "blue", se = TRUE, fill = "grey90") + # Add fitted line
labs(x = "PPT1 expression", y = "TRAF1 expression") + # Set axis labels
theme_bw() + # Set theme
# Custom theme details
theme(
panel.grid.major = element_line(color = "grey90", linewidth = 0.3),
panel.grid.minor = element_line(color = "grey90", linewidth = 0.3),
panel.background = element_rect(fill = "white"),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5),
axis.ticks = element_line(color = "black", linewidth = 0.3),
axis.text = element_text(color = "black", size = 10),
axis.title = element_text(color = "black", size = 16),
axis.ticks.length = unit(0.2, "cm"),
plot.background = element_rect(fill = "white")
) +
# Set x-axis ticks
scale_x_continuous(
breaks = seq(0, max(plot_data1$PPT1), by = 1),
minor_breaks = seq(0, max(plot_data1$TRAF1), by = 0.5)
) +
# Set y-axis ticks
scale_y_continuous(
limits = c(0, max(plot_data1$TRAF1)),
breaks = seq(0, max(plot_data1$TRAF1), by = 1),
minor_breaks = seq(0, max(plot_data1$TRAF1), by = 0.5)
) +
# Add correlation statistics
stat_cor(method = "pearson",
label.x.npc = "left",
label.y.npc = "top",
size = 4)
# Add marginal distribution plots
p1_with_marginal <- ggMarginal(p1,
type = "density", # Set marginal plot type to density
margins = "both", # Show marginal distribution for both x and y axes
size = 5,
xparams = list(fill = "orange", alpha = 1), # Set x-axis marginal plot style
yparams = list(fill = "blue", alpha = 1)) # Set y-axis marginal plot style
p1_with_marginal # Show plotR
# Create scatter plot for filtered data
options(repr.plot.height=8, repr.plot.width=8) # Set figure size
p1_filtered <- ggplot(plot_data1_filtered, aes(x = PPT1, y = TRAF1)) +
geom_point(size = 1, alpha = 0.5, color = "grey30") + # Add points
geom_smooth(method = "lm", color = "blue", se = TRUE, fill = "grey90") + # Add fitted line and confidence interval
labs(x = "PPT1 expression", y = "TRAF1 expression") + # Set axis labels
# Set theme and style
theme_bw() +
theme(
panel.grid.major = element_line(color = "grey90", linewidth = 0.3), # Major grid lines
panel.grid.minor = element_line(color = "grey90", linewidth = 0.3), # Minor grid lines
panel.background = element_rect(fill = "white"), # Background color
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5), # Border
axis.ticks = element_line(color = "black", linewidth = 0.3), # Ticks
axis.text = element_text(color = "black", size = 10), # Axis text
axis.title = element_text(color = "black", size = 16), # Axis title
axis.ticks.length = unit(0.2, "cm"), # Tick length
plot.background = element_rect(fill = "white") # Plot background
) +
# Set x-axis ticks
scale_x_continuous(
breaks = seq(0, max(plot_data1_filtered$PPT1), by = 1),
minor_breaks = seq(0, max(plot_data1_filtered$TRAF1), by = 0.5)
) +
# Set y-axis ticks
scale_y_continuous(
limits = c(0, max(plot_data1_filtered$TRAF1)),
breaks = seq(0, max(plot_data1_filtered$TRAF1), by = 1),
minor_breaks = seq(0, max(plot_data1_filtered$TRAF1), by = 0.5)
) +
# Add correlation statistics
stat_cor(method = "pearson",
label.x.npc = "left",
label.y.npc = "top",
size = 4)
# Add marginal distribution to filtered scatter plot
p1_filtered_with_marginal <- ggMarginal(p1_filtered,
type = "density", # Set marginal plot type to density
margins = "both", # Show marginal distribution for both axes
size = 5,
xparams = list(fill = "orange", alpha = 1), # x-axis marginal plot style
yparams = list(fill = "blue", alpha = 1)) # y-axis marginal plot style
p1_filtered_with_marginal # Show plotFigure Legend
This is a gene expression correlation scatter plot showing the relationship between PPT1 and TRAF1 expression levels in cells:
(1) Main Plot
- X-axis: Represents PPT1 gene expression level
- Y-axis: Represents TRAF1 gene expression level
- Points: Each gray dot represents a cell
- Fitted Line: Blue line is the fitted line, indicating the overall trend of expression
- Confidence Interval: Light gray area is the confidence interval of the fitted line
(2) Statistics in the plot
- R = 0.72: Correlation coefficient, ranging from -1 to 1. 0.72 indicates a strong positive correlation
- p = 2.5e-16: Extremely small p-value, indicating statistical significance
(3) Marginal Distribution Plots
- Top orange curve: Shows distribution of PPT1 expression
- Right blue curve: Shows distribution of TRAF1 expression
Biological Significance PPT1 and TRAF1 expression show significant positive correlation; when one increases, the other tends to increase. This correlation may suggest functional linkage or shared regulatory mechanisms.
Multi-sample Correlation Analysis
R
# Extract multi-sample expression data and group info
plot_data2 <- FetchData(sub_obj2,
vars = c("PPT1", "TRAF1", "Sample"), # Extract gene expression values and sample info
slot = "data") # Use normalized data
# Data quality check
print("Data summary:")
summary(plot_data2) # Show data summary
print("Number of NA values:")
print(colSums(is.na(plot_data2))) # Check NA values
print("Number of 0 values:")
print(colSums(plot_data2 == 0)) # Check 0 values
print("Number of cells with both genes as 0:")
sum(plot_data2$PPT1== 0 & plot_data2$TRAF1 == 0) # Check double zero expressing cellsR
# Filter multi-sample data
plot_data2_filtered <- plot_data2[plot_data2$PPT1 > 0 & plot_data2$TRAF1 > 0, ] # Filter double gene expressing cells
print("Data summary after filtering:")
summary(plot_data2_filtered) # Show statistical summary after filtering
print(paste0("Cell count after filtering:", nrow(plot_data2_filtered))) # Show cell count after filteringR
# Create multi-sample scatter plot
options(repr.plot.height=8, repr.plot.width=8) # Set figure size
p2 <- ggplot(plot_data2, aes(x = PPT1, y = TRAF1, color = Sample)) + # Color by sample
geom_point(size = 1.5, alpha = 0.8) + # Add points
labs(title = "PPT1 vs TRAF1 Correlation", # Set title
x = "PPT1 expression", # x-axis label
y = "TRAF1 expression") + # y-axis label
# Set theme and style
theme_bw() +
theme(
panel.grid.major = element_line(color = "grey90", linewidth = 0.3), # Major grid lines
panel.grid.minor = element_line(color = "grey95", linewidth = 0.3), # Minor grid lines
panel.background = element_rect(fill = "white"), # Background
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5), # Border
axis.ticks = element_line(color = "black", linewidth = 0.3), # Ticks
axis.text = element_text(color = "black", size = 16), # Axis text
axis.title = element_text(color = "black", size = 16), # Axis title
legend.title = element_text(size = 16), # Legend title
legend.text = element_text(size = 16), # Legend text
legend.position = "right", # Legend position
plot.title = element_text(size = 16, hjust = 0) # Plot title
) +
# Set x-axis ticks
scale_x_continuous(
breaks = seq(0, max(plot_data2$PPT1), by = 1),
minor_breaks = seq(0, max(plot_data2$TRAF1), by = 0.5)
) +
# Set y-axis ticks
scale_y_continuous(
limits = c(0, max(plot_data2$TRAF1)),
breaks = seq(0, max(plot_data2$TRAF1), by = 1),
minor_breaks = seq(0, max(plot_data2$TRAF1), by = 0.5)
)
p2 # Show plotFigure Legend
This is a multi-sample gene expression correlation scatter plot showing the relationship between PPT1 and TRAF1 expression in different samples (Correlation coefficients and P-values for each sample can be found in the output correlation_results.txt below):
(1) Basic Structure
- X-axis: PPT1 gene expression level
- Y-axis: TRAF1 gene expression level
- Legend: Right legend, different colors represent different samples (S133A to S159T; 'A' suffix for adjacent normal, 'T' for tumor)
- Points: Each dot represents a cell
(2) Data Features
- Positive correlation between PPT1 and TRAF1 is observed in multiple samples
- Many points near 0 on x/y axes indicate low expression of both genes in some cells
- Correlation patterns may vary slightly between samples
Biological Significance
- Compare expression relationship differences between samples (esp. tumor vs adjacent)
- Helps understand regulatory relationships in different pathological states
- May reveal potential sample-specific expression patterns
R
# Calculate correlation statistics for each sample
correlation_results <- plot_data2 %>%
group_by(Sample) %>% # Group by sample
summarise(
Correlation = cor(PPT1, TRAF1, method = "pearson"), # Calculate Pearson correlation
P_value = cor.test(PPT1, TRAF1, method = "pearson")$p.value, # Calculate p-value
Sample_size = n() # Calculate sample size
) %>%
mutate(
P_value = format(P_value, scientific = TRUE, digits = 3) # Format p-value to scientific notation
)
# Save correlation results
write.table(correlation_results,
file = "Tcell_allsampes_PPT1_TRAF1_correlation_results.txt",
sep = "\t", # Tab separated
row.names = FALSE, # Exclude row names
quote = FALSE) # Do not quoteR
# Create filtered multi-sample scatter plot
options(repr.plot.height=8, repr.plot.width=8) # Set figure size
p2_filtered <- ggplot(plot_data2_filtered, aes(x = PPT1, y = TRAF1, color = Sample)) + # Color by sample
geom_point(size = 1.5, alpha = 0.8) + # Add points
labs(title = "PPT1 vs TRAF1 Correlation", # Set title
x = "PPT1 expression", # x-axis label
y = "TRAF1 expression") + # y-axis label
# Set theme and style
theme_bw() +
theme(
panel.grid.major = element_line(color = "grey90", linewidth = 0.3), # Major grid lines
panel.grid.minor = element_line(color = "grey95", linewidth = 0.3), # Minor grid lines
panel.background = element_rect(fill = "white"), # Background
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5), # Border
axis.ticks = element_line(color = "black", linewidth = 0.3),# Axis ticks
axis.text = element_text(color = "black", size = 16), # Axis text
axis.title = element_text(color = "black", size = 16), # Axis title
legend.title = element_text(size = 16), # Legend title
legend.text = element_text(size = 16), # Legend text
legend.position = "right", # Legend position
plot.title = element_text(size = 16, hjust = 0) # Plot title
) +
# Set x-axis ticks
scale_x_continuous(
breaks = seq(0, max(plot_data2_filtered$PPT1), by = 1),
minor_breaks = seq(0, max(plot_data2_filtered$TRAF1), by = 0.5)
) +
# Set y-axis ticks
scale_y_continuous(
limits = c(0, max(plot_data2_filtered$TRAF1)),
breaks = seq(0, max(plot_data2_filtered$TRAF1), by = 1),
minor_breaks = seq(0, max(plot_data2_filtered$TRAF1), by = 0.5)
)
p2_filtered # Show plotR
# Calculate correlation statistics for each sample after filtering
correlation_results_filtered <- plot_data2_filtered %>%
group_by(Sample) %>% # Group by sample
summarise(
Correlation = cor(PPT1, TRAF1, method = "pearson"), # Calculate Pearson correlation
# Calculate p-value
P_value = cor.test(PPT1, TRAF1, method = "pearson")$p.value, # Calculate p-value
Sample_size = n() # Calculate sample size
) %>%
# Convert values to scientific notation
mutate(
P_value = format(P_value, scientific = TRUE, digits = 3)
)
# Save correlation results
write.table(correlation_results_filtered, # Format p-values as scientific notation
file = "Tcell_allsampes_PPT1_TRAF1_filtered_correlation_results.txt",
sep = "\t", # Tab separated
row.names = FALSE, # Exclude row names
quote = FALSE) # Do not quote stringsR
# Save plot
ggsave("Tcell_S133T_PPT1_TRAF1_filtered_correlation_with_marginal.pdf",
plot = p1_filtered_with_marginal, width = 8, height = 8) # Save filtered single-sample plot
ggsave("Tcell_allsamples_PPT1_TRAF1_correlation.pdf",
plot = p2, width = 8, height = 8) # Save original multi-sample plot
ggsave("Tcell_allsamples_PPT1_TRAF1_filtered_correlation.pdf",
plot = p2_filtered, width = 8, height = 8) # Save filtered multi-sample plot