SeekSpace 高级分析:空间基因表达密度分布图绘制
时长: 17 分钟
字数: 3.4k 字
更新: 2026-02-28
阅读: 0 次
python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde
from mpl_toolkits.mplot3d import Axes3D数据加载
先利用 R 从 rds 中保存空间坐标和注释结果
读取基因表达矩阵
python
celltype_df_all1 = pd.read_csv("/PROJ2/FLOAT/weichendan/seekspace/SGS2401017/updata_allsamples/10Density/26samples_IGF2_marker.csv")python
celltype_df_all1| cell_id | sample_id | IGF2 | IGF1R | IGF2R | IGF2_marker | IGF1R_marker | IGF2R_marker | |
|---|---|---|---|---|---|---|---|---|
| 0 | AAGTACCATGTATGATGTAGTCATACG | WTH1051 | 0.000000 | 1.337767 | 0.000000 | non_IGF2 | IGF1R | non_IGF2R |
| 1 | AAGTTCGATGCGCATCCAAGGCACTAT | WTH1051 | 0.000000 | 0.000000 | 0.000000 | non_IGF2 | non_IGF1R | non_IGF2R |
| 2 | ACAAGCTATGGTCATCTGGACTGTGGA | WTH1051 | 0.000000 | 0.000000 | 0.000000 | non_IGF2 | non_IGF1R | non_IGF2R |
| 3 | ACCCTTGTACAAGTCATCTAAGTACGA | WTH1051 | 0.000000 | 0.000000 | 0.000000 | non_IGF2 | non_IGF1R | non_IGF2R |
| 4 | ACTACGATACCGAGTGCGCTCTGCTTC | WTH1051 | 0.000000 | 1.340361 | 0.000000 | non_IGF2 | IGF1R | non_IGF2R |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 394417 | GTATTTCGCTGTAGAATGCTCTGCTTC | Normal_5 | 0.000000 | 0.249231 | 0.000000 | non_IGF2 | IGF1R | non_IGF2R |
| 394418 | TGAGGTTGCTCGAGTTTGCTCTGCTTC | Normal_5 | 4.161886 | 0.000000 | 0.445782 | IGF2 | non_IGF1R | IGF2R |
| 394419 | ATTCCCGATGCAACTGATAGTCATACG | Normal_5 | 3.798390 | 0.000000 | 0.192101 | IGF2 | non_IGF1R | IGF2R |
| 394420 | TTGCATTATGCGCTTCGTTCTGCGCTC | Normal_5 | 3.657890 | 0.000000 | 0.119740 | IGF2 | non_IGF1R | IGF2R |
| 394421 | CAAGCTAGCTGAGAATTCCTAATAACG | Normal_5 | 3.714930 | 0.108064 | 0.108064 | IGF2 | IGF1R | IGF2R |
394422 rows × 8 columns
python
celltype_df_all = celltype_df_all1.drop(columns=celltype_df_all1.columns[5:8])python
celltype_df_all| cell_id | sample_id | IGF2 | IGF1R | IGF2R | |
|---|---|---|---|---|---|
| 0 | AAGTACCATGTATGATGTAGTCATACG | WTH1051 | 0.000000 | 1.337767 | 0.000000 |
| 1 | AAGTTCGATGCGCATCCAAGGCACTAT | WTH1051 | 0.000000 | 0.000000 | 0.000000 |
| 2 | ACAAGCTATGGTCATCTGGACTGTGGA | WTH1051 | 0.000000 | 0.000000 | 0.000000 |
| 3 | ACCCTTGTACAAGTCATCTAAGTACGA | WTH1051 | 0.000000 | 0.000000 | 0.000000 |
| 4 | ACTACGATACCGAGTGCGCTCTGCTTC | WTH1051 | 0.000000 | 1.340361 | 0.000000 |
| ... | ... | ... | ... | ... | ... |
| 394417 | GTATTTCGCTGTAGAATGCTCTGCTTC | Normal_5 | 0.000000 | 0.249231 | 0.000000 |
| 394418 | TGAGGTTGCTCGAGTTTGCTCTGCTTC | Normal_5 | 4.161886 | 0.000000 | 0.445782 |
| 394419 | ATTCCCGATGCAACTGATAGTCATACG | Normal_5 | 3.798390 | 0.000000 | 0.192101 |
| 394420 | TTGCATTATGCGCTTCGTTCTGCGCTC | Normal_5 | 3.657890 | 0.000000 | 0.119740 |
| 394421 | CAAGCTAGCTGAGAATTCCTAATAACG | Normal_5 | 3.714930 | 0.108064 | 0.108064 |
394422 rows × 5 columns
python
celltype_df_all = celltype_df_all.set_index('cell_id')提取单样本的基因表达情况
python
samples = celltype_df_all["sample_id"].unique()python
samplesoutput
array(['WTH1051', 'LGG_3', 'LGG_4', 'LGG_6_1', 'LGG_6_2', 'LGG_7_2',
'LGG_7_1', 'LGG_8', 'WTH973', 'WTH1029', 'WTH1030', 'ndGBM_4_2',
'ndGBM_5_1', 'ndGBM_5_2', 'ndGBM_6_2', 'ndGBM_6_3', 'ndGBM_5_3',
'ndGBM_7', 'ndGBM_8', 'nd_GBM_8', 'WTH1052', 'WTH1068', 'WTH974',
'rGBM_2', 'rGBM_3', 'Normal_5'], dtype=object)
'LGG_7_1', 'LGG_8', 'WTH973', 'WTH1029', 'WTH1030', 'ndGBM_4_2',
'ndGBM_5_1', 'ndGBM_5_2', 'ndGBM_6_2', 'ndGBM_6_3', 'ndGBM_5_3',
'ndGBM_7', 'ndGBM_8', 'nd_GBM_8', 'WTH1052', 'WTH1068', 'WTH974',
'rGBM_2', 'rGBM_3', 'Normal_5'], dtype=object)
python
sample = samples[13]
sampleoutput
'ndGBM_5_2'
python
celltype_df = celltype_df_all[celltype_df_all['sample_id'].str.contains(sample)]python
celltype_df| sample_id | IGF2 | IGF1R | IGF2R | |
|---|---|---|---|---|
| cell_id | ||||
| AGAGCCCCGATAGCTACCCTAATAACG | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 |
| CACCGTTATGGGAAACGTCCTATTAGG | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 |
| GCCATTCTACGCCTTTGCTAAGTACGA | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 |
| TAAGTCGCGACATCCGGCTAAGTACGA | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 |
| TACGTCCTACGAAGTGGGGACTGTGGA | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 |
| ... | ... | ... | ... | ... |
| CGCGTGACGATGCAAGGAAGGCACTAT | ndGBM_5_2 | 0.020862 | 0.080957 | 0.225520 |
| GCCAGGTATGAGTCCGGGGACTGTGGA | ndGBM_5_2 | 0.020770 | 0.137069 | 0.273685 |
| TGGGTTAATGACGTCGAAAGGCACTAT | ndGBM_5_2 | 0.020676 | 0.060789 | 0.303916 |
| CCCGGAACGAGTCAGCCCCTAATAACG | ndGBM_5_2 | 0.000000 | 0.091217 | 0.323553 |
| TCAGGGCGCTTTCTATCAGCGACGGAT | ndGBM_5_2 | 0.000000 | 0.156162 | 0.262727 |
14055 rows × 4 columns
读取空间坐标
python
spatial_df = pd.read_csv(f"/PROJ2/FLOAT/weichendan/seekspace/SGS2401017/allsamples/00data/down_oss/{sample}/{sample}_barcode_centers.csv",index_col=0,sep=',')
spatial_df.drop(columns=['RNA_snn_res.0.8'], inplace=True)
spatial_df = spatial_df[spatial_df.index.isin(celltype_df.index)]python
spatial_df| x | y | |
|---|---|---|
| barcode | ||
| AAACCCAATGACAGCGTGCTCTGCTTC | 1110 | 9227 |
| AAACCCAATGACTACGATTCTGCGCTC | 11863 | 17525 |
| AAACCCAATGGGCTTCTAAGGCACTAT | 36326 | 10478 |
| AAACCCAATGTCAATCCGGACTGTGGA | 4020 | 17168 |
| AAACCCACGAAACGGTGGGACTGTGGA | 35176 | 2370 |
| ... | ... | ... |
| TTTGTTGATGACAGCTGTTCTGCGCTC | 34850 | 15295 |
| TTTGTTGGCTCGGAAGTTTCTGCGCTC | 2542 | 4716 |
| TTTGTTGTACAAACCATTTCTGCGCTC | 38155 | 7606 |
| TTTGTTGTACCGTGCTCTTCTGCGCTC | 25502 | 14087 |
| TTTGTTGTACGTTCTATTAGTCATACG | 48764 | 7264 |
14055 rows × 2 columns
合并数据
python
merged_df = pd.merge(celltype_df, spatial_df, left_index=True, right_index=True,how="inner")python
merged_df| sample_id | IGF2 | IGF1R | IGF2R | x | y | |
|---|---|---|---|---|---|---|
| AGAGCCCCGATAGCTACCCTAATAACG | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 | 27913 | 15055 |
| CACCGTTATGGGAAACGTCCTATTAGG | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 | 29920 | 4106 |
| GCCATTCTACGCCTTTGCTAAGTACGA | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 | 34538 | 1969 |
| TAAGTCGCGACATCCGGCTAAGTACGA | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 | 27680 | 10535 |
| TACGTCCTACGAAGTGGGGACTGTGGA | ndGBM_5_2 | 0.000000 | 0.000000 | 0.000000 | 43135 | 11168 |
| ... | ... | ... | ... | ... | ... | ... |
| CGCGTGACGATGCAAGGAAGGCACTAT | ndGBM_5_2 | 0.020862 | 0.080957 | 0.225520 | 33528 | 12544 |
| GCCAGGTATGAGTCCGGGGACTGTGGA | ndGBM_5_2 | 0.020770 | 0.137069 | 0.273685 | 40550 | 7503 |
| TGGGTTAATGACGTCGAAAGGCACTAT | ndGBM_5_2 | 0.020676 | 0.060789 | 0.303916 | 20952 | 13660 |
| CCCGGAACGAGTCAGCCCCTAATAACG | ndGBM_5_2 | 0.000000 | 0.091217 | 0.323553 | 41631 | 16713 |
| TCAGGGCGCTTTCTATCAGCGACGGAT | ndGBM_5_2 | 0.000000 | 0.156162 | 0.262727 | 34379 | 11688 |
14055 rows × 6 columns
绘制 3D 空间密度图
以细胞类型 AC 为例,绘制其 3D 空间密度图
python
for gene in ["IGF2","IGF1R","IGF2R"]:
# 提取坐标和基因表达值
xy = merged_df[['x', 'y']].values.T # 形状为 (2, N)
expression = merged_df[gene].values # 基因表达值
# 新增:判断数据点数和表达值有效性
if xy.shape[1] <= 1 or np.all(np.isnan(expression)) or np.all(expression == 0):
print(f"{sample} {gene} 数据点不足或表达值无效,跳过。")
continue
# 使用基因表达值作为权重进行核密度估计
# 注意:这里直接使用表达值作为 z 轴高度
kde_weights = gaussian_kde(xy, weights=expression, bw_method=0.2)
# 创建网格
xmin, xmax = merged_df.x.min(), merged_df.x.max()
ymin, ymax = merged_df.y.min(), merged_df.y.max()
xgrid = np.linspace(xmin, xmax, 200)
ygrid = np.linspace(ymin, ymax, 200)
X, Y = np.meshgrid(xgrid, ygrid)
grid_points = np.vstack([X.ravel(), Y.ravel()])
# 计算加权密度(即基因表达的空间分布)
Z = kde_weights(grid_points).reshape(X.shape)
# 创建画布
fig = plt.figure(figsize=(7, 5))
ax3d = fig.add_subplot(111, projection='3d')
# 设置长宽比
ax3d.set_box_aspect([15, 5, 5])
# 美化设置
ax3d.grid(False)
ax3d.set_facecolor('white')
ax3d.xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax3d.yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax3d.zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
# 隐藏坐标轴
ax3d.set_xticks([])
ax3d.set_yticks([])
ax3d.set_zticks([])
ax3d.set_xticklabels([])
ax3d.set_yticklabels([])
ax3d.set_xlabel('')
ax3d.set_ylabel('')
ax3d.set_zlabel('')
# 绘制基因表达曲面
surf = ax3d.plot_surface(
X, Y, Z,
#cmap='viridis', # 使用热图色系表示表达强度
cmap='RdYlGn_r',
rstride=5, # 行采样步长
cstride=5, # 列采样步长
alpha=0.8, # 适当透明度
antialiased=True,
edgecolor='none',
linewidth=0.1
)
# 添加颜色条表示表达强度
cbar = fig.colorbar(surf, ax=ax3d, shrink=0.4, aspect=10, pad=0.05)
cbar.set_label(f'{gene} Expression', fontsize=10)
# 设置视角(从y轴方向俯视)
ax3d.view_init(elev=30, azim=-90)
# 调整布局并保存
plt.tight_layout()
plt.subplots_adjust(left=0, right=0.95, top=0.95, bottom=0.05)
#plt.savefig(f'{sample}_{gene}_expression3d.pdf')
plt.show()


绘制 3D 空间密度图的映射图
以细胞类型 AC 为例,绘制其 3D 空间密度图的映射图
python
for gene in ["IGF2","IGF1R","IGF2R"]:
# 提取坐标和基因表达值
xy = merged_df[['x', 'y']].values.T # 形状为 (2, N)
expression = merged_df[gene].values # 基因表达值
# 新增:判断数据点数和表达值有效性
if xy.shape[1] <= 1 or np.all(np.isnan(expression)) or np.all(expression == 0):
print(f"{sample} {gene} 数据点不足或表达值无效,跳过。")
continue
# 使用基因表达值作为权重进行核密度估计
# 注意:这里直接使用表达值作为 z 轴高度
kde_weights = gaussian_kde(xy, weights=expression, bw_method=0.2)
# 创建网格
xmin, xmax = merged_df.x.min(), merged_df.x.max()
ymin, ymax = merged_df.y.min(), merged_df.y.max()
xgrid = np.linspace(xmin, xmax, 200)
ygrid = np.linspace(ymin, ymax, 200)
X, Y = np.meshgrid(xgrid, ygrid)
grid_points = np.vstack([X.ravel(), Y.ravel()])
# 计算加权密度(即基因表达的空间分布)
Z = kde_weights(grid_points).reshape(X.shape)
# 创建画布
fig = plt.figure(figsize=(7, 5))
ax3d = fig.add_subplot(111, projection='3d')
# 设置长宽比
ax3d.set_box_aspect([15, 5, 0.01])
# 美化设置
ax3d.grid(False)
ax3d.set_facecolor('white')
ax3d.xaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax3d.yaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
ax3d.zaxis.set_pane_color((1.0, 1.0, 1.0, 1.0))
# 隐藏坐标轴
ax3d.set_xticks([])
ax3d.set_yticks([])
ax3d.set_zticks([])
ax3d.set_xticklabels([])
ax3d.set_yticklabels([])
ax3d.set_xlabel('')
ax3d.set_ylabel('')
ax3d.set_zlabel('')
# 绘制基因表达曲面
surf = ax3d.plot_surface(
X, Y, Z,
#cmap='viridis', # 使用热图色系表示表达强度
cmap='RdYlGn_r',
rstride=5, # 行采样步长
cstride=5, # 列采样步长
alpha=0.8, # 适当透明度
antialiased=True,
edgecolor='none',
linewidth=0.1
)
# 添加颜色条表示表达强度
cbar = fig.colorbar(surf, ax=ax3d, shrink=0.4, aspect=10, pad=0.05)
cbar.set_label(f'{gene} Expression', fontsize=10)
# 设置视角(从y轴方向俯视)
ax3d.view_init(elev=30, azim=-90)
# 调整布局并保存
plt.tight_layout()
plt.subplots_adjust(left=0, right=0.95, top=0.95, bottom=0.05)
# plt.savefig(f'{sample}_{gene}_expression2d.pdf')
plt.show()


绘制 2D 密度图
python
for gene in ["IGF2","IGF1R","IGF2R"]:
# 提取坐标和基因表达值
xy = merged_df[['x', 'y']].values.T # 形状为 (2, N)
expression = merged_df[gene].values # 基因表达值
# 新增:判断数据点数和表达值有效性
if xy.shape[1] <= 1 or np.all(np.isnan(expression)) or np.all(expression == 0):
print(f"{sample} {gene} 数据点不足或表达值无效,跳过。")
continue
# 使用基因表达值作为权重进行核密度估计
# 注意:这里直接使用表达值作为 z 轴高度
kde_weights = gaussian_kde(xy, weights=expression, bw_method=0.2)
# 创建网格
xmin, xmax = merged_df.x.min(), merged_df.x.max()
ymin, ymax = merged_df.y.min(), merged_df.y.max()
xgrid = np.linspace(xmin, xmax, 200)
ygrid = np.linspace(ymin, ymax, 200)
X, Y = np.meshgrid(xgrid, ygrid)
grid_points = np.vstack([X.ravel(), Y.ravel()])
# 计算加权密度(即基因表达的空间分布)
Z = kde_weights(grid_points).reshape(X.shape)
fig2d, ax2d = plt.subplots(figsize=(6, 4))
contour = ax2d.contourf(X, Y, Z, levels=20, cmap='RdYlGn_r')
# 坐标轴设置
ax2d.set(xlim=(xmin, xmax), ylim=(ymin, ymax), aspect='equal')
# 取消坐标轴刻度
ax2d.set_xticks([])
ax2d.set_yticks([])
# 取消坐标轴标签
ax2d.set_xlabel('')
ax2d.set_ylabel('')
plt.show()


python
