-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcell2location.R
244 lines (195 loc) · 10.2 KB
/
cell2location.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
##单细胞和空转联合分析
##cell2location
#线粒体编码基因与空间定位无关,去除
# find mitochondria-encoded (MT) genes
adata_spatial.var['MT_gene'] = [gene.startswith('mt-') for gene in adata_spatial.var.index]
# remove MT genes for spatial mapping (keeping their counts in the object)
adata_spatial.obsm['MT'] = adata_spatial[:, adata_spatial.var['MT_gene'].values].X.toarray()
adata_spatial = adata_spatial[:, ~adata_spatial.var['MT_gene'].values]
adata_spatial
##目录操作
import os
os.getcwd()
os.chdir("/mnt/sda/jwz/data/TTS/spatial/cell2location")
#2.1加载单细胞数据
adata_sc = sc.read_text("/mnt/sda/jwz/data/TTS/spatial/scATAC-seq/signac/new/integrated_counts.csv",delimiter='\t',first_column_names=True)
adata_sc.X = sp.csr_matrix(adata_sc.X)
metadata = pd.read_csv("/mnt/sda/jwz/data/TTS/spatial/scATAC-seq/signac/new/integrated_metadata.csv",sep='\t', header=0, index_col=0)
adata_sc.obs = metadata
r_embedding = pd.read_csv("/mnt/sda/jwz/data/TTS/spatial/scATAC-seq/signac/new/integrated_embedding.csv",sep='\t', header=0, index_col=0)
adata_sc.obsm["X_umap"] = r_embedding.values
adata_sc.write("adata_sc.h5ad")
#读入
adata_sc = sc.read_h5ad('adata_sc.h5ad')
##umap绘图
sc.pl.umap(adata_sc, color='celltype',save="adata_sc-umap.pdf")
#2.3 基因筛选
from cell2location.utils.filtering import filter_genes
selected = filter_genes(adata_sc, cell_count_cutoff=5, cell_percentage_cutoff2=0.03, nonz_mean_cutoff=1.12)
plt.savefig("adata_sc_filter_gene.pdf")
# filter the object
adata_sc = adata_sc[:, selected].copy()
#2.4 参考细胞类型特征估计(NB回归)
# prepare anndata for the regression model
cell2location.models.RegressionModel.setup_anndata(adata=adata_sc,
batch_key='dataset', # 10X reaction / sample / batch
labels_key='celltype' # cell type, covariate used for constructing signatures
#categorical_covariate_keys=['Method'] # multiplicative technical effects (platform, 3' vs 5', donor effect)
)
# create the regression model
from cell2location.models import RegressionModel
mod = RegressionModel(adata_sc)
# view anndata_setup as a sanity check
mod.view_anndata_setup()
#训练模型以估计参考细胞类型的特征。
mod.train(max_epochs=250)
#确定模型是否需要更多训练。
#在这里,我们绘制训练过程中的 ELBO 损失历史,从图中去掉前 20 个 epochs。这个图应该呈下降趋势,并在训练结束时趋于平稳。如果仍在下降,请增加 max_epochs。
#没有曲线,不知何故?
mod.plot_history(20)
plt.savefig("mod.plot_history.pdf")
# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata_sc = mod.export_posterior(
adata_sc, sample_kwargs={'num_samples': 1000, 'batch_size': 2500}
)
#可选用户可以直接计算后验分布的5%、50%和95%分位数,而不是使用1000个样本(或其他任何分位数)从分布中抽取。这样可以加快在大型数据集上的应用并减少内存需求,但无法用这种方式计算后验均值和标准差。
adata_sc = mod.export_posterior(
adata_sc, use_quantiles=True,
# choose quantiles
add_to_varm=["q05","q50", "q95", "q0001"],
sample_kwargs={'batch_size': 2500, 'use_gpu': True}
)
##QC plot
mod.plot_QC()
plt.savefig("mod.plot_QC1.pdf")
# Save model
mod.save("./mod.train5000/", overwrite=True)
# Save anndata object with results
adata_sc.write("./sc_cell2location.h5ad")
#模型和输出的h5ad,可以像这样加载:
adata_sc = sc.read_h5ad("./sc_cell2location.h5ad")
mod = cell2location.models.RegressionModel.load("./mod.train5000/", adata_sc)
# export estimated expression in each cluster
if 'means_per_cluster_mu_fg' in adata_sc.varm.keys():
inf_aver = adata_sc.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
for i in adata_sc.uns['mod']['factor_names']]].copy()
else:
inf_aver = adata_sc.var[[f'means_per_cluster_mu_fg_{i}'
for i in adata_sc.uns['mod']['factor_names']]].copy()
##good
inf_aver = adata_sc.varm['means_per_cluster_mu_fg'][[f'means_per_cluster_mu_fg_{i}'
for i in adata_sc.uns['mod']['factor_names']]].copy()
inf_aver.columns = adata_sc.uns['mod']['factor_names']
inf_aver.iloc[0:5, 0:5]
#后续Spatial Mapping仅需要inf_aver文件作为输入,在此保存和读取
inf_aver.to_csv('inf_aver.csv',index=True)
inf_aver = pd.read_csv('inf_aver.csv',index_col=0)
#Cell2location: spatial mapping
# find shared genes and subset both anndata and reference signatures
intersect = np.intersect1d(adata_spatial.var_names, inf_aver.index)
adata_spatial = adata_spatial[:, intersect].copy()
inf_aver = inf_aver.loc[intersect, :].copy()
# prepare anndata for cell2location model,need unnormalized count data
cell2location.models.Cell2location.setup_anndata(adata=adata_spatial, batch_key="library_id")
# create and train the model
mod = cell2location.models.Cell2location(
adata_spatial, cell_state_df=inf_aver,
# the expected average cell abundance: tissue-dependent
# hyper-prior which can be estimated from paired histology:
N_cells_per_location=30,
# hyperparameter controlling normalisation of
# within-experiment variation in RNA detection:
detection_alpha=20
)
mod.view_anndata_setup()
#Training cell2location
mod.train(max_epochs=30000,
# train using full data (batch_size=None)
batch_size=None,
# use all data points in training because
# we need to estimate cell abundance at all locations
train_size=1,
#use_gpu=True,
)
# plot ELBO loss history during training, removing first 100 epochs from the plot
mod.plot_history(1000)
plt.savefig("full_data_training.pdf")
plt.legend(labels=['full data training'])
#Exporting estimated posterior distributions of cell abundance and saving results
# In this section, we export the estimated cell abundance (summary of the posterior distribution).
adata_spatial = mod.export_posterior(
adata_spatial, sample_kwargs={'num_samples': 1000, 'batch_size': mod.adata.n_obs, 'use_gpu': True}
)
# Save model
mod.save("./spatial_cell2location", overwrite=True)
##QC
mod.plot_QC()
plt.savefig("cell2location_qc.pdf")
#导出预测的细胞丰度结果为csv文件:作者建议使用后验分布的5%分位数的结果,表示模型具有高置信度的细胞丰度值(即“至少存在这个数量”)。结果在adata_st.obsm下
pd.DataFrame(adata_spatial.obsm['q05_cell_abundance_w_sf']).to_csv("st_cell2location_ratio.csv")
pd.DataFrame(adata_spatial.obsm['q05_cell_abundance_w_sf']).to_csv("st_cell2location_ratio.txt",sep="\t")
# Save anndata object with results
adata_spatial.write("adata_spatial_cell2location.h5ad")
#The model and output h5ad can be loaded later like this:
adata_spatial= sc.read_h5ad("adata_spatial_cell2location.h5ad")
mod = cell2location.models.Cell2location.load("./spatial_cell2location", adata_spatial)
##Visualising cell abundance in spatial coordinates
# add 5% quantile, representing confident cell abundance, 'at least this amount is present',
# to adata.obs with nice names for plotting
adata_spatial.obs[adata_spatial.uns['mod']['factor_names']] = adata_spatial.obsm['q05_cell_abundance_w_sf']
# select one slide
from cell2location.utils import select_slide
adata_spatial.obs['sample']
slide = select_slide(adata_spatial, 'ISO3d-2') ##根据sample name选择
# plot in spatial coordinates
sc.pl.spatial(slide, cmap='viridis',
# show first 8 cell types
color=['Monocytes', 'Macrophages', 'Cardiomyocytes', 'Arterial endothelial cells'],
ncols=4, size=1.3,
img_key='hires',
# limit color scale at 99.2% quantile of cell abundance
vmin=0, vmax='p99.2'
)
sc.pl.spatial(slide, cmap='viridis',
# selected cell types
color=['Adipocytes', 'Arterial endothelial cells', 'Astrocytes', 'B cells', 'Cardiomyocytes', 'Epithelial cell', 'Fibroblasts', 'Macrophages', 'Monocytes', 'Neutrophils', 'Pericytes', 'T cells', 'Venous endothelial cells'],
ncols=5, size=1.3,
img_key='hires',
# limit color scale at 99.2% quantile of cell abundance
vmin=0, vmax='p99.2',
show=True
)
plt.savefig("ISO3d-2-cell2location.pdf")
##画单个细胞的分布随时间变化
fig, axes = plt.subplots(1, 8, figsize=(width, 3))
for x,y in zip(axes, sorted(adata_spatial.uns['spatial'].keys() )):
sc.pl.spatial(adata_spatial[adata_spatial.obs['sample']==y], frameon=False,color="Cardiomyocytes",size=1.3, library_id=y, title=y,ax=x,show=False, legend_loc="right margin",vmax=25)
plt.savefig('Cardiomyocytes-spatial.pdf')
##用Leiden聚类识别离散组织区域
##通过使用由cell2location估计的细胞丰度对位置进行聚类,我们识别在细胞组成方面存在差异的组织区域。
# compute KNN using the cell2location output stored in adata.obsm
sc.pp.neighbors(adata_spatial, use_rep='q05_cell_abundance_w_sf',
n_neighbors = 10)
# Cluster spots into regions using scanpy
sc.tl.leiden(adata_spatial, resolution=0.3)
# add region as categorical variable
adata_spatial.obs["region_cluster"] = adata_spatial.obs["leiden"].astype("category")
## compute UMAP using KNN graph based on the cell2location output
sc.tl.umap(adata_spatial, min_dist = 0.3, spread = 1)
# show regions in UMAP coordinates
with mpl.rc_context({'axes.facecolor': 'white',
'figure.figsize': [8, 8]}):
sc.pl.umap(adata_spatial, color=['region_cluster'], size=30,
ncols = 2, legend_loc='on data',
legend_fontsize=15)
plt.savefig("cell2location_region_cluster.pdf")
with mpl.rc_context({'axes.facecolor': 'white',
'figure.figsize': [8, 8]}):
sc.pl.umap(adata_spatial, color=['sample'], size=30,
ncols = 2)
plt.savefig("cell2location_sample.pdf")
# plot in spatial coordinates
with mpl.rc_context({'axes.facecolor': 'black',
'figure.figsize': [4.5, 5]}):
sc.pl.spatial(adata_spatial, color=['region_cluster'],
size=1.3, img_key='hires', alpha=0.5)