TF analysis for single-cell multimodal PBMC¶
[1]:
import os
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import muon as mu
from muon import atac as ac
from pyjaspar import jaspardb
import pychromvar as pc
[2]:
pc.__version__
[2]:
'0.0.2'
download data
[3]:
!wget https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5
--2023-01-28 05:38:05-- https://cf.10xgenomics.com/samples/cell-arc/2.0.0/pbmc_granulocyte_sorted_3k/pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5
Resolving cf.10xgenomics.com (cf.10xgenomics.com)... 104.18.0.173, 104.18.1.173, 2606:4700::6812:ad, ...
Connecting to cf.10xgenomics.com (cf.10xgenomics.com)|104.18.0.173|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 38844318 (37M) [binary/octet-stream]
Saving to: ‘pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5’
pbmc_granulocyte_so 100%[===================>] 37.04M 101MB/s in 0.4s
2023-01-28 05:38:05 (101 MB/s) - ‘pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5’ saved [38844318/38844318]
Loda data as Mudata object
[4]:
mdata = mu.read_10x_h5("pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5")
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/anndata/_core/anndata.py:1830: UserWarning: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
utils.warn_names_duplicates("var")
Added `interval` annotation for features from pbmc_granulocyte_sorted_3k_filtered_feature_bc_matrix.h5
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/mudata/_core/mudata.py:446: UserWarning: var_names are not unique. To make them unique, call `.var_names_make_unique`.
warnings.warn(
[5]:
mdata.var_names_make_unique()
Quality control of RNA-seq data¶
[6]:
# compute quality
mdata['rna'].var['mt'] = mdata['rna'].var_names.str.startswith('MT-')
sc.pp.calculate_qc_metrics(mdata['rna'], qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
# control quality
mu.pp.filter_var(mdata['rna'], 'n_cells_by_counts', lambda x: x >= 3)
mu.pp.filter_obs(mdata['rna'], 'n_genes_by_counts', lambda x: (x >= 200) & (x < 5000))
mu.pp.filter_obs(mdata['rna'], 'total_counts', lambda x: x < 15000)
mu.pp.filter_obs(mdata['rna'], 'pct_counts_mt', lambda x: x < 20)
Quality control of ATAC-seq data¶
[7]:
sc.pp.calculate_qc_metrics(mdata['atac'], percent_top=None, log1p=False, inplace=True)
mu.pp.filter_var(mdata['atac'], 'n_cells_by_counts', lambda x: x >= 50)
mu.pp.filter_obs(mdata['atac'], 'n_genes_by_counts', lambda x: (x >= 2000) & (x <= 15000))
mu.pp.filter_obs(mdata['atac'], 'total_counts', lambda x: (x >= 4000) & (x <= 40000))
only keep cells that pass the control for both modalities
[8]:
mu.pp.intersect_obs(mdata)
Data normalization and PCA for RNA¶
[9]:
mdata['rna'].layers["counts"] = mdata['rna'].X.copy()
sc.pp.normalize_total(mdata['rna'], target_sum=1e4)
sc.pp.log1p(mdata['rna'])
sc.pp.highly_variable_genes(mdata['rna'], min_mean=0.02, max_mean=4, min_disp=0.5)
sc.pp.scale(mdata['rna'], max_value=10)
sc.tl.pca(mdata['rna'], svd_solver='arpack')
Data normalization and LSI for ATAC¶
[10]:
mdata['atac'].layers["counts"] = mdata['atac'].X
ac.pp.tfidf(mdata['atac'], scale_factor=None)
ac.tl.lsi(mdata['atac'])
mdata['atac'].obsm['X_lsi'] = mdata['atac'].obsm['X_lsi'][:,1:]
mdata['atac'].varm["LSI"] = mdata['atac'].varm["LSI"][:,1:]
mdata['atac'].uns["lsi"]["stdev"] = mdata['atac'].uns["lsi"]["stdev"][1:]
Multi-modal data integration of RNA/ATAC with WNN analysis¶
[11]:
sc.pp.neighbors(mdata['rna'], n_neighbors=10, n_pcs=20)
sc.pp.neighbors(mdata['atac'], use_rep="X_lsi", n_neighbors=10, n_pcs=20)
mu.pp.neighbors(mdata, key_added='wnn')
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/pynndescent/pynndescent_.py:334: NumbaWarning: Cannot cache compiled function "generate_leaf_updates" as it uses dynamic globals (such as ctypes pointers and large global arrays)
init_rp_tree(data, dist, current_graph, leaf_array)
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/pynndescent/pynndescent_.py:334: NumbaWarning: Cannot cache compiled function "init_rp_tree" as it uses dynamic globals (such as ctypes pointers and large global arrays)
init_rp_tree(data, dist, current_graph, leaf_array)
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/pynndescent/pynndescent_.py:336: NumbaWarning: Cannot cache compiled function "init_random" as it uses dynamic globals (such as ctypes pointers and large global arrays)
init_random(n_neighbors, data, current_graph, dist, rng_state)
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/pynndescent/pynndescent_.py:346: NumbaWarning: Cannot cache compiled function "generate_graph_updates" as it uses dynamic globals (such as ctypes pointers and large global arrays)
nn_descent_internal_low_memory_parallel(
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/pynndescent/pynndescent_.py:346: NumbaWarning: Cannot cache compiled function "process_candidates" as it uses dynamic globals (such as ctypes pointers and large global arrays)
nn_descent_internal_low_memory_parallel(
We can visualize the cells in this integrated space
[12]:
mu.tl.umap(mdata, neighbors_key='wnn', random_state=10)
mdata.obsm["X_wnn_umap"] = mdata.obsm["X_umap"]
sc.tl.leiden(mdata, resolution=.3, neighbors_key='wnn', key_added='leiden_wnn')
mu.pl.umap(mdata, color=['leiden_wnn'], frameon=False, title="UMAP embedding", legend_loc="on data")
Add the clustering results to RNA and ATAC modality
[13]:
# add the clustering results to chromvar modality
mdata['rna'].obs['leiden_wnn'] = mdata.obs['leiden_wnn']
mdata['atac'].obs['leiden_wnn'] = mdata.obs['leiden_wnn']
Perform differential expression analysis
[14]:
sc.tl.rank_genes_groups(mdata['rna'], 'leiden_wnn', method='wilcoxon')
pd.DataFrame(mdata['rna'].uns['rank_genes_groups']['names']).head(10)
... storing 'feature_types' as categorical
... storing 'genome' as categorical
... storing 'interval' as categorical
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:420: RuntimeWarning: invalid value encountered in log2
self.stats[group_name, 'logfoldchanges'] = np.log2(
[14]:
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DPYD | IL32 | FHIT | LEF1 | BANK1 | CCL5 | FCGR3A | GNLY | HLA-DPB1 | PTPRS |
| 1 | PLXDC2 | LTB | LEF1 | BACH2 | RALGPS2 | NKG7 | IFITM3 | PRF1 | HLA-DRA | TCF4 |
| 2 | NEAT1 | INPP4B | BCL11B | THEMIS | MS4A1 | GZMA | LST1 | NKG7 | HLA-DPA1 | LINC01374 |
| 3 | LRMDA | EEF1A1 | MALAT1 | NELL2 | AFF3 | CST7 | AIF1 | CD247 | CST3 | IRF8 |
| 4 | FCN1 | IL7R | CAMK4 | CD8B | PAX5 | CTSW | SERPINA1 | KLRD1 | HLA-DRB1 | ZFAT |
| 5 | JAK2 | ITGB1 | BACH2 | PDE3B | CD74 | IL32 | TCF7L2 | GZMA | CD74 | FCHSD2 |
| 6 | VCAN | TPT1 | INPP4B | OXNAD1 | CD79A | PRF1 | FCER1G | CTSW | FLT3 | LINC00996 |
| 7 | SLC8A1 | RPL41 | OXNAD1 | CD8A | LINC00926 | A2M | PSAP | CST7 | HLA-DRB5 | CUX2 |
| 8 | ARHGAP26 | TRAC | TCF7 | CCR7 | IGHM | GNLY | IFI30 | SPON2 | HLA-DQA1 | PLD4 |
| 9 | DENND1A | RPS18 | ANK3 | TXK | CD79B | HCST | CST3 | GZMB | GAPDH | CCDC50 |
Motif analysis with pychromVAR¶
We first download the genome sequence, here hg38 is used.
[15]:
pc.get_genome("hg38", output_dir="./")
Pre-processing data and perform motif matching
[16]:
mdata['atac'].X = mdata['atac'].layers["counts"]
pc.add_peak_seq(mdata, genome_file="./hg38.fa", delimiter=":|-")
pc.add_gc_bias(mdata)
pc.get_bg_peaks(mdata)
# get motifs
jdb_obj = jaspardb(release='JASPAR2020')
motifs = jdb_obj.fetch_motifs(
collection = 'CORE',
tax_group = ['vertebrates'])
pc.match_motif(mdata, motifs=motifs)
100%|██████████| 62871/62871 [00:00<00:00, 67751.41it/s]
100%|██████████| 62871/62871 [00:00<00:00, 86294.72it/s]
100%|██████████| 62871/62871 [00:47<00:00, 1320.46it/s]
[17]:
mdata
[17]:
MuData object with n_obs × n_vars = 2391 × 134920
obs: 'leiden_wnn'
var: 'gene_ids', 'feature_types', 'genome', 'interval'
obsm: 'X_umap', 'X_wnn_umap'
obsp: 'wnn_distances', 'wnn_connectivities'
2 modalities
rna: 2391 x 21256
obs: 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'leiden_wnn'
var: 'gene_ids', 'feature_types', 'genome', 'interval', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'mean', 'std'
uns: 'log1p', 'hvg', 'pca', 'neighbors', 'rank_genes_groups'
obsm: 'X_pca'
varm: 'PCs'
layers: 'counts'
obsp: 'distances', 'connectivities'
atac: 2391 x 62871
obs: 'n_genes_by_counts', 'total_counts', 'leiden_wnn'
var: 'gene_ids', 'feature_types', 'genome', 'interval', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'gc_bias'
uns: 'lsi', 'neighbors', 'peak_seq', 'motif_name'
obsm: 'X_lsi'
varm: 'LSI', 'bg_peaks', 'motif_match'
layers: 'counts'
obsp: 'distances', 'connectivities'[18]:
dev = pc.compute_deviations(mdata)
dev
2023-01-28 05:46:47 INFO computing expectation reads per cell and peak...
2023-01-28 05:46:47 INFO computing observed motif deviations...
2023-01-28 05:46:47 INFO computing background deviations...
[18]:
AnnData object with n_obs × n_vars = 2391 × 746
We can add it to mdata as another modality
[19]:
mdata.mod['chromvar'] = dev
mdata.mod['chromvar'].raw = dev
Then we can analysis and visualize the motif variability as if they are RNA-seq data
[20]:
# add the clustering results to chromvar modality
mdata['chromvar'].obs['leiden_wnn'] = mdata.obs['leiden_wnn']
We can perform differential analysis to identify marker motifs for each cluster
[21]:
sc.tl.rank_genes_groups(mdata['chromvar'], 'leiden_wnn', method='wilcoxon', use_raw=True)
/home/rs619065/miniconda3/envs/r-4.1/lib/python3.8/site-packages/scanpy/tools/_rank_genes_groups.py:420: RuntimeWarning: invalid value encountered in log2
self.stats[group_name, 'logfoldchanges'] = np.log2(
Print the top10 motifs per cluster:
[22]:
pd.DataFrame(mdata['chromvar'].uns['rank_genes_groups']['names']).head(10)
[22]:
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | MA0501.1.MAF::NFE2 | MA0769.2.TCF7 | MA0523.1.TCF7L2 | MA0523.1.TCF7L2 | MA0627.2.POU2F3 | MA0800.1.EOMES | MA1111.1.NR2F2 | MA0800.1.EOMES | MA0081.2.SPIB | MA1100.2.ASCL1 |
| 1 | MA0150.2.Nfe2l2 | MA1513.1.KLF15 | MA0768.1.LEF1 | MA0768.1.LEF1 | MA1115.1.POU5F1 | MA0690.1.TBX21 | MA0687.1.SPIC | MA0688.1.TBX2 | MA0080.5.SPI1 | MA1635.1.BHLHE22(var.2) |
| 2 | MA0089.2.NFE2L1 | MA0139.1.CTCF | MA0769.2.TCF7 | MA0769.2.TCF7 | MA0792.1.POU5F1B | MA0688.1.TBX2 | MA0598.3.EHF | MA0690.1.TBX21 | MA1508.1.IKZF1 | MA0500.2.MYOG |
| 3 | MA0833.2.ATF4 | MA0511.2.RUNX2 | MA0632.2.TCFL5 | MA0006.1.Ahr::Arnt | MA0507.1.POU2F2 | MA0802.1.TBR1 | MA0859.1.Rarg | MA0802.1.TBR1 | MA1652.1.ZKSCAN5 | MA0816.1.Ascl2 |
| 4 | MA0838.1.CEBPG | MA0685.1.SP4 | MA1421.1.TCF7L1 | MA1421.1.TCF7L1 | MA0785.1.POU2F1 | MA0803.1.TBX15 | MA0017.2.NR2F1 | MA0803.1.TBX15 | MA0598.3.EHF | MA1467.1.ATOH1(var.2) |
| 5 | MA0836.2.CEBPD | MA0481.3.FOXP1 | MA0006.1.Ahr::Arnt | MA0442.2.SOX10 | MA0789.1.POU3F4 | MA1567.1.TBX6 | MA0136.2.ELF5 | MA0689.1.TBX20 | MA0640.2.ELF3 | MA0832.1.Tcf21 |
| 6 | MA1633.1.BACH1 | MA1683.1.FOXA3 | MA1650.1.ZBTB14 | MA0103.3.ZEB1 | MA0784.1.POU1F1 | MA0689.1.TBX20 | MA1508.1.IKZF1 | MA1567.1.TBX6 | MA0687.1.SPIC | MA0048.2.NHLH1 |
| 7 | MA0102.4.CEBPA | MA0768.1.LEF1 | MA0732.1.EGR3 | MA0139.1.CTCF | MA0787.1.POU3F2 | MA1566.1.TBX3 | MA0160.1.NR4A2 | MA0805.1.TBX1 | MA0645.1.ETV6 | MA1472.1.BHLHA15(var.2) |
| 8 | MA1636.1.CEBPG(var.2) | MA0042.2.FOXI1 | MA0162.4.EGR1 | MA0162.4.EGR1 | MA0786.1.POU3F1 | MA0805.1.TBX1 | MA0080.5.SPI1 | MA0801.1.MGA | MA0517.1.STAT1::STAT2 | MA0698.1.ZBTB18 |
| 9 | MA0496.3.MAFK | MA0850.1.FOXP3 | MA1102.2.CTCFL | MA1102.2.CTCFL | MA0788.1.POU3F3 | MA0801.1.MGA | MA0081.2.SPIB | MA1566.1.TBX3 | MA0473.3.ELF1 | MA0521.1.Tcf12 |
We can visualize the motif activity and gene expression at the same time.
[23]:
mu.pl.umap(mdata, color=['MA0769.2.TCF7', 'TCF7', "IL7R"], frameon=False, vmin='p1', vmax='p99')
[ ]: