In [1]:
import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import episcanpy.api as epi
sc.settings.set_figure_params(dpi=300, color_map='gist_earth')

Load the data¶

In [2]:
adata = sc.read('./data/heart_lv_SM-JF1NY.h5ad')
adata
Out[2]:
AnnData object with n_obs × n_vars = 9110 × 305315
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'sample', 'cell_type', 'organ', 'tissue', 'batch'
    var: 'peaks'
    uns: 'peaks', 'peaks_new', 'reference_sequences'
    obsm: 'fragment_paired'
View the distribution of cell types¶
In [3]:
print(adata.obs['cell_type'].value_counts())
cell_type
Ventricular Cardiomyocyte        2813
Cardiac Fibroblasts              2230
Endothelial Cell (Myocardial)    1351
Macrophage (General)             1043
Cardiac Pericyte 1                886
Cardiac Pericyte 2                225
Schwann Cell (General)            152
T Lymphocyte 1 (CD8+)              93
Endothelial Cell (General) 1       90
Natural Killer T Cell              63
Endothelial Cell (General) 2       51
Macrophage (General,Alveolar)      25
Fibroblast (General)               18
Pericyte (General) 2               15
Pericyte (General) 1                9
Adipocyte                           8
Smooth Muscle (General)             7
Peripheral Nerve Stromal            6
Cardiac Pericyte 4                  5
Pericyte (General) 3                4
Cardiac Pericyte 3                  4
Lymphatic Endothelial Cell          3
Pericyte (General) 4                3
Vascular Smooth Muscle 2            2
Naive T cell                        1
Atrial Cardiomyocyte                1
T lymphocyte 2 (CD4+)               1
Mast Cell                           1
Name: count, dtype: int64
Filtered cells by number¶
In [4]:
threshold = 100
num = adata.obs['cell_type'].value_counts()
idx = np.where(num>threshold)[0]
types_kept = num.index[idx]
adata = adata[adata.obs['cell_type'].isin(types_kept)]
adata
Out[4]:
View of AnnData object with n_obs × n_vars = 8700 × 305315
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'sample', 'cell_type', 'organ', 'tissue', 'batch'
    var: 'peaks'
    uns: 'peaks', 'peaks_new', 'reference_sequences'
    obsm: 'fragment_paired'

Quality controls¶

remove any potential empty features or barcodes¶
In [5]:
epi.pp.filter_cells(adata, min_features=1)
epi.pp.filter_features(adata, min_cells=1)
epi.pl.violin(adata, ['nb_features'])
 ImplicitModificationWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/scanpy/preprocessing/_simple.py:139: Trying to modify attribute `.obs` of view, initializing view as actual.
 FutureWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:839: 

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.
create a new AnnData containing only the most variable features¶
In [6]:
nb_feature_selected = 50000
adata = epi.pp.select_var_feature(adata,
                                  nb_features=nb_feature_selected,
                                  show=False,
                                  copy=True)
adata
Out[6]:
View of AnnData object with n_obs × n_vars = 8700 × 50043
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'sample', 'cell_type', 'organ', 'tissue', 'batch', 'nb_features'
    var: 'peaks', 'n_cells', 'prop_shared_cells', 'variability_score'
    uns: 'peaks', 'peaks_new', 'reference_sequences'
    obsm: 'fragment_paired'

Visualisation¶

IF-IDF function¶
In [7]:
import scipy
def tfidf2(count_mat): 
    tf_mat = 1.0 * count_mat / np.tile(np.sum(count_mat,axis=0), (count_mat.shape[0],1))
    signac_mat = np.log(1 + np.multiply(1e4*tf_mat,  np.tile((1.0 * count_mat.shape[1] / np.sum(count_mat,axis=1)).reshape(-1,1), (1,count_mat.shape[1]))))
    return scipy.sparse.csr_matrix(signac_mat)
Umap plot¶
In [9]:
adata = sc.AnnData(adata.X, obs=adata.obs, var=adata.var, uns=adata.uns, dtype='float32')
adata.X = tfidf2(adata.X)
epi.pp.pca(adata, n_comps=10)
epi.pp.neighbors(adata)
epi.tl.umap(adata)
sc.pl.umap(adata, color=['nb_features', 'cell_type'], wspace=0.4)
 TqdmWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/tqdm/auto.py:21: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
WARNING: .obsp["connectivities"] have not been computed using umap
 UserWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

Differential chromatin analysis¶

In [10]:
epi.tl.rank_features(adata, 'cell_type', omic='ATAC', use_raw=False)
epi.pl.rank_feat_groups(adata)
In [11]:
epi.pl.rank_feat_groups_matrixplot(adata)
WARNING: dendrogram data not found (using key=dendrogram_cell_type). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.

Load Gene Annotation¶

Change the format of peak names¶
In [12]:
var_list = []
for i in adata.var_names:
    var_list.append(i.replace(":", "_").replace("-","_"))
adata.var_names = var_list
Get gene activity from atac¶
In [15]:
adata_gene = epi.tl.geneactivity(adata, 
                        gtf_file='./annotation/gencode.v38.annotation.gtf', 
                        key_added='gene', 
                        upstream=2000, 
                        feature_type='gene', 
                        annotation='HAVANA', 
                        layer_name='geneactivity', 
                        raw=False, 
                        copy=True)
 UserWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: Variable names are not unique. To make them unique, call `.var_names_make_unique`.
In [18]:
adata_gene.write("./data/genes/HGCA_lv_SM-JF1NY.h5ad")
Preprocess gene data¶
In [19]:
sc.pp.normalize_total(adata_gene, target_sum=1e5)
sc.pp.log1p(adata_gene)
sc.pp.highly_variable_genes(adata_gene, mn_top_genes=5000)
sc.tl.pca(adata_gene, svd_solver='arpack')
sc.pp.neighbors(adata_gene)
adata_gene
Out[19]:
AnnData object with n_obs × n_vars = 8700 × 18172
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'sample', 'cell_type', 'organ', 'tissue', 'batch', 'nb_features'
    var: 'gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'protein_id', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'peaks', 'peaks_new', 'reference_sequences', 'pca', 'neighbors', 'umap', 'cell_type_colors', 'rank_features_groups', 'dendrogram_cell_type', 'log1p', 'hvg'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'
Differential expression gene analysis¶
In [21]:
sc.tl.rank_genes_groups(adata_gene, groupby='cell_type', method='wilcoxon')
sc.pl.rank_genes_groups(adata_gene, n_genes=25, sharey=False)
In [27]:
sc.pl.umap(adata_gene, color=['cell_type', 'TBX18'],wspace=0.8)
 UserWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored