import scanpy as sc
import anndata as ad
import numpy as np
import pandas as pd
import episcanpy.api as epi
sc.settings.set_figure_params(dpi=300, color_map='gist_earth')


adata = sc.read('./data/heart_lv_SM-JF1NY.h5ad')
adata

AnnData object with n_obs × n_vars = 9110 × 305315
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'sample', 'cell_type', 'organ', 'tissue', 'batch'
    var: 'peaks'
    uns: 'peaks', 'peaks_new', 'reference_sequences'
    obsm: 'fragment_paired'


print(adata.obs['cell_type'].value_counts())

cell_type
Ventricular Cardiomyocyte        2813
Cardiac Fibroblasts              2230
Endothelial Cell (Myocardial)    1351
Macrophage (General)             1043
Cardiac Pericyte 1                886
Cardiac Pericyte 2                225
Schwann Cell (General)            152
T Lymphocyte 1 (CD8+)              93
Endothelial Cell (General) 1       90
Natural Killer T Cell              63
Endothelial Cell (General) 2       51
Macrophage (General,Alveolar)      25
Fibroblast (General)               18
Pericyte (General) 2               15
Pericyte (General) 1                9
Adipocyte                           8
Smooth Muscle (General)             7
Peripheral Nerve Stromal            6
Cardiac Pericyte 4                  5
Pericyte (General) 3                4
Cardiac Pericyte 3                  4
Lymphatic Endothelial Cell          3
Pericyte (General) 4                3
Vascular Smooth Muscle 2            2
Naive T cell                        1
Atrial Cardiomyocyte                1
T lymphocyte 2 (CD4+)               1
Mast Cell                           1
Name: count, dtype: int64


threshold = 100
num = adata.obs['cell_type'].value_counts()
idx = np.where(num>threshold)[0]
types_kept = num.index[idx]
adata = adata[adata.obs['cell_type'].isin(types_kept)]
adata

View of AnnData object with n_obs × n_vars = 8700 × 305315
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'sample', 'cell_type', 'organ', 'tissue', 'batch'
    var: 'peaks'
    uns: 'peaks', 'peaks_new', 'reference_sequences'
    obsm: 'fragment_paired'


epi.pp.filter_cells(adata, min_features=1)
epi.pp.filter_features(adata, min_cells=1)
epi.pl.violin(adata, ['nb_features'])

 ImplicitModificationWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/scanpy/preprocessing/_simple.py:139: Trying to modify attribute `.obs` of view, initializing view as actual.
 FutureWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/scanpy/plotting/_anndata.py:839: 

The `scale` parameter has been renamed and will be removed in v0.15.0. Pass `density_norm='width'` for the same effect.


nb_feature_selected = 50000
adata = epi.pp.select_var_feature(adata,
                                  nb_features=nb_feature_selected,
                                  show=False,
                                  copy=True)
adata

View of AnnData object with n_obs × n_vars = 8700 × 50043
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'sample', 'cell_type', 'organ', 'tissue', 'batch', 'nb_features'
    var: 'peaks', 'n_cells', 'prop_shared_cells', 'variability_score'
    uns: 'peaks', 'peaks_new', 'reference_sequences'
    obsm: 'fragment_paired'


import scipy
def tfidf2(count_mat): 
    tf_mat = 1.0 * count_mat / np.tile(np.sum(count_mat,axis=0), (count_mat.shape[0],1))
    signac_mat = np.log(1 + np.multiply(1e4*tf_mat,  np.tile((1.0 * count_mat.shape[1] / np.sum(count_mat,axis=1)).reshape(-1,1), (1,count_mat.shape[1]))))
    return scipy.sparse.csr_matrix(signac_mat)


adata = sc.AnnData(adata.X, obs=adata.obs, var=adata.var, uns=adata.uns, dtype='float32')
adata.X = tfidf2(adata.X)
epi.pp.pca(adata, n_comps=10)
epi.pp.neighbors(adata)
epi.tl.umap(adata)
sc.pl.umap(adata, color=['nb_features', 'cell_type'], wspace=0.4)

 TqdmWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/tqdm/auto.py:21: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

WARNING: .obsp["connectivities"] have not been computed using umap

 UserWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored


epi.tl.rank_features(adata, 'cell_type', omic='ATAC', use_raw=False)
epi.pl.rank_feat_groups(adata)


epi.pl.rank_feat_groups_matrixplot(adata)

WARNING: dendrogram data not found (using key=dendrogram_cell_type). Running `sc.tl.dendrogram` with default parameters. For fine tuning it is recommended to run `sc.tl.dendrogram` independently.


var_list = []
for i in adata.var_names:
    var_list.append(i.replace(":", "_").replace("-","_"))
adata.var_names = var_list


adata_gene = epi.tl.geneactivity(adata, 
                        gtf_file='./annotation/gencode.v38.annotation.gtf', 
                        key_added='gene', 
                        upstream=2000, 
                        feature_type='gene', 
                        annotation='HAVANA', 
                        layer_name='geneactivity', 
                        raw=False, 
                        copy=True)

 UserWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/anndata/_core/anndata.py:1840: Variable names are not unique. To make them unique, call `.var_names_make_unique`.


adata_gene.write("./data/genes/HGCA_lv_SM-JF1NY.h5ad")


sc.pp.normalize_total(adata_gene, target_sum=1e5)
sc.pp.log1p(adata_gene)
sc.pp.highly_variable_genes(adata_gene, mn_top_genes=5000)
sc.tl.pca(adata_gene, svd_solver='arpack')
sc.pp.neighbors(adata_gene)
adata_gene

AnnData object with n_obs × n_vars = 8700 × 18172
    obs: 'n_fragment', 'frac_dup', 'frac_mito', 'sample', 'cell_type', 'organ', 'tissue', 'batch', 'nb_features'
    var: 'gene_id', 'transcript_id', 'gene_type', 'gene_name', 'transcript_type', 'transcript_name', 'protein_id', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'peaks', 'peaks_new', 'reference_sequences', 'pca', 'neighbors', 'umap', 'cell_type_colors', 'rank_features_groups', 'dendrogram_cell_type', 'log1p', 'hvg'
    obsm: 'X_pca', 'X_umap'
    varm: 'PCs'
    obsp: 'distances', 'connectivities'


sc.tl.rank_genes_groups(adata_gene, groupby='cell_type', method='wilcoxon')
sc.pl.rank_genes_groups(adata_gene, n_genes=25, sharey=False)


sc.pl.umap(adata_gene, color=['cell_type', 'TBX18'],wspace=0.8)

 UserWarning:/home/jiangqun/anaconda3/envs/episcanpy/lib/python3.8/site-packages/scanpy/plotting/_tools/scatterplots.py:394: No data for colormapping provided via 'c'. Parameters 'cmap' will be ignored

Load the data¶

View the distribution of cell types¶

Filtered cells by number¶

Quality controls¶

remove any potential empty features or barcodes¶

create a new AnnData containing only the most variable features¶

Visualisation¶

IF-IDF function¶

Umap plot¶

Differential chromatin analysis¶

Load Gene Annotation¶

Change the format of peak names¶

Get gene activity from atac¶

Preprocess gene data¶

Differential expression gene analysis¶