The example analysis is implemented by scEpiEnsemble.
The example data consist in ~3000 cells of human PBMCs. Cells were FAC sorted thus providing a ground truth for cell type identification. The data was produces by Buenrostro et al. 2018.
The data can be downloaded from here.

1 Initialization

First import EpiScanpy, Scanpy, Anndata, and some other packages we will be using for analyzing PBMCs.

import os
import numpy as np
import pandas as pd
import scipy
from sklearn import metrics

import anndata as ad
import scanpy as sc
import episcanpy.api as epi

import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import sys

import warnings
warnings.filterwarnings("ignore")

sc.settings.set_figure_params(dpi=80, color_map='gist_earth')

2 Load the data

The first step of analysis is to load the cell-by-feature matrix as an AnnData object. The matrix is analogous to the gene expression count matrix used to analyze single-cell RNA-seq. However, instead of genes, each row of the matrix represents a region of the genome (a peak/windows), that is predicted to represent a region of open chromatin. ScEpiTools will provide a task-specific ID after uploading the file.

Internal parameters:
task_id - The only identifier of each task.

User-defined parameters:
Input_file - The name of the input file you want to load. We. Default: ‘Example.h5ad’.
Cell_type_key - The key indicating the cell type of each cell stored in observation data frame. If cell type annotation is not provided, input ‘None’ for this parameter. Default: ‘cell_type’.

task_id = 'Example'

inputpath = "./%s/" % task_id
outputpath = "./%s/" % task_id
path_anno = "./%s/scEpiEnsemble_data/" % task_id

We start by creating a Anndata object. scEpiEnsemble uses a .h5ad-format file as input.

File_type = 'h5ad'
Input_file = task_id + ".h5ad"
Cell_type_key = "cell_type"

def load_data(Input_file, File_type):
    if File_type == 'h5ad':
        adata = ad.read(inputpath + Input_file)
    elif File_type == '10x-genomics':
        adata = sc.read_10x_h5(inputpath + Input_file, gex_only=False)
    elif File_type == 'seurat':
        os.system(f'/home/scepitools/miniconda3/envs/signac/bin/Rscript /var/www/html/scepitools/pipelines/Seurat_to_h5ad.R {task_id} {Input_file}')
        adata = ad.read(inputpath + Input_file.replace('h5Seurat', 'h5ad'))
    elif File_type == 'sce':
        os.system(f'/home/scepitools/miniconda3/envs/signac/bin/Rscript /var/www/html/scepitools/pipelines/SCE_to_h5ad.R {task_id} {Input_file}')
        adata = ad.read(inputpath + Input_file.replace('rds', 'h5ad'))
    else:
        print("Input error!")
    adata.X = scipy.sparse.csr_matrix(adata.X)
    return adata

adata = load_data(Input_file, File_type)
if Cell_type_key=="None" or Cell_type_key not in adata.obs:
    Cell_type_key = None
print(adata)

## AnnData object with n_obs × n_vars = 2034 × 491436
##     obs: 'batch', 'cell_name', 'cell_type'

# display information stored in adata.obs
print(adata.obs)

##                                                batch                                       cell_name cell_type
## BM1077-MPP-Frozen-160105-1                         0                      BM1077-MPP-Frozen-160105-1       MPP
## singles-20160726-scATAC-BM1137-GMP3high-HYC-88     0  singles-20160726-scATAC-BM1137-GMP3high-HYC-88       GMP
## singles-160808-scATAC-BM1137-GMP2mid-LS-65         0      singles-160808-scATAC-BM1137-GMP2mid-LS-65       GMP
## singles-BM0828-LMPP-frozen-151105-20               0            singles-BM0828-LMPP-frozen-151105-20      LMPP
## singles-160819-BM1137-CMP-LS-95                    0                 singles-160819-BM1137-CMP-LS-95       CMP
## ...                                              ...                                             ...       ...
## BM1077-LMPP-Frozen-160107-40                       1                    BM1077-LMPP-Frozen-160107-40      LMPP
## BM1077-MPP-Frozen-160105-74                        1                     BM1077-MPP-Frozen-160105-74       MPP
## singles-BM1214-GMP-160421-9                        1                     singles-BM1214-GMP-160421-9       GMP
## singles-BM0828-LMPP-frozen-151105-62               1            singles-BM0828-LMPP-frozen-151105-62      LMPP
## singles-BM0828-MEP-160420-43                       1                    singles-BM0828-MEP-160420-43       MEP
## 
## [2034 rows x 3 columns]

# checking the variable names (bulk peaks coordinates)
print(adata.var_names)

## Index(['chr1_10279_10779', 'chr1_13252_13752', 'chr1_16019_16519',
##        'chr1_29026_29526', 'chr1_96364_96864', 'chr1_115440_115940',
##        'chr1_237535_238035', 'chr1_240811_241311', 'chr1_540469_540969',
##        'chr1_713909_714409',
##        ...
##        'chrX_154822578_154823078', 'chrX_154840821_154841321',
##        'chrX_154841449_154841949', 'chrX_154841956_154842456',
##        'chrX_154842517_154843017', 'chrX_154862057_154862557',
##        'chrX_154870909_154871409', 'chrX_154880741_154881241',
##        'chrX_154891824_154892324', 'chrX_154896342_154896842'],
##       dtype='object', name='index', length=491436)

3 Preprocessing

Next we perform some useful per-cell or per-feature QC metrics.

3.1 Binarization

One of the special characteristics of Single-cell chromatin accessibility sequencing data is its close-to-binary nature. Sometimes we will convert the count matrix of fragments/reads to binary.

User-defined parameters:
Binarization - Whether to convert the count matrix into a binary matrix. Default: True.

Binarization = True
if Binarization:
    epi.pp.binarize(adata)
    print(adata)

## AnnData object with n_obs × n_vars = 2034 × 491436
##     obs: 'batch', 'cell_name', 'cell_type'

3.2 Quality controls

3.2.1 Remove empty features or barcodes

In the first time of quality control, scEpiTools removes any potential empty features or barcodes.

epi.pp.filter_cells(adata, min_features=1)
epi.pp.filter_features(adata, min_cells=1)
print(adata)

## AnnData object with n_obs × n_vars = 2034 × 467226
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features'
##     var: 'n_cells'

3.2.2 QC plots before filtering

adata.obs['log_nb_features'] = [np.log10(x) for x in adata.obs['nb_features']]
print(adata)

## AnnData object with n_obs × n_vars = 2034 × 467226
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features'
##     var: 'n_cells'

epi.pl.violin(adata, ['nb_features'])

plot of chunk unnamed-chunk-11

epi.pl.violin(adata, ['log_nb_features'])

plot of chunk unnamed-chunk-11

3.2.3 Filter the cells and peaks based on the QC plots

User-defined parameters:
Minimum_peaks - Minimum number (integer) or ratio (float) of peaks required for a cell to pass filtering. Default: 1000.
Minimum_cells - Minimum number (integer) or ratio (float) of cells required for a peak to pass filtering. Default: 20.

Histogram of the number of open features (in the case of ATAC-seq data) per cell.

Minimum_peaks = 1000
if type(Minimum_peaks) == float:
    Minimum_peaks = np.ceil(Minimum_peaks*adata.shape[1])

epi.pp.coverage_cells(adata, binary=True, log=False, bins=50, threshold=Minimum_peaks)

plot of chunk unnamed-chunk-12

epi.pp.coverage_cells(adata, binary=True, log=10, bins=50, threshold=Minimum_peaks)

plot of chunk unnamed-chunk-12

Display how often a feature is measured as open (for ATAC-seq). Distribution of the feature commoness in cells.

Minimum_cells = 20
if type(Minimum_cells) == float:
    Minimum_cells = np.ceil(Minimum_cells*adata.shape[0])

epi.pp.coverage_features(adata, binary=True, log=False, threshold=Minimum_cells)

plot of chunk unnamed-chunk-13

epi.pp.coverage_features(adata, binary=True, log=True, threshold=Minimum_cells)

plot of chunk unnamed-chunk-13

Filter cell outliers based on counts and only keep cells with at least min_peaks counts.

epi.pp.filter_cells(adata, min_features=Minimum_peaks)
print(adata)

## AnnData object with n_obs × n_vars = 1945 × 467226
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features'
##     var: 'n_cells', 'commonness'

Filter features based on number of cells and keep features that are expressed in at least min_cells cells.

epi.pp.filter_features(adata, min_cells=Minimum_cells)
print(adata)

## AnnData object with n_obs × n_vars = 1945 × 134919
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features'
##     var: 'n_cells', 'commonness'

3.2.4 QC plots after filtering

Histogram of the number of open features (in the case of ATAC-seq data) per cell.

epi.pp.coverage_cells(adata, binary=True, log='log10', bins=50, threshold=Minimum_peaks)

plot of chunk unnamed-chunk-16

Display how often a feature is measured as open (for ATAC-seq). Distribution of the feature commoness in cells.

epi.pp.coverage_features(adata, binary=True, log='log10', bins=50, threshold=Minimum_cells)

plot of chunk unnamed-chunk-17

3.3 Term frequency–inverse document frequency

Automatically computes Term frequency–inverse document frequency.

User-defined parameters:
TFIDF - Whether to calculate the TFIDF for the dataset. Default: True.

def cal_tfidf(adata):
    adata.X = adata.X / np.sum(adata.X,axis=1)
    adata.X = adata.X / np.sum(adata.X,axis=0)
    adata.X = np.log(1 + 1e4 * adata.X.shape[0] * adata.X)
    adata.X = scipy.sparse.csr_matrix(adata.X)
    return adata

adata.raw = adata

TFIDF = True
if TFIDF:
    adata = cal_tfidf(adata)
    print(adata)

## AnnData object with n_obs × n_vars = 1945 × 134919
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features'
##     var: 'n_cells', 'commonness'

3.4 Normalization

Save the the normalized matrix in a layer of the Anndata.

User-defined parameters:
Normalization - Whether to normalize the dataset. Default: True.

Normalization = True
if Normalization:
    sc.pp.normalize_total(adata)
    adata.layers['normalised'] = adata.X.copy()
    print(adata)

## AnnData object with n_obs × n_vars = 1945 × 134919
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features'
##     var: 'n_cells', 'commonness'
##     layers: 'normalised'

4 Dimensionality reduction and clustering

ScEpiEnsemble performs the dimensionality reduction by ensembling the results of EpiScanpy, Signac and SnapATAC.
Cluster cells into subgroups using the latent features after PCA via the louvain, leiden, k-means or hc(hierarchical clustering) algorithm.

User-defined parameters:
Use_EpiScanpy - Whether to use EpiScanpy in scEpiEnsemble. Default: True.
Use_Signac - Whether to use Signac in scEpiEnsemble. Default: True.
Use_SnapATAC - Whether to use SnapATAC in scEpiEnsemble. Default: True.
Visualization_method - The visualization method for displaying dimensionality reduction and clustering results, scEpiTools provides two choices, ‘umap’ and ‘tsne’. Default: ‘umap’.

Use_EpiScanpy = True
Use_Signac = True
Use_SnapATAC = True
Visualization_method = ['umap']

Generate the files for the methods in R kernel.

Dir_intermediate = outputpath+"results/scEpiEnsemble_intermediate/"
os.makedirs(Dir_intermediate, exist_ok=True)
scipy.io.mmwrite(Dir_intermediate + '%s.mtx' % task_id, adata.X)
scipy.io.mmwrite(Dir_intermediate + '%s_raw.mtx' % task_id, adata.raw.X)

import re
peaks = []
var = list(adata.var['peak']) if 'peaks' in adata.var or 'peak' in adata.var else list(adata.var.index)
for p in var:
    ps = re.split(r"[-:,.;_]", p)
    peaks.append(ps[0] + ':' + ps[-2] + '-' + ps[-1])
df_peak = pd.DataFrame(peaks)
df_peak.to_csv(Dir_intermediate + "%s_peaks.csv" % task_id, sep='\t')
df_cell = pd.DataFrame(list(adata.obs.index))
df_cell.to_csv(Dir_intermediate + "%s_cells.csv" % task_id, sep='\t')

4.1 EpiScanpy

Perform the dimensionality reduction using EpiScanpy.

User-defined parameters:
Number_of_PCs - Number of principal component computed for PCA (and therefore neighbors, tsne and umap). Default: 50.
Number_of_neighbors - The size of local neighborhood (in terms of number of neighboring data points) used for manifold approximation. Default: 15.
Method_for_connectivity - Use ‘umap’ or ‘gauss’, kernel for computing connectivities. Default: ‘umap’.
Distance_metric - A known metric name that specify a kind of distance. Default: ‘euclidean’.
Perplexity - The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Default: 30.
Minimum_distance - The effective minimum distance between embedded points. Default: 0.5.

Clustering_algorithm - The algorithm for clustering. Default: ‘louvain’.
Number_of_clusters - The number of clusters. Default: None.
Cluster_resolution - A parameter value controlling the coarseness of the clustering. Higher values lead to more clusters. Default: 1.
Random_seed - A random seed for the clustering. Default: 0.

# clustering
def epi_cluster(adata, use_method='scEpiEnsemble', cluster_method='louvain', n_cluster=None, resolution=1, random_state=0):
    # unknown n_cluster  
    colors = []
    if n_cluster == None:
        for cluster_method_ in cluster_method:
            if cluster_method_ == 'louvain':
                for resolution_ in resolution:
                    epi.tl.louvain(adata, resolution=resolution_, random_state=random_state, key_added='louvain_'+use_method+'_'+str(resolution_))
                    colors.append('louvain_'+use_method+'_'+str(resolution_))
            elif cluster_method_ == 'leiden':
                for resolution_ in resolution:
                    epi.tl.leiden(adata, resolution=resolution_, random_state=random_state, key_added='leiden_'+use_method+'_'+str(resolution_))
                    colors.append('leiden_'+use_method+'_'+str(resolution_))
            else:
                print("Clustering method error!")
        adata.obs[cluster_method[0]+'_'+use_method] = adata.obs[cluster_method[0]+'_'+use_method+'_'+str(resolution[0])].copy()
    # known n_cluster
    else:
        for cluster_method_ in cluster_method:
            if cluster_method_ == 'louvain' or cluster_method_ == 'leiden':
                for n_cluster_ in n_cluster:
                    epi.tl.getNClusters(adata, n_cluster=n_cluster_, method=cluster_method_,
                                        key_added=cluster_method_+'_'+use_method+'_n'+str(n_cluster_))
                    colors.append(cluster_method_+'_'+use_method+'_n'+str(n_cluster_))
            else:
                print("Clustering method error!")
        adata.obs[cluster_method[0]+'_'+use_method] = adata.obs[cluster_method[0]+'_'+use_method+'_n'+str(n_cluster[0])].copy()
    if len(colors) == 1:
        colors = [cluster_method[0]+'_'+use_method]
    return colors

Number_of_PCs = 50
Number_of_neighbors = 15
Method_for_connectivity = 'umap'
Distance_metric = 'euclidean'
Perplexity = 30
Minimum_distance = 0.5

Clustering_algorithm = ['louvain']
Number_of_clusters = None
Cluster_resolution = [1]
Random_seed = 0

if Use_EpiScanpy:
    epi.pp.pca(adata, n_comps=Number_of_PCs, random_state=Random_seed)
    adata_episcanpy = ad.AnnData(adata.obsm['X_pca'], obs=adata.obs)
    epi.pp.neighbors(adata_episcanpy, n_neighbors=Number_of_neighbors, method=Method_for_connectivity, 
                     metric=Distance_metric, random_state=Random_seed)
    for Visualization_method_ in Visualization_method:
        if Visualization_method_ == 'tsne':
            epi.tl.tsne(adata_episcanpy, perplexity=Perplexity, random_state=Random_seed)
        else:
            epi.tl.umap(adata_episcanpy, min_dist=Minimum_distance, random_state=Random_seed)
        
    colors = epi_cluster(adata_episcanpy, use_method='EpiScanpy', cluster_method=Clustering_algorithm, n_cluster=Number_of_clusters, 
                         resolution=Cluster_resolution, random_state=Random_seed)
    colors_episcanpy = colors.copy()
    print(adata_episcanpy)
    sc.settings.set_figure_params(facecolor='white',figsize=(4,4),frameon=True,fontsize=15)
    for Visualization_method_ in Visualization_method:
        if Cell_type_key is not None:
            colors.append(Cell_type_key)
        if Visualization_method_ == 'tsne':
            epi.pl.tsne(adata_episcanpy, color=colors, wspace=0.5)
        else:
            epi.pl.umap(adata_episcanpy, color=colors, wspace=0.5)
    adata_episcanpy.write(outputpath+'results/%s_EpiScanpy_result.h5ad.gz'%task_id, compression='gzip')

## AnnData object with n_obs × n_vars = 1945 × 50
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features', 'louvain_EpiScanpy_1', 'louvain_EpiScanpy'
##     uns: 'neighbors', 'umap', 'louvain'
##     obsm: 'X_umap'
##     obsp: 'distances', 'connectivities'

plot of chunk unnamed-chunk-27

4.2 Signac

Perform the dimensionality reduction using Signac.

User-defined parameters:
Reference_genome - ‘hg19’, ‘hg38’, ‘mm10’ or ‘mm9’ are the 4 currently recognised genomic version in this pipeline. Default: None.

Reference_genome = 'hg19'

if Use_Signac:
    os.system(f'/home/scepitools/miniconda3/envs/signac/bin/Rscript scEpiEnsemble_Signac.R {task_id} {Number_of_PCs} {Reference_genome}')
    signac_embeddings = np.array(pd.read_table(Dir_intermediate + "%s_Signac.csv" % task_id).values)
    adata_signac = ad.AnnData(signac_embeddings, obs=adata.obs)
    epi.pp.neighbors(adata_signac, n_neighbors=Number_of_neighbors, method=Method_for_connectivity, 
                     metric=Distance_metric, random_state=Random_seed)
    for Visualization_method_ in Visualization_method:
        if Visualization_method_ == 'tsne':
            epi.tl.tsne(adata_signac, perplexity=Perplexity, random_state=Random_seed)
        else:
            epi.tl.umap(adata_signac, min_dist=Minimum_distance, random_state=Random_seed)
    
    colors = epi_cluster(adata_signac, use_method='Signac', cluster_method=Clustering_algorithm, n_cluster=Number_of_clusters, 
                         resolution=Cluster_resolution, random_state=Random_seed)
    colors_signac = colors.copy()
    print(adata_signac)
    sc.settings.set_figure_params(facecolor='white',figsize=(4,4),frameon=True,fontsize=15)
    for Visualization_method_ in Visualization_method:
        if Cell_type_key is not None:
            colors.append(Cell_type_key)
        if Visualization_method_ == 'tsne':
            epi.pl.tsne(adata_signac, color=colors, wspace=0.5)
        else:
            epi.pl.umap(adata_signac, color=colors, wspace=0.5)
    adata_signac.write(outputpath+'results/%s_Signac_result.h5ad.gz'%task_id, compression='gzip')

## 0
## AnnData object with n_obs × n_vars = 1945 × 50
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features', 'louvain_Signac_1', 'louvain_Signac'
##     uns: 'neighbors', 'umap', 'louvain'
##     obsm: 'X_umap'
##     obsp: 'distances', 'connectivities'

plot of chunk unnamed-chunk-29

4.3 SnapATAC

Perform the dimensionality reduction using SnapATAC.

if Use_SnapATAC:
    os.system(f'/home/scepitools/miniconda3/envs/snapatac/bin/Rscript scEpiEnsemble_SnapATAC.R {task_id} {Number_of_PCs}')
    try:
        snapatac_embeddings = np.array(pd.read_table(Dir_intermediate + "%s_SnapATAC.csv" % task_id).values)
        adata_snapatac = ad.AnnData(snapatac_embeddings, obs=adata.obs)
        epi.pp.neighbors(adata_snapatac, n_neighbors=Number_of_neighbors, method=Method_for_connectivity, 
                         metric=Distance_metric, random_state=Random_seed)
        for Visualization_method_ in Visualization_method:
            if Visualization_method_ == 'tsne':
                epi.tl.tsne(adata_snapatac, perplexity=Perplexity, random_state=Random_seed)
            else:
                epi.tl.umap(adata_snapatac, min_dist=Minimum_distance, random_state=Random_seed)

        colors = epi_cluster(adata_snapatac, use_method='SnapATAC', cluster_method=Clustering_algorithm, n_cluster=Number_of_clusters, 
                             resolution=Cluster_resolution, random_state=Random_seed)
        colors_snapatac = colors.copy()
        print(adata_snapatac)
        sc.settings.set_figure_params(facecolor='white',figsize=(4,4),frameon=True,fontsize=15)
        for Visualization_method_ in Visualization_method:
            if Cell_type_key is not None:
                colors.append(Cell_type_key)
            if Visualization_method_ == 'tsne':
                epi.pl.tsne(adata_snapatac, color=colors, wspace=0.5)
            else:
                epi.pl.umap(adata_snapatac, color=colors, wspace=0.5)
        adata_snapatac.write(outputpath+'results/%s_SnapATAC_result.h5ad.gz'%task_id, compression='gzip')
    except:
        Use_SnapATAC = False
        print("Error when running SnapATAC, and it will default to not use SnapATAC!")

## 0
## AnnData object with n_obs × n_vars = 1945 × 50
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features', 'louvain_SnapATAC_1', 'louvain_SnapATAC'
##     uns: 'neighbors', 'umap', 'louvain'
##     obsm: 'X_umap'
##     obsp: 'distances', 'connectivities'

plot of chunk unnamed-chunk-30

4.4 scEpiEnsemble

ScEpiEnsemble performs the dimensionality reduction by ensembling the results of EpiScanpy, Signac and SnapATAC.

User-defined parameters:
Ensemble_method - Ensemble method for integrating the results of selected kernels. Options are ‘raw’, ‘minmax’, ‘z-score’. Default: ‘z-score’.

raw: Ensemble the results of selected kernels by concatenating the latent features directly.
minmax: Ensemble the results of selected kernels by concatenating the latent features after a min-max normalization.
z-score: Ensemble the results of selected kernels by concatenating the latent features after a z-score normalization.

Ensemble_method = 'z-score'
latents = []
if Ensemble_method == 'raw':
    if Use_EpiScanpy:
        latents = np.concatenate((latents, adata_episcanpy.X), axis=-1) if len(latents) != 0 else adata_episcanpy.X.copy()
    if Use_Signac:
        latents = np.concatenate((latents, adata_signac.X), axis=-1) if len(latents) != 0 else adata_signac.X.copy()
    if Use_SnapATAC:
        latents = np.concatenate((latents, adata_snapatac.X), axis=-1) if len(latents) != 0 else adata_snapatac.X.copy()
elif Ensemble_method == 'minmax':
    if Use_EpiScanpy:
        scale_embeddings = (adata_episcanpy.X - np.min(adata_episcanpy.X)) / (np.max(adata_episcanpy.X) - np.min(adata_episcanpy.X))
        latents = np.concatenate((latents, scale_embeddings), axis=-1) if len(latents) != 0 else scale_embeddings.copy()
    if Use_Signac:
        scale_embeddings = (adata_signac.X - np.min(adata_signac.X)) / (np.max(adata_signac.X) - np.min(adata_signac.X))
        latents = np.concatenate((latents, scale_embeddings), axis=-1) if len(latents) != 0 else scale_embeddings.copy()
    if Use_SnapATAC:
        scale_embeddings = (adata_snapatac.X - np.min(adata_snapatac.X)) / (np.max(adata_snapatac.X) - np.min(adata_snapatac.X))
        latents = np.concatenate((latents, scale_embeddings), axis=-1) if len(latents) != 0 else scale_embeddings.copy()
elif Ensemble_method == 'z-score':
    if Use_EpiScanpy:
        scale_embeddings = (adata_episcanpy.X - np.mean(adata_episcanpy.X)) / np.std(adata_episcanpy.X)
        latents = np.concatenate((latents, scale_embeddings), axis=-1) if len(latents) != 0 else scale_embeddings.copy()
    if Use_Signac:
        scale_embeddings = (adata_signac.X - np.mean(adata_signac.X)) / np.std(adata_signac.X)
        latents = np.concatenate((latents, scale_embeddings), axis=-1) if len(latents) != 0 else scale_embeddings.copy()
    if Use_SnapATAC:
        scale_embeddings = (adata_snapatac.X - np.mean(adata_snapatac.X)) / np.std(adata_snapatac.X)
        latents = np.concatenate((latents, scale_embeddings), axis=-1) if len(latents) != 0 else scale_embeddings.copy()
else:
    print("Ensemble method error!")

adata_scepiensemble = ad.AnnData(latents, obs=adata.obs)
epi.pp.neighbors(adata_scepiensemble, n_neighbors=Number_of_neighbors, method=Method_for_connectivity, 
                 metric=Distance_metric, random_state=Random_seed, use_rep='X')
for Visualization_method_ in Visualization_method:
    if Visualization_method_ == 'tsne':
        epi.tl.tsne(adata_scepiensemble, perplexity=Perplexity, random_state=Random_seed)
    else:
        epi.tl.umap(adata_scepiensemble, min_dist=Minimum_distance, random_state=Random_seed)

colors = epi_cluster(adata_scepiensemble, use_method='scEpiEnsemble', cluster_method=Clustering_algorithm, n_cluster=Number_of_clusters, 
                     resolution=Cluster_resolution, random_state=Random_seed)
colors_scepiensemble = colors.copy()
print(adata_scepiensemble)

## AnnData object with n_obs × n_vars = 1945 × 150
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features', 'louvain_scEpiEnsemble_1', 'louvain_scEpiEnsemble'
##     uns: 'neighbors', 'umap', 'louvain'
##     obsm: 'X_umap'
##     obsp: 'distances', 'connectivities'

sc.settings.set_figure_params(facecolor='white',figsize=(4,4),frameon=True,fontsize=15)
for Visualization_method_ in Visualization_method:
    if Cell_type_key is not None:
        colors.append(Cell_type_key)
    if Visualization_method_ == 'tsne':
        epi.pl.tsne(adata_scepiensemble, color=colors, wspace=0.5)
    else:
        epi.pl.umap(adata_scepiensemble, color=colors, wspace=0.5)

plot of chunk unnamed-chunk-32

adata_scepiensemble.write(outputpath+'results/%s_scEpiEnsemble_result.h5ad.gz'%task_id, compression='gzip')

5 Clustering performance

Calculate and compare clustering performance for scEpiEnsemble and other baseline methods.

def cluster_eval(adata, label_key, cluster_key, method, res_table_pd):
    cluster_method = cluster_key.replace(method+'_','').replace('_'+method,'')
    print(cluster_key)
    ARI = epi.tl.ARI(adata, label_key, cluster_key)
    AMI = epi.tl.AMI(adata, label_key, cluster_key)
    NMI = metrics.normalized_mutual_info_score(adata.obs[label_key], adata.obs[cluster_key])
    Homo = metrics.homogeneity_score(adata.obs[label_key], adata.obs[cluster_key])
    print('AMI:%.3f\tARI:%.3f\tNMI:%.3f\tHomo:%.3f'%(AMI,ARI,NMI,Homo))
    
    res_table_pd = res_table_pd.append(pd.DataFrame({'method':[method]*4, 'cluster_method':[cluster_method]*4, 
                                  'metric':['ARI','AMI','NMI','Homo'], 'value':[ARI,AMI,NMI,Homo]}), ignore_index=True)

    return res_table_pd

if Cell_type_key is not None:
    res_table_pd = pd.DataFrame(columns=['method', 'cluster_method', 'metric', 'value'])
    if Use_EpiScanpy:
        for colors_episcanpy_ in colors_episcanpy:
            res_table_pd = cluster_eval(adata_episcanpy, Cell_type_key, colors_episcanpy_, 'EpiScanpy', res_table_pd)
    if Use_Signac:
        for colors_signac_ in colors_signac:
            res_table_pd = cluster_eval(adata_signac, Cell_type_key, colors_signac_, 'Signac', res_table_pd)
    if Use_SnapATAC:
        for colors_snapatac_ in colors_snapatac:
            res_table_pd = cluster_eval(adata_snapatac, Cell_type_key, colors_snapatac_, 'SnapATAC', res_table_pd)
    for colors_scepiensemble_ in colors_scepiensemble:
        res_table_pd = cluster_eval(adata_scepiensemble, Cell_type_key, colors_scepiensemble_, 'scEpiEnsemble', res_table_pd)

## louvain_EpiScanpy
## AMI:0.615	ARI:0.445	NMI:0.619	Homo:0.662
## louvain_Signac
## AMI:0.575	ARI:0.392	NMI:0.580	Homo:0.609
## louvain_SnapATAC
## AMI:0.622	ARI:0.448	NMI:0.627	Homo:0.687
## louvain_scEpiEnsemble
## AMI:0.641	ARI:0.483	NMI:0.645	Homo:0.700

if Cell_type_key is not None:
    sc.settings.set_figure_params(dpi=80,facecolor='white',figsize=(14,12),frameon=True,fontsize=15)
    g = sns.catplot(col='cluster_method', x="metric", y="value", hue="method", data=res_table_pd,
                palette='Paired', kind="bar", height=6, aspect=1.2, sharey=False, col_wrap=1)
    sns.despine(top=False, right=False)

plot of chunk unnamed-chunk-35

6 Save h5ad

Write .h5ad-formatted hdf5 file. You can get the results (a html file, a h5ad file and figure files) via your taskID.

adata.obsm['scEpiEnsemble'] = adata_scepiensemble.X.copy()
if Use_EpiScanpy:
    adata.obsm['EpiScanpy'] = adata_episcanpy.X.copy()
if Use_Signac:
    adata.obsm['Signac'] = adata_signac.X.copy()
if Use_SnapATAC:
    adata.obsm['SnapATAC'] = adata_snapatac.X.copy()
print(adata)

## AnnData object with n_obs × n_vars = 1945 × 134919
##     obs: 'batch', 'cell_name', 'cell_type', 'nb_features', 'log_nb_features'
##     var: 'n_cells', 'commonness'
##     uns: 'pca'
##     obsm: 'X_pca', 'scEpiEnsemble', 'EpiScanpy', 'Signac', 'SnapATAC'
##     varm: 'PCs'
##     layers: 'normalised'

adata.write_h5ad(outputpath+'results/%s_result.h5ad'%task_id)

ScEpiTools tutorial