import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from pyfasta import Fasta
from scipy.stats import mannwhitneyu
loop_info.csv could
be downloaded from DOWNLOAD
page.df_all_loops = pd.read_csv('loop_info.csv', header=0)
df_gm12878_loops = df_all_loops.loc[(df_all_loops['cellLine'] == 'GM12878') & (df_all_loops['ChIP'] == 'H3K27ac')]
df_gm12878_loops.head()
## id anchor1ID anchor1 ... organ cellLine pipeline
## 0 0 166304 chr1:710000-715000 ... Blood GM12878 fithichip_5k
## 1 1 166304 chr1:710000-715000 ... Blood GM12878 fithichip_5k
## 2 2 166304 chr1:710000-715000 ... Blood GM12878 fithichip_5k
## 3 3 166304 chr1:710000-715000 ... Blood GM12878 fithichip_5k
## 4 4 166304 chr1:710000-715000 ... Blood GM12878 fithichip_5k
##
## [5 rows x 21 columns]
df_gm12878_loops['Distance (kb)'] = (df_gm12878_loops['anchor2'].apply(lambda x: int(x.split('-')[1]))-
df_gm12878_loops['anchor1'].apply(lambda x: int(x.split('-')[1])))/1000
## <string>:1: SettingWithCopyWarning:
## A value is trying to be set on a copy of a slice from a DataFrame.
## Try using .loc[row_indexer,col_indexer] = value instead
##
## See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
sns.set_style("whitegrid")
#print(min(df_gm12878_loops['Distance (kb)']),max(df_gm12878_loops['Distance (kb)']))
ax = sns.violinplot(x=df_gm12878_loops["Distance (kb)"])
plt.show()
#wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_19/gencode.v19.annotation.gtf.gz && gunzip gencode.v19.annotation.gtf.gz
df_hg19 = pd.read_csv('gencode.v19.annotation.gtf', header=None,
sep='\t', skiprows=5, on_bad_lines='skip')
df_coding_genes = df_hg19.loc[(df_hg19[2]=='gene') \
&(['protein_coding' in item for item in df_hg19[8]]), [0,3,4,8]]
df_coding_genes[8]=df_coding_genes[8].apply(lambda x: x.split(';')[4].split(' ')[2].strip('"'))
df_coding_genes.rename(columns={0:'chrom',3:'start',4:'end',8:'protein_coding_gene_name'},inplace=True)
df_coding_genes.head()
## chrom start end protein_coding_gene_name
## 109 chr1 69091 70008 OR4F5
## 141 chr1 134901 139379 AL627309.1
## 277 chr1 367640 368634 OR4F29
## 374 chr1 621059 622053 OR4F16
## 511 chr1 738532 739137 AL669831.1
df_left_anchors = df_gm12878_loops[['anchor1']].drop_duplicates()
df_right_anchors = df_gm12878_loops[['anchor2']].drop_duplicates()
df_left_anchors['nearest_gene_distance'] = df_left_anchors['anchor1'].apply(lambda x: \
(df_coding_genes.loc[df_coding_genes['chrom']== \
x.split(':')[0]]['start']-int(x.split(':')[1].split('-')[0])).abs().min())
df_right_anchors['nearest_gene_distance'] = df_right_anchors['anchor2'].apply(lambda x: \
(df_coding_genes.loc[df_coding_genes['chrom']== \
x.split(':')[0]]['start']-int(x.split(':')[1].split('-')[0])).abs().min())
print('Median distance between left anchor and nearest gene is %d bp.'\
%(df_left_anchors['nearest_gene_distance'].median()))
## Median distance between left anchor and nearest gene is 39457 bp.
print('Median distance between right anchor and nearest gene is %d bp.'\
%(df_right_anchors['nearest_gene_distance'].median()))
## Median distance between right anchor and nearest gene is 39830 bp.
df_left_anchors.rename(columns={'anchor1':'location'},inplace=True)
df_right_anchors.rename(columns={'anchor2':'location'},inplace=True)
df_left_anchors['anchor'] ='Left'
df_right_anchors['anchor'] ='Right'
df_combine_anchors=pd.concat([df_left_anchors,df_right_anchors],axis=0)
df_combine_anchors['nearest_gene_distance'] = np.log10(1+df_combine_anchors['nearest_gene_distance'])
ax = sns.violinplot(x="anchor", y="nearest_gene_distance", data=df_combine_anchors)
plt.xlabel('Anchors')
plt.ylabel('Nearest gene distance (log10 kb)')
plt.show()
p_value = mannwhitneyu(df_left_anchors['nearest_gene_distance'], \
df_right_anchors['nearest_gene_distance'],
alternative='two-sided')[1]
print('The distance distribution of left and right anchor are not statistically different, two-sided Mann-Whitney U test P-value=%.4f'%p_value)
## The distance distribution of left and right anchor are not statistically different, two-sided Mann-Whitney U test P-value=0.0230
#wget http://hgdownload.cse.ucsc.edu/goldenpath/hg19/bigZips/hg19.fa.gz && gunzip hg19.fa.gz
genome = Fasta('hg19.fa')
df_left_anchors['GC_content'] = df_left_anchors['location'].apply(\
lambda x: (genome[x.split(':')[0]][int(x.split(':')[1].split('-')[0]):\
int(x.split(':')[1].split('-')[1])].upper().count('C')+\
genome[x.split(':')[0]][int(x.split(':')[1].split('-')[0]):\
int(x.split(':')[1].split('-')[1])].upper().count('G'))/ \
(int(x.split(':')[1].split('-')[1])-int(x.split(':')[1].split('-')[0])))
df_right_anchors['GC_content'] = df_right_anchors['location'].apply(\
lambda x: (genome[x.split(':')[0]][int(x.split(':')[1].split('-')[0]):\
int(x.split(':')[1].split('-')[1])].upper().count('C')+\
genome[x.split(':')[0]][int(x.split(':')[1].split('-')[0]):\
int(x.split(':')[1].split('-')[1])].upper().count('G'))/ \
(int(x.split(':')[1].split('-')[1])-int(x.split(':')[1].split('-')[0])))
print('GC content of left anchor is around %.2f to %.2f.'\
%(df_left_anchors['GC_content'].min(),
df_left_anchors['GC_content'].max()))
## GC content of left anchor is around 0.02 to 0.74.
print('GC content of right anchor is around %.2f to %.2f.'\
%(df_right_anchors['GC_content'].min(),
df_right_anchors['GC_content'].max()))
## GC content of right anchor is around 0.02 to 0.74.
df_combine_anchors=pd.concat([df_left_anchors,df_right_anchors],axis=0)
ax = sns.violinplot(x="anchor", y="GC_content", data=df_combine_anchors)
plt.xlabel('Anchors')
plt.ylabel('GC content')
plt.show()
p_value = mannwhitneyu(df_left_anchors['GC_content'], \
df_right_anchors['GC_content'],
alternative='two-sided')[1]
print('The GC content in left and right anchor is not statistically different by two-sided Mann-Whitney U test, P-value = %.4f'%p_value)
## The GC content in left and right anchor is not statistically different by two-sided Mann-Whitney U test, P-value = 0.8625
We explore the cell type specificity of the loops from HiChIP CTCF experiments on GM12878 cell line. Some loops only exist in a few cell lines (high specificity) while some loops exist in most of the cell lines (low specificity).
We first extracted the unique and significant loops of HiChIP CTCF across diverse cell lines.
all_loops_CTCF = df_all_loops.loc[(df_all_loops['ChIP'] == 'CTCF')&\
(df_all_loops['qValue']<1e-10)]
all_uniq_loops = all_loops_CTCF[['anchor1','anchor2']].drop_duplicates()
nb_cl_by_loop = []
for i in range(len(all_uniq_loops)):
anchor1, anchor2 = all_uniq_loops.iloc[i].values
celllines = all_loops_CTCF.loc[(all_loops_CTCF['anchor1']==anchor1)&\
(all_loops_CTCF['anchor2']==anchor2)]['cellLine'].values
uniq_cellines = np.unique(celllines)
nb_cl_by_loop.append(len(uniq_cellines))
all_uniq_loops['nb_celllines'] = nb_cl_by_loop
all_uniq_loops['specificity']=all_uniq_loops['nb_celllines'].apply(lambda x: 'Rare loops' if x < 5 else ('Others' if x < 10 else 'Common loops'))
all_uniq_loops.head()
## anchor1 ... specificity
## 3665600 chr1:6550000-6555000 ... Rare loops
## 3665613 chr1:9345000-9350000 ... Others
## 3665628 chr1:16360000-16365000 ... Rare loops
## 3665631 chr1:16950000-16955000 ... Others
## 3665643 chr1:19985000-19990000 ... Others
##
## [5 rows x 4 columns]
ax = sns.violinplot(x="specificity", y="nb_celllines", data=all_uniq_loops)
plt.xlabel('Cell line specificity')
plt.ylabel('Number of cell lines')
plt.show()
we illustrate the effectiveness of HiChIPdb through an application of annotating GWAS risk genes by using HiChIP loops.
In brief, we sum the weighted effect size from GWAS summary statistics of all genetic variants that fall on the regions that have interactions (HiChIP loops) with the target risk gene as the HiChIP annotated gene score (HAGS).
we took the\(-log_{10}(P)\times \beta\) as the as the weighted effect size for each variant where β and p is the p-value and effect size in the original GWAS study.
The HAGS across 95 cell types and 20242 genes were directly provided through (link). Users can also refer to a detailed IPython notebook for how to calculate the HAGS.
The rows and columns of HAGS file corresponds to cell types and coding genes.
Note that -1 denotes HAGS does not exist as there is no corresponding GWAS SNPs fell into any region that has interaction with the target gene.
#!wget xxx/df_g_scores_minusLogPMultiBeta.csv
df_g_scores_minusLogPMultiBeta = pd.read_csv('df_g_scores_minusLogPMultiBeta.csv',index_col=0)
df_g_scores_minusLogPMultiBeta.head()
## SAMD11 AL627309.1 OR4F29 ... MT-ND5 MT-ND6 MT-CYB
## GM12878 14.972468 -1.0 -1.0 ... -1.0 -1.0 -1.0
## HCASMC 108.275940 -1.0 -1.0 ... -1.0 -1.0 -1.0
## K562 155.749202 -1.0 -1.0 ... -1.0 -1.0 -1.0
## MyLa 32.772657 -1.0 -1.0 ... -1.0 -1.0 -1.0
## Naive T primary cell -1.000000 -1.0 -1.0 ... -1.0 -1.0 -1.0
##
## [5 rows x 20242 columns]
n_top = 20
df_HAEC_scores = df_g_scores_minusLogPMultiBeta.loc['HAEC'].sort_values(ascending=False)
df_hiPSC_scores = df_g_scores_minusLogPMultiBeta.loc['hiPSC-CM'].sort_values(ascending=False)
HAEC_scores = df_HAEC_scores.values[:n_top]
hiPSC_scores = df_hiPSC_scores.values[:n_top]
HAEC_genes_names = df_HAEC_scores.index.to_list()[:n_top]
hiPSC_genes_names = df_hiPSC_scores.index.to_list()[:n_top]
x_pos = [i for i, _ in enumerate(HAEC_scores)]
plt.figure(figsize=(10,8))
plt.subplot(2, 1, 1)
ax1 = plt.bar(x_pos, HAEC_scores, color=np.array([118,218,145])/256)
plt.ylabel("HAGS",fontsize=12)
plt.title("The top-%d annotated risk genes in HAEC heart cell line"%n_top,fontsize=12)
ax2 = plt.xticks(x_pos, HAEC_genes_names,rotation=45,fontsize=12)
plt.subplot(2, 1, 2)
ax3 = plt.bar(x_pos, hiPSC_scores, color=np.array([118,218,145])/256)
plt.ylabel("HAGS",fontsize=12)
plt.title("The top-%d annotated risk genes in hiPSC-CM heart cell line"%n_top,fontsize=12)
ax4 = plt.xticks(x_pos, hiPSC_genes_names,rotation=45,fontsize=12)
plt.subplots_adjust(hspace=0.5)
plt.show()
genes = ['NOS1AP','KCNH2']
uniq_values = list(set(df_HAEC_scores))
uniq_values.sort(reverse=True)
rank_HAEC=df_HAEC_scores.map(lambda x:uniq_values.index(x)+1)
uniq_values = list(set(df_hiPSC_scores))
uniq_values.sort(reverse=True)
rank_hiPSC=df_hiPSC_scores.map(lambda x:uniq_values.index(x)+1)
print(['%s ranked %d in HAEC cell line'%(gene, rank_HAEC[gene]) for gene in genes])
## ['NOS1AP ranked 1 in HAEC cell line', 'KCNH2 ranked 12 in HAEC cell line']
print(['%s ranked %d in hiPSC-CM cell line'%(gene, rank_hiPSC[gene]) for gene in genes])
## ['NOS1AP ranked 1 in hiPSC-CM cell line', 'KCNH2 ranked 8 in hiPSC-CM cell line']
n_top = 20
df_HARA_scores = df_g_scores_minusLogPMultiBeta.loc['HARA'].sort_values(ascending=False)
df_PAEC_scores = df_g_scores_minusLogPMultiBeta.loc['Pulmonary artery endothelial cells (PAEC)'].sort_values(ascending=False)
HARA_scores = df_HARA_scores.values[:n_top]
PAEC_scores = df_PAEC_scores.values[:n_top]
HARA_genes_names = df_HARA_scores.index.to_list()[:n_top]
PAEC_genes_names = df_PAEC_scores.index.to_list()[:n_top]
x_pos = [i for i, _ in enumerate(HARA_scores)]
plt.figure(figsize=(10,8))
plt.subplot(2, 1, 1)
ax1 = plt.bar(x_pos, HARA_scores, color=np.array([248,203,127])/256)
plt.ylabel("HAGS",fontsize=12)
plt.title("The top-%d annotated risk genes in HARA lung cell line"%n_top,fontsize=12)
ax2 = plt.xticks(x_pos, HARA_genes_names,rotation=45,fontsize=12)
plt.subplot(2, 1, 2)
ax3 = plt.bar(x_pos, PAEC_scores, color=np.array([248,203,127])/256)
plt.ylabel("HAGS",fontsize=12)
plt.title("The top-%d annotated risk genes in PAEC lung cell line"%n_top,fontsize=12)
ax4 = plt.xticks(x_pos, PAEC_genes_names,rotation=45,fontsize=12)
plt.subplots_adjust(hspace=0.5)
plt.show()
genes = ['NOS1AP','KCNH2']
uniq_values = list(set(df_HARA_scores))
uniq_values.sort(reverse=True)
rank_HARA=df_HARA_scores.map(lambda x:uniq_values.index(x)+1)
uniq_values = list(set(df_PAEC_scores))
uniq_values.sort(reverse=True)
rank_PAEC=df_PAEC_scores.map(lambda x:uniq_values.index(x)+1)
print(['%s ranked %d in HARA cell line'%(gene, rank_HARA[gene]) for gene in genes])
## ['NOS1AP ranked 4179 in HARA cell line', 'KCNH2 ranked 4179 in HARA cell line']
print(['%s ranked %d in PAEC cell line'%(gene, rank_PAEC[gene]) for gene in genes])
## ['NOS1AP ranked 3 in PAEC cell line', 'KCNH2 ranked 11729 in PAEC cell line']
cell_types = df_g_scores_minusLogPMultiBeta.index.to_list()
heart_cell_types = ['hiPSC-CM','HAEC']
rank_list1, rank_list2 = [],[]
for each in cell_types:
if each not in heart_cell_types:
df_scores = df_g_scores_minusLogPMultiBeta.loc[each].sort_values(ascending=False)
uniq_values = list(set(df_scores))
uniq_values.sort(reverse=True)
rank=df_scores.map(lambda x:uniq_values.index(x)+1)
rank_list1 += [rank[genes[0]]]
rank_list2 += [rank[genes[1]]]
print('%s has a mean rank of %.1f, median rank of %.1f in heart irrelevant cell lines'%(genes[0],np.mean(rank_list1),np.median(rank_list1)))
## NOS1AP has a mean rank of 2229.1, median rank of 470.0 in heart irrelevant cell lines
print('%s has a mean rank of %.1f, median rank of %.1f in heart irrelevant cell lines'%(genes[1],np.mean(rank_list2),np.median(rank_list2)))
## KCNH2 has a mean rank of 2092.8, median rank of 280.0 in heart irrelevant cell lines