import torch
import os
from pyfasta import Fasta
import numpy as np
import pandas as pd
os.environ['CUDA_VISIBLE_DEVICES']='0'
from model import EpiGePT
from model.utils import *
from model.config import *


model = EpiGePT.EpiGePT(WORD_NUM,TF_DIM,BATCH_SIZE)
model.load_state_dict(torch.load('pretrainModel/model.ckpt',map_location='cuda:0')['state_dict'])
genome = Fasta('hg38.fa')
model.eval()
model = model.cuda()


acgt2num = {'A': 0,
            'C': 1,
            'G': 2,
            'T': 3}

def seq2mat(seq):
    seq = seq.upper()
    h = 4
    w = len(seq)
    mat = np.zeros((h, w), dtype=bool)  # True or false in mat
    for i in range(w):
        if seq[i] != 'N':
            mat[acgt2num[seq[i]], i] = 1.
    return mat

def model_predict(model,chrom,start,end,tf_feature):
    seq = genome[chrom][start:end]
    seq_embeds = seq2mat(seq)
    seq_embeds = np.array(seq_embeds,dtype='float16')#(50,)
    seq_embeds = np.expand_dims(seq_embeds,axis=0)
    seq_embeds = torch.from_numpy(seq_embeds)
    tf_feature = np.pad(tf_feature,((0, 0), (0, 1)),'constant',constant_values = (0,0))
    tf_feature = np.expand_dims(tf_feature,axis=0)
    tf_feature = torch.from_numpy(tf_feature)
    seq_embeds = seq_embeds.type(torch.FloatTensor)
    tf_feature = tf_feature.type(torch.FloatTensor)
    seq_embeds = seq_embeds.to('cuda')
    tf_feature = tf_feature.to('cuda')
    signals = model(seq_embeds,tf_feature)
    np_signals = signals.cpu().detach().numpy()
    return np_signals


# get information about 711 TFs
tf_list, motif2tf, isContain = get_tf_motif_match()
print(tf_list[:10])

['AFP', 'AHR', 'AIRE', 'ALX1', 'ALX3', 'ALX4', 'AR', 'ARHGEF12', 'ARID3A', 'ARID3B']


ref_tf_expression = pd.read_csv('reference_TPM_tf_expr.csv',header=0,sep='\t',index_col=[0])

def quantile_norm(df,ref_exp):
    """quantile normalization
    Arg:
        df: Pandas DataFrame with TFs x cells
    Return:
        normalized Pandas DataFrame 
    """
    rank_mean = ref_exp.stack().groupby(ref_exp.rank(method='first').stack().astype(int)).mean()
    return df.rank(method='min').stack().astype(int).map(rank_mean).unstack()

cell_id = 0 # index for the cell type
N = 64 # number for cell types
tf_TPM_expression = pd.read_csv('EXAMPLE_TPM.csv',header=0,sep='\t',index_col=[0]) # TPM value of 711 TFs
tf_expression = quantile_norm(tf_TPM_expression,ref_tf_expression)
tf_expression = np.array(tf_expression.iloc[:,cell_id]) # quntile transformed TPM values


tf_TPM_expression.head()


# scan the motif binding score using Homer tool
! findMotifsGenome.pl bins.bed hg38.fa ./ -find all_motif_rmdup.motif -p 32 -cache 320000 > homer_motifscore.txt


"""parse the motifscan results by homer tool.
    Args:
        bin_file: path to bed file that was used for motif scanning
        motifscan_file: path to the motifscan file from Homer tool (e.g. homer_motifscore.txt)
"""
tf_motifscore = get_motif_score(tf_list, motif2tf, isContain, bin_file, motifscan_file, save = True)


chrom, start, end = 'chr1', 768000, 896000
region = [chrom, start, end]
tf_motifscore = get_motifscore(region,tf_list,motif2tf) # motif binding status from homer tools


# TF feature
tf_feature = tf_motifscore * np.log(tf_expression + 1)
print(tf_feature.shape)

(1000, 711)


chrom, start, end = 'chr1', 768000, 896000
region = [chrom, start, end]
bins = [chrom+':'+str(start+i*128)+'-'+str(start+(i+1)*128) for i in range(1000)]


# bin level prediction
predict = model_predict(model,region[0],region[1],region[2],tf_feature)
pd_predict = pd.DataFrame(predict[0,:,:],index=bins,columns = ['DNase','CTCF','H3K27ac','H3K4me3','H3K36me3','H3K27me3','H3K9me3','H3K4me1'])
pd_predict


pd_predict.to_csv('demo_region_prediction.csv')

	A673
AFP	0.040
AHR	5.860
AIRE	0.005
ALX1	4.825
ALX3	0.380

	DNase	CTCF	H3K27ac	H3K4me3	H3K36me3	H3K27me3	H3K9me3	H3K4me1
chr1:768000-768128	0.218530	0.000000	0.006686	0.033173	0.056404	0.053041	0.083177	0.000000
chr1:768128-768256	0.674857	0.054612	0.082749	0.102566	0.172418	0.206133	0.227334	0.083132
chr1:768256-768384	0.132666	0.144564	0.151232	0.066611	0.214756	0.279354	0.266159	0.162440
chr1:768384-768512	0.928340	0.220737	0.198496	0.306024	0.354715	0.374888	0.422795	0.331158
chr1:768512-768640	0.128552	0.072632	0.112102	0.073463	0.132509	0.182067	0.170579	0.109985
...	...	...	...	...	...	...	...	...
chr1:895360-895488	0.443278	0.000000	0.000000	0.000000	0.000000	0.000000	0.032891	0.000000
chr1:895488-895616	0.315710	0.068977	0.090206	0.066755	0.133631	0.149681	0.155440	0.069176
chr1:895616-895744	0.274128	0.108964	0.108921	0.084879	0.129972	0.195359	0.248707	0.153418
chr1:895744-895872	0.264284	0.061002	0.061676	0.048003	0.089759	0.144322	0.100673	0.094833
chr1:895872-896000	1.156838	0.501072	1.083211	0.315828	2.169565	2.090127	1.949080	1.807102

EpiGePT tutorial¶

1 Initialization¶

Requirements¶

2 Load pretrained model¶

3 Predict¶

3.1 Prepare input: motif score for transcription factor binding and gene expression¶

Prepare tf expression¶

Prepare tf motif score¶

3.2 Prepare input: genomic region with 128kbp length¶

3.3 Model prediction¶

4 Save¶