import theano
import numpy as np
import sys
import pandas as pd
import scipy
from scipy.stats import spearmanr
%matplotlib inline
import matplotlib.pyplot as plt
我们将介绍加载模型和预测突变影响的基本函数。
下载预训练参数。
请首先使用 download_pretrained.sh 脚本下载预训练参数。
加载模型。
sys.path.insert(0, "../DeepSequence")
import model
import helper
import train
突变影响预测。
突变影响预测辅助函数始终针对比对中的焦点序列。我们可以单独请求预测突变效应。
为了获得可靠的突变效应预测结果,我们建议从模型中取 Monte Carlo 500-2000 个样本(使用 N_pred_iterations 参数)。
我们可以预测单个、双重、三重突变等的影响。突变以元组列表的形式组织,其中元组为(Uniprot位置,野生型氨基酸,突变氨基酸)。
PABP
首先让我们加载一个模型。我们不需要在这里计算序列权重,因为我们不是在训练模型,而且在 CPU 上进行这项计算可能会很慢。
在 "Explore model parameters.ipynb" 笔记本中,helper.py 代码被修改以预先指定 DataHelper 类使用的数据集。然而,我们可以传入一个比对名称和一些额外参数,这样就不必修改 helper.py 文件。
data_params = {"alignment_file":"datasets/PABP_YEAST_hmmerbit_plmc_n5_m30_f50_t0.2_r115-210_id100_b48.a2m"}
pabp_data_helper = helper.DataHelper(
alignment_file=data_params["alignment_file"],
working_dir=".",
calc_weights=False
)
model_params = {
"batch_size" : 100,
"encode_dim_zero" : 1500,
"encode_dim_one" : 1500,
"decode_dim_zero" : 100,
"decode_dim_one" : 500,
"n_patterns" : 4,
"n_latent" : 30,
"logit_p" : 0.001,
"sparsity" : "logit",
"encode_nonlin" : "relu",
"decode_nonlin" : "relu",
"final_decode_nonlin": "sigmoid",
"output_bias" : True,
"final_pwm_scale" : True,
"conv_pat" : True,
"d_c_size" : 40
}
pabp_vae_model = model.VariationalAutoencoder(pabp_data_helper,
batch_size = model_params["batch_size"],
encoder_architecture = [model_params["encode_dim_zero"],
model_params["encode_dim_one"]],
decoder_architecture = [model_params["decode_dim_zero"],
model_params["decode_dim_one"]],
n_latent = model_params["n_latent"],
n_patterns = model_params["n_patterns"],
convolve_patterns = model_params["conv_pat"],
conv_decoder_size = model_params["d_c_size"],
logit_p = model_params["logit_p"],
sparsity = model_params["sparsity"],
encode_nonlinearity_type = model_params["encode_nonlin"],
decode_nonlinearity_type = model_params["decode_nonlin"],
final_decode_nonlinearity = model_params["final_decode_nonlin"],
output_bias = model_params["output_bias"],
final_pwm_scale = model_params["final_pwm_scale"],
working_dir = ".")
print ("Model built")
Encoding sequences
Neff = 151528.0
Data Shape = (151528, 82, 20)
Model built
加载预训练模型在 ‘params’ 文件夹中的参数。
file_prefix = "PABP_YEAST"
pabp_vae_model.load_parameters(file_prefix=file_prefix)
print ("Parameters loaded")
Parameters loaded
print (pabp_data_helper.delta_elbo(pabp_vae_model,[(126,"G","A")], N_pred_iterations=500))
-2.03463650668
print (pabp_data_helper.delta_elbo(pabp_vae_model,[(126,"G","A"), (137,"I","P")], N_pred_iterations=500))
-10.8308351474
print (pabp_data_helper.delta_elbo(pabp_vae_model,[(126,"G","A"), (137,"I","P"), (155,"S","A")], N_pred_iterations=500))
-16.058655309
我们可以预测所有单个突变的影响。优选使用此函数及以下函数,因为它们能够利用对突变数据进行小批量处理所带来的加速优势。
pabp_full_matr_mutant_name_list, pabp_full_matr_delta_elbos \
= pabp_data_helper.single_mutant_matrix(pabp_vae_model, N_pred_iterations=500)
print (pabp_full_matr_mutant_name_list[0], pabp_full_matr_delta_elbos[0])
('K123A', 0.5887526915685584)
我们还可以以批处理模式从文件中预测突变的影响。
pabp_custom_matr_mutant_name_list, pabp_custom_matr_delta_elbos \
= pabp_data_helper.custom_mutant_matrix("mutations/PABP_YEAST_Fields2013-singles.csv", \
pabp_vae_model, N_pred_iterations=500)
print (pabp_custom_matr_mutant_name_list[12], pabp_custom_matr_delta_elbos[12])
('N127D', -6.426795215037501)
我们也可以编写一个快速的函数来从一个突变文件计算 Spearman 系数(rho)。
def generate_spearmanr(mutant_name_list, delta_elbo_list, mutation_filename, phenotype_name):
measurement_df = pd.read_csv(mutation_filename, sep=',')
mutant_list = measurement_df.mutant.tolist()
expr_values_ref_list = measurement_df[phenotype_name].tolist()
mutant_name_to_pred = {mutant_name_list[i]:delta_elbo_list[i] for i in range(len(delta_elbo_list))}
# If there are measurements
wt_list = []
preds_for_spearmanr = []
measurements_for_spearmanr = []
for i,mutant_name in enumerate(mutant_list):
expr_val = expr_values_ref_list[i]
# Make sure we have made a prediction for that mutant
if mutant_name in mutant_name_to_pred:
multi_mut_name_list = mutant_name.split(':')
# If there is no measurement for that mutant, pass over it
if np.isnan(expr_val):
pass
# If it was a codon change, add it to the wt vals to average
elif mutant_name[0] == mutant_name[-1] and len(multi_mut_name_list) == 1:
wt_list.append(expr_values_ref_list[i])
# If it is labeled as the wt sequence, add it to the average list
elif mutant_name == 'wt' or mutant_name == 'WT':
wt_list.append(expr_values_ref_list[i])
else:
measurements_for_spearmanr.append(expr_val)
preds_for_spearmanr.append(mutant_name_to_pred[mutant_name])
if wt_list != []:
measurements_for_spearmanr.append(np.mean(average_wt_list))
preds_for_spearmanr.append(0.0)
num_data = len(measurements_for_spearmanr)
spearman_r, spearman_pval = spearmanr(measurements_for_spearmanr, preds_for_spearmanr)
print ("N: "+str(num_data)+", Spearmanr: "+str(spearman_r)+", p-val: "+str(spearman_pval))
generate_spearmanr(pabp_custom_matr_mutant_name_list, pabp_custom_matr_delta_elbos, \
"mutations/PABP_YEAST_Fields2013-singles.csv", "log")
N: 1188, Spearmanr: 0.6509305755221257, p-val: 4.0800344026520655e-144
PDZ
data_params = {"alignment_file":"datasets/DLG4_RAT_hmmerbit_plmc_n5_m30_f50_t0.2_r300-400_id100_b50.a2m"}
pdz_data_helper = helper.DataHelper(
alignment_file=data_params["alignment_file"],
working_dir=".",
calc_weights=False
)
pdz_vae_model = model.VariationalAutoencoder(pdz_data_helper,
batch_size = model_params["batch_size"],
encoder_architecture = [model_params["encode_dim_zero"],
model_params["encode_dim_one"]],
decoder_architecture = [model_params["decode_dim_zero"],
model_params["decode_dim_one"]],
n_latent = model_params["n_latent"],
n_patterns = model_params["n_patterns"],
convolve_patterns = model_params["conv_pat"],
conv_decoder_size = model_params["d_c_size"],
logit_p = model_params["logit_p"],
sparsity = model_params["sparsity"],
encode_nonlinearity_type = model_params["encode_nonlin"],
decode_nonlinearity_type = model_params["decode_nonlin"],
final_decode_nonlinearity = model_params["final_decode_nonlin"],
output_bias = model_params["output_bias"],
final_pwm_scale = model_params["final_pwm_scale"],
working_dir = ".")
print ("Model built")
file_prefix = "DLG4_RAT"
pdz_vae_model.load_parameters(file_prefix=file_prefix)
print ("Parameters loaded\n\n")
pdz_custom_matr_mutant_name_list, pdz_custom_matr_delta_elbos \
= pdz_data_helper.custom_mutant_matrix("mutations/DLG4_RAT_Ranganathan2012.csv", \
pdz_vae_model, N_pred_iterations=500)
generate_spearmanr(pdz_custom_matr_mutant_name_list, pdz_custom_matr_delta_elbos, \
"mutations/DLG4_RAT_Ranganathan2012.csv", "CRIPT")
Encoding sequences
Neff = 102246.0
Data Shape = (102246, 84, 20)
Model built
Parameters loaded
N: 1577, Spearmanr: 0.6199244929585085, p-val: 4.31636475994128e-168
B-lactamase
对于包含更多待预测突变的较大蛋白质,运行时间可能会更长。针对这种情况,我们建议使用支持 GPU 的计算。
data_params = {"dataset":"BLAT_ECOLX"}
blat_data_helper = helper.DataHelper(
dataset=data_params["dataset"],
working_dir=".",
calc_weights=False
)
blat_vae_model = model.VariationalAutoencoder(blat_data_helper,
batch_size = model_params["batch_size"],
encoder_architecture = [model_params["encode_dim_zero"],
model_params["encode_dim_one"]],
decoder_architecture = [model_params["decode_dim_zero"],
model_params["decode_dim_one"]],
n_latent = model_params["n_latent"],
n_patterns = model_params["n_patterns"],
convolve_patterns = model_params["conv_pat"],
conv_decoder_size = model_params["d_c_size"],
logit_p = model_params["logit_p"],
sparsity = model_params["sparsity"],
encode_nonlinearity_type = model_params["encode_nonlin"],
decode_nonlinearity_type = model_params["decode_nonlin"],
final_decode_nonlinearity = model_params["final_decode_nonlin"],
output_bias = model_params["output_bias"],
final_pwm_scale = model_params["final_pwm_scale"],
working_dir = ".")
print ("Model built")
file_prefix = "BLAT_ECOLX"
blat_vae_model.load_parameters(file_prefix=file_prefix)
print ("Parameters loaded\n\n")
blat_custom_matr_mutant_name_list, blat_custom_matr_delta_elbos \
= blat_data_helper.custom_mutant_matrix("mutations/BLAT_ECOLX_Ranganathan2015.csv", \
blat_vae_model, N_pred_iterations=500)
generate_spearmanr(blat_custom_matr_mutant_name_list, blat_custom_matr_delta_elbos, \
"mutations/BLAT_ECOLX_Ranganathan2015.csv", "2500")
Encoding sequences
Neff = 8355.0
Data Shape = (8355, 253, 20)
Model built
Parameters loaded
N: 4807, Spearmanr: 0.743886370415797, p-val: 0.0