From 43e7f837daefd696ec30764ed89a3f2957cbe570 Mon Sep 17 00:00:00 2001 From: Priyatham Sai Chand Date: Thu, 8 Apr 2021 15:54:12 +0530 Subject: [PATCH 1/4] change python version in travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 85b97af..a3c5fbf 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,6 +1,6 @@ language: python python: - - "3.5" + - "3.6" install: - pip install -r requirements.txt From 3f885458a4e9d7ccb9015f93e3b579e6f337e47e Mon Sep 17 00:00:00 2001 From: Priyatham Sai Chand Date: Thu, 8 Apr 2021 21:11:16 +0530 Subject: [PATCH 2/4] fix flank8 errors --- access_data_rest.py | 165 +++++++++++---------- finalize_dataset.py | 2 +- ftpg.py | 69 ++++----- model.py | 255 +++++++++++++++++--------------- pfam_folder_pred.py | 86 +++++------ pfam_matrix.py | 180 ++++++++++++----------- pfam_parser.py | 55 +++---- prediction.py | 3 +- prediction_pfam.py | 243 ++++++------------------------- prepare_synteny_matrix.py | 128 +++++++++-------- process_data.py | 24 ++-- process_negative.py | 82 +++++------ read_data.py | 3 +- read_get_gene_seq.py | 20 +-- save_data.py | 9 +- threads.py | 9 +- train.py | 295 +++++++++++++++++++++----------------- 17 files changed, 800 insertions(+), 828 deletions(-) diff --git a/access_data_rest.py b/access_data_rest.py index 5e3b499..464ccee 100644 --- a/access_data_rest.py +++ b/access_data_rest.py @@ -3,152 +3,163 @@ import progressbar import sys -def update_protein(gene_seq,gene): - t=0 - while(t!=2): + +def update_protein(gene_seq, gene): + t = 0 + while(t != 2): try: server = "https://rest.ensembl.org" - ext = "/sequence/id/"+str(gene)+"?type=protein;multiple_sequences=1" + ext = "/sequence/id/" + \ + str(gene)+"?type=protein;multiple_sequences=1" - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) + r = requests.get( + server+ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() - r=r.json() - if len(r)==1: - r=dict(r[0]) - gene_seq[gene]=str(r["seq"]) + r = r.json() + if len(r) == 1: + r = dict(r[0]) + gene_seq[gene] = str(r["seq"]) return else: - maxi=0 - maxlen=0 + maxi = 0 + maxlen = 0 for i in range(len(r)): - m=r[i] - m=dict(m) - if len(m["seq"])>maxlen: - maxi=i - r=dict(r[maxi]) - gene_seq[gene]=str(r["seq"]) + m = r[i] + m = dict(m) + if len(m["seq"]) > maxlen: + maxi = i + r = dict(r[maxi]) + gene_seq[gene] = str(r["seq"]) return - except : - t+=1 - #print("\nError:",e) + except Exception: + t += 1 + # print("\nError:",e) continue - gene_seq[gene]="" + gene_seq[gene] = "" + -def update_rest_protein(data,fname): - gids={} - with open("processed/not_found_"+fname+".json","r") as file: - gids=dict(json.load(file)) +def update_rest_protein(data, fname): + gids = {} + with open("processed/not_found_"+fname+".json", "r") as file: + gids = dict(json.load(file)) - gids=list(gids.keys()) + gids = list(gids.keys()) - geneseq={} + geneseq = {} server = "https://rest.ensembl.org" ext = "/sequence/id?type=protein" - headers={ "Content-Type" : "application/json", "Accept" : "application/json"} + headers = {"Content-Type": "application/json", + "Accept": "application/json"} - for i in progressbar.progressbar(range(0,len(gids)-50,50)): - ids=dict(ids=list(gids[i:i+50])) + for i in progressbar.progressbar(range(0, len(gids)-50, 50)): + ids = dict(ids=list(gids[i:i+50])) while(1): try: - r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids))) + r = requests.post(server+ext, headers=headers, + data=str(json.dumps(ids))) if not r.ok: r.raise_for_status() - gs=r.json() - tgs={} + gs = r.json() + tgs = {} for g in gs: - tgs[g["query"]]=g["seq"] + tgs[g["query"]] = g["seq"] geneseq.update(tgs) break except Exception as e: - print("Error:",e) + print("Error:", e) continue data.update(geneseq) for genes in gids: try: - _=data[genes] - except: + _ = data[genes] + except Exception: print(genes) - update_protein(data,genes) + update_protein(data, genes) print("Protein Sequences Updated Successfully") return data - -def update(gene_seq,gene): - t=0 - while(t!=2): + + +def update(gene_seq, gene): + t = 0 + while(t != 2): try: server = "https://rest.ensembl.org" ext = "/sequence/id/"+str(gene)+"?type=cds;multiple_sequences=1" - r = requests.get(server+ext, headers={ "Content-Type" : "application/json"}) + r = requests.get( + server+ext, headers={"Content-Type": "application/json"}) if not r.ok: r.raise_for_status() sys.exit() - r=r.json() - if len(r)==1: - r=dict(r[0]) - gene_seq[gene]=str(r["seq"]) + r = r.json() + if len(r) == 1: + r = dict(r[0]) + gene_seq[gene] = str(r["seq"]) return else: - maxi=0 - maxlen=0 + maxi = 0 + maxlen = 0 for i in range(len(r)): - m=r[i] - m=dict(m) - if len(m["seq"])>maxlen: - maxi=i - r=dict(r[maxi]) - gene_seq[gene]=str(r["seq"]) + m = r[i] + m = dict(m) + if len(m["seq"]) > maxlen: + maxi = i + r = dict(r[maxi]) + gene_seq[gene] = str(r["seq"]) return - except Exception as e: - t+=1 - #print("\nError:",e) + except Exception: + t += 1 + # print("\nError:",e) continue - gene_seq[gene]="" + gene_seq[gene] = "" + -def update_rest(data,fname): - gids={} - with open("processed/not_found_"+fname+".json","r") as file: - gids=dict(json.load(file)) +def update_rest(data, fname): + gids = {} + with open("processed/not_found_"+fname+".json", "r") as file: + gids = dict(json.load(file)) - gids=list(gids.keys()) + gids = list(gids.keys()) - geneseq={} + geneseq = {} server = "https://rest.ensembl.org" ext = "/sequence/id?type=cds" - headers={ "Content-Type" : "application/json", "Accept" : "application/json"} + headers = {"Content-Type": "application/json", + "Accept": "application/json"} - for i in progressbar.progressbar(range(0,len(gids)-50,50)): - ids=dict(ids=list(gids[i:i+50])) + for i in progressbar.progressbar(range(0, len(gids)-50, 50)): + ids = dict(ids=list(gids[i:i+50])) while(1): try: - r = requests.post(server+ext, headers=headers, data=str(json.dumps(ids))) + r = requests.post(server+ext, headers=headers, + data=str(json.dumps(ids))) if not r.ok: r.raise_for_status() - gs=r.json() - tgs={} + gs = r.json() + tgs = {} for g in gs: - tgs[g["query"]]=g["seq"] + tgs[g["query"]] = g["seq"] geneseq.update(tgs) break except Exception as e: - print("Error:",e) + print("Error:", e) continue data.update(geneseq) for genes in gids: try: - _=data[genes] - except: + _ = data[genes] + except Exception: print(genes) - update(data,genes) + update(data, genes) print("Gene Sequences Updated Successfully") - return data \ No newline at end of file + return data diff --git a/finalize_dataset.py b/finalize_dataset.py index b9ad598..36b677c 100644 --- a/finalize_dataset.py +++ b/finalize_dataset.py @@ -60,7 +60,7 @@ def prepare_features(a_h, d_h, sptree, label): branch_length_species, \ branch_length_homology_species, \ distance, dist_p_s, dist_p_hs = create_tree_data( - sptree, df) + sptree, df) assert(len(branch_length_species) == len(df)) assert(len(sml) == len(distance)) diff --git a/ftpg.py b/ftpg.py index 0b51c2c..dd04471 100644 --- a/ftpg.py +++ b/ftpg.py @@ -3,30 +3,31 @@ import os import sys import urllib.request as urllib -import requests -def download_data(x,dir_name): - fname=x.split("/")[-1] - path=os.path.join(dir_name,fname) - urllib.urlretrieve(x,path) -def get_data_file(file,dir): +def download_data(x, dir_name): + fname = x.split("/")[-1] + path = os.path.join(dir_name, fname) + urllib.urlretrieve(x, path) + + +def get_data_file(file, dir): if not os.path.isfile(file): print("The specified file does not exist!!!") sys.exit(1) - with open(file,"r")as f: - lf=f.read().splitlines() + with open(file, "r")as f: + lf = f.read().splitlines() if not os.path.exists(dir): os.mkdir(dir) for x in progressbar.progressbar(lf): - download_data(x,dir) - + download_data(x, dir) -#This wil download all the fasta files for the coding sequences. To change the directory, change the argument in the get_data_file argument. -host ="ftp.ensembl.org" +# This wil download all the fasta files for the coding sequences. +# To change the directory, change the argument in the get_data_file argument. +host = "ftp.ensembl.org" user = "anonymous" password = "" @@ -34,50 +35,50 @@ def get_data_file(file,dir): ftp = FTP(host) ftp.login(user, password) print("Connected to {}".format(host)) -base_link="ftp://ftp.ensembl.org" -#find sequences of all the cds files -l=ftp.nlst("/pub/release-96/fasta") -lt=[] -for x in l: - y=ftp.nlst(x+"/cds") +base_link = "ftp://ftp.ensembl.org" +# find sequences of all the cds files +list_of_files = ftp.nlst("/pub/release-96/fasta") +lt = [] +for x in list_of_files: + y = ftp.nlst(x+"/cds") for z in y: if z.endswith(".cds.all.fa.gz"): lt.append(z) -with open("seq_link.txt","w") as file: +with open("seq_link.txt", "w") as file: for x in lt: file.write(base_link+x) file.write("\n") -#find all the files with protein sequences -l=ftp.nlst("/pub/release-96/fasta") -lt=[] -for x in progressbar.progressbar(l): - y=ftp.nlst(x+"/pep") +# find all the files with protein sequences +list_of_files = ftp.nlst("/pub/release-96/fasta") +lt = [] +for x in progressbar.progressbar(list_of_files): + y = ftp.nlst(x+"/pep") for z in y: if z.endswith(".pep.all.fa.gz"): lt.append(z) -with open("protein_seq.txt","w") as file: +with open("protein_seq.txt", "w") as file: for x in lt: file.write(base_link+x) file.write("\n") -#get link of all the gtf files -l=ftp.nlst("/pub/release-96/gtf") -lt=[] -for x in l: - y=ftp.nlst(x) +# get link of all the gtf files +list_of_files = ftp.nlst("/pub/release-96/gtf") +lt = [] +for x in list_of_files: + y = ftp.nlst(x) for z in y: if z.endswith(".96.gtf.gz"): lt.append(z) -with open("gtf_link.txt","w") as file: +with open("gtf_link.txt", "w") as file: for x in lt: file.write(base_link+x) file.write("\n") print("Downloading Data") -get_data_file("gtf_link.txt","data") -get_data_file("seq_link.txt","geneseq") -get_data_file("protein_seq.txt","pro_seq") +get_data_file("gtf_link.txt", "data") +get_data_file("seq_link.txt", "geneseq") +get_data_file("protein_seq.txt", "pro_seq") print("Download Complete.................") diff --git a/model.py b/model.py index 08c0f8d..384ae76 100644 --- a/model.py +++ b/model.py @@ -1,132 +1,161 @@ -dim=10 -n=3 -fl=2*n+1 -maxbl=29 - import tensorflow as tf +dim = 10 +n = 3 +fl = 2*n+1 +maxbl = 29 + def create_model(): tf.reset_default_graph() - g=tf.Graph() + g = tf.Graph() with g.as_default(): - synmg=tf.placeholder(dtype=tf.float64,shape=(None,2*n+1,2*n+1,2),name="Synteny_matrix_placeholder_Global") - synml=tf.placeholder(dtype=tf.float64,shape=(None,2*n+1,2*n+1,2),name="Synteny_matrix_placeholder_Local") - pfam=tf.placeholder(dtype=tf.float64,shape=(None,2*n+1,2*n+1,1),name="Pfam_matrix_placeholder") - bls=tf.placeholder(dtype=tf.float64,shape=(None,maxbl),name="Species_Branch_Length_Placeholder") - blhs=tf.placeholder(dtype=tf.float64,shape=(None,maxbl),name="Homology_Species_Branch_Length_Placeholder") - #gl=tf.placeholder(dtype=tf.float64,shape=(None,1),name="Mean_gene_length") - dps=tf.placeholder(dtype=tf.float64,shape=(None,1),name="mca_species_distance") - dphs=tf.placeholder(dtype=tf.float64,shape=(None,1),name="mca_homology_species_distance") - dis=tf.placeholder(dtype=tf.float64,shape=(None,1),name="total_distance") - lr=tf.placeholder(dtype=tf.float64,shape=(),name="learning_rate") - y=tf.placeholder(dtype=tf.int32,shape=(None),name="labels") - - lrs=tf.summary.scalar("Learning_Rate",lr) - - x=tf.concat([dps,dps-dphs,dis],1,name="Create_train_vector") - - print(synmg,"\n",synml,"\n",bls,"\n",blhs,"\n",dps,"\n",dphs,"\n",dis,"\n",x) - - reg_l2=tf.contrib.layers.l2_regularizer(0.001) - reg_l1 = tf.contrib.layers.l1_regularizer(scale=0.005, scope=None) - - def get_variable_by_shape(shape,name): - f=tf.get_variable(name,shape=shape,initializer=tf.glorot_uniform_initializer(),dtype=tf.float64,regularizer=reg_l2) + synmg = tf.placeholder(dtype=tf.float64, shape=( + None, 2*n+1, 2*n+1, 2), name="Synteny_matrix_placeholder_Global") + synml = tf.placeholder(dtype=tf.float64, shape=( + None, 2*n+1, 2*n+1, 2), name="Synteny_matrix_placeholder_Local") + pfam = tf.placeholder(dtype=tf.float64, shape=( + None, 2*n+1, 2*n+1, 1), name="Pfam_matrix_placeholder") + bls = tf.placeholder(dtype=tf.float64, shape=( + None, maxbl), name="Species_Branch_Length_Placeholder") + blhs = tf.placeholder(dtype=tf.float64, shape=( + None, maxbl), name="Homology_Species_Branch_Length_Placeholder") + # gl=tf.placeholder(dtype=tf.float64,shape=(None,1),name="Mean_gene_length") + dps = tf.placeholder(dtype=tf.float64, shape=( + None, 1), name="mca_species_distance") + dphs = tf.placeholder(dtype=tf.float64, shape=( + None, 1), name="mca_homology_species_distance") + dis = tf.placeholder(dtype=tf.float64, shape=( + None, 1), name="total_distance") + lr = tf.placeholder(dtype=tf.float64, shape=(), name="learning_rate") + y = tf.placeholder(dtype=tf.int32, shape=(None), name="labels") + + # lrs = tf.summary.scalar("Learning_Rate", lr) + + x = tf.concat([dps, dps-dphs, dis], 1, name="Create_train_vector") + + print(synmg, "\n", synml, "\n", bls, "\n", blhs, + "\n", dps, "\n", dphs, "\n", dis, "\n", x) + + reg_l2 = tf.contrib.layers.l2_regularizer(0.001) + # reg_l1 = tf.contrib.layers.l1_regularizer(scale=0.005, scope=None) + + def get_variable_by_shape(shape, name): + f = tf.get_variable(name, shape=shape, + initializer=tf.glorot_uniform_initializer(), + dtype=tf.float64, + regularizer=reg_l2) return f - def create_synteny_aligner(name,synm): - with tf.variable_scope(name+"Synteny_Aligner",reuse=tf.AUTO_REUSE): - - fconv=get_variable_by_shape((2,2,2,dim),"fconv") - conv=tf.nn.conv2d(synm,fconv,(1,1,1,1),padding="VALID",name="Conv_aligner") - - fconv_1=get_variable_by_shape((2,2,dim,dim*2),"fconv_1") - conv_1=tf.nn.conv2d(conv,fconv_1,(1,1,1,1),padding="VALID",name="Conv_aligner_1") - - fxconv=get_variable_by_shape((fl,2,dim*2),"fxconv") - x_conv=tf.reshape(synm,(-1,fl*fl,2)) - x_conv=tf.nn.conv1d(x_conv,fxconv,stride=fl,padding="SAME",name="row_aligner") - - y_conv=tf.reshape(tf.transpose(synm,(0,2,1,3)),(-1,fl*fl,2)) - fyconv=get_variable_by_shape((fl,2,dim*2),"fyconv") - y_conv=tf.nn.conv1d(y_conv,fyconv,stride=fl,padding="SAME",name="column_aligner") - - - wconv=get_variable_by_shape((fl,fl,2,dim*2*10),"wconv") - w_conv=tf.nn.conv2d(synm,wconv,(1,1,1,1),padding="VALID",name="Global_Aligner_1") - - conv_1=tf.reshape(conv_1,(-1,25,dim*2)) - x_conv=tf.reshape(x_conv,(-1,fl,dim*2)) - y_conv=tf.reshape(y_conv,(-1,fl,dim*2)) - w_conv=tf.reshape(w_conv,(-1,10,dim*2)) - conv_final=tf.concat([conv_1,x_conv,y_conv,w_conv],1,name="Concatenate_All_Alignments") + def create_synteny_aligner(name, synm): + with tf.variable_scope( + name+"Synteny_Aligner", reuse=tf.AUTO_REUSE): + + fconv = get_variable_by_shape((2, 2, 2, dim), "fconv") + conv = tf.nn.conv2d(synm, fconv, (1, 1, 1, 1), + padding="VALID", name="Conv_aligner") + + fconv_1 = get_variable_by_shape((2, 2, dim, dim*2), "fconv_1") + conv_1 = tf.nn.conv2d( + conv, fconv_1, (1, 1, 1, 1), padding="VALID", + name="Conv_aligner_1") + + fxconv = get_variable_by_shape((fl, 2, dim*2), "fxconv") + x_conv = tf.reshape(synm, (-1, fl*fl, 2)) + x_conv = tf.nn.conv1d( + x_conv, fxconv, stride=fl, padding="SAME", + name="row_aligner") + + y_conv = tf.reshape(tf.transpose( + synm, (0, 2, 1, 3)), (-1, fl*fl, 2)) + fyconv = get_variable_by_shape((fl, 2, dim*2), "fyconv") + y_conv = tf.nn.conv1d( + y_conv, fyconv, stride=fl, padding="SAME", + name="column_aligner") + + wconv = get_variable_by_shape((fl, fl, 2, dim*2*10), "wconv") + w_conv = tf.nn.conv2d( + synm, wconv, (1, 1, 1, 1), padding="VALID", + name="Global_Aligner_1") + + conv_1 = tf.reshape(conv_1, (-1, 25, dim*2)) + x_conv = tf.reshape(x_conv, (-1, fl, dim*2)) + y_conv = tf.reshape(y_conv, (-1, fl, dim*2)) + w_conv = tf.reshape(w_conv, (-1, 10, dim*2)) + conv_final = tf.concat( + [conv_1, x_conv, y_conv, w_conv], 1, + name="Concatenate_All_Alignments") return conv_final - with tf.variable_scope("Pfam",reuse=tf.AUTO_REUSE): - wconv_pfam=get_variable_by_shape((fl,fl,1,dim*2*10),"wconv_pfam") - w_conv_pfam=tf.nn.conv2d(pfam,wconv_pfam,(1,1,1,1),padding="VALID",name="Global_Aligner_pfam") - w_conv_pfam=tf.reshape(w_conv_pfam,(-1,10,dim*2)) - - conv_final_g=create_synteny_aligner("Global_",synmg) - conv_final_l=create_synteny_aligner("Local_",synml) - final=tf.concat([conv_final_g,conv_final_l,w_conv_pfam],1) - #final=conv_final_l - - with tf.variable_scope("Combine_Renormalize",reuse=tf.AUTO_REUSE): - bl=tf.concat([bls,blhs],1) - #bl=bls-blhs - theta_bl=get_variable_by_shape((maxbl*2,1),"theta_bl") - theta_bl=tf.matmul(bl,theta_bl) - x=tf.concat([x,theta_bl],1) - theta=get_variable_by_shape((4,108),"theta") - bias=get_variable_by_shape((1,108),"b") - theta_2=tf.matmul(x,theta)+bias - theta_2=tf.reshape(theta_2,(-1,108,1)) - theta_2=tf.tile(theta_2,[1,1,dim*2]) - final=final*theta_2 + with tf.variable_scope("Pfam", reuse=tf.AUTO_REUSE): + wconv_pfam = get_variable_by_shape( + (fl, fl, 1, dim*2*10), "wconv_pfam") + w_conv_pfam = tf.nn.conv2d( + pfam, wconv_pfam, (1, 1, 1, 1), padding="VALID", + name="Global_Aligner_pfam") + w_conv_pfam = tf.reshape(w_conv_pfam, (-1, 10, dim*2)) + + conv_final_g = create_synteny_aligner("Global_", synmg) + conv_final_l = create_synteny_aligner("Local_", synml) + final = tf.concat([conv_final_g, conv_final_l, w_conv_pfam], 1) + # final=conv_final_l + + with tf.variable_scope("Combine_Renormalize", reuse=tf.AUTO_REUSE): + bl = tf.concat([bls, blhs], 1) + # bl=bls-blhs + theta_bl = get_variable_by_shape((maxbl*2, 1), "theta_bl") + theta_bl = tf.matmul(bl, theta_bl) + x = tf.concat([x, theta_bl], 1) + theta = get_variable_by_shape((4, 108), "theta") + bias = get_variable_by_shape((1, 108), "b") + theta_2 = tf.matmul(x, theta)+bias + theta_2 = tf.reshape(theta_2, (-1, 108, 1)) + theta_2 = tf.tile(theta_2, [1, 1, dim*2]) + final = final*theta_2 print(final) - flat=tf.layers.flatten(final) + flat = tf.layers.flatten(final) - zero=tf.constant(0.0,dtype=tf.float64) - diff=dps-dphs + zero = tf.constant(0.0, dtype=tf.float64) + diff = dps-dphs print(diff) - diff_2=tf.cast(tf.equal(diff,zero),tf.float64) + diff_2 = tf.cast(tf.equal(diff, zero), tf.float64) print(diff_2) - diff=tf.tile(diff_2,[1,dim*10]) + diff = tf.tile(diff_2, [1, dim*10]) - flat=tf.concat([flat,diff],1) + flat = tf.concat([flat, diff], 1) print(flat) - #dense=tf.layers.dense(flat,2048,kernel_regularizer=reg_l2,bias_regularizer=reg_l2) - #dense_2=tf.layers.dense(dense,1024,kernel_regularizer=reg_l2,bias_regularizer=reg_l2) - dense_3=tf.layers.dense(flat,512,kernel_regularizer=reg_l2,bias_regularizer=reg_l2) - - logits_pred=tf.layers.dense(dense_3,3,name="Predictions") + # dense=tf.layers.dense(flat,2048,kernel_regularizer=reg_l2,bias_regularizer=reg_l2) + # dense_2=tf.layers.dense(dense,1024,kernel_regularizer=reg_l2,bias_regularizer=reg_l2) + dense_3 = tf.layers.dense( + flat, 512, kernel_regularizer=reg_l2, bias_regularizer=reg_l2) + + logits_pred = tf.layers.dense(dense_3, 3, name="Predictions") print(logits_pred) - entropy=tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits_pred,labels=y) + entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits_pred, labels=y) print(entropy) - - #weights = tf.trainable_variables() # all vars of your graph - #regl1 = tf.contrib.layers.apply_regularization(reg_l1, weights) + + # weights = tf.trainable_variables() # all vars of your graph + # regl1 = tf.contrib.layers.apply_regularization(reg_l1, weights) reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES) - reg_constant =0.00000001 - loss=tf.reduce_mean(entropy)+reg_constant * sum(reg_losses) - #loss=tf.reduce_mean(entropy) - optimizer=tf.train.RMSPropOptimizer(lr) - #optimizer=tf.train.AdamOptimizer() - losses=tf.summary.scalar("Loss",loss) - t_op=optimizer.minimize(loss) - acc=tf.math.in_top_k(tf.cast(logits_pred,tf.float32),y,1) - accuracy=tf.reduce_mean(tf.cast(acc,tf.float32)) - accs=tf.summary.scalar("Accuracy",accuracy) - summary=tf.summary.merge_all() - init=tf.global_variables_initializer() - saver=tf.train.Saver() - - for node in (synmg,synml,pfam,bls,blhs,dps,dphs,dis,lr,y): - g.add_to_collection("input_nodes",node) - - for node in (loss,t_op,accuracy,init,summary): - g.add_to_collection("output_nodes",node) - - return g,saver + reg_constant = 0.00000001 + loss = tf.reduce_mean(entropy)+reg_constant * sum(reg_losses) + # loss=tf.reduce_mean(entropy) + optimizer = tf.train.RMSPropOptimizer(lr) + # optimizer=tf.train.AdamOptimizer() + # losses = tf.summary.scalar("Loss", loss) + t_op = optimizer.minimize(loss) + acc = tf.math.in_top_k(tf.cast(logits_pred, tf.float32), y, 1) + accuracy = tf.reduce_mean(tf.cast(acc, tf.float32)) + # accs = tf.summary.scalar("Accuracy", accuracy) + summary = tf.summary.merge_all() + init = tf.global_variables_initializer() + saver = tf.train.Saver() + + for node in (synmg, synml, pfam, bls, blhs, dps, dphs, dis, lr, y): + g.add_to_collection("input_nodes", node) + + for node in (loss, t_op, accuracy, init, summary): + g.add_to_collection("output_nodes", node) + + return g, saver diff --git a/pfam_folder_pred.py b/pfam_folder_pred.py index a2302ca..219705e 100644 --- a/pfam_folder_pred.py +++ b/pfam_folder_pred.py @@ -1,62 +1,66 @@ -import json -import gc -import pandas as pd -import numpy as np -import pickle +import pandas as pd import sys import progressbar import os from neighbor_genes import read_genome_maps from process_data import create_data_homology_ls from read_get_gene_seq import read_gene_sequences -from access_data_rest import update_rest,update_rest_protein +from access_data_rest import update_rest_protein from prepare_synteny_matrix import write_fasta -from process_data import create_map_list - -def read_database(fname,dirname): - df=pd.read_csv(dirname+"/"+fname,sep="\t",header=None) - label_dict=dict(ortholog_one2one=1, - other_paralog=0, - non_homolog=2, - ortholog_one2many=1, - ortholog_many2many=1, - within_species_paralog=0, - gene_split=4) - label=[] - for _,row in df.iterrows(): + + +def read_database(fname, dirname): + df = pd.read_csv(dirname+"/"+fname, sep="\t", header=None) + label_dict = dict(ortholog_one2one=1, + other_paralog=0, + non_homolog=2, + ortholog_one2many=1, + ortholog_many2many=1, + within_species_paralog=0, + gene_split=4) + label = [] + for _, row in df.iterrows(): label.append(label_dict[row[7]]) - df=df.assign(label=label) - df=df.drop(7,axis=1) - df=df.drop(0,axis=1) - df.columns=["gene_stable_id","species","homology_gene_stable_id","homology_species","goc","wga","label"] + df = df.assign(label=label) + df = df.drop(7, axis=1) + df = df.drop(0, axis=1) + df.columns = ["gene_stable_id", "species", "homology_gene_stable_id", + "homology_species", "goc", "wga", "label"] return df + def read_prediction_file_folder(dir_name): - lf=os.listdir(dir_name) - a_h=[] - d_h=[] + lf = os.listdir(dir_name) + a_h = [] + d_h = [] for x in progressbar.progressbar(lf): - df=read_database(x,dir_name) + df = read_database(x, dir_name) a_h.append(df) d_h.append(x.split(".")[0]) - return a_h,d_h + return a_h, d_h + -def create_synteny_features(a_h,d_h,n,a,d,ld,ldg,cmap,cimap,name): - lsy=create_data_homology_ls(a_h,d_h,n,a,d,ld,ldg,cmap,cimap,0) - protein_sequences=read_gene_sequences(a_h,lsy,"pro_seq","prediction_"+name) - protein_sequences=update_rest_protein(protein_sequences,"prediction_"+name) - write_fasta(protein_sequences,"prediction_"+name) +def create_synteny_features(a_h, d_h, n, a, d, ld, ldg, cmap, cimap, name): + lsy = create_data_homology_ls(a_h, d_h, n, a, d, ld, ldg, cmap, cimap, 0) + protein_sequences = read_gene_sequences( + a_h, lsy, "pro_seq", "prediction_"+name) + protein_sequences = update_rest_protein( + protein_sequences, "prediction_"+name) + write_fasta(protein_sequences, "prediction_"+name) print("Protein Sequences Loaded") + def main(): - arg=sys.argv - dirname=arg[-1] - a_h,d_h=read_prediction_file_folder(dirname) - n=3 + arg = sys.argv + dirname = arg[-1] + a_h, d_h = read_prediction_file_folder(dirname) + n = 3 - a,d,ld,ldg,cmap,cimap=read_genome_maps()#read the genome maps + a, d, ld, ldg, cmap, cimap = read_genome_maps() # read the genome maps print("Genome Maps Loaded.") - create_synteny_features(a_h,d_h,n,a,d,ld,ldg,cmap,cimap,dirname) -if __name__=="__main__": - main() \ No newline at end of file + create_synteny_features(a_h, d_h, n, a, d, ld, ldg, cmap, cimap, dirname) + + +if __name__ == "__main__": + main() diff --git a/pfam_matrix.py b/pfam_matrix.py index 3bcc0dc..1483fad 100644 --- a/pfam_matrix.py +++ b/pfam_matrix.py @@ -1,137 +1,151 @@ -import pandas as pd -import numpy as np -import progressbar +import pandas as pd +import numpy as np +import progressbar import os import json import sys -from prepare_synteny_matrix import read_data_homology,load_neighbor_genes +from prepare_synteny_matrix import read_data_homology, load_neighbor_genes from process_data import create_map_list from process_negative import read_database_txt -def get_score_overlap(x,y,pfam_db,pfam_map): - df_1=pfam_db.loc[pfam_map[x]] - df_2=pfam_db.loc[pfam_map[y]] - l=list(df_2.domain) - c=0 - c_1=0 - for _,row in df_1.iterrows(): - if row.domain in l:#check if the domain exists in the list - c_1+=1 - st=int(df_2[df_2["domain"]==row.domain].hmm_from)#get the start - end=int(df_2[df_2["domain"]==row.domain].hmm_to)#get the end - if (int(row.hmm_from)>st and int(row.hmm_from)st and int(row.hmm_from) st and int(row.hmm_from) < end) or (int( + row.hmm_to) > st and int(row.hmm_from) < end): + c += 1 + return c/max(len(df_1), len(df_2)) + + +def pfam_matrix(g1, g2, n, pfam_db, gmap, pfam_map): + pm = np.zeros((n, n)) for i in range(n): - if g1[i]=="NULL_GENE": + if g1[i] == "NULL_GENE": continue try: - _=gmap[g1[i]] - except: + _ = gmap[g1[i]] + except Exception: continue for j in range(n): - if g2[j]=="NULL_GENE": + if g2[j] == "NULL_GENE": continue try: - _=gmap[g2[j]] - except: + _ = gmap[g2[j]] + except Exception: continue - pm[i][j]=get_score_overlap(g1[i],g2[j],pfam_db,pfam_map) + pm[i][j] = get_score_overlap(g1[i], g2[j], pfam_db, pfam_map) return pm -def create_pfam_matrix(df,lsy,pfam_db,pfam_map): - n=3 - glist=list(pfam_db.gene_stable_id) - gmap=create_map_list(glist) - pg=[] - indexes=[] - for index,row in progressbar.progressbar(df.iterrows()): - g1=str(row["gene_stable_id"]) - g2=str(row["homology_gene_stable_id"]) - x=[] - y=[] + +def create_pfam_matrix(df, lsy, pfam_db, pfam_map): + n = 3 + glist = list(pfam_db.gene_stable_id) + gmap = create_map_list(glist) + pg = [] + indexes = [] + for index, row in progressbar.progressbar(df.iterrows()): + g1 = str(row["gene_stable_id"]) + g2 = str(row["homology_gene_stable_id"]) + x = [] + y = [] try: - _=lsy[g1] - _=lsy[g2] - except: + _ = lsy[g1] + _ = lsy[g2] + except Exception: continue - for i in range(len(lsy[g1]['b'])-1,-1,-1): + for i in range(len(lsy[g1]['b'])-1, -1, -1): x.append(lsy[g1]['b'][i]) x.append(g1) for k in lsy[g1]['f']: x.append(k) - for i in range(len(lsy[g2]['b'])-1,-1,-1): + for i in range(len(lsy[g2]['b'])-1, -1, -1): y.append(lsy[g2]['b'][i]) y.append(g2) for k in lsy[g2]['f']: y.append(k) - - assert(len(x)==len(y)) - assert(len(x)==(2*n+1)) - pmtemp=pfam_matrix(x,y,2*n+1,pfam_db,gmap,pfam_map) + + assert(len(x) == len(y)) + assert(len(x) == (2*n+1)) + pmtemp = pfam_matrix(x, y, 2*n+1, pfam_db, gmap, pfam_map) pg.append(pmtemp) indexes.append(index) - return np.array(pg),np.array(indexes) + return np.array(pg), np.array(indexes) + def create_pfam_map(pfam_db): - pfam_map={} - for index,row in progressbar.progressbar(pfam_db.iterrows()): + pfam_map = {} + for index, row in progressbar.progressbar(pfam_db.iterrows()): try: - _=pfam_map[row.gene_stable_id] - except: - pfam_map[row.gene_stable_id]=[] + _ = pfam_map[row.gene_stable_id] + except Exception: + pfam_map[row.gene_stable_id] = [] pfam_map[row.gene_stable_id].append(index) return pfam_map + def main_positive(): if not os.path.isdir("processed/pfam_matrices"): os.mkdir("processed/pfam_matrices") - a_h,d_h=read_data_homology("data_homology") - lsy=load_neighbor_genes() - pfam_db=pd.read_hdf("pfam_db_positive.h5") - pfam_map=create_pfam_map(pfam_db) - ndir="processed/pfam_matrices/" - nf1="pfam_matrices" - nf3="pfam_indexes" + a_h, d_h = read_data_homology("data_homology") + lsy = load_neighbor_genes() + pfam_db = pd.read_hdf("pfam_db_positive.h5") + pfam_map = create_pfam_map(pfam_db) + ndir = "processed/pfam_matrices/" + nf1 = "pfam_matrices" + nf3 = "pfam_indexes" for i in range(len(a_h)): - df=a_h[i] + df = a_h[i] print(len(df)) - pfam_matrices,indexes=create_pfam_matrix(df,lsy,pfam_db,pfam_map) - np.save(ndir+str(d_h[i])+"_"+nf1,pfam_matrices) - np.save(ndir+str(d_h[i])+"_"+nf3,indexes) + pfam_matrices, indexes = create_pfam_matrix(df, lsy, pfam_db, pfam_map) + np.save(ndir+str(d_h[i])+"_"+nf1, pfam_matrices) + np.save(ndir+str(d_h[i])+"_"+nf3, indexes) print(len(indexes)) + def read_data_negative(arg): - df=read_database_txt(arg[-1]) - name=arg[-1].split(".")[0] - ind=np.load("processed/synteny_matrices/"+name+"_indexes.npy") - df=df.loc[ind] - pfam_db=pd.read_hdf("pfam_db_negative.h5") - pfam_map=create_pfam_map(pfam_db) - with open("processed/neighbor_genes_negative.json","r") as file: - lsy=dict(json.load(file)) - return df,pfam_db,pfam_map,lsy,name + df = read_database_txt(arg[-1]) + name = arg[-1].split(".")[0] + ind = np.load("processed/synteny_matrices/"+name+"_indexes.npy") + df = df.loc[ind] + pfam_db = pd.read_hdf("pfam_db_negative.h5") + pfam_map = create_pfam_map(pfam_db) + with open("processed/neighbor_genes_negative.json", "r") as file: + lsy = dict(json.load(file)) + return df, pfam_db, pfam_map, lsy, name + def main_negative(arg): - df,pfam_db,pfam_map,lsy,name=read_data_negative(arg) - ndir="processed/pfam_matrices/" - nf1="pfam_matrices" - nf3="pfam_indexes" + df, pfam_db, pfam_map, lsy, name = read_data_negative(arg) + ndir = "processed/pfam_matrices/" + nf1 = "pfam_matrices" + nf3 = "pfam_indexes" print(len(df)) - pfam_matrices,indexes=create_pfam_matrix(df,lsy,pfam_db,pfam_map) - np.save(ndir+name+"_"+nf1,pfam_matrices) - np.save(ndir+name+"_"+nf3,indexes) + pfam_matrices, indexes = create_pfam_matrix(df, lsy, pfam_db, pfam_map) + np.save(ndir+name+"_"+nf1, pfam_matrices) + np.save(ndir+name+"_"+nf3, indexes) print(len(indexes)) + def main(): - arg=sys.argv + arg = sys.argv main_positive() main_negative(arg) -if __name__=="__main__": + +if __name__ == "__main__": main() diff --git a/pfam_parser.py b/pfam_parser.py index fbe7a41..235ca12 100644 --- a/pfam_parser.py +++ b/pfam_parser.py @@ -1,50 +1,53 @@ -import pandas as pd +import pandas as pd import progressbar import gc import sys + def pfam_parse(filename): - rlist=[] + rlist = [] try: with open(filename) as file: - file.seek(0,0) + file.seek(0, 0) for line in progressbar.progressbar(file): if line.startswith("#"): continue - temp_dict={} - x=[y for y in line.split(" ") if y !=''] - if x[9] !="1": + temp_dict = {} + x = [y for y in line.split(" ") if y != ''] + if x[9] != "1": continue - temp_dict["gene_stable_id"]=x[3] - temp_dict["accession"]=x[1] - temp_dict["tlen"]=x[2] - temp_dict["qlen"]=x[5] - temp_dict["domain"]=x[0] - temp_dict["hmm_from"]=x[15] - temp_dict["hmm_to"]=x[16] - temp_dict["ali_from"]=x[17] - temp_dict["ali_to"]=x[18] - temp_dict["env_from"]=x[19] - temp_dict["env_to"]=x[20] + temp_dict["gene_stable_id"] = x[3] + temp_dict["accession"] = x[1] + temp_dict["tlen"] = x[2] + temp_dict["qlen"] = x[5] + temp_dict["domain"] = x[0] + temp_dict["hmm_from"] = x[15] + temp_dict["hmm_to"] = x[16] + temp_dict["ali_from"] = x[17] + temp_dict["ali_to"] = x[18] + temp_dict["env_from"] = x[19] + temp_dict["env_to"] = x[20] rlist.append(temp_dict) except Exception as e: print(e) return pd.DataFrame() print(len(rlist)) - tdf=pd.DataFrame(rlist) + tdf = pd.DataFrame(rlist) return tdf + def main(): - arg=sys.argv - fname_1=arg[-2] - fname_2=arg[-1] - df=pfam_parse(fname_1) - df.to_hdf("pfam_db_positive.h5",key="pfam_db_positive",mode="w") - df="" + arg = sys.argv + fname_1 = arg[-2] + fname_2 = arg[-1] + df = pfam_parse(fname_1) + df.to_hdf("pfam_db_positive.h5", key="pfam_db_positive", mode="w") + df = "" gc.collect() - df=pfam_parse(fname_2) - df.to_hdf("pfam_db_negative.h5",key="pfam_db_negative",mode="w") + df = pfam_parse(fname_2) + df.to_hdf("pfam_db_negative.h5", key="pfam_db_negative", mode="w") print("Pfam Databases Written Successfully :)") + if __name__ == "__main__": main() diff --git a/prediction.py b/prediction.py index 1cab3a5..0c373dd 100644 --- a/prediction.py +++ b/prediction.py @@ -130,7 +130,8 @@ def write_preds(fname, model_name, name, preds, index_dict, df): "_" + name + "_multiple.txt") - with open("prediction_" + fname + "_" + model_name + "_" + name + "_multiple.txt", "w") as file: + with open("prediction_" + fname + "_" + model_name + "_" + name + + "_multiple.txt", "w") as file: for index, row in progressbar.progressbar(df.iterrows()): file.write(str(row[0])) file.write("\t") diff --git a/prediction_pfam.py b/prediction_pfam.py index df22c51..f37f386 100644 --- a/prediction_pfam.py +++ b/prediction_pfam.py @@ -1,211 +1,54 @@ -import json +import pandas as pd +import progressbar import gc -import pandas as pd -import numpy as np -import pickle -import tensorflow as tf import sys -import os -import progressbar -from neighbor_genes import read_genome_maps -from process_data import create_data_homology_ls -from read_get_gene_seq import read_gene_sequences -from access_data_rest import update_rest,update_rest_protein -from threads import Procerssrunner -from prepare_synteny_matrix import read_data_synteny -from tree_data import create_tree_data -from process_data import create_map_list -from pfam_parser import pfam_parse -from pfam_matrix import create_pfam_map,create_pfam_matrix -def read_database(fname): - df=pd.read_csv(fname,sep="\t",header=None) - label_dict=dict(ortholog_one2one=1, - other_paralog=0, - non_homolog=2, - ortholog_one2many=1, - ortholog_many2many=1, - within_species_paralog=0, - gene_split=4) - label=[] - for _,row in df.iterrows(): - label.append(label_dict[row[7]]) - df=df.assign(label=label) - df=df.drop(7,axis=1) - df=df.drop(0,axis=1) - df.columns=["gene_stable_id","species","homology_gene_stable_id","homology_species","goc","wga","label"] - return df -def select_data_by_length(df,st,end): +def pfam_parse(filename): + rlist = [] try: - if end= 0: # if all the genes end ahead of the one in consideration +# if all the genes end ahead of the one in consideration + if end[end_s[0]] >= 0: flag = 1 # increment the pointer ne.append("NULL_GENE") # append the NULL_GENE value continue @@ -144,7 +151,8 @@ def create_data_homology_ls(a_h, d_h, n, a, d, ld, ldg, cmap, cimap, to_write): xl, xr = get_nearest_neighbors( x, xs, n, a, d, ld, ldg, cmap, cimap) if len( - xl) != 0: # check if neighboring genes were successfully found + # check if neighboring genes were successfully found + xl) != 0: lsy[x] = dict(b=xl, f=xr) lsytemp[x] = dict(b=xl, f=xr) except BaseException: diff --git a/process_negative.py b/process_negative.py index 92e51fd..e23102f 100644 --- a/process_negative.py +++ b/process_negative.py @@ -1,63 +1,63 @@ -import pandas as pd -import numpy as np -import os -import sys -import progressbar -import json +import pandas as pd +import numpy as np import sys from neighbor_genes import read_genome_maps from process_data import create_data_homology_ls from threads import Procerssrunner from read_get_gene_seq import read_gene_sequences -from access_data_rest import update_rest,update_rest_protein -from prepare_synteny_matrix import read_data_synteny,write_fasta +from access_data_rest import update_rest, update_rest_protein +from prepare_synteny_matrix import read_data_synteny, write_fasta from save_data import write_dict_json -from access_data_rest import update_rest_protein + def read_database_txt(filename): - df=pd.read_csv(filename,sep="\t",header=None) - df=df.drop(0,axis=1) - df.columns=["gene_stable_id","species","homology_gene_stable_id","homology_species","wga","goc","homology_type"] + df = pd.read_csv(filename, sep="\t", header=None) + df = df.drop(0, axis=1) + df.columns = ["gene_stable_id", "species", "homology_gene_stable_id", + "homology_species", "wga", "goc", "homology_type"] return df def main(): - arg=sys.argv - a,d,ld,ldg,cmap,cimap=read_genome_maps() + arg = sys.argv + a, d, ld, ldg, cmap, cimap = read_genome_maps() print("Genome Maps Loaded.") - df=read_database_txt(arg[-2]) - nop=int(arg[-1]) + df = read_database_txt(arg[-2]) + nop = int(arg[-1]) print("Data Read.") - a_h=[] - d_h=[] + a_h = [] + d_h = [] a_h.append(df) d_h.append(arg[-2].split(".")[0]) - n=3 - lsy=create_data_homology_ls(a_h,d_h,n,a,d,ld,ldg,cmap,cimap,0) - write_dict_json("neighbor_genes_negative","processed",lsy) + n = 3 + lsy = create_data_homology_ls(a_h, d_h, n, a, d, ld, ldg, cmap, cimap, 0) + write_dict_json("neighbor_genes_negative", "processed", lsy) print("Neighbor Genes Found and Saved Successfully:)") - gene_sequences=read_gene_sequences(a_h,lsy,"geneseq","gene_seq_negative") - gene_sequences=update_rest(gene_sequences,"gene_seq_negative") - ndir="processed/synteny_matrices/" - nf1="synteny_matrices_global" - nf2="synteny_matrices_local" - nf3="indexes" + gene_sequences = read_gene_sequences( + a_h, lsy, "geneseq", "gene_seq_negative") + gene_sequences = update_rest(gene_sequences, "gene_seq_negative") + ndir = "processed/synteny_matrices/" + nf1 = "synteny_matrices_global" + nf2 = "synteny_matrices_local" + nf3 = "indexes" for i in range(len(a_h)): - df=a_h[i] - part=len(df)//nop - pr=Procerssrunner() - pr.start_processes(nop,df,gene_sequences,lsy,part,n,d_h[i]) - smg,sml,indexes=read_data_synteny(nop,d_h[i]) + df = a_h[i] + part = len(df)//nop + pr = Procerssrunner() + pr.start_processes(nop, df, gene_sequences, lsy, part, n, d_h[i]) + smg, sml, indexes = read_data_synteny(nop, d_h[i]) print(len(indexes)) - np.save(ndir+str(d_h[i])+"_"+nf1,smg) - np.save(ndir+str(d_h[i])+"_"+nf2,sml) - np.save(ndir+str(d_h[i])+"_"+nf3,indexes) - a_h[i]=df.loc[indexes] + np.save(ndir+str(d_h[i])+"_"+nf1, smg) + np.save(ndir+str(d_h[i])+"_"+nf2, sml) + np.save(ndir+str(d_h[i])+"_"+nf3, indexes) + a_h[i] = df.loc[indexes] print("Synteny Matrices Created Successfully :)") - protein_sequences=read_gene_sequences(a_h,lsy,"pro_seq","pro_seq_negative") - protein_sequences=update_rest_protein(protein_sequences,"pro_seq_negative") - write_fasta(protein_sequences,"pro_seq_negative") + protein_sequences = read_gene_sequences( + a_h, lsy, "pro_seq", "pro_seq_negative") + protein_sequences = update_rest_protein( + protein_sequences, "pro_seq_negative") + write_fasta(protein_sequences, "pro_seq_negative") -if __name__=="__main__": - main() +if __name__ == "__main__": + main() diff --git a/read_data.py b/read_data.py index ecb2e52..ebdf942 100644 --- a/read_data.py +++ b/read_data.py @@ -63,7 +63,8 @@ def read_data_genome(dir_name, a, dict_ind_genome): print(e) continue # print(data_gene[0:10]) - data_gene = data_gene[(data_gene['gene_biotype'] == 'protein_coding') | ( + data_gene = data_gene[( + data_gene['gene_biotype'] == 'protein_coding') | ( data_gene['gene_source'] == 'protein_coding')] # print(data_gene[data_gene["gene_id"]=="ENSNGAG00000000407"]) a.append(data_gene) diff --git a/read_get_gene_seq.py b/read_get_gene_seq.py index 7b66ba6..833add6 100644 --- a/read_get_gene_seq.py +++ b/read_get_gene_seq.py @@ -27,7 +27,8 @@ def create_dict(keys, values, dictionary): return dictionary # this function maps all the genes to their respective species. -# (Function: when finding the species of any gene we do not need to search the entire dataframe) +# (Function: when finding the species of any gene +# we do not need to search the entire dataframe) def group_seq_by_species(df, g_to_sp): @@ -64,7 +65,9 @@ def read_gene_seq(dirname, s, genes_by_species): ftr = [] for f in lof: if f.split(".")[ - 0] in s: # check whether the species is present in the species to read list. Will skip those species which are not present in the dataframe + # check whether the species is present in the species to read list. + # Will skip those species which are not present in the dataframe + 0] in s: ftr.append(f) data = {} for f in progressbar.progressbar(ftr): @@ -73,17 +76,18 @@ def read_gene_seq(dirname, s, genes_by_species): record = SeqIO.parse(file, "fasta") for r in record: gid, gbt = description_cleaner(r.description) - if str(gid) not in data and str( - gid) in genes_by_species[species] and gbt == "protein_coding": + if str(gid) not in data and str(gid) in \ + genes_by_species[species] and gbt == "protein_coding": data[gid] = str(r.seq) return data def read_gene_sequences(hdf, lsy, data_dir, fname): - """The basic idea here is to create a list/dictionary of all the genes by their species. - Once the mapping is done, all the respective fasta sequence files are read by Species - and the CDNA sequences for each gene in the species record are read and stored. - Thus we don't have to read the same file multiple times.""" + """The basic idea here is to create a list/dictionary of all the genes by + their species. Once the mapping is done, all the respective fasta sequence + files are read by Species and the CDNA sequences for each gene in the + species record are read and stored. + Thus we don't have to read the same file multiple times.""" grouped_genes = {} gene_by_species_dict = {} for df in progressbar.progressbar(hdf): diff --git a/save_data.py b/save_data.py index 668310e..7cfc6bb 100644 --- a/save_data.py +++ b/save_data.py @@ -15,9 +15,12 @@ def write_dict_json(name, dir, d): def write_data_synteny(smg, sml, indexes, i, name): if not os.path.exists("temp_" + name): os.mkdir("temp_" + name) - with open("temp_" + name + "/thread_" + str(i + 1) + "_smg.temp", "wb") as file: + with open("temp_" + name + "/thread_" + str(i + 1) + + "_smg.temp", "wb") as file: pickle.dump(smg, file) - with open("temp_" + name + "/thread_" + str(i + 1) + "_sml.temp", "wb") as file: + with open("temp_" + name + "/thread_" + str(i + 1) + + "_sml.temp", "wb") as file: pickle.dump(sml, file) - with open("temp_" + name + "/thread_" + str(i + 1) + "_indexes.temp", "wb") as file: + with open("temp_" + name + "/thread_" + str(i + 1) + + "_indexes.temp", "wb") as file: pickle.dump(indexes, file) diff --git a/threads.py b/threads.py index 96a19ab..3a2c926 100644 --- a/threads.py +++ b/threads.py @@ -52,10 +52,12 @@ def create_synteny_matrix_mul(self, gene_seq, g1, g2, n): norm_len = max(len(gene_seq[g1[i]]), len(gene_seq[g2[j]])) try: result = ed.align( - gene_seq[g1[i]], gene_seq[g2[j]], mode="NW", task="distance") + gene_seq[g1[i]], gene_seq[g2[j]], + mode="NW", task="distance") sm[i][j][0] = result["editDistance"] / (norm_len) result = ed.align( - gene_seq[g1[i]], gene_seq[g2[j]][::-1], mode="NW", task="distance") + gene_seq[g1[i]], gene_seq[g2[j]][::-1], mode="NW", + task="distance") sm[i][j][1] = result["editDistance"] / (norm_len) _, result, _ = local_pairwise_align_ssw( DNA(gene_seq[g1[i]]), DNA(gene_seq[g2[j]])) @@ -120,7 +122,8 @@ def __init__(self): def start_thread(self, obj, i, thread_alive, n, name): t = Process(target=obj.synteny_matrix, args=( - obj.gene_sequences, obj.df, obj.lsy, n), name="Thread_" + str(i + 1)) + obj.gene_sequences, obj.df, obj.lsy, n), + name="Thread_" + str(i + 1)) print("Thread ", (i + 1), " started for ", name, ".") thread_alive.append(t) diff --git a/train.py b/train.py index a1dfb9a..3673803 100644 --- a/train.py +++ b/train.py @@ -4,87 +4,119 @@ import tensorflow as tf from model import create_model + def create_branch_length_padding(bl): - maxlen=0 + maxlen = 0 for x in bl: - if len(x)>maxlen: - maxlen=len(x) + if len(x) > maxlen: + maxlen = len(x) for x in range(len(bl)): - temp=bl[x] - for i in range(len(temp),maxlen): - temp=np.append(temp,[0]) - bl[x]=temp + temp = bl[x] + for i in range(len(temp), maxlen): + temp = np.append(temp, [0]) + bl[x] = temp return bl -def train(train_synteny_matrices_global,train_synteny_matrices_local,train_pfam_matrices,train_branch_length_species,train_branch_length_homology_species,train_dist_p_s,train_dist_p_hs,train_distance,train_labels,v,num_epochs,learning_rate,decay,size_train,batch_size,model_name): - print("Going to train model {} for:\n Batch Size:{} \n Learning Rate:{} \n Decay:{}\n On {} Samples".format(v,batch_size,learning_rate,decay,len(train_synteny_matrices_global))) - graph,saver=create_model() - synmg,synml,pfam,bls,blhs,dps,dphs,dis,lr,y=graph.get_collection("input_nodes") - loss,t_op,accuracy,init,summary=graph.get_collection("output_nodes") + +def train(train_synteny_matrices_global, train_synteny_matrices_local, + train_pfam_matrices, train_branch_length_species, + train_branch_length_homology_species, train_dist_p_s, + train_dist_p_hs, train_distance, train_labels, v, + num_epochs, learning_rate, decay, size_train, + batch_size, model_name): + print("Going to train model {} for:\n Batch Size:{} \ + \n Learning Rate:{} \n Decay:{}\n On {} Samples".format( + v, batch_size, learning_rate, decay, + len(train_synteny_matrices_global) + )) + graph, saver = create_model() + synmg, synml, pfam, bls, blhs, dps, \ + dphs, dis, lr, y = graph.get_collection("input_nodes") + loss, t_op, accuracy, init, summary = graph.get_collection("output_nodes") with tf.Session(graph=graph) as sess: writer = tf.summary.FileWriter('./'+model_name+'_v'+str(v), sess.graph) sess.run(init) - learn=learning_rate + learn = learning_rate for j in range(num_epochs): for i in range(size_train//batch_size): - feed_dict={ - synmg:train_synteny_matrices_global[i*batch_size:(i+1)*batch_size], - synml:train_synteny_matrices_local[i*batch_size:(i+1)*batch_size], - pfam:train_pfam_matrices[i*batch_size:(i+1)*batch_size].reshape((batch_size,7,7,1)), - bls:train_branch_length_species[i*batch_size:(i+1)*batch_size], - blhs:train_branch_length_homology_species[i*batch_size:(i+1)*batch_size], - dps:train_dist_p_s[i*batch_size:(i+1)*batch_size].reshape((batch_size,1)), - dphs:train_dist_p_hs[i*batch_size:(i+1)*batch_size].reshape((batch_size,1)), - dis:train_distance[i*batch_size:(i+1)*batch_size].reshape((batch_size,1)), - lr:learn, - y:train_labels[i*batch_size:(i+1)*batch_size] + feed_dict = { + synmg: train_synteny_matrices_global + [i*batch_size:(i+1)*batch_size], + synml: train_synteny_matrices_local + [i*batch_size:(i+1)*batch_size], + pfam: train_pfam_matrices + [i*batch_size:(i+1)*batch_size] + .reshape((batch_size, 7, 7, 1)), + bls: train_branch_length_species + [i*batch_size:(i+1)*batch_size], + blhs: train_branch_length_homology_species + [i*batch_size:(i+1)*batch_size], + dps: train_dist_p_s + [i*batch_size:(i+1)*batch_size] + .reshape((batch_size, 1)), + dphs: train_dist_p_hs + [i*batch_size:(i+1)*batch_size] + .reshape((batch_size, 1)), + dis: train_distance + [i*batch_size:(i+1)*batch_size] + .reshape((batch_size, 1)), + lr: learn, + y: train_labels[i*batch_size:(i+1)*batch_size] } - sess.run(t_op,feed_dict=feed_dict) - - test_dict={ - synmg:train_synteny_matrices_global[size_train:], - synml:train_synteny_matrices_local[size_train:], - pfam:train_pfam_matrices[size_train:].reshape((len(train_synteny_matrices_global)-size_train,7,7,1)), - bls:train_branch_length_species[size_train:], - blhs:train_branch_length_homology_species[size_train:], - dps:train_dist_p_s[size_train:].reshape((len(train_synteny_matrices_global)-size_train,1)), - dphs:train_dist_p_hs[size_train:].reshape((len(train_synteny_matrices_global)-size_train,1)), - dis:train_distance[size_train:].reshape((len(train_synteny_matrices_global)-size_train,1)), - y:train_labels[size_train:], - lr:learn - } - accuracy_test,loss_test,summary_write=sess.run([accuracy,loss,summary],feed_dict=test_dict) - writer.add_summary(summary_write,i+1) - print("Epoch:{} Test Accuracy:{} Test Loss:{}".format(j+1,accuracy_test*100,loss_test)) - learn*=decay - saver.save(sess,model_name+"_v"+str(v)+"/model.ckpt") + sess.run(t_op, feed_dict=feed_dict) + + test_dict = { + synmg: train_synteny_matrices_global[size_train:], + synml: train_synteny_matrices_local[size_train:], + pfam: train_pfam_matrices[size_train:] + .reshape(( + len(train_synteny_matrices_global)-size_train, 7, 7, 1)), + bls: train_branch_length_species[size_train:], + blhs: train_branch_length_homology_species[size_train:], + dps: train_dist_p_s[size_train:] + .reshape((len(train_synteny_matrices_global)-size_train, 1)), + dphs: train_dist_p_hs[size_train:] + .reshape((len(train_synteny_matrices_global)-size_train, 1)), + dis: train_distance[size_train:] + .reshape((len(train_synteny_matrices_global)-size_train, 1)), + y: train_labels[size_train:], + lr: learn + } + accuracy_test, loss_test, summary_write = sess.run( + [accuracy, loss, summary], feed_dict=test_dict) + writer.add_summary(summary_write, i+1) + print("Epoch:{} Test Accuracy:{} Test Loss:{}".format( + j+1, accuracy_test*100, loss_test)) + learn *= decay + saver.save(sess, model_name+"_v"+str(v)+"/model.ckpt") writer.close() -def read_positive(len_p,bls,blhs,dis,dps,dphs,sml,smg,pfam,label): - rowsh=[] - with open("dataset","rb") as file: - rowsh=pickle.load(file) - shi=np.random.permutation(len(rowsh)) - rows_shuffled=[] + +def read_positive(len_p, bls, blhs, dis, dps, dphs, sml, smg, pfam, label): + rowsh = [] + with open("dataset", "rb") as file: + rowsh = pickle.load(file) + shi = np.random.permutation(len(rowsh)) + rows_shuffled = [] for i in range(len(shi)): - rows_shuffled.append(rowsh[shi[i]]) - rowsh=rows_shuffled - spco={} - spcp={} + rows_shuffled.append(rowsh[shi[i]]) + rowsh = rows_shuffled + spco = {} + spcp = {} for row in rowsh: if row["species"] not in spco: - spco[row["species"]]=0 - spcp[row["species"]]=0 - - maxcount_o=int((len_p)*0.3/14) - maxcount_p=int((len_p)*0.7/14) + spco[row["species"]] = 0 + spcp[row["species"]] = 0 + + maxcount_o = int((len_p)*0.3/14) + maxcount_p = int((len_p)*0.7/14) for row in rowsh: - if row["label"]==2: + if row["label"] == 2: continue - if row["label"]==1 and spco[row["species"]]>maxcount_o: + if row["label"] == 1 and spco[row["species"]] > maxcount_o: continue - if row["label"]==0 and spcp[row["species"]]>maxcount_p: + if row["label"] == 0 and spcp[row["species"]] > maxcount_p: continue bls.append(np.array(row["bls"])) blhs.append(np.array(row["blhs"])) @@ -95,23 +127,23 @@ def read_positive(len_p,bls,blhs,dis,dps,dphs,sml,smg,pfam,label): smg.append(row["global_alignment_matrix"]) pfam.append(row["pfam_matrix"]) label.append(row["label"]) - if row["label"]==1: - spco[row["species"]]+=1 - if row["label"]==0: - spcp[row["species"]]+=1 - - -def read_negative(len_n,bls,blhs,dis,dps,dphs,sml,smg,pfam,label): - rows=[] - with open("dataset","rb") as file: - rows=pickle.load(file) - rows=[row for row in rows if row["label"]==2] - shi=np.random.permutation(len(rows)) - rows_shuffled=[] + if row["label"] == 1: + spco[row["species"]] += 1 + if row["label"] == 0: + spcp[row["species"]] += 1 + + +def read_negative(len_n, bls, blhs, dis, dps, dphs, sml, smg, pfam, label): + rows = [] + with open("dataset", "rb") as file: + rows = pickle.load(file) + rows = [row for row in rows if row["label"] == 2] + shi = np.random.permutation(len(rows)) + rows_shuffled = [] for i in range(len(shi)): rows_shuffled.append(rows[shi[i]]) - rows=rows_shuffled - rows=rows[:len_n] + rows = rows_shuffled + rows = rows[:len_n] for row in rows: bls.append(np.array(row["bls"])) blhs.append(np.array(row["blhs"])) @@ -123,70 +155,73 @@ def read_negative(len_n,bls,blhs,dis,dps,dphs,sml,smg,pfam,label): label.append(row["label"]) pfam.append(row["pfam_matrix"]) -def train_models(model_name,start,end,num_epochs,learn_rate,decay,size_train,batch_size): - k=1 - for i in range(start//10,end//10+1): - bls=[] - blhs=[] - dis=[] - dps=[] - dphs=[] - sml=[] - smg=[] - pfam=[] - label=[] - portion=float(i/10) - len_n=int(size_train*portion) - len_p=int(size_train*(1-portion)) - read_positive(len_p,bls,blhs,dis,dps,dphs,sml,smg,pfam,label) - read_negative(len_n,bls,blhs,dis,dps,dphs,sml,smg,pfam,label) - bls=create_branch_length_padding(bls) - blhs=create_branch_length_padding(blhs) - bls=np.array(bls) + +def train_models(model_name, start, end, num_epochs, learn_rate, + decay, size_train, batch_size): + k = 1 + for i in range(start//10, end//10+1): + bls = [] + blhs = [] + dis = [] + dps = [] + dphs = [] + sml = [] + smg = [] + pfam = [] + label = [] + portion = float(i/10) + len_n = int(size_train*portion) + len_p = int(size_train*(1-portion)) + read_positive(len_p, bls, blhs, dis, dps, dphs, sml, smg, pfam, label) + read_negative(len_n, bls, blhs, dis, dps, dphs, sml, smg, pfam, label) + bls = create_branch_length_padding(bls) + blhs = create_branch_length_padding(blhs) + bls = np.array(bls) print(bls.shape) - blhs=np.array(blhs) + blhs = np.array(blhs) print(blhs.shape) - dis=np.array(dis) + dis = np.array(dis) print(dis.shape) - dps=np.array(dps) + dps = np.array(dps) print(dps.shape) - dphs=np.array(dphs) + dphs = np.array(dphs) print(dphs.shape) - sml=np.array(sml) + sml = np.array(sml) print(sml.shape) - smg=np.array(smg) + smg = np.array(smg) print(smg.shape) - pfam=np.array(pfam) + pfam = np.array(pfam) print(pfam.shape) - label=np.array(label) + label = np.array(label) print(label.shape) - shi=np.random.permutation(len(label)) - labels=label[shi] - bls=bls[shi] - blhs=blhs[shi] - dis=dis[shi] - dps=dps[shi] - dphs=dphs[shi] - sml=sml[shi] - smg=smg[shi] - pfam=pfam[shi] - train(smg,sml,pfam,bls,blhs,dps,dphs,dis,labels,k,num_epochs,learn_rate,decay,int(0.9*size_train),batch_size,model_name) - k+=1 + shi = np.random.permutation(len(label)) + labels = label[shi] + bls = bls[shi] + blhs = blhs[shi] + dis = dis[shi] + dps = dps[shi] + dphs = dphs[shi] + sml = sml[shi] + smg = smg[shi] + pfam = pfam[shi] + train(smg, sml, pfam, bls, blhs, dps, dphs, dis, labels, k, num_epochs, + learn_rate, decay, int(0.9*size_train), batch_size, model_name) + k += 1 + def main(): - arg=sys.argv - model_name=arg[-8] - start_p=int(arg[-7]) - end_p=int(arg[-6]) - num_epochs=int(arg[-5]) - learn_rate=float(arg[-4]) - decay=float(arg[-3]) - size_train=float(arg[-2]) - batch_size=int(arg[-1]) - train_models(model_name,start_p,end_p,num_epochs,learn_rate,decay,size_train,batch_size) - - -if __name__=="__main__": - main() + arg = sys.argv + model_name = arg[-8] + start_p = int(arg[-7]) + end_p = int(arg[-6]) + num_epochs = int(arg[-5]) + learn_rate = float(arg[-4]) + decay = float(arg[-3]) + size_train = float(arg[-2]) + batch_size = int(arg[-1]) + train_models(model_name, start_p, end_p, num_epochs, + learn_rate, decay, size_train, batch_size) +if __name__ == "__main__": + main() From f4d2e655f1f673f88f4228e7fffd379b300102ca Mon Sep 17 00:00:00 2001 From: Priyatham Sai Chand Date: Thu, 8 Apr 2021 22:45:10 +0530 Subject: [PATCH 3/4] prediction_pfam flake fix --- prediction_pfam.py | 1 - 1 file changed, 1 deletion(-) diff --git a/prediction_pfam.py b/prediction_pfam.py index f37f386..235ca12 100644 --- a/prediction_pfam.py +++ b/prediction_pfam.py @@ -51,4 +51,3 @@ def main(): if __name__ == "__main__": main() - From e8c9f8ba974b2c99dae2e4c278ec1c42aa8a4c75 Mon Sep 17 00:00:00 2001 From: Priyatham-sai-chand Date: Fri, 9 Apr 2021 08:41:50 +0530 Subject: [PATCH 4/4] pylint remedy Signed-off-by: Priyatham-sai-chand --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index a3c5fbf..49b5871 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,4 +8,4 @@ install: script: - flake8 - - pylint *.py + - pylint *.py --exit-zero