forked from DBT-BIF/Biosensor-Python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_parsing_function.py
More file actions
58 lines (40 loc) · 2.05 KB
/
data_parsing_function.py
File metadata and controls
58 lines (40 loc) · 2.05 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
def merge(names,nodes,delnodes,merged):
#import librarys.
import csv
import pandas as pd
#create empty DataFrame df_names and df_nodes.
df_names = pd.DataFrame()
df_nodes = pd.DataFrame()
#read .dmp files with use of parameters.
df_names = pd.read_csv(names,delimiter='\t',header=None)
df_nodes = pd.read_csv(nodes,delimiter='\t',header=None)
#collect usefull columns from DataFrames.
df_nodes = df_nodes[[0,2,4]]
df_names = df_names[[0, 2, 6]]
#apply key on column 6 in df_named and extract column 0 and 2.
df_names = df_names[df_names[6]=='scientific name'][[0,2]]
#lenth of dataframe in range(.....).
df_names.index = range(len(df_names))
#merge both DataFrames df_nodes and df_names in new DataFrame df_merge.
df_merge = pd.DataFrame()
df_merge = df_nodes.merge(df_names[[0,2]],how = "left",left_on = [0],right_on= [0])
#rename columns of DataFrame df_merge.
df_merge = df_merge.rename(columns={0:'taxa_ids','2_x':'parent_id',4:'rank','2_y':'scientfic_name'})
#create empty DataFrame df_delnodes and df_merged.
df_delnodes = pd.DataFrame()
df_merged = pd.DataFrame()
#read .dmp files with use of parameters.
df_delnodes = pd.read_csv(delnodes,delimiter='\t',header=None)
df_merged = pd.read_csv(merged,delimiter='\t',header=None)
#creat two empty lists taxaid and taxaids assign them with set of df_merge['taxa_ids'] and df_merged[0].
taxaid = list(set(df_merge['taxa_ids']))
taxaids = list(set(df_merged[0]))
#print common values in lists.
print("modified values are",list(set(taxaid) & set(taxaids)))
#convert DataFrame in .csv formate and assign to file_csv .
file_csv = df_merge.to_csv("final_merge.csv")
print ("csv file created as final_merge.csv")
#return file_csv
return file_csv, len(df_merge)
#pass path of .dmp files in parameters.
merge(names = 'taxadmp/names.dmp',nodes = 'taxadmp/nodes.dmp',merged = 'taxadmp/merged.dmp',delnodes = 'taxadmp/delnodes.dmp')