-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnetst.py
More file actions
executable file
·230 lines (182 loc) · 9.84 KB
/
netst.py
File metadata and controls
executable file
·230 lines (182 loc) · 9.84 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import argparse
import csv
import datetime
import importlib
import multiprocessing
import os
import sys
__description__ = '''K-mer based haplotype detection and haplotype network building tool.'''
SUBCOMMAND_DESC = f'''These subcommands invoke a specific stage of processing. Use "pipeline" to run the whole pipeline.
makedata convert a packed csv or fasta file into individual fasta sequences
getblast infer haplotypes from sequences using blastn search
gettype infer mixed haplotypes from sequences using k-mer based search
getmixseq search for fragments supporting each haplotype using gettype results
viewhaplo build and visualize a haplotype network from sequences
pipeline run the haplotype network analysis pipeline
'''
MODULE_NAMES = ["MakeData", "GetBlast", "GetType", "GetMixSeq", "ViewHaplo"] # "GenNetworkConfig", "HapFasta"
def run_pipeline(args):
import GetBlast
import GetType
import GetMixSeq
import GenNetworkConfig
import ViewHaplo
work_dir = args.work_dir
method_type = args.method
net_type = args.net_type
if not os.path.exists(work_dir):
os.makedirs(work_dir)
target_file = args.input
ref_file = args.ref
cut_value = args.cut
k_value = args.k
e_value = args.e
word_size = args.word_size
haplo_seq = args.haplo_seq
haplo_seq_type = args.haplo_seq_type
index_name = args.index_name
index_seq = args.index_seq
index_count = args.index_count
index_type = args.index_type
index_quant = args.index_quant
skip_getmixseq = args.skip_getmixseq
include_all_seqs = args.all_seqs
process_num = args.t
if args.db:
db_file = args.db
elif method_type == "blast":
db_file = os.path.join(work_dir, 'blast', 'db')
else:
db_file = os.path.join(work_dir, f'db_{k_value}.gz')
result_file = os.path.join(work_dir, 'type_analysis')
mix_csv_path = os.path.join(work_dir, 'type_analysis_mix.csv')
null_csv_path = os.path.join(work_dir, 'type_analysis_null.csv')
all_csv_path = os.path.join(work_dir, 'type_analysis.csv')
if method_type == "blast":
GetBlast.run(target_file, ref_file, db_file, None, e_value, word_size, result_file, cut_value, process_num)
else:
GetType.run(k_value, db_file, ref_file, target_file, result_file, cut_value)
print("Finished haplotype analysis.")
print()
if not skip_getmixseq:
current_time = datetime.datetime.now().strftime("%y%m%d%H%M%S")
GetMixSeq.run(k_value,
mix_csv_path,
"",
ref_file,
100,
os.path.join(work_dir, f'mix-{current_time}'),
process_num)
print("Finished sequence search.")
print()
hap_seq_path = os.path.join(work_dir, 'hap_seq.fasta')
network_dir = os.path.join(work_dir, 'network')
single_haps = {}
mix_haps = set()
with open(hap_seq_path, 'w') as wfile:
if haplo_seq:
if haplo_seq_type == 'csv':
c2f = ViewHaplo.csv2fas_generator(haplo_seq, index_name, index_seq, index_count, index_type, index_quant)
for line in c2f:
wfile.write(line)
elif index_count != -1 or index_type != -1 or index_quant != -1:
f2f = ViewHaplo.fas2fas_generator(haplo_seq, index_name, index_seq, index_count, index_type, index_quant)
for line in f2f:
wfile.write(line)
else:
with open(haplo_seq, 'r') as rfile:
for line in rfile:
wfile.write(line)
with open(mix_csv_path, 'r') as rfile:
reader = csv.reader(rfile, delimiter=',')
next(reader)
for row in reader:
mix_haps.add(row[1])
wfile.write(f">{row[1]}|{row[3]}\n")
wfile.write(f"{row[5]}\n")
with open(null_csv_path, 'r') as rfile:
reader = csv.reader(rfile, delimiter=',')
next(reader)
for row in reader:
mix_haps.add(row[1])
wfile.write(f">{row[1]}|[null]\n")
wfile.write(f"{row[3]}\n")
if include_all_seqs:
with open(all_csv_path, 'r') as rfile:
reader = csv.reader(rfile, delimiter=',')
next(reader)
for row in reader:
if row[3] in single_haps:
mix_haps.add(row[1])
else:
single_haps[row[1]] = row[3]
for name in mix_haps:
if name in single_haps:
del single_haps[name]
seq_name = ''
with open(target_file, 'r') as f:
for line in f:
line = line.strip()
if line.startswith(">"):
seq_name = line[1:]
if seq_name in single_haps:
wfile.write(f">{seq_name}|{single_haps[seq_name]}\n")
elif seq_name in single_haps:
wfile.write(f"{line}\n")
if not single_haps and not mix_haps:
print("No haplotypes eligible for haplotype network. You might want to run with -all_seqs to include all sequences in the network.")
return
ViewHaplo.run(hap_seq_path, False, net_type, network_dir)
if __name__ == "__main__":
if sys.platform.startswith('win'):
multiprocessing.freeze_support()
pars = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=__description__)
subpars = pars.add_subparsers(dest='module_name', description=SUBCOMMAND_DESC, metavar='<command>', help='one of the above commands')
sub_parsers = {}
modules = {}
# Load each script into modules dict (with keys in lower case)
for name in MODULE_NAMES:
mod = importlib.import_module(name)
mod_pars = subpars.add_parser(name.lower(),
formatter_class=argparse.RawDescriptionHelpFormatter,
description=mod.__description__)
for param in mod.ARGS:
mod_pars.add_argument(param[0], **param[1])
modules[name.lower()] = mod
sub_parsers[name.lower()] = mod_pars
pip_pars = subpars.add_parser('pipeline', description='Run the haplotype analysis pipeline. A haplotype network will be generated from analyzed sequences and the reference database.')
pip_pars.add_argument('-work_dir', metavar='<str>', type=str, help='''directory to save intermediate files and analysis results''', required=True)
pip_pars.add_argument('-method', choices=['blast', 'kmer'], help='''haplotype detection method (default kmer)''', required=False, default='kmer')
pip_pars.add_argument('-net_type', choices=['original_tcs', 'modified_tcs', 'mjn', 'msn'], help='''type of haplotype network (default modified_tcs)''', required=False, default='modified_tcs')
pip_pars.add_argument('-input', metavar='<str>', type=str, help='''fasta sequences awaiting analysis''', required=True)
pip_pars.add_argument('-ref', metavar='<str>', type=str, help='''reference fasta sequences for haplotype detection''', required=True)
pip_pars.add_argument('-db', metavar='<str>', type=str, help='''location of pre-built database for the selected detection method''', required=False)
pip_pars.add_argument('-cut', metavar='<int>', type=int, help='''minimum percentage of identical nucleotides to assume a sequence is not recombinant''', required=False, default=85)
pip_pars.add_argument('-k', metavar='<int>', type=int, help='''k-mer size''', required=False, default=21)
pip_pars.add_argument('-e', metavar='<num>', type=float, help='''maximum E value in BLAST queries''', required=False, default=1e-5)
pip_pars.add_argument('-word_size', metavar='<int>', type=int, help='''word size in BLAST queries''', required=False, default=11)
pip_pars.add_argument('-haplo_seq', metavar='<str>', type=str, help='''other sequences to display in haplotype network''', required=False)
pip_pars.add_argument('-haplo_seq_type', metavar='<str>', type=str, help='''type of haplo_seq (csv or fas)''', required=False, default='fas')
pip_pars.add_argument('-index_name', metavar='<int>', type=int, help='''index (0-based) for names in haplo_seq''', required=False, default=1)
pip_pars.add_argument('-index_seq', metavar='<int>', type=int, help='''index (0-based) for sequence data in haplo_seq (csv only)''', required=False, default=2)
pip_pars.add_argument('-index_count', metavar='<int>', type=int, help='''index (0-based) for population count in haplo_seq''', required=False, default=-1)
pip_pars.add_argument('-index_type', metavar='<int>', type=int, help='''index (0-based) for haplotype in haplo_seq''', required=False, default=-1)
pip_pars.add_argument('-index_quant', metavar='<int>', type=int, help='''index (0-based) for quantitative (continuous) trait''', required=False, default=-1)
pip_pars.add_argument('-all_seqs', help='''include putative non-recombinant sequences in haplotype network''', action='store_true')
pip_pars.add_argument('-skip_getmixseq', help='''skip time-consuming search of haplotype-supporting fragments''', action='store_true')
pip_pars.add_argument('-t', metavar='<int>', type=int, help='''number of concurrent processes''', required=False, default=10)
args = pars.parse_args()
if not args.module_name:
# No subcommand, print help and exit
print("Please specify a subcommand to execute.")
print()
pars.print_help()
sys.exit(1)
module_name = args.module_name.lower()
if module_name != "pipeline":
# Invoke the respective module with args
modules[module_name].main(sub_parsers[module_name], args)
else:
run_pipeline(args)