diff --git a/envs/mtoolbox.yaml b/envs/mtoolbox.yaml index c0c1b0f..30f614e 100644 --- a/envs/mtoolbox.yaml +++ b/envs/mtoolbox.yaml @@ -3,387 +3,23 @@ channels: - bioconda - defaults dependencies: - - _libgcc_mutex=0.1 - - _r-mutex=1.0.0 - - aioeasywebdav=2.2.0 - - aiohttp=3.4.4 - - appdirs=1.4.3 - - asn1crypto=0.24.0 - - async-timeout=3.0.1 - - atomicwrites=1.3.0 - - attrs=18.2.0 - - backcall=0.1.0 - - bamutil=1.0.14 - - bbmap=38.22 - - bcftools=1.9 - - bcrypt=3.1.4 - - binutils_impl_linux-64=2.31.1 - - binutils_linux-64=2.31.1 - - biopython=1.72 - - blas=1.0 - - bleach=3.1.0 - - boto3=1.9.35 - - botocore=1.12.35 - - bwidget=1.9.11 - - bzip2=1.0.6 - - ca-certificates=2020.6.20 - - cachetools=2.1.0 - - cairo=1.14.12 - - certifi=2020.6.20 - - cffi=1.11.5 - - chardet=3.0.4 - - click=7.1.2 - - configargparse=0.13.0 - - cryptography=2.4.1 - - curl=7.64.0 - - datrie=0.7.1 - - decorator=4.3.0 - - defusedxml=0.6.0 - - docutils=0.14 - - dropbox=9.1.0 - - entrypoints=0.3 - - expat=2.2.6 - - fastqc=0.11.8 - - filechunkio=1.6 - - font-ttf-dejavu-sans-mono=2.37 - - fontconfig=2.13.0 - - freetype=2.9.1 - - fribidi=1.0.5 - - ftputil=3.2 - - gatk-framework=3.6.24 - - gcc_impl_linux-64=7.3.0 - - gcc_linux-64=7.3.0 - - gettext=0.19.8.1 - - gfortran_impl_linux-64=7.3.0 - - gfortran_linux-64=7.3.0 - - gitdb2=2.0.5 - - gitpython=2.1.11 - - glib=2.56.2 - - gmap=2018.07.04 - - gmp=6.1.2 - - google-auth=1.2.1 - - google-auth-httplib2=0.0.3 - - google-cloud-core=0.24.1 - - google-cloud-storage=1.1.1 - - google-resumable-media=0.0.2 - - googleapis-common-protos=1.5.5 - - graphite2=1.3.12 - - graphviz=2.40.1 - - gsl=2.4 - - gxx_impl_linux-64=7.3.0 - - gxx_linux-64=7.3.0 - - harfbuzz=1.8.8 - - httplib2=0.12.0 - - icu=58.2 - - idna=2.7 - - idna_ssl=1.1.0 - - importlib_metadata=0.18 - - intel-openmp=2019.1 - - ipykernel=5.1.2 - - ipython=7.2.0 - - ipython_genutils=0.2.0 - - jedi=0.13.2 - - jinja2=2.10.1 - - jmespath=0.9.3 - - jpeg=9b - - jsonschema=2.6.0 - - jupyter_client=5.3.1 - - jupyter_core=4.5.0 - - krb5=1.16.3 - - libblas=3.8.0 - - libcblas=3.8.0 - - libcurl=7.64.0 - - libdeflate=1.0 - - libedit=3.1.20170329 - - libffi=3.2.1 - - libgcc=7.2.0 - - libgcc-ng=8.2.0 - - libgfortran-ng=7.3.0 - - libiconv=1.15 - - liblapack=3.8.0 - - libopenblas=0.3.7 - - libpng=1.6.35 - - libprotobuf=3.6.1 - - libsodium=1.0.16 - - libssh2=1.8.0 - - libstdcxx-ng=8.2.0 - - libtiff=4.0.9 - - libuuid=1.0.3 - - libxcb=1.13 - - libxml2=2.9.8 - - line_profiler=2.1.2 - - make=4.2.1 - - markupsafe=1.1.0 - - memory_profiler=0.55.0 - - mistune=0.8.4 - - mkl=2018.0.3 - - mkl_fft=1.0.6 - - mkl_random=1.0.1 - - more-itertools=7.2.0 - - multidict=4.4.2 - - muscle=3.8.1551 - - nbconvert=5.5.0 - - nbformat=4.4.0 - - ncurses=6.1 - - networkx=2.2 - - notebook=6.0.0 - - numpy=1.15.4 - - numpy-base=1.15.4 - - openblas=0.3.7 - - openjdk=8.0.152 - - openssl=1.1.1g - - packaging=19.0 - - pandas=0.23.4 - - pandoc=2.2.3.2 - - pandocfilters=1.4.2 - - pango=1.42.4 - - paramiko=2.4.2 - - parso=0.3.1 - - pcre=8.42 - - perl=5.26.2 - - pexpect=4.6.0 - - picard=2.18.16 - - pickleshare=0.7.5 - - pip=18.1 - - pixman=0.34.0 - - pluggy=0.12.0 - - prettytable=0.7.2 - - prometheus_client=0.7.1 - - prompt_toolkit=2.0.7 - - protobuf=3.6.1 - - psutil=5.4.8 - - pthread-stubs=0.4 - - ptyprocess=0.6.0 - - py=1.8.0 - - pyasn1=0.4.4 - - pyasn1-modules=0.0.5 - - pycparser=2.19 - - pygments=2.3.1 - - pygraphviz=1.3.1 - - pynacl=1.3.0 - - pyopenssl=18.0.0 - - pyparsing=2.4.2 - - pysftp=0.2.9 - - pysocks=1.6.8 - - pytest=5.0.1 - - python=3.6.7 - - python-dateutil=2.7.5 - - python-irodsclient=0.7.0 - - python_abi=3.6 - - pytz=2018.7 - - pyvcf=0.6.7 - - pyyaml=3.13 - - pyzmq=17.1.2 - - r-abind=1.4_5 - - r-assertthat=0.2.0 - - r-backports=1.1.2 - - r-base=3.5.1 - - r-base64enc=0.1_3 - - r-bh=1.66.0_1 - - r-bindr=0.1.1 - - r-bindrcpp=0.2.2 - - r-bookdown=0.7 - - r-boot=1.3_20 - - r-broom=0.5.0 - - r-callr=2.0.4 - - r-caret=6.0_80 - - r-cellranger=1.1.0 - - r-class=7.3_14 - - r-classint=0.3_3 - - r-cli=1.0.0 - - r-clipr=0.4.1 - - r-cluster=2.0.7_1 - - r-codetools=0.2_15 - - r-colorspace=1.3_2 - - r-crayon=1.3.4 - - r-crosstalk=1.0.0 - - r-curl=3.2 - - r-cvst=0.2_2 - - r-data.table=1.11.4 - - r-dbi=1.0.0 - - r-dbplyr=1.2.2 - - r-ddalpha=1.3.4 - - r-deoptimr=1.0_8 - - r-dichromat=2.0_0 - - r-digest=0.6.15 - - r-dimred=0.1.0 - - r-dplyr=0.7.6 - - r-drr=0.0.3 - - r-dt=0.4 - - r-e1071=1.7_2 - - r-essentials=3.5.1 - - r-evaluate=0.11 - - r-fansi=0.2.3 - - r-forcats=0.3.0 - - r-foreach=1.4.4 - - r-foreign=0.8_71 - - r-formatr=1.5 - - r-funr=0.3.2 - - r-geometry=0.3_6 - - r-ggplot2=3.0.0 - - r-glmnet=2.0_16 - - r-glue=1.3.0 - - r-gower=0.1.2 - - r-gtable=0.2.0 - - r-haven=1.1.2 - - r-hexbin=1.27.2 - - r-highr=0.7 - - r-hms=0.4.2 - - r-htmltools=0.3.6 - - r-htmlwidgets=1.2 - - r-httpuv=1.4.5.1 - - r-httr=1.3.1 - - r-ipred=0.9_6 - - r-irdisplay=0.5.0 - - r-irkernel=0.8.12 - - r-iterators=1.0.10 - - r-jsonlite=1.5 - - r-kernlab=0.9_26 - - r-kernsmooth=2.23_15 - - r-knitr=1.20 - - r-labeling=0.3 - - r-labelled=1.1.0 - - r-later=0.7.3 - - r-lattice=0.20_35 - - r-lava=1.6.2 - - r-lazyeval=0.2.1 - - r-lubridate=1.7.4 - - r-magic=1.5_8 - - r-magrittr=1.5 - - r-maps=3.3.0 - - r-markdown=0.8 - - r-mass=7.3_50 - - r-matrix=1.2_14 - - r-mgcv=1.8_24 - - r-mime=0.5 - - r-miniui=0.1.1.1 - - r-modelmetrics=1.1.0 - - r-modelr=0.1.2 - - r-munsell=0.5.0 - - r-nlme=3.1_137 - - r-nnet=7.3_12 - - r-numderiv=2016.8_1.1 - - r-openssl=1.0.2 - - r-pbdzmq=0.3_3 - - r-pillar=1.3.0 - - r-pkgconfig=2.0.1 - - r-plogr=0.2.0 - - r-pls=2.6_0 - - r-plyr=1.8.4 - - r-praise=1.0.0 - - r-prettydoc=0.2.1 - - r-processx=3.1.0 - - r-prodlim=2018.04.18 - - r-promises=1.0.1 - - r-purrr=0.2.5 - - r-quantmod=0.4_13 - - r-questionr=0.7.0 - - r-r6=2.2.2 - - r-randomforest=4.6_14 - - r-rbokeh=0.6.3 - - r-rcolorbrewer=1.1_2 - - r-rcpp=0.12.18 - - r-rcpproll=0.3.0 - - r-readr=1.1.1 - - r-readxl=1.1.0 - - r-recipes=0.1.3 - - r-recommended=3.5.1 - - r-rematch=1.0.1 - - r-repr=0.15.0 - - r-reprex=0.2.0 - - r-reshape2=1.4.3 - - r-reticulate=1.12 - - r-rlang=0.2.1 - - r-rmarkdown=1.10 - - r-rmdformats=0.3.5 - - r-robustbase=0.93_2 - - r-rpart=4.1_13 - - r-rprojroot=1.3_2 - - r-rstudioapi=0.7 - - r-rvest=0.3.2 - - r-scales=0.5.0 - - r-selectr=0.4_1 - - r-sfsmisc=1.1_2 - - r-shiny=1.1.0 - - r-sourcetools=0.1.7 - - r-spatial=7.3_11 - - r-squarem=2017.10_1 - - r-stringi=1.2.4 - - r-stringr=1.3.1 - - r-survival=2.42_6 - - r-testthat=2.0.0 - - r-tibble=1.4.2 - - r-tidyr=0.8.1 - - r-tidyselect=0.2.4 - - r-tidyverse=1.2.1 - - r-timedate=3043.102 - - r-tinytex=0.6 - - r-ttr=0.23_3 - - r-utf8=1.1.4 - - r-uuid=0.1_2 - - r-viridislite=0.3.0 - - r-whisker=0.3_2 - - r-withr=2.1.2 - - r-xfun=0.3 - - r-xml2=1.2.0 - - r-xtable=1.8_2 - - r-xts=0.11_0 - - r-yaml=2.2.0 - - r-zoo=1.8_3 - - ratelimiter=1.2.0 - - readline=7.0 - - requests=2.20.1 - - rsa=3.1.4 - - s3transfer=0.1.13 - - samtools=1.9 - - scipy=1.3.1 - - send2trash=1.5.0 + - samtools=1.11 + - numpy=1.19.2 + - picard=2.23.7 + - bcftools=1.11 + - trimmomatic=0.39 + - biopython=1.78 + - pip=20.2.3 - seqtk=1.3 - - setuptools=40.6.2 - - six=1.11.0 - - smmap2=2.0.5 - - snakemake=5.4.3 - - snakemake-minimal=5.4.3 - - sqlalchemy=1.2.16 - - sqlite=3.25.3 - - terminado=0.8.2 - - testpath=0.4.2 - - tk=8.6.9 - - tktable=2.10 - - tornado=6.0.3 - - traitlets=4.3.2 - - trimmomatic=0.38 - - urllib3=1.23 - - wcwidth=0.1.7 - - webencodings=0.5.1 - - wheel=0.32.3 - - wrapt=1.10.11 - - xmlrunner=1.7.7 - - xorg-kbproto=1.0.7 - - xorg-libice=1.0.10 - - xorg-libsm=1.2.2 - - xorg-libx11=1.6.8 - - xorg-libxau=1.0.9 - - xorg-libxdmcp=1.1.3 - - xorg-libxext=1.3.4 - - xorg-libxrender=0.9.10 - - xorg-renderproto=0.11.1 - - xorg-xextproto=7.3.0 - - xorg-xproto=7.0.31 - - xz=5.2.4 - - yaml=0.1.7 - - yarl=1.2.6 - - zeromq=4.2.5 - - zipp=0.5.2 - - zlib=1.2.11 + - snakemake=5.26 + - pyvcf=0.6.7 + - gmap=2020.10.14 + - yaml=0.2.5 + - python=3.6 + - scipy=1.5.2 + - bamutil=1.0.14 + - fastqc=0.11.9 + - gatk-framework=3.6 + - pandas=1.1.3 - pip: - - apybiomart==0.5.2 - - asyncio==3.4.3 - - cython==0.29.20 - - dask==2.19.0 - mtoolnote==0.2.0 - - pysam==0.16.0.1 - - scikit-allel==1.2.1 - - toolz==0.10.0 - - vcfpy==0.12.2 - diff --git a/install.sh b/install.sh old mode 100644 new mode 100755 diff --git a/modules/mtVariantCaller.py b/modules/mtVariantCaller.py index 8f69da0..1cb796e 100644 --- a/modules/mtVariantCaller.py +++ b/modules/mtVariantCaller.py @@ -6,6 +6,7 @@ """ from collections import OrderedDict +import sys import glob import gzip import math @@ -23,6 +24,7 @@ def extract_mismatches(seq, qs, len_mism, position_in_read): + '''extract mismatches from read sequence using MD flag''' start = position_in_read - 1 end = start + len_mism mismatch_seq = seq[start:end] @@ -32,6 +34,7 @@ def extract_mismatches(seq, qs, len_mism, position_in_read): def check_strand(mate): + '''check whether read is forward or reverse''' if mate & 16 == 16: strand = '-' else: @@ -94,6 +97,9 @@ def parse_sam_row(row): for field in row[11:]: if field.startswith("MD"): md = field.split(':')[2] + if '*' in md: + sys.stderr.write('SAM field without MD flag or with non-canonical MD flag found. Skip this row\n') + md = '0' leftmost = row[3]-1 read_id = row[0] seq = list(row[9]) @@ -114,8 +120,8 @@ def parse_sam_row(row): def read_length_from_cigar(cigar_bases, cigar_nt): """ - Cigar bases: ['S', 'M', 'D', 'M', 'I', 'M'] - Cigar nt: [10, 30, 5, 15, 10, 20] + Cigar nt: ['S', 'M', 'D', 'M', 'I', 'M'] + Cigar bases: [10, 30, 5, 15, 10, 20] The function will compute the effective read length to discard MD variants in softclipped regions and to calculate the distance of a mismatch from @@ -133,8 +139,8 @@ def parse_mismatches_from_cigar_md(sam_record, minqs=25, tail=5, """ Logic of this function is: - - MD flag reflect the **mapped portion of the read** - no soft clipping no - insertions (although some mutations in softclipped regions have been + - MD flag reflects the **mapped portion of the read** - no soft clipping no + insertions - CIGAR flag reflects the absolute reads length - including soft clipping and insertions; - The script first equals the length of the read to that of the mapped @@ -150,13 +156,12 @@ def parse_mismatches_from_cigar_md(sam_record, minqs=25, tail=5, ins_pos_in_seq += int(cigar_bases[n]) # cut out from the read sequence and qs the bases representing the ins elif cigar_nt[n] == 'I': - # original version - # ins_len = len(cigar_bases[n]) + # if insertion found then insert in the read sequence and qs the bases representing the ins ins_len = int(cigar_bases[n]) new_seq = new_seq[:ins_pos_in_seq]+new_seq[(ins_pos_in_seq+ins_len):] new_qs = new_qs[:ins_pos_in_seq]+new_qs[(ins_pos_in_seq+ins_len):] - # insert in the read sequence and qs the bases representing the ins - elif cigar_nt[n] == 'D': + elif cigar_nt[n] == 'D': #check this + # if deletion found then ins_len = int(cigar_bases[n]) new_seq = new_seq[:ins_pos_in_seq] + ["I"]*ins_len + new_seq[ins_pos_in_seq:] new_qs = new_qs[:ins_pos_in_seq] + ["I"]*ins_len + new_qs[ins_pos_in_seq:] @@ -212,11 +217,10 @@ def parse_mismatches_from_cigar_md(sam_record, minqs=25, tail=5, all_ref.append(bases_ref[x]) all_mism.append(new_seq[t]) all_qs.append(ord(new_qs[t])-33) - except IndexError: + except IndexError: #check this - shouldn't we raise here a more human readable error? pass return positions_ref_final, positions_read_final, all_ref, all_mism, all_qs, strand - def allele_strand_counter(strand): """ Initialize a strand counter instance for mismatch detection. """ if strand == "+": @@ -245,15 +249,11 @@ def varnames(i): refposleft = int(i[3]) - 1 mate = int(i[1]) # check strand - # TODO: duplicated, can be replaced by check_mate() - if mate & 16 == 16: - strand = '-' - else: - strand = '+' + strand = check_strand(mate) return CIGAR, readNAME, seq, qs, refposleft, strand -# defines global variables for MT-table parsing +# defines global variables for MT-table parsing #check this - this function might be dismissed at some point if we don't use the mt-table anymore def varnames2(b, c, i): global Position, Ref, Cov, A, C, G, T, A_f, C_f, G_f, T_f, A_r, C_r, G_r, T_r Position = int((i[0]).strip()) @@ -621,7 +621,7 @@ def s_encoding(s): def mtvcf_main_analysis(mtable_file=None, coverage_data_file=None, sam_file=None, name2=None, tail=5, Q=25, minrd=5, ref_mt=None, tail_mismatch=5): - + coverage_data = parse_coverage_data_file(coverage_data_file) if sam_file.endswith("gz"): @@ -642,7 +642,8 @@ def mtvcf_main_analysis(mtable_file=None, coverage_data_file=None, sam_file=None Coverage = [] ref = SeqIO.index(ref_mt, 'fasta') ref_seq = ref[list(ref.keys())[0]].seq - for n in range(len(coverage_data)): + #for n in range(len(coverage_data)): + for n in range(len(ref_seq)): Coverage.append(coverage_data[n + 1]) mtDNA.append(ref_seq[n]) mtDNAseq = "".join(mtDNA)