diff --git a/.appveyor.yml b/.appveyor.yml index 14145cf1b..8fe288094 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -18,19 +18,23 @@ skip_tags: true # - docs/* # - '**/*.html' +# Appveyor Windows images are based on Visual studio version +image: Visual Studio 2019 + # We use Mingw/Msys, so use pacman for installs install: - set HOME=. - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - set MINGWPREFIX=x86_64-w64-mingw32 - - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\"" + - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-tools-git\"" build_script: - set HOME=. - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - - "sh -lc \"autoheader && autoconf && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" + - git submodule update --init --recursive + - "sh -lc \"autoreconf -i && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" #build_script: # - make diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 000000000..fc4405b08 --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,209 @@ +# Note we have a maximum of 16 CPUs available, so adjust our +# builds so we can start all concurrently without needing to schedule. + +# Sadly though there is still a finite limit to macOS of one instance. +# Can we cull our Mac test to just one instance? + +timeout_in: 20m + +#-------------------------------------------------- +# Template: build libdeflate dependency + +libdeflate_template: &LIBDEFLATE + libdeflate_script: | + if test "x$USE_LIBDEFLATE" == "xyes"; then + pushd "$HOME" + git clone --depth 1 https://github.com/ebiggers/libdeflate.git + pushd libdeflate + cmake -B build -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -DCMAKE_C_FLAGS="-g -O3 -fPIC $LIBDEFLATE_CFLAGS" + cmake --build build --verbose + popd + popd + fi + +#-------------------------------------------------- +# Template: compile and test + +compile_template: &COMPILE + compile_script: | + git submodule update --init --recursive + if test "x$USE_LIBDEFLATE" = "xyes"; then + CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate/build" --with-libdeflate' + else + CONFIG_OPTS='--without-libdeflate' + fi + if test "$USE_CONFIG" = "yes"; then + MAKE_OPTS= + autoreconf -i + eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ + ( cat config.log; false ) + else + MAKE_OPTS=-e + fi + if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then + make maintainer-check + fi + make -j 4 $MAKE_OPTS + +test_template: &TEST + test_script: | + make test-shlib-exports + make test + if test "x$DO_UNTRACKED_FILE_CHECK" = "xyes"; then make check-untracked ; fi + +#-------------------------------------------------- +# Task: linux builds. + +# Debian + latest GCC +gcc_task: + name: debian-gcc + container: + image: gcc:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + DO_MAINTAINER_CHECKS: yes + DO_UNTRACKED_FILE_CHECK: yes + USE_CONFIG: no + - environment: + USE_CONFIG: yes + CFLAGS: -std=c99 -pedantic -Wformat=2 + USE_LIBDEFLATE: yes + + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + cmake + + << : *LIBDEFLATE + << : *COMPILE + << : *TEST + + +# Ubuntu + Clang +ubuntu_task: + name: ubuntu-clang + container: + image: ubuntu:latest + # image: ubuntu:devel + cpu: 2 + memory: 1G + + environment: + CC: clang + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + USE_CONFIG: yes + DO_UNTRACKED_FILE_CHECK: yes + - environment: + USE_CONFIG: yes + CFLAGS: -g -Wall -O3 -fsanitize=address + LDFLAGS: -fsanitize=address + USE_LIBDEFLATE: yes + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates clang libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + libdeflate-dev + + << : *COMPILE + << : *TEST + + +# Rocky Linux +rocky_task: + name: rockylinux-gcc + container: + image: rockylinux:9 + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + USE_CONFIG: yes + CFLAGS: -std=gnu90 + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + yum install -y autoconf automake make gcc perl-Data-Dumper perl-FindBin \ + zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel \ + ncurses-devel diffutils git + + << : *COMPILE + << : *TEST + +# Arm Linux +arm_ubuntu_task: + name: ubuntu-arm + arm_container: + image: ubuntu:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + DO_UNTRACKED_FILE_CHECK: yes + USE_CONFIG: yes + CFLAGS: -g -Wall -O3 -std=c99 -pedantic + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates clang libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + libdeflate-dev + + << : *COMPILE + << : *TEST + +#-------------------------------------------------- +# Task: macOS builds + +macosx_task: + name: macosx + clang + macos_instance: + image: ghcr.io/cirruslabs/macos-ventura-base:latest + + environment: + CC: clang + CFLAGS: "-Wall -arch arm64 -arch x86_64" + LDFLAGS: "-arch arm64 -arch x86_64" + LIBDEFLATE_CFLAGS: "-arch arm64 -arch x86_64" + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + USE_CONFIG: no + - environment: + USE_CONFIG: yes + USE_LIBDEFLATE: yes + + package_install_script: | + HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git \ + cmake + + << : *LIBDEFLATE + << : *COMPILE + << : *TEST + diff --git a/.gitattributes b/.gitattributes index efd67e585..5d9850bc7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -9,10 +9,18 @@ # Omit these files from release tarballs. /.appveyor.yml export-ignore .git* export-ignore -/.travis.yml export-ignore +/.cirrus.yml export-ignore README.md export-ignore # Remove the text attribute from reference files, so that git doesn't convert # line separators on Windows machines. It causes the index files to become out # of sync with the fasta files. *.fa* -text + +# Remove the text attribute from index_dos.sam, so that the line separators +# for the test file don't get converted into Unix format. +test/index_dos.sam -text + +# Remove the text attribute from various faidx test files +test/faidx/faidx*.fa* -text +test/faidx/fastqs*.fq* -text diff --git a/.gitignore b/.gitignore index 363698f1c..1dafc3615 100644 --- a/.gitignore +++ b/.gitignore @@ -7,16 +7,20 @@ *.dll.a *.pc.tmp *-uninstalled.pc +config_vars.h /version.h autom4te.cache config.cache +config.guess config.h config.h.in config.log config.mk config.status +config.sub configure +install-sh hfile_*.bundle hfile_*.cygdll @@ -25,6 +29,7 @@ hfile_*.so hts-object-files htslib_static.mk +htscodecs.mk cyg*.dll lib*.a @@ -39,25 +44,34 @@ shlib-exports-*.txt /bgzip /htsfile /tabix +/test/faidx/*.tmp* +/test/faidx/FAIL* /test/fieldarith /test/hfile /test/hts_endian /test/longrefs/*.tmp.* /test/pileup +/test/pileup_mod /test/plugins-dlhts /test/sam /test/tabix/*.tmp.* /test/tabix/FAIL* /test/test-bcf-sr /test/test-bcf-translate +/test/test-bcf_set_variant_type /test/test_bgzf +/test/test_expr +/test/test_faidx /test/test_index +/test/test_introspection /test/test_kfunc /test/test_kstring +/test/test_mod /test/test-parse-reg /test/test_realn /test/test-regidx /test/test_str2int +/test/test_time_funcs /test/test-vcf-api /test/test-vcf-sweep /test/test_view diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..cb6f98d42 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "htscodecs"] + path = htscodecs + url = https://github.com/samtools/htscodecs.git + fetchRecurseSubmodules = true diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 9c604d52c..000000000 --- a/.travis.yml +++ /dev/null @@ -1,108 +0,0 @@ -# Control file for continuous integration testing at http://travis-ci.org/ - -language: c - -matrix: - include: - - compiler: gcc - os: linux - env: DO_MAINTAINER_CHECKS=yes USE_CONFIG=no - - - compiler: gcc-8 - os: linux - env: USE_CONFIG=yes CC=gcc-8 AR=gcc-ar-8 - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-8 - - # An optimised build with address and leak checking, also using libdeflate - - compiler: gcc-8 - os: linux - dist: xenial - env: USE_CONFIG=yes USE_LIBDEFLATE=yes CC=gcc-8 AR=gcc-ar-8 CFLAGS="-g -Wall -O3 -fsanitize=address" LDFLAGS="-fsanitize=address" - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-8 - - - compiler: clang - os: osx - env: USE_CONFIG=no - - - compiler: clang - os: osx - env: USE_CONFIG=yes - - - compiler: clang - os: osx - env: USE_CONFIG=yes USE_LIBDEFLATE=yes - - - compiler: gcc - os: linux - env: USE_CONFIG=yes - - - compiler: clang - os: linux - env: USE_CONFIG=yes - - - compiler: gcc - os: linux - env: CFLAGS="-std=c99 -pedantic" USE_CONFIG=yes - - # Big-endian - - compiler: gcc - arch: s390x - os: linux - env: USE_CONFIG=yes - addons: - apt: - packages: - - libbz2-dev - - liblzma-dev - - -# For MacOSX systems -before_install: - - | - if [[ "$TRAVIS_OS_NAME" == "osx" && "$USE_CONFIG" == "no" ]]; then - HOMEBREW_NO_AUTO_UPDATE=1 brew install xz || ( brew update && brew install xz ) - fi - -before_script: - - | - if test "x$USE_LIBDEFLATE" == "xyes"; then - pushd "$HOME" && \ - git clone --depth 1 https://github.com/ebiggers/libdeflate.git && \ - pushd libdeflate && \ - make -j 2 CFLAGS='-fPIC -O3' libdeflate.a && \ - popd && \ - popd - fi - -script: - - | - if test "x$USE_LIBDEFLATE" = "xyes"; then - CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' - else - CONFIG_OPTS='--without-libdeflate' - fi - - | - if test "$USE_CONFIG" = "yes"; then - MAKE_OPTS= ; - autoreconf && \ - eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ - ( cat config.log; false ) - else - MAKE_OPTS=-e - fi && \ - if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then - make maintainer-check - fi && \ - make -j 2 $MAKE_OPTS && \ - make test-shlib-exports && \ - make test diff --git a/INSTALL b/INSTALL index 3ee36d0cf..e0fddd9d7 100644 --- a/INSTALL +++ b/INSTALL @@ -17,6 +17,7 @@ In addition, building the configure script requires: autoheader autoconf + autoreconf Running the configure script uses awk, along with a number of standard UNIX tools (cat, cp, grep, mv, rm, sed, among others). Almost @@ -41,21 +42,38 @@ a development ('-dev' or '-devel') package separate from the main library. Disabling libbzip2 and liblzma will make some CRAM files unreadable, so is not recommended. -Using libcurl provides HTSlib with better network protocol support, for -example it enables the use of https:// URLs. It is also required if -direct access to Amazon S3 or Google Cloud Storage is enabled. +Using libcurl provides HTSlib with network protocol support, for +example it enables the use of ftp://, http://, and https:// URLs. +It is also required if direct access to Amazon S3 or Google Cloud +Storage is enabled. Amazon S3 support requires an HMAC function to calculate a message authentication code. On MacOS, the CCHmac function from the standard -library is used. Systems that do not have CChmac will get this from +library is used. Systems that do not have CCHmac will get this from libcrypto. libcrypto is part of OpenSSL or one of its derivatives (LibreSSL or BoringSSL). -On Microsoft Windows we recommend use of Mingw64/Msys2. Note that -currently for the test harness to work you will need to override the -test temporary directory with e.g.: make check TEST_OPTS="-t C:/msys64/tmp/_" -Whilst the code may work on Windows with other environments, these have -not be verified. +On Microsoft Windows we recommend use of Mingw64/Msys2. Whilst the +code may work on Windows with other environments, these have not been +verified. Use of the configure script is a requirement too. + +Update htscodecs submodule +========================== + +Note that this section only applies to git checkouts. If you're building +from a release tar file, you can skip this section. + +Some parts of HTSlib are provided by the external "htscodecs" project. This +is included as a submodule. When building from the git repository, +either clone the project using "git clone --recurse-submodules", or run: + + git submodule update --init --recursive + +to ensure the correct version of the submodule is present. + +It is also possible to link against an external libhtscodecs library +by using the '--with-external-htscodecs' configure option. When +this is used, the submodule files will be ignored. Building Configure ================== @@ -64,12 +82,7 @@ This step is only needed if configure.ac has been changed, or if configure does not exist (for example, when building from a git clone). The configure script and config.h.in can be built by running: - autoheader - autoconf - -If you have a full GNU autotools install, you can alternatively run: - - autoreconf + autoreconf -i Basic Installation ================== @@ -88,7 +101,7 @@ configure and just type 'make; make install' as for previous versions of HTSlib. However if the build fails you should run './configure' as it can diagnose the common reasons for build failures. -The 'make' command builds the HTSlib library and and various useful +The 'make' command builds the HTSlib library and various useful utilities: bgzip, htsfile, and tabix. If compilation fails you should run './configure' as it can diagnose problems with your build environment that cause build failures. @@ -97,6 +110,7 @@ The 'make install' command installs the libraries, library header files, utilities, several manual pages, and a pkgconfig file to /usr/local. The installation location can be changed by configuring with --prefix=DIR or via 'make prefix=DIR install' (see Installation Locations below). +Shared library permissions can be set via e.g. 'make install LIB_PERM=755'. Configuration @@ -115,6 +129,9 @@ various features and specify further optional external requirements: any enabled pluggable facilities (such as libcurl file access) are built directly within HTSlib. + Programs that are statically linked to a libhts.a with plugins enabled + need to be linked using -rdynamic or a similar linker option. + The repository contains several additional plugins, including the iRODS () file access plugin previously distributed with HTSlib. @@ -129,10 +146,16 @@ various features and specify further optional external requirements: searched; you can use --with-plugin-path='DIR:$(plugindir):DIR' and so on to cause additional directories to be searched. +--with-external-htscodecs + Build and link against an external copy of the htscodecs library + instead of using the source files in the htscodecs directory. + --enable-libcurl - Use libcurl () to implement network access to - remote files via FTP, HTTP, HTTPS, etc. By default, HTSlib uses its - own simple networking code to provide access via FTP and HTTP only. + Use libcurl () to implement network access to + remote files via FTP, HTTP, HTTPS, etc. By default or with + --enable-libcurl=check, configure will probe for libcurl and include + this functionality if libcurl is available. Use --disable-libcurl + to prevent this. --enable-gcs Implement network access to Google Cloud Storage. By default or with @@ -158,6 +181,12 @@ various features and specify further optional external requirements: By default, ./configure will probe for libdeflate and use it if available. To prevent this, use --without-libdeflate. +Each --enable-FEATURE/--disable-FEATURE/--with-PACKAGE/--without-PACKAGE +option listed also has an opposite, e.g., --without-external-htscodecs +or --disable-plugins. However, apart from those options for which the +default is to probe for related facilities, using these opposite options +is mostly unnecessary as they just select the default configure behaviour. + The configure script also accepts the usual options and environment variables for tuning installation locations and compilers: type './configure --help' for details. For example, @@ -232,13 +261,49 @@ RedHat / CentOS sudo yum install autoconf automake make gcc perl-Data-Dumper zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel +Note: On some versions perl FindBin will need to be installed to make the tests work. + +sudo yum install perl-FindBin + Alpine Linux ------------ -sudo apk update # Ensure the package list is up to date -sudo apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev +doas apk update # Ensure the package list is up to date +doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev + +Note: some older Alpine versions use libressl-dev rather than openssl-dev. OpenSUSE -------- sudo zypper install autoconf automake make gcc perl zlib-devel libbz2-devel xz-devel libcurl-devel libopenssl-devel + +Windows MSYS2/MINGW64 +--------------------- + +The configure script must be used as without it the compilation will +likely fail. + +Follow MSYS2 installation instructions at +https://www.msys2.org/wiki/MSYS2-installation/ + +Then relaunch to MSYS2 shell using the "MSYS2 MinGW x64" executable. +Once in that environment (check $MSYSTEM equals "MINGW64") install the +compilers using pacman -S and the following package list: + +base-devel mingw-w64-x86_64-toolchain +mingw-w64-x86_64-libdeflate mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 +mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools +mingw-w64-x86_64-tools-git + +(The last is only needed for building libraries compatible with MSVC.) + +HP-UX +----- + +HP-UX requires that shared libraries have execute permission. The +default for HTSlib is to install with permission 644 (read-write for +owner and read-only for group / other). This can be overridden by +setting the LIB_PERM variable at install time with: + + make install LIB_PERM=755 diff --git a/LICENSE b/LICENSE index 8456da591..925d47b40 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.] The MIT/Expat License -Copyright (C) 2012-2020 Genome Research Ltd. +Copyright (C) 2012-2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.] The Modified-BSD License -Copyright (C) 2012-2020 Genome Research Ltd. +Copyright (C) 2012-2023 Genome Research Ltd. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/Makefile b/Makefile index 245b7a16c..ef5d61147 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2020 Genome Research Ltd. +# Copyright (C) 2013-2023 Genome Research Ltd. # # Author: John Marshall # @@ -25,7 +25,7 @@ CC = gcc AR = ar RANLIB = ranlib - +SOURCE_DIR = . # Default libraries to link if configure is not used htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl @@ -35,9 +35,21 @@ CPPFLAGS = # TODO: probably update cram code to make it compile cleanly with -Wc++-compat # For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600 #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 -CFLAGS = -g -Wall -O2 -fvisibility=hidden +ifdef DEBUG + CFLAGS = -DDEBUG -g3 -gdwarf-3 + LDFLAGS = -g3 -gdwarf-3 +else + CFLAGS = -O3 + LDFLAGS = +endif +ifdef PROFILE + CFLAGS += -pg +endif +CFLAGS += -Wall -fPIC EXTRA_CFLAGS_PIC = -fpic -LDFLAGS = -fvisibility=hidden +TARGET_CFLAGS = +LDFLAGS = +VERSION_SCRIPT_LDFLAGS = -Wl,-version-script,$(srcprefix)htslib.map LIBS = $(htslib_default_libs) prefix = /usr/local @@ -57,7 +69,8 @@ MKDIR_P = mkdir -p INSTALL = install -p INSTALL_DATA = $(INSTALL) -m 644 INSTALL_DIR = $(MKDIR_P) -m 755 -INSTALL_LIB = $(INSTALL_DATA) +LIB_PERM = 644 +INSTALL_LIB = $(INSTALL) -m $(LIB_PERM) INSTALL_MAN = $(INSTALL_DATA) INSTALL_PROGRAM = $(INSTALL) @@ -74,14 +87,19 @@ BUILT_TEST_PROGRAMS = \ test/fieldarith \ test/hfile \ test/pileup \ + test/pileup_mod \ test/plugins-dlhts \ test/sam \ test/test_bgzf \ + test/test_expr \ + test/test_faidx \ test/test_kfunc \ test/test_kstring \ + test/test_mod \ test/test_realn \ test/test-regidx \ test/test_str2int \ + test/test_time_funcs \ test/test_view \ test/test_index \ test/test-vcf-api \ @@ -89,7 +107,9 @@ BUILT_TEST_PROGRAMS = \ test/test-bcf-sr \ test/fuzz/hts_open_fuzzer.o \ test/test-bcf-translate \ - test/test-parse-reg + test/test-parse-reg \ + test/test_introspection \ + test/test-bcf_set_variant_type BUILT_THRASH_PROGRAMS = \ test/thrash_threads1 \ @@ -103,19 +123,50 @@ BUILT_THRASH_PROGRAMS = \ all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \ htslib_static.mk htslib-uninstalled.pc +ALL_CPPFLAGS = -I. $(CPPFLAGS) + +# Usually htscodecs.mk is generated by running configure or config.status, +# but if those aren't used create a default here. +htscodecs.mk: + echo '# Default htscodecs.mk generated by Makefile' > $@ + echo 'include $$(HTSPREFIX)htscodecs_bundled.mk' >> $@ + $(srcdir)/hts_probe_cc.sh '$(CC)' '$(CFLAGS) $(CPPFLAGS)' '$(LDFLAGS)' >> $@ + +srcdir = . +srcprefix = $(SOURCE_DIR)/ HTSPREFIX = -include htslib_vars.mk + +# Flags for SIMD code +HTS_CFLAGS_AVX2 = +HTS_CFLAGS_AVX512 = +HTS_CFLAGS_SSE4 = + +# Control building of SIMD code. Not used if configure has been run. +HTS_BUILD_AVX2 = +HTS_BUILD_AVX512 = +HTS_BUILD_SSSE3 = +HTS_BUILD_POPCNT = +HTS_BUILD_SSE4_1 = + +include $(SOURCE_DIR)/htslib_vars.mk +include htscodecs.mk # If not using GNU make, you need to copy the version number from version.sh # into here. -PACKAGE_VERSION := $(shell ./version.sh) +PACKAGE_VERSION := $(shell $(srcdir)/version.sh) LIBHTS_SOVERSION = 3 -MACH_O_COMPATIBILITY_VERSION = $(LIBHTS_SOVERSION) + +# Version numbers for the Mac dynamic library. Note that the leading 3 +# is not strictly necessary and should be removed the next time +# LIBHTS_SOVERSION is bumped (see #1144 and +# https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23) +MACH_O_COMPATIBILITY_VERSION = 3.1.18 +MACH_O_CURRENT_VERSION = 3.1.18 # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string # even if this is a dirty or untagged Git working tree. -NUMERIC_VERSION := $(shell ./version.sh numeric) +NUMERIC_VERSION := $(shell $(srcdir)/version.sh numeric) # Force version.h to be remade if $(PACKAGE_VERSION) has changed. version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force)) @@ -130,18 +181,32 @@ show-version: @echo PACKAGE_VERSION = $(PACKAGE_VERSION) @echo NUMERIC_VERSION = $(NUMERIC_VERSION) +config_vars.h: override escape=$(subst ',\x27,$(subst ",\",$(subst \,\\,$(1)))) +config_vars.h: override hts_cc_escaped=$(call escape,$(CC)) +config_vars.h: override hts_cppflags_escaped=$(call escape,$(CPPFLAGS)) +config_vars.h: override hts_cflags_escaped=$(call escape,$(CFLAGS)) +config_vars.h: override hts_ldflags_escaped=$(call escape,$(LDFLAGS)) +config_vars.h: override hts_libs_escaped=$(call escape,$(LIBS)) + +config_vars.h: + printf '#define HTS_CC "%s"\n#define HTS_CPPFLAGS "%s"\n#define HTS_CFLAGS "%s"\n#define HTS_LDFLAGS "%s"\n#define HTS_LIBS "%s"\n' \ + '$(hts_cc_escaped)' \ + '$(hts_cppflags_escaped)' \ + '$(hts_cflags_escaped)' \ + '$(hts_ldflags_escaped)' \ + '$(hts_libs_escaped)' > $@ + .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: - $(CC) $(CFLAGS) -I. $(CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) -I$(SOURCE_DIR) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< .c.pico: - $(CC) $(CFLAGS) -I. $(CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + $(CC) $(CFLAGS) -I$(SOURCE_DIR) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< LIBHTS_OBJS = \ kfunc.o \ - knetfile.o \ kstring.o \ bcf_sr_sort.o \ bgzf.o \ @@ -149,8 +214,8 @@ LIBHTS_OBJS = \ faidx.o \ header.o \ hfile.o \ - hfile_net.o \ hts.o \ + hts_expr.o \ hts_os.o\ md5.o \ multipart.o \ @@ -159,6 +224,7 @@ LIBHTS_OBJS = \ regidx.o \ region.o \ sam.o \ + sam_mods.o \ synced_bcf_reader.o \ vcf_sweep.o \ tbx.o \ @@ -172,13 +238,12 @@ LIBHTS_OBJS = \ cram/cram_external.o \ cram/cram_index.o \ cram/cram_io.o \ - cram/cram_samtools.o \ cram/cram_stats.o \ cram/mFILE.o \ cram/open_trace_file.o \ cram/pooled_alloc.o \ - cram/rANS_static.o \ cram/string_alloc.o \ + $(HTSCODECS_OBJS) \ $(NONCONFIGURE_OBJS) # Without configure we wish to have a rich set of default figures, @@ -197,14 +262,15 @@ cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) $(htslib_cram_h) cram/string_alloc.h cram/mFILE.h $(htslib_khash_h) cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h) +fuzz_settings_h = fuzz_settings.h header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h) hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h) hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h) +hts_time_funcs_h = hts_time_funcs.h sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) - # To be effective, config.mk needs to appear after most Makefile variables are # set but before most rules appear, so that it can both use previously-set # variables in its own rules' prerequisites and also update variables for use @@ -218,6 +284,9 @@ thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) # but if those aren't used create a default config.h here. config.h: echo '/* Default config.h generated by Makefile */' > $@ + echo '#ifndef _XOPEN_SOURCE' >> $@ + echo '#define _XOPEN_SOURCE 600' >> $@ + echo '#endif' >> $@ echo '#define HAVE_LIBBZ2 1' >> $@ echo '#define HAVE_LIBLZMA 1' >> $@ echo '#ifndef __APPLE__' >> $@ @@ -225,12 +294,28 @@ config.h: echo '#endif' >> $@ echo '#define HAVE_DRAND48 1' >> $@ echo '#define HAVE_LIBCURL 1' >> $@ + if [ "x$(HTS_BUILD_POPCNT)" != "x" ] && \ + [ "x$(HTS_BUILD_SSE4_1)" != "x" ] && \ + [ "x$(HTS_BUILD_SSSE3)" != "x" ]; then \ + echo '#define HAVE_POPCNT 1' >> $@ ; \ + echo '#define HAVE_SSE4_1 1' >> $@ ; \ + echo '#define HAVE_SSSE3 1' >> $@ ; \ + echo '#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0' >> $@ ; \ + echo '#define UBSAN 1' >> $@ ; \ + echo '#endif' >> $@ ; \ + fi + if [ "x$(HTS_BUILD_AVX2)" != "x" ] ; then \ + echo '#define HAVE_AVX2 1' >> $@ ; \ + fi + if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \ + echo '#define HAVE_AVX512 1' >> $@ ; \ + fi # And similarly for htslib.pc.tmp ("pkg-config template"). No dependency # on htslib.pc.in listed, as if that file is newer the usual way to regenerate # this target is via configure or config.status rather than this rule. htslib.pc.tmp: - sed -e '/^static_libs=/s/@static_LIBS@/$(htslib_default_libs)/;s#@[^-][^@]*@##g' htslib.pc.in > $@ + sed -e '/^static_libs=/s/@static_LIBS@/$(htslib_default_libs)/;s#@[^-][^@]*@##g' $(srcprefix)htslib.pc.in > $@ # Create a makefile fragment listing the libraries and LDFLAGS needed for # static linking. This can be included by projects that want to build @@ -254,10 +339,10 @@ SHLIB_FLAVOUR = cygdll lib-shared: cyghts-$(LIBHTS_SOVERSION).dll else ifeq "$(findstring MSYS,$(PLATFORM))" "MSYS" SHLIB_FLAVOUR = dll -lib-shared: hts-$(LIBHTS_SOVERSION).dll +lib-shared: hts-$(LIBHTS_SOVERSION).dll hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib else ifeq "$(findstring MINGW,$(PLATFORM))" "MINGW" SHLIB_FLAVOUR = dll -lib-shared: hts-$(LIBHTS_SOVERSION).dll +lib-shared: hts-$(LIBHTS_SOVERSION).dll hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib else SHLIB_FLAVOUR = so lib-shared: libhts.so @@ -265,6 +350,9 @@ endif BUILT_PLUGINS = $(PLUGIN_OBJS:.o=$(PLUGIN_EXT)) +ifneq "$(BUILT_PLUGINS)" "" +plugins: lib-shared +endif plugins: $(BUILT_PLUGINS) @@ -274,6 +362,10 @@ libhts.a: $(LIBHTS_OBJS) -$(RANLIB) $@ print-config: + @echo HTS_CFLAGS_AVX2 = $(HTS_CFLAGS_AVX2) + @echo HTS_CFLAGS_AVX512 = $(HTS_CFLAGS_AVX512) + @echo HTS_CFLAGS_SSE4 = $(HTS_CFLAGS_SSE4) + @echo HTS_HAVE_NEON = $(HTS_HAVE_NEON) @echo LDFLAGS = $(LDFLAGS) @echo LIBHTS_OBJS = $(LIBHTS_OBJS) @echo LIBS = $(LIBS) @@ -285,7 +377,7 @@ print-config: # file used at runtime (when $LD_LIBRARY_PATH includes the build directory). libhts.so: $(LIBHTS_OBJS:.o=.pico) - $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread + $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(VERSION_SCRIPT_LDFLAGS) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread ln -sf $@ libhts.so.$(LIBHTS_SOVERSION) # Similarly this also creates libhts.NN.dylib as a byproduct, so that programs @@ -293,7 +385,7 @@ libhts.so: $(LIBHTS_OBJS:.o=.pico) # includes this project's build directory). libhts.dylib: $(LIBHTS_OBJS) - $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(NUMERIC_VERSION) -compatibility_version $(MACH_O_COMPATIBILITY_VERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LIBS) + $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(MACH_O_CURRENT_VERSION) -compatibility_version $(MACH_O_COMPATIBILITY_VERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LIBS) ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib cyghts-$(LIBHTS_SOVERSION).dll libhts.dll.a: $(LIBHTS_OBJS) @@ -302,16 +394,59 @@ cyghts-$(LIBHTS_SOVERSION).dll libhts.dll.a: $(LIBHTS_OBJS) hts-$(LIBHTS_SOVERSION).dll hts.dll.a: $(LIBHTS_OBJS) $(CC) -shared -Wl,--out-implib=hts.dll.a -Wl,--enable-auto-import -Wl,--exclude-all-symbols $(LDFLAGS) -o $@ -Wl,--whole-archive $(LIBHTS_OBJS) -Wl,--no-whole-archive $(LIBS) -lpthread +hts-$(LIBHTS_SOVERSION).def: hts-$(LIBHTS_SOVERSION).dll + gendef hts-$(LIBHTS_SOVERSION).dll + +hts-$(LIBHTS_SOVERSION).lib: hts-$(LIBHTS_SOVERSION).def + dlltool -m i386:x86-64 -d hts-$(LIBHTS_SOVERSION).def -l hts-$(LIBHTS_SOVERSION).lib + +# Bundling libraries, binaries, dll dependencies, and licenses into a +# single directory. NB: This is not needed for end-users, but a test bed +# for maintainers building binary distributions. +# +# NOTE: only tested on the supported MSYS2/MINGW64 environment. +dist-windows: DESTDIR= +dist-windows: prefix=dist-windows +dist-windows: install + cp hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib dist-windows/lib + cp `ldd hts-$(LIBHTS_SOVERSION).dll| awk '/mingw64/ {print $$3}'` dist-windows/bin + mkdir -p dist-windows/share/licenses/htslib + -cp -r /mingw64/share/licenses/mingw-w64-libraries \ + /mingw64/share/licenses/brotli \ + /mingw64/share/licenses/bzip2 \ + /mingw64/share/licenses/gcc-libs \ + /mingw64/share/licenses/libdeflate \ + /mingw64/share/licenses/libpsl \ + /mingw64/share/licenses/libtre \ + /mingw64/share/licenses/libwinpthread \ + /mingw64/share/licenses/openssl \ + /mingw64/share/licenses/xz \ + /mingw64/share/licenses/zlib \ + /mingw64/share/licenses/zstd \ + dist-windows/share/licenses/ + -cp -r /usr/share/licenses/curl \ + dist-windows/share/licenses/ + cp LICENSE dist-windows/share/licenses/htslib/ + + # Target to allow htslib.mk to build all the object files before it # links the shared and static libraries. hts-object-files: $(LIBHTS_OBJS) touch $@ +# On Unix dlopen("libhts.so.NN", RTLD_LAZY) may default to RTLD_LOCAL. +# Hence plugins need to link to (shared) libhts.so.NN themselves, as they +# may not be able to access libhts symbols via the main program's libhts +# if that was dynamically loaded without an explicit RTLD_GLOBAL. %.so: %.pico libhts.so $(CC) -shared -Wl,-E $(LDFLAGS) -o $@ $< libhts.so $(LIBS) -lpthread -%.bundle: %.o libhts.dylib - $(CC) -bundle -Wl,-undefined,dynamic_lookup $(LDFLAGS) -o $@ $< libhts.dylib $(LIBS) +# For programs *statically* linked to libhts.a, on macOS loading a plugin +# linked to a shared libhts.NN.dylib would lead to conflicting duplicate +# symbols. Fortunately macOS dlopen() defaults to RTLD_GLOBAL so there +# is less need for plugins to link back to libhts themselves. +%.bundle: %.o + $(CC) -bundle -Wl,-undefined,dynamic_lookup $(LDFLAGS) -o $@ $< $(LIBS) %.cygdll: %.o libhts.dll.a $(CC) -shared $(LDFLAGS) -o $@ $< libhts.dll.a $(LIBS) @@ -323,18 +458,18 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) -knetfile.o knetfile.pico: knetfile.c config.h $(htslib_hts_log_h) $(htslib_knetfile_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) -hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) -hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(hts_time_funcs_h) +hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) +hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c -vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) -sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) +vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) +sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) +sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) @@ -351,21 +486,38 @@ probaln.o probaln.pico: probaln.c config.h $(htslib_hts_h) realn.o realn.pico: realn.c config.h $(htslib_hts_h) $(htslib_sam_h) textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstring_h) $(htslib_sam_h) $(hts_internal_h) -cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(cram_h) +cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(fuzz_settings_h) $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) -cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) -cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) +cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h) +cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htscodecs_rANS_static4x16_h) $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) -cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h) $(sam_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(fuzz_settings_h) $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/pooled_alloc.h $(cram_misc_h) -cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c config.h cram/rANS_static.h cram/rANS_byte.h cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) +htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) +htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) +htscodecs/htscodecs/htscodecs.o htscodecs/htscodecs/htscodecs.pico: htscodecs/htscodecs/htscodecs.c $(htscodecs_htscodecs_h) $(htscodecs_version_h) +htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) +htscodecs/htscodecs/rANS_static32x16pr.o htscodecs/htscodecs/rANS_static32x16pr.pico: htscodecs/htscodecs/rANS_static32x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x16pr_avx2.pico: htscodecs/htscodecs/rANS_static32x16pr_avx2.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_permute_h) +htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: htscodecs/htscodecs/rANS_static32x16pr_avx512.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_neon.o htscodecs/htscodecs/rANS_static32x16pr_neon.pico: htscodecs/htscodecs/rANS_static32x16pr_neon.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: htscodecs/htscodecs/rANS_static32x16pr_sse4.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) $(htscodecs_rANS_static32x16pr_h) +htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) +htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.c config.h $(htscodecs_varint_h) $(htscodecs_rle_h) +htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/utils.o htscodecs/htscodecs/utils.pico: htscodecs/htscodecs/utils.c config.h $(htscodecs_utils_h) + +# Extra CFLAGS for specific files +htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x16pr_avx2.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX2) +htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX512) +htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: TARGET_CFLAGS = $(HTS_CFLAGS_SSE4) bgzip: bgzip.o libhts.a $(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread @@ -376,10 +528,54 @@ htsfile: htsfile.o libhts.a tabix: tabix.o libhts.a $(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread -bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) +bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) +# Runes to check that the htscodecs submodule is present +ifdef HTSCODECS_SOURCES +htscodecs/htscodecs/%.c: | htscodecs/htscodecs + @if test -e htscodecs/.git && test ! -e "$@" ; then \ + echo "Missing file '$@'" ; \ + echo " - Do you need to update the htscodecs submodule?" ; \ + false ; \ + fi + +htscodecs/htscodecs/%.h: | htscodecs/htscodecs + @if test -e htscodecs/.git && test ! -e "$@" ; then \ + echo "Missing file '$@'" ; \ + echo " - Do you need to update the htscodecs submodule?" ; \ + false ; \ + fi + +htscodecs/htscodecs: + @if test -e .git ; then \ + printf "\\n\\nError: htscodecs submodule files not present for htslib.\\n\ + Try running: \\n\ + git submodule update --init --recursive\\n\ + in the top-level htslib directory and then re-run make.\\n\\n\\n" ; \ + else \ + printf "\\n\\nError: htscodecs submodule files not present and this is not a git checkout.\\n\ + You have an incomplete distribution. Please try downloading one of the\\n\ + official releases from https://www.htslib.org/\\n" ; \ + fi + @false + +# Build the htscodecs/htscodecs/version.h file if necessary +htscodecs/htscodecs/version.h: force + @if test -e $(srcdir)/htscodecs/.git && test -e $(srcdir)/htscodecs/configure.ac ; then \ + vers=`cd $(srcdir)/htscodecs && git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \ + case "$$vers" in \ + v*) vers=$${vers#v} ;; \ + *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \ + esac ; \ + if ! grep -s -q '"'"$$vers"'"' $@ ; then \ + echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \ + echo '#define HTSCODECS_VERSION_TEXT "'"$$vers"'"' > $@ ; \ + fi ; \ + fi +endif + # Maintainer source code checks # - copyright boilerplate presence # - tab and trailing space detection @@ -387,32 +583,54 @@ maintainer-check: test/maintainer/check_copyright.pl . test/maintainer/check_spaces.pl . +# Look for untracked files in the git repository. +check-untracked: + @if test -e .git && git status --porcelain | grep '^\?'; then \ + echo 'Untracked files detected (see above). Please either clean up, add to .gitignore, or for test output files consider naming them to match *.tmp or *.tmp.*' ; \ + false ; \ + fi + +# Create a shorthand. We use $(SRC) or $(srcprefix) rather than $(srcdir)/ +# for brevity in test and install rules, and so that build logs do not have +# ./ sprinkled throughout. +SRC = $(srcprefix) + # For tests that might use it, set $REF_PATH explicitly to use only reference # areas within the test suite (or set it to ':' to use no reference areas). # # If using MSYS, avoid poor shell expansion via: # MSYS2_ARG_CONV_EXCL="*" make check -check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) +check test: all $(HTSCODECS_TEST_TARGETS) test/hts_endian + test/test_expr test/test_kfunc test/test_kstring test/test_str2int + test/test_time_funcs test/fieldarith test/fieldarith.sam test/hfile - HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR) - HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR) + if test "x$(BUILT_PLUGINS)" != "x"; then \ + HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR); \ + fi + if test "x$(BUILT_PLUGINS)" != "x"; then \ + HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR); \ + fi test/test_bgzf test/bgziptest.txt test/test-parse-reg -t test/colons.bam + cd test/faidx && ./test-faidx.sh faidx.tst + cd test/sam_filter && ./filter.sh filter.tst cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst - REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq + cd test/fastq && ./test-fastq.sh + cd test/base_mods && ./base-mods.sh base-mods.tst + REF_PATH=: test/sam test/ce.fa test/faidx/faidx.fa test/faidx/fastqs.fq test/test-regidx cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-} test/hts_endian: test/hts_endian.o $(CC) $(LDFLAGS) -o $@ test/hts_endian.o $(LIBS) -test/fuzz/hts_open_fuzzer: test/fuzz/hts_open_fuzzer.o +test/fuzz/hts_open_fuzzer: test/fuzz/hts_open_fuzzer.o libhts.a $(CC) $(LDFLAGS) -o $@ test/fuzz/hts_open_fuzzer.o libhts.a $(LIBS) -lpthread test/fieldarith: test/fieldarith.o libhts.a @@ -424,6 +642,9 @@ test/hfile: test/hfile.o libhts.a test/pileup: test/pileup.o libhts.a $(CC) $(LDFLAGS) -o $@ test/pileup.o libhts.a $(LIBS) -lpthread +test/pileup_mod: test/pileup_mod.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/pileup_mod.o libhts.a $(LIBS) -lpthread + test/plugins-dlhts: test/plugins-dlhts.o $(CC) $(LDFLAGS) -o $@ test/plugins-dlhts.o $(LIBS) @@ -433,12 +654,21 @@ test/sam: test/sam.o libhts.a test/test_bgzf: test/test_bgzf.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread +test/test_expr: test/test_expr.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread + +test/test_faidx: test/test_faidx.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a -lz $(LIBS) -lpthread + test/test_kfunc: test/test_kfunc.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread test/test_kstring: test/test_kstring.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread +test/test_mod: test/test_mod.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread + test/test_realn: test/test_realn.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread @@ -451,6 +681,9 @@ test/test-parse-reg: test/test-parse-reg.o libhts.a test/test_str2int: test/test_str2int.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_str2int.o libhts.a $(LIBS) -lpthread +test/test_time_funcs: test/test_time_funcs.o + $(CC) $(LDFLAGS) -o $@ test/test_time_funcs.o + test/test_view: test/test_view.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread @@ -469,26 +702,89 @@ test/test-bcf-sr: test/test-bcf-sr.o libhts.a test/test-bcf-translate: test/test-bcf-translate.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread +test/test_introspection: test/test_introspection.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread + +test/test-bcf_set_variant_type: test/test-bcf_set_variant_type.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test-bcf_set_variant_type.o libhts.a $(LIBS) -lpthread + +# Extra tests for bundled htscodecs +test_htscodecs_rans4x8: htscodecs/tests/rans4x8 + cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x8.test + +test_htscodecs_rans4x16: htscodecs/tests/rans4x16pr + cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x16.test + +test_htscodecs_arith: htscodecs/tests/arith_dynamic + cd htscodecs/tests && srcdir=. && export srcdir && ./arith.test + +test_htscodecs_tok3: htscodecs/tests/tokenise_name3 + cd htscodecs/tests && srcdir=. && export srcdir && ./tok3.test + +test_htscodecs_fqzcomp: htscodecs/tests/fqzcomp_qual + cd htscodecs/tests && srcdir=. && export srcdir && ./fqzcomp.test + +test_htscodecs_varint: htscodecs/tests/varint + cd htscodecs/tests && ./varint + +htscodecs/tests/arith_dynamic: htscodecs/tests/arith_dynamic_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/fqzcomp_qual: htscodecs/tests/fqzcomp_qual_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/rans4x16pr: htscodecs/tests/rANS_static4x16pr_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/rans4x8: htscodecs/tests/rANS_static_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/tokenise_name3: htscodecs/tests/tokenise_name3_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/varint: htscodecs/tests/varint_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs +htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c config.h $(htscodecs_arith_dynamic_h) +htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs +htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) +htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs +htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c config.h $(htscodecs_rANS_static4x16_h) +htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs +htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c config.h $(htscodecs_rANS_static_h) +htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs +htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c config.h $(htscodecs_tokenise_name3_h) +htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs +htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c config.h $(htscodecs_varint_h) + test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h) test/hfile.o: test/hfile.c config.h $(htslib_hfile_h) $(htslib_hts_defs_h) $(htslib_kstring_h) test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) +test/pileup_mod.o: test/pileup_mod.c config.h $(htslib_sam_h) test/plugins-dlhts.o: test/plugins-dlhts.c config.h test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_hts_log_h) -test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) +test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h) +test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h) test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) +test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h) test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h) test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h) test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) +test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) +test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) -test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) +test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) $(htslib_hts_h) $(htslib_vcf_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) +test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h) +test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c test/thrash_threads1: test/thrash_threads1.o libhts.a @@ -527,13 +823,12 @@ test-shlib-exports: header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt ( echo "Error: Found unexported symbols (listed above)" ; false ) # Extract symbols that should be exported from public headers using ctags -# Filter out macros in htslib/hts_defs.h, and knet_win32_ functions that -# aren't needed on non-Windows platforms. +# Filter out macros in htslib/hts_defs.h. header-exports.txt: test/header_syms.pl htslib/*.h test/header_syms.pl htslib/*.h | sort -u -o $@ shlib-exports-so.txt: libhts.so - nm -D -g libhts.so | awk '$$2 == "T" { print $$3 }' | sort -u -o $@ + nm -D -g libhts.so | awk '$$2 == "T" { sub("@.*", "", $$3); print $$3 }' | sort -u -o $@ shlib-exports-dylib.txt: libhts.dylib nm -Ug libhts.dylib | awk '$$2 == "T" { sub("^_", "", $$3); print $$3 }' | sort -u -o $@ @@ -541,14 +836,39 @@ shlib-exports-dylib.txt: libhts.dylib shlib-exports-dll.txt: hts.dll.a nm -g hts.dll.a | awk '$$2 == "T" { print $$3 }' | sort -u -o $@ +$(srcprefix)htslib.map: libhts.so + LC_ALL=C ; export LC_ALL; \ + curr_vers=`expr 'X$(PACKAGE_VERSION)' : 'X\([0-9]*\.[0-9.]*\)'` ; \ + last_vers=`awk '/^HTSLIB_[0-9](\.[0-9]+)+/ { lv = $$1 } END { print lv }' htslib.map` ; \ + if test "x$$curr_vers" = 'x' || test "x$$last_vers" = 'x' ; then \ + echo "Version check failed : $$curr_vers / $$las_vers" 1>&2 ; \ + exit 1 ; \ + fi && \ + if test "HTSLIB_$$curr_vers" = "$$last_vers" ; then \ + echo "Refusing to update $@ - HTSlib version not changed" 1>&2 ; \ + exit 1 ; \ + fi && \ + nm --with-symbol-versions -D -g libhts.so | awk '$$2 ~ /^[DGRT]$$/ && $$3 ~ /@@Base$$/ && $$3 !~ /^(_init|_fini|_edata)@@/ { sub(/@@Base$$/, ";", $$3); print " " $$3 }' > $@.tmp && \ + if [ -s $@.tmp ] ; then \ + cat $@ > $@.new.tmp && \ + printf '\n%s {\n' "HTSLIB_$$curr_vers" >> $@.new.tmp && \ + cat $@.tmp >> $@.new.tmp && \ + printf '} %s;\n' "$$last_vers" >> $@.new.tmp && \ + rm -f $@.tmp && \ + mv $@.new.tmp $@ ; \ + fi ; \ + else \ + rm -f $@.tmp ; \ + fi + install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir) if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi - $(INSTALL_DATA) htslib/*.h $(DESTDIR)$(includedir)/htslib + $(INSTALL_DATA) $(SRC)htslib/*.h $(DESTDIR)$(includedir)/htslib $(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a - $(INSTALL_MAN) bgzip.1 htsfile.1 tabix.1 $(DESTDIR)$(man1dir) - $(INSTALL_MAN) faidx.5 sam.5 vcf.5 $(DESTDIR)$(man5dir) - $(INSTALL_MAN) htslib-s3-plugin.7 $(DESTDIR)$(man7dir) + $(INSTALL_MAN) $(SRC)bgzip.1 $(SRC)htsfile.1 $(SRC)tabix.1 $(DESTDIR)$(man1dir) + $(INSTALL_MAN) $(SRC)faidx.5 $(SRC)sam.5 $(SRC)vcf.5 $(DESTDIR)$(man5dir) + $(INSTALL_MAN) $(SRC)htslib-s3-plugin.7 $(DESTDIR)$(man7dir) installdirs: $(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(includedir) $(DESTDIR)$(includedir)/htslib $(DESTDIR)$(libdir) $(DESTDIR)$(man1dir) $(DESTDIR)$(man5dir) $(DESTDIR)$(man7dir) $(DESTDIR)$(pkgconfigdir) @@ -589,18 +909,27 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: - -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt + -rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \ + test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \ + header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt + -rm -rf htscodecs/tests/test.out + +# Only remove this in git checkouts +DEL_HTSCODECS_VERSION := $(if $(wildcard htscodecs/.git),htscodecs/htscodecs/version.h) mostlyclean: testclean - -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h + -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM config_vars.h version.h + -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico $(DEL_HTSCODECS_VERSION) -rm -f hts-object-files + -rm -f htscodecs/tests/*.o clean: mostlyclean clean-$(SHLIB_FLAVOUR) -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) $(BUILT_TEST_PROGRAMS) $(BUILT_THRASH_PROGRAMS) + -rm -f htscodecs/tests/rans4x8 htscodecs/tests/rans4x16pr htscodecs/tests/arith_dynamic htscodecs/tests/tokenise_name3 htscodecs/tests/fqzcomp_qual htscodecs/tests/varint distclean maintainer-clean: clean -rm -f config.cache config.h config.log config.mk config.status - -rm -f TAGS *.pc.tmp *-uninstalled.pc htslib_static.mk + -rm -f TAGS *.pc.tmp *-uninstalled.pc htslib_static.mk htscodecs.mk -rm -rf autom4te.cache clean-so: @@ -634,7 +963,7 @@ distdir: force: -.PHONY: all check clean distclean distdir force +.PHONY: all check check-untracked clean distclean distdir force .PHONY: install install-pkgconfig installdirs lib-shared lib-static .PHONY: maintainer-check maintainer-clean mostlyclean plugins .PHONY: print-config print-version show-version tags @@ -643,3 +972,5 @@ force: .PHONY: clean-cygdll install-cygdll .PHONY: clean-dll install-dll .PHONY: clean-dylib install-dylib +.PHONY: test_htscodecs_rans4x8 test_htscodecs_rans4x16 test_htscodecs_arith +.PHONY: test_htscodecs_tok3 test_htscodecs_fqzcomp test_htscodecs_varint diff --git a/NEWS b/NEWS index 290183012..afc4ad9a8 100644 --- a/NEWS +++ b/NEWS @@ -1,15 +1,1213 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Added tabix --separate-regions option, which lets the user group the output by - the number of corresponding target regions supplied in the command line. +Noteworthy changes in release 1.18 (25th July 2023) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Updates +------- + +* Using CRAM 3.1 no longer gives a warning about the specification + being draft. Note CRAM 3.0 is still the default output format. + (PR#1583) + +* Replaced use of sprintf with snprintf, to silence potential warnings + from Apple's compilers and those who implement similar checks. + (PR#1594, fixes #1586. Reported by Oleksii Nikolaienko) + +* Fastq output will now generate empty records for reads with no + sequence data (i.e. sequence is "*" in SAM format). (PR#1576, + fixes samtools/samtools#1576. Reported by Nils Homer) + +* CRAM decoding speed-ups. (PR#1580) + +* A new MN aux tag can now be used to verify that MM/ML base modification + data has not been broken by hard clipping. (PR#1590, PR#1612. See also + PR samtools/hts-specs#714 and issue samtools/hts-specs#646. + Reported by Jared Simpson) + +* The base modification API has been improved to make it easier for callers + to tell unchecked bases from unmodified ones. (PR#1636, fixes #1550. + Requested by Chris Wright) + +* A new bam_mods_queryi() API has been added to return additional + data about the i-th base modification returned by bam_mods_recorded(). + (PR#1636, fixes #1550 and #1635. Requested by Jared Simpson) + +* Speed up index look-ups for whole-chromosome queries. (PR#1596) + +* Mpileup now merges adjacent (mis)match CIGAR operations, so CIGARs + using the X/= operators give the same results as if the M operator + was used. (PR#1607, fixes #1597. Reported by Marcel Martin) + +* It's now possible to call bcf_sr_set_regions() after adding readers + using bcf_sr_add_reader() (previously this returned an error). Doing so + will discard any unread data, and reset the readers so they iterate over + the new regions. (PR#1624, fixes samtools/bcftools#1918. Reported by + Gregg Thomas) + +* The synced BCF reader can now accept regions with reference names including + colons and hyphens, by enclosing them in curly braces. For example, + {chr_part:1-1001}:10-20 will return bases 10 to 20 from reference + "chr_part:1-1001". (PR#1630, fixes #1620. Reported by Bren) + +* Add a "samples" directory with code demonstrating usage of HTSlib plus + a tutorial document. (PR#1589) + +Build changes +------------- + +* Htscodecs has been updated to 1.5.1 (PR#1654) + +* Htscodecs SIMD code now works with Apple multiarch binaries. + (PR#1587, HTSlib fix for samtools/htscodecs#76. Reported by John Marshall) + +* Improve portability of "expr" usage in version.sh. + (PR#1593, fixes #1592. Reported by John Marshall) + +* Improve portability to *BSD targets by ensuring _XOPEN_SOURCE is defined + correctly and that source files properly include "config.h". Perl + scripts also now all use #!/usr/bin/env instead of assuming that + it's in /usr/bin/perl. (PR#1628, fixes #1606. + Reported by Robert Clausecker) + +* Fixed NAME entry in htslib-s3-plugin man page so the whatis and apropos + commands find it. (PR#1634, thanks to Étienne Mollier) + +* Assorted dependency tracking fixes. (PR#1653, thanks to John Marshall) + +Documentation updates +--------------------- + +* Changed Alpine build instructions as they've switched back to using openssl. + (PR#1609) + +* Recommend using -rdynamic when statically linking a libhts.a with + plugins enabled. (PR#1611, thanks to John Marshall. Fixes #1600, + reported by Jack Wimberley) + +* Fixed example in docs for sam_hdr_add_line(). (PR#1618, thanks to kojix2) + +* Improved test harness for base modifications API. (PR#1648) + +Bug fixes +--------- + +* Fix a major bug when searching against a CRAM index where one container + has start and end coordinates entirely contained within the previous + container. This would occasionally miss data, and sometimes return much + more than required. The bug affected versions 1.11 to 1.17, although the + change in 1.11 was bug-fixing multi-threaded index queries. This bug did + not affect index building. There is no need to reindex your CRAM files. + (PR#1574, PR#1640. Fixes #1569, #1639, samtools/samtools#1808, + samtools/samtools#1819. Reported by xuxif, Jens Reeder and Jared Simpson) + +* Prevent CRAM blocks from becoming too big in files with short + sequences but very long aux tags. (PR #1613) + +* Fix bug where the CRAM decoder for CONST_INT and CONST_BYTE + codecs may incorrectly look for extra data in the CORE block. + Note that this bug only affected the experimental CRAM v4.0 decoder. + (PR#1614) + +* Fix crypt4gh redirection so it works in conjunction with non-file + IO, such as using htsget. (PR#1577) + +* Improve error checking for the VCF POS column, when facing invalid + data. (PR#1575, replaces #1570 originally reported and fixed + by Colin Nolan.) + +* Improved error checking on VCF indexing to validate the data is BGZF + compressed. (PR#1581) + +* Fix bug where bin number calculation could overflow when making iterators + over regions that go to the end of a chromosome. (PR#1595) + +* Backport attractivechaos/klib#78 (by Pall Melsted) to HTSlib. + Prevents infinite loops in kseq_read() when reading broken gzip files. + (PR#1582, fixes #1579. Reported by Goran Vinterhalter) + +* Backport attractivechaos/klib@384277a (by innoink) to HTSlib. + Fixes the kh_int_hash_func2() macro definition. + (PR#1599, fixes #1598. Reported by fanxinping) + +* Remove a compilation warning on systems with newer libcurl releases. + (PR#1572) + +* Windows: Fixed BGZF EOF check for recent MinGW releases. (PR#1601, + fixes samtools/bcftools#1901) + +* Fixed bug where tabix would not return the correct regions for files + where the column ordering is end, ..., begin instead of begin, ..., end. + (PR#1626, fixes #1622. Reported by Hiruna Samarakoon) + +* sam_format_aux1() now always NUL-terminates Z/H tags. (PR#1631) + +* Ensure base modification iterator is reset when no MM tag is present. + (PR#1631, PR#1647) + +* Fix segfault when attempting to write an uncompressed BAM file opened using + hts_open(name, "wbu"). This was attempting to write BAM data without + wrapping it in BGZF blocks, which is invalid according to the BAM + specification. "wbu" is now internally converted to "wb0" to output + uncompressed data wrapped in BGZF blocks. (PR#1632, fixes #1617. + Reported by Joyjit Daw) + +* Fixed over-strict bounds check in probaln_glocal() which caused it to make + sub-optimal alignments when the requested band width was greater than the + query length. (PR#1616, fixes #1605. Reported by Jared Simpson) + +* Fixed possible double frees when handling errors in bcf_hdr_add_hrec(), + if particular memory allocations fail. (PR#1637) + +* Ensure that bcf_hdr_remove() clears up all pointers to the items removed + from dictionaries. Failing to do this could have resulted in a call + requesting a deleted item via bcf_hdr_get_hrec() returning a stale pointer. + (PR#1637) + +* Stop the gzip decompresser from finishing prematurely when an empty + gzip block is followed by more data. (PR#1643, PR#1646) + +Noteworthy changes in release 1.17 (21st February 2023) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* A new API for iterating through a BAM record's aux field. + (PR#1354, addresses #1319. Thanks to John Marshall) + +* Text mode for bgzip. Allows bgzip to compress lines of text with block breaks + at newlines. + (PR#1493, thanks to Mike Lin for the initial version PR#1369) + +* Make tabix support CSI indices with large positions. Unlike SAM and VCF + files, BED files do not set a maximum reference length which hindered CSI + support. This change sets an arbitrary large size of 100G to enable it to + work. + (PR#1506) + +* Add a fai_line_length function. Exposes the internal line-wrap length. + (PR#1516) + +* Check for invalid barcode tags in fastq output. + (PR#1518, fixes samtools#1728. Reported by Poshi) + +* Warn if reference found in a CRAM file is not contained in the specified + reference file. + (PR#1517 and PR#1521, adds diagnostics for #1515. Reported by Wei WeiDeng) + +* Add a faidx_seq_len64 function that can return sequence lengths longer than + INT_MAX. At the same time limit faidx_seq_len to INT_MAX output. Also add a + fai_adjust_region to ensure given ranges do not go beyond the end of the + requested sequence. + (PR#1519) + +* Add a bcf_strerror function to give text descriptions of BCF errors. + (PR#1510) + +* Add CRAM SQ/M5 header checking when specifying a fasta file. This is to + prevent creating a CRAM that cannot be decoded again. + (PR#1522. In response to samtools#1748 though not a direct fix) + +* Improve support for very long input lines (> 2Gbyte). This is mostly useful + for tabix which does not do much interpretation of its input. + (PR#1542, a partial fix for #1539) + +* Speed up load_ref_portion. This function has been sped up by about 7x, which + speeds up low-depth CRAM decoding by about 10%. + (PR#1551) + +* Expand CRAM API to cope with new samtools cram_size command. + (PR#1546) + +* Merges neighbouring I and D ops into one op within pileup. This means + 4M1D1D1D3M is reported as 4M3D3M. Fixing this in sam.c means not only is + samtools mpileup now looking better, but any tool using the mpileup API will + be getting consistent results. + (PR#1552, fixes the last remaining part of samtools#139) + +* Update the API documentation for bgzf_mt as it refered to a previous + iteration. + (PR#1556, fixes #1553. Reported by Raghavendra Padmanabhan) + + +Build changes +------------- + +* Use POSIX grep in testing as egrep and fgrep are considered obsolete. + (PR#1509, thanks to David Seifert) + +* Switch to building libdefalte with cmake for Cirris CI. + (PR#1511) + +* Ensure strings in config_vars.h are escaped correctly. + (PR#1530, fixes #1527. Reported by Lucas Czech) + +* Easier modification of shared library permissions during install. + (PR#1532, fixes #1525. Reported by StephDC) + +* Fix build on ancient compilers. Added -std=gnu90 to build tests so older + C compilers will still be happy. + (PR#1524, fixes #1523. Reported by Martin Jakt) + +* Switch MacOS CI tests to an ARM-based image. + (PR#1536) + +* Cut down the number of embed_ref=2 tests that get run. + (PR#1537) + +* Add symbol versions to libhts.so. This is to aid package developers. + (PR#1560 addresses #1505, thanks to John Marshall. Reported by Stefan Bruens) + +* htscodecs now updated to v1.4.0. + (PR#1563) + +* Cleaned up misleading system error reports in test_bgzf. + (PR#1565) + +Bug fixes +--------- + +* VCF. Fix n-squared complexity in sample line with many adjacent tabs [fuzz]. + (PR#1503) + +* Improved bcftools detection and reporting of bgzf decode errors. + (PR#1504, thanks to Lilian Janin. PR#1529 thanks to Bergur Ragnarsson, fixes + #1528. PR#1554) + +* Prevent crash when the only FASTA entry has no sequence [fuzz]. + (PR#1507) + +* Fixed typo in sam.h documentation. + (PR#1512, thanks to kojix2) + +* Fix buffer read-overrun in bam_plp_insertion_mod. + (PR#1520) + +* Fix hash keys being left behind by bcf_hdr_remove. + (PR#1535, fixes #1533. Reported by Giulio Genovese in #842) + +* Make bcf_hdr_idinfo_exists more robust by checking id value exists. + (PR#1544, fixes #1538. Reported by Giulio Genovese) + +* CRAM improvements. Fixed crash with multi-threaded CRAM. Fixed a bug in the + codec parameter learning for CRAM 3.1 name tokeniser. Fixed Cram compression + container substitution matrix generation, + (PR#1558, PR#1559 and PR#1562) + +Noteworthy changes in release 1.16 (18th August 2022) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Make hfile_s3 refresh AWS credentials on expiry in order to make HTSlib work + better with AWS IAM credentials, which have a limited lifespan. + (PR#1462 and PR#1474, addresses #344) + +* Allow BAM headers between 2GB and 4GB in size once more. This is not + permitted in the BAM specification but was allowed in an earlier version of + HTSlib. There is now a warning at 2GB and a hard failure at 4GB. + (PR#1421, fixes #1420 and samtools#1613. Reported by John Marshall and + R C Mueller) + +* Improve error message when failing to load an index. + (PR#1468, example of the problem samtools#1637) + +* Permit MM (base modification) tags containing "." and "?" suffixes. These + define implicit vs explicit coordinates. See the SAM tags specification for + details. + (PR#1423 and PR#1426, fixes #1418. PR#1469, fixes #1466. Reported + by cjw85) + +* Warn if spaces instead of tabs are detected in a VCF file to prevent + confusion. + (PR#1328, fixes bcftools#1575. Reported by ketkijoshi278) + +* Add an "sclen" filter expression keyword. This is the length of a soft-clip, + both left and right end. It may be combined with qlen (qlen-sclen) to obtain + the number of bases in the query sequence that have been aligned to the genome + ie it provides a way to compare local-alignment vs global-alignment length. + (PR#1441 and PR/samtools#1661, fixes #1436. Requested by Chang Y) + +* Improve error messages for CRAM reference mismatches. If the user specifies + the wrong reference, the CRAM slice header MD5sum checks fail. We now report + the SQ line M5 string too so it is possible to validate against the whole + chr in the ref.fa file. The error message has also been improved to report + the reference name instead of #num. Finally, we now hint at the likely cause, + which counters the misleading samtools supplied error of "truncated or + corrupt" file. + (PR#1427, fixes samtools#1640. Reported by Jian-Guo Zhou) + +* Expose more of the CRAM API and add new functionality to extract the reference + from a CRAM file. + (PR#1429 and PR#1442) + +* Improvements to the implementation of embedded references in CRAM where no + external reference is specified. + (PR#1449, addresses some of the issues in #1445) + +* The CRAM writer now allows alignment records with RG:Z: aux tags that + don't have a corresponding @RG ID in the file header. Previously these + tags would have been silently dropped. HTSlib will complain whenever it + has to add one though, as such tags do not conform to recommended practice + for the SAM, BAM and CRAM formats. + (PR#1480, fixes #1479. Reported by Alex Leonard) + +* Set tab delimiter in man page for tabix GFF3 sort. + (PR#1457. Thanks to Colin Diesh) + +* When using libdeflate, the 1...9 scale of BGZF compression levels is + now remapped to the 1...12 range used by libdeflate instead of being + passed directly. In particular, HTSlib levels 8 and 9 now map to + libdeflate levels 10 and 12, so it is possible to select the highest (but + slowest) compression offered by libdeflate. + (PR#1488, fixes #1477. Reported by Gert Hulselmans) + +* The VCF variant API has been extended so that it can return separate flags + for INS and DEL variants as well as the existing INDEL one. These flags + have not been added to the old bcf_get_variant_types() interface as + it could break existing users. To access them, it is necessary to use new + functions bcf_has_variant_type() and bcf_has_variant_types(). + (PR#1467) + +* The missing, but trivial, `le_to_u8()` function has been added to hts_endian. + (PR#1494, Thanks to John Marshall) + +* bcf_format_gt() now works properly on big-endian platforms. + (PR#1495, Thanks to John Marshall) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Update htscodecs to version 1.3.0 for new SIMD code + various fixes. + Updates the htscodecs submodule and adds changes necessary to make HTSlib + build the new SIMD codec implementations. + (PR#1438, PR#1489, PR#1500) + +* Fix clang builds under mingw. Under mingw, clang requires dllexport to be + applied to both function declarations and function definitions. + (PR#1435, PR#1497, PR#1498 fixes #1433. Reported by teepean) + +* Fix curl type warning with gcc 12.1 on Windows. + (PR#1443) + +* Detect ARM Neon support and only build appropriate SIMD object files. + (PR#1451, fixes #1450. Thanks to John Marshall) + +* `make print-config` now reports extra CFLAGS that are needed to build the + SIMD parts of htscodecs. These may be of use to third-party build + systems that don't use HTSlib's or htscodecs' build infrastructure. (PR#1485. + Thanks to John Marshall) + +* Fixed some Makefile dependency issues for the "check"/"test" targets + and plugins. In particular, "make check" will now build the "all" target, + if not done already, before running the tests. + (PR#1496) + +Bug fixes +--------- + +* Fix bug when reading position -1 in BCF (0 in VCF), which is used to indicate + telomeric regions. The BCF reader was incorrectly assuming the value stored + in the file was unsigned, so a VCF->BCF->VCF round-trip would change it + from 0 to 4294967296. + (PR#1476, fixes #1475 and bcftools#1753. Reported by Rodrigo Martin) + +* Various bugs and quirks have been fixed in the filter expression engine, + mostly related to the handling of absent tags, and the is_true flag. + Note that as a result of these fixes, some filter expressions may give + different results: + - Fixed and-expressions including aux tag values which could give an invalid + true result depending on the order of terms. + - The expression `![NM]` is now true if only `NM` does not exist. In + earlier versions it would also report true for tags like `NM:i:0` which + exist but have a value of zero. + - The expression `[X1] != 0` is now false when `X1` does not exist. Earlier + versions would return true for this comparison when the tag was missing. + - NULL values due to missing tags now propagate through string, bitwise + and mathematical operations. Logical operations always treat them as + false. + (PR#1463, fixes samtools#1670. Reported by Gert Hulselmans; + PR#1478, fixes samtools#1677. Reported by johnsonzcode) + +* Fix buffer overrun in bam_plp_insertion_mod. Memory now grows to the proper + size needed for base modification data. + (PR#1430, fixes samtools#1652. Reported by hd2326) + +* Remove limit of returned size from fai_retrieve(). + (PR#1446, fixes samtools#1660. Reported by Shane McCarthy) + +* Cap hts_getline() return value at INT_MAX. Prevents hts_getline() from + returning a negative number (a fail) for very long string length values. + (PR#1448. Thanks to John Marshall) + +* Fix breakend detection and test bcf_set_variant_type(). + (PR#1456, fixes #1455. Thanks to Martin Pollard) + +* Prevent arrays of BCF_BT_NULL values found in BCF files from causing + bcf_fmt_array() to call exit() as the type is unsupported. These are + now tested for and caught by bcf_record_check(), which returns an + error code instead. (PR#1486) + +* Improved detection of fasta and fastq files that have very long comments + following identifiers. (PR#1491, thanks to John Marshall. + Fixes samtools/samtools#1689, reported by cjw85) + +* Fixed a SEGV triggered by giving a SAM file to `samtools import`. + (PR#1492) + +Noteworthy changes in release 1.15.1 (7th April 2022) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Security fix: Fixed broken error reporting in the sam_prob_realn() + function, due to a missing hts_log() parameter. Prior to this fix + (i.e., in HTSlib versions 1.8 to 1.15) it was possible to abuse + the log message format string by passing a specially crafted + alignment record to this function. (PR#1406) + +* HTSlib now uses libhtscodecs release 1.2.2. This fixes a number + of bugs where invalid compressed data could trigger usage of + uninitialised values. (PR#1416) + +* Fixed excessive memory used by multi-threaded SAM output on + long reads. (Part of PR#1384) + +* Fixed a bug where tabix would misinterpret region specifiers + starting at position 0. It will also now warn if the file + being indexed is supposed to be 1-based but has positions + less than or equal to 0. (PR#1411) + +* The VCF header parser will now issue a warning if it finds an + INFO header with Type=Flag but Number not equal to 0. It will + also ignore the incorrect Number so the flag can be used. (PR#1415) + +Noteworthy changes in release 1.15 (21st February 2022) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Features and Updates +-------------------- + +* Bgzip now has a --keep option to not remove the input file after + compressing. (PR#1331) + +* Improved file format detection so some BED files are no longer + detected as FASTQ or FASTA. (PR#1350, thanks to John Marshall) + +* Added xz (lzma), zstd and D4 formats to the file type detection + functions. We don't actively support reading these data types, but + function calls and htsfile can detect them. (PR#1340, thanks to + John Marshall) + +* CRAM now also uses libdeflate for read-names if the libdeflate + version is new enough (1.9 onwards). Previously we used zlib for + this due to poor performance of libdeflate. This gives a slight + speed up and reduction in file size. (PR#1383) + +* The VCF and BCF readers will now issue a warning if contig, INFO + or FORMAT IDs do not match the formats described in the VCFv4.3 + specification. Note that while the invalid names will mostly still + be accepted, future updates will convert the warnings to errors + causing files including invalid names to be rejected. (PR#1389) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* HTSlib now uses libhtscodecs release 1.2.1. + +* Improved support for compiling and linking against HTSlib with + Microsoft Visual Studio. (PR#1380, #1377, #1375. Thanks to + Aidan Bickford and John Marshall) + +* Various internal CI improvements. + +Bug fixes +--------- + +* Fixed CRAM index queries for HTSJDK output (PR#1388, reported by + Chris Norman). Note this also fixes writing CRAM writing, to match + the specification (and HTSJDK), from version 3.1 onwards. + +* Fixed CRAM index queries when required-fields settings are selected + to ignore CIGARs (PR#1372, reported by Giulio Genovese). + +* Unmapped but placed (having chr/pos) are now included in the BAM + indices. (PR#1352, thanks to John Marshall) + +* CRAM now honours the filename##idx##index nomenclature for + specifying non-standard index locations. (PR#1360, reported by + Michael Cariaso) + +* Minor CRAM v1.0 read-group fix (PR#1349, thanks to John Marshall) + +* Permit .fa and .fq file type detection as synonyms for FASTA and + FASTQ. (PR#1386). + +* Empty VCF format fields are now output ":.:" as instead of "::". + (PR#1370) + +* Repeated bcf_sr_seek calls now work. (PR#1363, reported by + Giulio Genovese) + +* Bcf_remove_allele_set now works on unpacked BCF records. (PR#1358, + reported by Brent Pedersen). + +* The hts_parse_decimal() function used to read numbers in region lists + is now better at rejecting non-numeric values. In particular it + now rejects a lone 'G' instead of interpreting it as '0G', i.e. zero. + (PR#1396, PR#1400, reported by SSSimon Yang; thanks to John Marshall). + +* Improve support for GPU issues listed by -Wdouble-promotion. + (PR#1365, reported by David Seisert) + +* Fix example code in header file documentation. (PR#1381, Thanks to + Aidan Bickford) + +Noteworthy changes in release 1.14 (22nd October 2021) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Features and Updates +-------------------- + +* Added a keep option to bgzip to leave the original file untouched. This + brings bgzip into line with gzip. (PR #1331, thanks to Alex Petty) + +* "endpos" has been added to the filter language, giving the position + of the rightmost mapped base as measured by the CIGAR string. For + unmapped reads it is the same as "pos". (PR #1307, thanks to John Marshall) + +* Interfaces have been added to interpret the new base modification tags + added to the SAMtags document in samtools/hts-specs#418. (PR #1132) + +* New API functions hts_flush()/sam_flush()/bcf_flush() for flushing output + htsFile/samFile/vcfFile streams. (PR #1326, thanks to John Marshall) + +* The synced_bcf_reader now sorts lines with symbolic alleles by END tag as + well as POS. (PR #1321) + +* Added synced_bcf_reader options BCF_SR_REGIONS_OVERLAP and + BCF_SR_TARGETS_OVERLAP for better control of records that start outside + the desired region but overlap it are handled. Fixes samtools/bcftools#1420 + and samtools/bcftools#1421 raised by John Marshall. (PR #1327) + +* HTSlib will now accept long-cigar CG:B: tags made by htsjdk which don't + quite follow the specification properly (using signed values instead of + unsigned). Thanks to Colin Diesh for reporting an example file. (PR #1317) + +* The warning printed when the BGZF reader finds a file with no EOF block + has been changed to be less alarming. Unfortunately some third-party + BGZF encoders don't write EOF blocks at the end of files. Thanks to + Keiran Raine for reporting an example file. (PR #1323) + +* The FASTA and FASTQ readers get an option to skip over the first item on + the header line, and use the second as the read name. It allows the original + name to be restored on some of the fastq files served from the European + Nucleotide Archive (ENA). (PR #1325) + +* HTSlib is now more strict when parsing the VCF samples line (beginning + #CHROM). It will only accept tabs between the mandatory field names and + sample names must be separated with tabs. (PR #1328) + +* HTSlib will now warn if it looks like the header has been corrupted + by diagnostic messages from the program that made it. This can happen when + using `nohup`, which by default mixes stdout and stderr into the same + stream. (PR#1339, thanks to John Marshall) + +* File format detection will now recognise signatures for XZ, Zstd and D4 + files (note that HTSlib will not read them yet). (PR #1340, thanks to + John Marshall) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Some redundant tests have been removed from the test harness, speeding it up. + (PR #1308) + +* The version.sh script now works better on shallow checkouts. (PR #1324) + +* A check-untracked Makefile target has been added to catch untracked files + (mostly) left by the test harness. (PR #1324) + +Bug fixes +--------- + +* Fixed a case where flushing the thread pool could very occasionally cause + a deadlock. (PR #1309) + +* Fixed a bug where some CRAM files could fail to decode if the required_fields + option was in use. Thanks to Matt Sexton for reporting the issue. + (PR #1314, fixes samtools/samtools#1475) -* New method vcf_open_mode() changes the opening mode of a variant call file, - based on its file extension. Similar to sam_open_mode(). +* Fixed a regression where the S3 plugin could not read public files unless + you supplied some Amazon credentials. Thanks to Chris Saunders for reporting. + (PR #1332, fixes samtools/samtools#1491) + +* Fixed a possible CRAM thread deadlock discovered by @ryancaicse. + (PR #1330, fixes #1329) + +* Some set-but-unused variables have been removed. (PR #1334) + +* Fixed a bug which prevented "flag.read2" from working in the filter + language unless it was at the end of the expression. Thanks to Vamsi Kodali + for reporting the issue. (PR #1342) + +* Fixed a memory leak that could happen if CRAM fails to inflate a LZMA + block. (PR #1340, thanks to John Marshall) + +Noteworthy changes in release 1.13 (7th July 2021) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Features and Updates +-------------------- + +* In case a PG header line has multiple ID tags supplied by other applications, + the header API now selects the first one encountered as the identifying tag + and issues a warning when detecting subsequent ID tags. + (#1256; fixed samtools/samtools#1393) + +* VCF header reading function (vcf_hdr_read) no longer tries to download a + remote index file by default. + (#1266; fixes #380) + +* Support reading and writing FASTQ format in the same way as SAM, BAM or CRAM. + Records read from a FASTQ file will be treated as unmapped data. + (#1156) + +* Added GCP requester pays bucket access. Thanks to @indraniel. + (#1255) + +* Made mpileup's overlap removal choose which copy to remove at random instead + of always removing the second one. This avoids strand bias in experiments + where the +ve and -ve strand reads always appear in the same order. + (#1273; fixes samtools/bcftools#1459) + +* It is now possible to use platform specific BAQ parameters. This also + selects long-read parameters for read lengths bigger than 1kb, which helps + bcftools mpileup call SNPs on PacBio CCS reads. + (#1275) + +* Improved bcf_remove_allele_set. This fixes a bug that stopped iteration over + alleles prematurely, marks removed alleles as 'missing' and does automatic + lazy unpacking. + (#1288; fixes #1259) + +* Improved compression metrics for unsorted CRAM files. This improves the + choice of codecs when handling unsorted data. + (#1291) + +* Linear index entries for empty intervals are now initialised with the file + offset in the next non-empty interval instead of the previous one. This + may reduce the amount of data iterators have to discard before reaching + the desired region, when the starting location is in a sequence gap. + Thanks to @carsonh for reporting the issue. + (#1286; fixes #486) + +* A new hts_bin_level API function has been added, to compute the level of a + given bin in the binning index. + (#1286) + +* Related to the above, a new API method, hts_idx_nseq, now returns the total + number of contigs from an index. + (#1295 and #1299) + +* Added bracket handling to bcf_hdr_parse_line, for use with ##META lines. + Thanks to Alberto Casas Ortiz. + (#1240) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* HTSlib now uses libhtscodecs release 1.1.1. + +* Added a curl/curl.h check to configure and improved INSTALL documentation on + build options. Thanks to Melanie Kirsche and John Marshall. + (#1265; fixes #1261) + +* Some fixes to address GCC 11.1 warnings. + (#1280, #1284, #1285; fixes #1283) + +* Supports building HTSlib in a separate directory. Thanks to John Marshall. + (#1277; fixes #231) + +* Supports building HTSlib on MinGW 32-bit environments. Thanks to + John Marshall. + (#1301) + +Bug fixes +--------- + +* Fixed hts_itr_query() et al region queries: fixed bug introduced in + HTSlib 1.12, which led to iterators producing very few reads for some + queries (especially for larger target regions) when unmapped reads were + present. HTSlib 1.11 had a related problem in which iterators would omit + a few unmapped reads that should have been produced; cf #1142. + Thanks to Daniel Cooke for reporting the issue. + (#1281; fixes #1279) + +* Removed compressBound assertions on opening bgzf files. Thanks to + Gurt Hulselmans for reporting the issue. + (#1258; fixed #1257) + +* Duplicate sample name error message for a VCF file now only displays the + duplicated name rather the entire same name list. + (#1262; fixes samtools/bcftools#1451) + +* Fix to make samtools cat work on CRAMs again. + (#1276; fixes samtools/samtools#1420) + +* Fix for a double memory free in SAM header creation. Thanks to @ihsineme. + (#1274) + +* Prevent assert in bcf_sr_set_regions. Thanks to Dr K D Murray. + (#1270) + +* Fixed crash in knet_open() etc stubs. Thanks to John Marshall. + (#1289) + +* Fixed filter expression "cigar" on unmapped reads. Stop treating an empty + CIGAR string as an error. Thanks to Chang Y for reporting the issue. + (#1298, fixes samtools/samtools#1445) + +* Bug fixes in the bundled copy of htscodecs: + + - Fixed an uninitialized access in the name tokeniser decoder. + (samtools/htscodecs#23) + + - Fixed a bug with name tokeniser and variable number of names per slice, + causing it to incorrectly report an error on certain valid inputs. + (samtools/htscodecs#24) + + +Noteworthy changes in release 1.12 (17th March 2021) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Features and Updates +-------------------- + +* Added experimental CRAM 3.1 and 4.0 support. (#929) + + These should not be used for long term data storage as the + specification still needs to be ratified by GA4GH and may be subject + to changes in format. (This is highly likely for 4.0). However it + may be tested using: + + test/test_view -t ref.fa -C -o version=3.1 in.bam -p out31.cram + + For smaller but slower files, try varying the compression profile + with an additional "-o small". Profile choices are fast, normal, + small and archive, and can be applied to all CRAM versions. + +* Added a general filtering syntax for alignment records in SAM/BAM/CRAM + readers. (#1181, #1203) + + An example to find chromosome spanning read-pairs with high mapping + quality: 'mqual >= 30 && mrname != rname' + + To find significant sized deletions: + 'cigar =~ "[0-9]{2}D"' or 'rlen - qlen > 10'. + + To report duplicates that aren't part of a "proper pair": + 'flag.dup && !flag.proper_pair' + + More details are in the samtools.1 man page under "FILTER EXPRESSIONS". + +* The knet networking code has been removed. It only supported the http + and ftp protocols, and a better and safer alternative using libcurl + has been available since release 1.3. If you need access to ftp:// and + http:// URLs, HTSlib should be built with libcurl support. (#1200) + +* The old htslib/knetfile.h interfaces have been marked as deprecated. Any + code still using them should be updated to use hFILE instead. (#1200) + +* Added an introspection API for checking some of the capabilities provided + by HTSlib. (#1170) Thanks also to John Marshall for contributions. (#1222) + - `hfile_list_schemes`: returns the number of schemes found + - `hfile_list_plugins`: returns the number of plugins found + - `hfile_has_plugin`: checks if a specific plugin is available + - `hts_features`: returns a bit mask with all available features + - `hts_test_feature`: test if a feature is available + - `hts_feature_string`: return a string summary of enabled features + +* Made performance improvements to `probaln_glocal` method, which + speeds up mpileup BAQ calculations. (#1188) + - Caching of reused loop variables and removal of loop invariants + - Code reordering to remove instruction latency. + - Other refactoring and tidyups. + +* Added a public method for constructing a BAM record from the + component pieces. Thanks to Anders Kaplan. (#1159, #1164) + +* Added two public methods, `sam_parse_cigar` and `bam_parse_cigar`, as part of + a small CIGAR API (#1169, #1182). Thanks to Daniel Cameron for input. (#1147) + +* HTSlib, and the included htsfile program, will now recognise the old + RAZF compressed file format. Note that while the format is detected, + HTSlib is unable to read it. It is recommended that RAZF files are + uncompressed with `gunzip` before using them with HTSlib. Thanks to + John Marshall (#1244); and Matthew J. Oldach who reported problems + with uncompressing some RAZF files (samtools/samtools#1387). + +* The S3 plugin now has options to force the address style. It will recognise + the addressing_style and host_bucket entries in the respective aws + .credentials and s3cmd .s3cfg files. There is also a new HTS_S3_ADDRESS_STYLE + environment variable. Details are in the htslib-s3-plugin.7 man file (#1249). + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Added new Makefile targets for the applications that embed HTSlib and + want to run its test suite or clean its generated artefacts. (#1230, #1238) + +* The CRAM codecs are now obtained via the htscodecs submodule, hence + when cloning it is now best to use "git clone --recursive". In an + existing clone, you may use "git submodule update --init" to obtain + the htscodecs submodule checkout. + +* Updated CI test configuration to recurse HTSlib submodules. (#1359) + +* Added Cirrus-CI integration as a replacement for Travis, which was + phased out. (#1175; #1212) + +* Updated the Windows image used by Appveyor to 'Visual Studio 2019'. (#1172; + fixed #1166) + +* Fixed a buglet in configure.ac, exposed by the release 2.70 of autoconf. + Thanks to John Marshall. (#1198) + +* Fixed plugin linking on macOS, to prevent symbol conflict when linking + with a static HTSlib. Thanks to John Marshall. (#1184) + +* Fixed a clang++9 error in `cram_io.h`. Thanks to Pjotr Prins. (#1190) + +* Introduced $(ALL_CPPFLAGS) to allow for more flexibility in setting the + compiler flags. Thanks to John Marshall. (#1187) + +* Added 'fall through' comments to prevent warnings issued by Clang on + intentional fall through case statements, when building with + `-Wextra flag`. Thanks to John Marshall. (#1163) + +* Non-configure builds now define _XOPEN_SOURCE=600 to allow them to work + when the `gcc -std=c99` option is used. Thanks to John Marshall. (#1246) + +Bug fixes +--------- + +* Fixed VCF `#CHROM` header parsing to only separate columns at tab characters. + Thanks to Sam Morris for reporting the issue. + (#1237; fixed samtools/bcftools#1408) + +* Fixed a crash reported in `bcf_sr_sort_set`, which expects REF to be present. + (#1204; fixed samtools/bcftools#1361) + +* Fixed a bcf synced reader bug when filtering with a region list, and + the first record for a chromosome had the same position as the last + record for the previous chromosome. (#1254; fixed samtools/bcftools#1441) + +* Fixed a bug in the overlapping logic of mpileup, dealing with iterating over + CIGAR segments. Thanks to `@wulj2` for the analysis. (#1202; fixed #1196) + +* Fixed a tabix bug that prevented setting the correct number of lines to be + skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189; + fixed #1186) + +* Made `bam_itr_next` an alias for `sam_itr_next`, to prevent it from crashing + when working with htsFile pointers. Thanks to Torbjörn Klatt for + reporting it. (#1180; fixed #1179) + +* Fixed once per outgoing multi-threaded block `bgzf_idx_flush` assertion, to + accommodate situations when a single record could span multiple blocks. + Thanks to `@lacek`. (#1168; fixed samtools/samtools#1328) + +* Fixed assumption of pthread_t being a non-structure, as permitted by POSIX. + Thanks also to John Marshall and Anders Kaplan. (#1167, #1153, #1153) + +* Fixed the minimum offset of a BAI index bin, to account for unmapped reads. + Thanks to John Marshall for spotting the issue. (#1158; fixed #1142) + +* Fixed the CRLF handling in `sam_parse_worker` method. Thanks to + Anders Kaplan. (#1149; fixed #1148) + +* Included unistd.h and errno.h directly in HTSlib files, as opposed to + including them indirectly, via third party code. Thanks to + Andrew Patterson (#1143) and John Marshall (#1145). + + +Noteworthy changes in release 1.11 (22nd September 2020) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Features and Updates +-------------------- + +* Support added for remote reference files. fai_path() can take a remote + reference file and will return the corresponding index file. Remote indexes + can be handled by refs_load_fai(). UR tags in @SQ lines can now be set to + remote URIs. (#1017) + +* Added tabix --separate-regions option, which adds header comment lines + separating different regions' output records when multiple target regions + are supplied on the command line. (#1108) + +* Added tabix --cache option to set a BGZF block cache size. Most beneficial + when the -R option is used and the same blocks need to be re-read multiple + times. (#1053) + +* Improved error checking in tabix and added a --verbosity option so + it is possible to change the amount of logging when it runs. (#1040) + +* A note about the maximum chromosome length usable with TBI indexes has been + added to the tabix manual page. Thanks to John Marshall. (#1070) + +* New method vcf_open_mode() changes the opening mode of a variant file + based on its file extension. Similar to sam_open_mode(). (#1096) + +* The VCF parser has been made faster and easier to maintain. (#1057) + +* bcf_record_check() has been made faster, giving a 15% speed increase when + reading an uncompressed BCF file. (#1130) + +* The VCF parser now recognises the "" symbolic allele produced + by GATK. (#1045) + +* Support has been added for simultaneous reading of unindexed VCF/BCF files + when using the synced_bcf_reader interface. Input files must have the + chromosomes in the same order as each other and be consistent with the order + of sequences in the header. (#1089) + +* The VCF and BCF readers will now attempt to fix up invalid INFO/END tags + where the stored END value is less than POS, resulting in an apparently + negative record length. Such files have been generated by programs which + used END incorrectly, and by broken lift-over processes that failed to + update any END tags present. (#1021; fixed samtools/bcftools#1154) + +* The htsFile interface can now detect the crypt4gh encrypted format (see + https://samtools.github.io/hts-specs/crypt4gh.pdf). If HTSlib is + built with external plug-in support, and the hfile_crypt4gh plug-in is + present, the file will be passed to it for decryption. The plug-in + can be obtained from https://github.com/samtools/htslib-crypt4gh. (#1046) * hts_srand48() now seeds the same POSIX-standard sequences of pseudo-random numbers regardless of platform, including on OpenBSD where plain srand48() produces a different cryptographically-strong non-deterministic sequence. + Thanks to John Marshall. (#1002) + +* Iterators now work with 64 bit positions. (#1018) + +* Improved the speed of range queries when using BAI indexes by + making better use of the linear index data included in the file. + The best improvement is on low-coverage data. (#1031) + +* Alignments which consume no reference bases are now considered to have + length 1. This would make such alignments cover 1 reference position in + the same manner as alignments that are unmapped or have no CIGAR strings. + These alignments can now be returned by iterator-based queries. Thanks + to John Marshall. (#1063; fixed samtools/samtools#1240, see also + samtools/hts-specs#521). + +* A bam_set_seqi() function to modify a single base in the BAM structure + has been added. This is a companion function to bam_seqi(). (#1022) + +* Writing SAM format is around 30% faster. (#1035) + +* Added sam_format_aux1() which converts a BAM aux tag to a SAM format string. + (#1134) + +* bam_aux_update_str() no longer requires NUL-terminated strings. It + is also now possible to create tags containing part of a longer string. + (#1088) + +* It is now possible to use external plug-ins in language bindings that + dynamically load HTSlib. Note that a side-effect of this change is that + some plug-ins now link against libhts.so, which means that they have to be + able to find the shared library when they are started up. Thanks to + John Marshall. (#1072) + +* bgzf_close(), and therefore hts_close(), will now return non-zero when + closing a BGZF handle on which errors have been detected. (Part of #1117) + +* Added a special case to the kt_fisher_exact() test for when the table + probability is too small to be represented in a double. This fixes a + bug where it would, for some inputs, fail to correctly determine which + side of the distribution the table was on resulting in swapped p-values + being returned for the left- and right-tailed tests. The two-tailed + test value was not affected by this problem. (#1126) + +* Improved error diagnostics in the CRAM decoder (#1042), BGZF (#1049), + the VCF and BCF readers (#1059), and the SAM parser (#1073). + +* ks_resize() now allocates 1.5 times the requested size when it needs + to expand a kstring instead of rounding up to the next power of two. + This has been done mainly to make the inlined function smaller, but it + also reduces the overhead of storing data in kstrings at the expense of + possibly needing a few more reallocations. (#1129) + +CRAM improvements +----------------- + +* Delay CRAM crc32 checks until the data actually needs to be used. With + other changes this leads to a 20x speed up in indexing and other sub-query + based actions. (#988) + +* CRAM now handles the transition from mapped to unmapped data in a better + way, improving compression of the unmapped data. (#961) + +* CRAM can now use libdeflate. (#961) + +* Fixed bug in MD tag generation with "b" read feature codes, causing the + numbers in the tag to be too large. Note that HTSlib never uses this + feature code so it is unlikely that this bug would be seen on real data. + The problem was found when testing against hand-crafted CRAM files. (#1086) + +* Fixed a regression where the CRAM multi-region iterator became much less + efficient when using threads. It now works more like the single iterator + and does not preemptively decode the next container unless it will be used. + (#1061) + +* Set CRAM default quality in lossy quality modes. If lossy quality is enabled + and 'B', 'q' or 'Q' features are used, CRAM starts off with QUAL being all 255 + (as per BAM spec and "*" quality) and then modifies individual qualities as + dictated by the specific features. + + However that then produces ASCII quality " " (space, q=-1) for the unmodified + bases. Instead ASCII quality "?" (q=30) is used, as per HTSJDK. Quality 255 + is still used for sequences with no modifications at all. (#1094) + + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* `make all` now also builds htslib_static.mk and htslib-uninstalled.pc. + Thanks to John Marshall. (#1011) + +* Various cppcheck-1.90 warnings have been fixed. (#995, #1011) + +* HTSlib now prefers its own headers when being compiled, fixing build + failures on machines that already had a system-installed HTSlib. Thanks to + John Marshall. (#1078; fixed #347) + +* Define HTSLIB_EXPORT without using a helper macro to reduce the length of + compiler diagnostics that mention exported functions. Thanks to + John Marshall. (#1029) + +* Fix dirty default build by including latest pkg.m4 instead of using + aclocal.m4. Thanks to Damien Zammit. (#1091) + +* Struct tags have been added to htslib/*.h public typedefs. This makes it + possible to forward declare htsFile without including htslib/hts.h. Thanks + to Lucas Czech and John Marshall. (#1115; fixed #1106) + +* Fixed compiler warnings emitted by the latest gcc and clang releases + when compiling HTSlib, along with some -Wextra warnings in the public + include files. Thanks to John Marshall. (#1066, #1063, #1083) + +Bug fixes +--------- + +* Fixed hfile_libcurl breakage when using libcurl 7.69.1 or later. Thanks to + John Marshall for tracking down the exact libcurl change that caused the + incompatibility. (#1105; fixed samtools/samtools#1254 and + samtools/samtools#1284) + +* Fixed overflows kroundup32() and kroundup_size_t() which caused them to + return zero when rounding up values where the most significant bit was + set. When this happens they now return the highest value that can + be stored (#1044). All of the kroundup macro definitions have also been + gathered together into a unified implementation (#1051). + +* Fixed missing return parameter value in idx_test_and_fetch(). Thanks to + Lilian Janin. (#1014) + +* Fixed crashes due to inconsistent selection between BGZF and plain (hFILE) + interfaces when reading files. [fuzz] (#1019) + +* Added and/or fixed byte swapping code for big-endian platforms. Thanks + to Jun Aruga, John Marshall, Michael R Crusoe and Gianfranco Costamagna + for their help. (#1023; fixed #119 and #355) + +* Fixed a problem with multi-threaded on-the-fly indexes which would + occasionally write virtual offsets pointing at the end of a BGZF block. + Attempting to read from such an offset caused EOF to be incorrectly + reported. These offsets are now handled correctly, and the indexer + has been updated to avoid generating them. (#1028; fixed + samtools/samtools#1197) + +* In sam_hdr_create(), free newly allocated SN strings when encountering an + error. [fuzz] (#1034) + +* Prevent double free in case of idx_test_and_fetch() failure. Thanks to + @fanwayne for the bug report. (#1047; fixed #1033) + +* In the header, link a new PG line only to valid chains. Prevents an + explosive growth of PG lines on headers where PG lines are already present + but not linked together correctly. (#1062; fixed samtools/samtools#1235) + +* Also in the header, when calling sam_hdr_update_line(), update target arrays + only when the name or length is changed. (#1007) + +* Fixed buffer overflows in CRAM MD5 calculation triggered by + files with invalid compression headers, or files with embedded + references that were one byte too short. [fuzz] (#1024, #1068) + +* Fix mpileup regression between 1.9 and 1.10 where overlap detection + was incorrectly skipped on reads where RNEXT, PNEXT and TLEN were + set to the "unavailable" values ("*", 0, 0 in SAM). (#1097) + +* kputs() now checks for null pointer in source string. [fuzz] (#1087) + +* Fix potential bcf_update_alleles() crash on 0 alleles. Thanks to + John Marshall. (#994) + +* Added bcf_unpack() calls to some bcf_update functions to fix a bug + where updates made after a call to bcf_dup() could be lost. (#1032; + fixed #1030) + +* Error message typo "Number=R" instead of "Number=G" fixed in + bcf_remove_allele_set(). Thanks to Ilya Vorontsov. (#1100) + +* Fixed crashes that could occur in BCF files that use IDX= header annotations + to create a sparse set of CHROM, FILTER or FORMAT indexes, and + include records that use one of the missing index values. [fuzz] (#1092) + +* Fixed potential integer overflows in the VCF parser and ensured that + the total length of FORMAT fields cannot go over 2Gbytes. [fuzz] (#1044, + #1104; latter is CVE-2020-36403 affecting all HTSlib versions up to 1.10.2) + +* Download index files atomically in idx_test_and_fetch(). This prevents + corruption when running parallel jobs on S3 files. Thanks to John Marshall. + (#1112; samtools/samtools#1242). + +* The pileup constructor callback is now given the copy of the bam1_t struct + made by pileup instead of the original one passed to bam_plp_push(). This + makes it the same as the one passed to the destructor and ensures that + cached data, for example the location of an aux tag, will remain valid. + (#1127) + +* Fixed possible error in code_sort() on negative CRAM Huffman code + length. (#1008) + +* Fixed possible undefined shift in cram_byte_array_stop_decode_init(). (#1009) + +* Fixed a bug where range queries to the end of a given reference + would return incorrect results on CRAM files. (#1016; + fixed samtools/samtools#1173) + +* Fixed an integer overflow in cram_read_slice(). [fuzz] (#1026) + +* Fixed a memory leak on failure in cram_decode_slice(). [fuzz] (#1054) + +* Fixed a regression which caused cram_transcode_rg() to fail, resulting + in a crash in "samtools cat" on CRAM files. (#1093; + fixed samtools/samtools#1276) + +* Fixed an undersized string reallocation in the threaded SAM reader which + caused it to crash when reading SAM files with very long lines. Numerous + memory allocation checks have also been added. (#1117) Noteworthy changes in release 1.10.2 (19th December 2019) @@ -634,7 +1832,7 @@ Noteworthy changes in release 1.8 (3rd April 2018) Noteworthy changes in release 1.7 (26th January 2018) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* BAM: HTSlib now supports BAMs which include CIGARs with more than +* BAM: HTSlib now supports BAMs which include CIGARs with more than 65535 operations as per HTS-Specs 18th November (dab57f4 and 2f915a8). * BCF/VCF: @@ -652,13 +1850,13 @@ Noteworthy changes in release 1.7 (26th January 2018) (#651, #653; reported by Imran Haque and @egafni via pysam). * Multi-region iterator: The new structure takes a list of regions and - iterates over all, deduplicating reads in the process, and producing a - full list of file offset intervals. This is usually much faster than + iterates over all, deduplicating reads in the process, and producing a + full list of file offset intervals. This is usually much faster than repeatedly using the old single-region iterator on a series of regions. * Curl improvements: - Add Bearer token support via HTS_AUTH_LOCATION env (#600). - - Use CURL_CA_BUNDLE environment variable to override the CA (#622; + - Use CURL_CA_BUNDLE environment variable to override the CA (#622; thanks to Garret Kelly & David Alexander). - Speed up (removal of excessive waiting) for both http(s) and ftp. - Avoid repeatedly reconnecting by removal of unnecessary seeks. @@ -667,7 +1865,7 @@ Noteworthy changes in release 1.7 (26th January 2018) * BGZF block caching, if enabled, now performs far better (#629; reported by Ram Yalamanchili). -* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas +* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas Hickman). * Tidied up the drand48 support (intended for systems that do not @@ -765,7 +1963,7 @@ Release 1.4 (13 March 2017) * HTSlib now links against libbz2 and liblzma by default. To remove these dependencies, run configure with options --disable-bz2 and --disable-lzma, - but note that this may make some CRAM files produced elsewhere unreadable. + but note that this may make some CRAM files produced elsewhere unreadable. * Added a thread pool interface and replaced the bgzf multi-threading code to use this pool. BAM and CRAM decoding is now multi-threaded diff --git a/README b/README index 4225bec20..db368afdf 100644 --- a/README +++ b/README @@ -3,3 +3,25 @@ formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing data. It is the core library used by samtools and bcftools. See INSTALL for building and installation instructions. + +Please cite this paper when using HTSlib for your publications: + +HTSlib: C library for reading/writing high-throughput sequencing data +James K Bonfield, John Marshall, Petr Danecek, Heng Li, Valeriu Ohan, Andrew Whitwham, Thomas Keane, Robert M Davies +GigaScience, Volume 10, Issue 2, February 2021, giab007, https://doi.org/10.1093/gigascience/giab007 + +@article{10.1093/gigascience/giab007, + author = {Bonfield, James K and Marshall, John and Danecek, Petr and Li, Heng and Ohan, Valeriu and Whitwham, Andrew and Keane, Thomas and Davies, Robert M}, + title = "{HTSlib: C library for reading/writing high-throughput sequencing data}", + journal = {GigaScience}, + volume = {10}, + number = {2}, + year = {2021}, + month = {02}, + abstract = "{Since the original publication of the VCF and SAM formats, an explosion of software tools have been created to process these data files. To facilitate this a library was produced out of the original SAMtools implementation, with a focus on performance and robustness. The file formats themselves have become international standards under the jurisdiction of the Global Alliance for Genomics and Health.We present a software library for providing programmatic access to sequencing alignment and variant formats. It was born out of the widely used SAMtools and BCFtools applications. Considerable improvements have been made to the original code plus many new features including newer access protocols, the addition of the CRAM file format, better indexing and iterators, and better use of threading.Since the original Samtools release, performance has been considerably improved, with a BAM read-write loop running 5 times faster and BAM to SAM conversion 13 times faster (both using 16 threads, compared to Samtools 0.1.19). Widespread adoption has seen HTSlib downloaded \\>1 million times from GitHub and conda. The C library has been used directly by an estimated 900 GitHub projects and has been incorporated into Perl, Python, Rust, and R, significantly expanding the number of uses via other languages. HTSlib is open source and is freely available from htslib.org under MIT/BSD license.}", + issn = {2047-217X}, + doi = {10.1093/gigascience/giab007}, + url = {https://doi.org/10.1093/gigascience/giab007}, + note = {giab007}, + eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab007/36332285/giab007.pdf}, +} diff --git a/README.md b/README.md index 1b01d7271..47afdba2a 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Build Status](https://travis-ci.org/samtools/htslib.svg?branch=develop)](https://travis-ci.org/samtools/htslib) +[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://api.cirrus-ci.com/github/samtools/htslib) [![Build status](https://ci.appveyor.com/api/projects/status/v46hkwyfjp3l8nd3/branch/develop?svg=true)](https://ci.appveyor.com/project/samtools/htslib/branch/develop) [![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib) @@ -12,7 +12,7 @@ HTSlib implements a generalized BAM index, with file extension `.csi` (coordinate-sorted index). The HTSlib file reader first looks for the new index and then for the old if the new index is absent. -This project also includes the popular tabix indexer, which indexes both `.tbi` +This project also includes the popular tabix indexer, which creates both `.tbi` and `.csi` formats, and the bgzip compression utility. [1]: http://samtools.github.io/hts-specs/ @@ -28,11 +28,36 @@ committed to this repository, so building the code from a Git repository requires extra steps: ```sh -autoheader # If using configure, generate the header template... -autoconf # ...and configure script (or use autoreconf to do both) +autoreconf -i # Build the configure script and install files it uses ./configure # Optional but recommended, for choosing extra functionality make make install ``` [download]: http://www.htslib.org/download/ + +### Citing + +Please cite this paper when using HTSlib for your publications. + +> HTSlib: C library for reading/writing high-throughput sequencing data
+> James K Bonfield, John Marshall, Petr Danecek, Heng Li, Valeriu Ohan, Andrew Whitwham, Thomas Keane, Robert M Davies
+> _GigaScience_, Volume 10, Issue 2, February 2021, giab007, https://doi.org/10.1093/gigascience/giab007 + +``` +@article{10.1093/gigascience/giab007, + author = {Bonfield, James K and Marshall, John and Danecek, Petr and Li, Heng and Ohan, Valeriu and Whitwham, Andrew and Keane, Thomas and Davies, Robert M}, + title = "{HTSlib: C library for reading/writing high-throughput sequencing data}", + journal = {GigaScience}, + volume = {10}, + number = {2}, + year = {2021}, + month = {02}, + abstract = "{Since the original publication of the VCF and SAM formats, an explosion of software tools have been created to process these data files. To facilitate this a library was produced out of the original SAMtools implementation, with a focus on performance and robustness. The file formats themselves have become international standards under the jurisdiction of the Global Alliance for Genomics and Health.We present a software library for providing programmatic access to sequencing alignment and variant formats. It was born out of the widely used SAMtools and BCFtools applications. Considerable improvements have been made to the original code plus many new features including newer access protocols, the addition of the CRAM file format, better indexing and iterators, and better use of threading.Since the original Samtools release, performance has been considerably improved, with a BAM read-write loop running 5 times faster and BAM to SAM conversion 13 times faster (both using 16 threads, compared to Samtools 0.1.19). Widespread adoption has seen HTSlib downloaded \\>1 million times from GitHub and conda. The C library has been used directly by an estimated 900 GitHub projects and has been incorporated into Perl, Python, Rust, and R, significantly expanding the number of uses via other languages. HTSlib is open source and is freely available from htslib.org under MIT/BSD license.}", + issn = {2047-217X}, + doi = {10.1093/gigascience/giab007}, + url = {https://doi.org/10.1093/gigascience/giab007}, + note = {giab007}, + eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab007/36332285/giab007.pdf}, +} +``` diff --git a/bcf_sr_sort.c b/bcf_sr_sort.c index b7299a4d0..01e98bb39 100644 --- a/bcf_sr_sort.c +++ b/bcf_sr_sort.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2019 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -259,6 +259,7 @@ static int cmpstringp(const void *p1, const void *p2) return strcmp(* (char * const *) p1, * (char * const *) p2); } +#define DEBUG_VSETS 0 #if DEBUG_VSETS void debug_vsets(sr_sort_t *srt) { @@ -280,6 +281,7 @@ void debug_vsets(sr_sort_t *srt) } #endif +#define DEBUG_VBUF 0 #if DEBUG_VBUF void debug_vbuf(sr_sort_t *srt) { @@ -380,13 +382,33 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, if ( srt->str.l ) kputc(';',&srt->str); srt->off[srt->noff++] = srt->str.l; - size_t beg = srt->str.l; + size_t beg = srt->str.l; + int end_pos = -1; for (ivar=1; ivarn_allele; ivar++) { if ( ivar>1 ) kputc(',',&srt->str); kputs(line->d.allele[0],&srt->str); kputc('>',&srt->str); kputs(line->d.allele[ivar],&srt->str); + + // If symbolic allele, check also the END tag in case there are multiple events, + // such as s, starting at the same positions + if ( line->d.allele[ivar][0]=='<' ) + { + if ( end_pos==-1 ) + { + bcf_info_t *end_info = bcf_get_info(reader->header,line,"END"); + if ( end_info ) + end_pos = (int)end_info->v1.i; // this is only to create a unique id, we don't mind a potential int64 overflow + else + end_pos = 0; + } + if ( end_pos ) + { + kputc('/',&srt->str); + kputw(end_pos, &srt->str); + } + } } if ( line->n_allele==1 ) { diff --git a/bgzf.c b/bgzf.c index 09f18a643..b3f760f08 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2,7 +2,7 @@ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013-2020 Genome Research Ltd + Copyright (C) 2009, 2013-2022 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -49,6 +49,10 @@ #include "cram/pooled_alloc.h" #include "hts_internal.h" +#ifndef EFTYPE +#define EFTYPE ENOEXEC +#endif + #define BGZF_CACHE #define BGZF_MT @@ -273,7 +277,7 @@ static int bgzf_idx_flush(BGZF *fp) { hts_idx_cache_entry *e = mt->idx_cache.e; int i; - assert(mt->idx_cache.nentries == 0 || mt->block_written >= e[0].block_number); + assert(mt->idx_cache.nentries == 0 || mt->block_written <= e[0].block_number); for (i = 0; i < mt->idx_cache.nentries && e[i].block_number == mt->block_written; i++) { if (hts_idx_push(mt->hts_idx, e[i].tid, e[i].beg, e[i].end, @@ -315,6 +319,37 @@ static inline void packInt32(uint8_t *buffer, uint32_t value) buffer[3] = value >> 24; } +static void razf_info(hFILE *hfp, const char *filename) +{ + uint64_t usize, csize; + off_t sizes_pos; + + if (filename == NULL || strcmp(filename, "-") == 0) filename = "FILE"; + + // RAZF files end with USIZE,CSIZE stored as big-endian uint64_t + if ((sizes_pos = hseek(hfp, -16, SEEK_END)) < 0) goto no_sizes; + if (hread(hfp, &usize, 8) != 8 || hread(hfp, &csize, 8) != 8) goto no_sizes; + if (!ed_is_big()) ed_swap_8p(&usize), ed_swap_8p(&csize); + if (csize >= sizes_pos) goto no_sizes; // Very basic validity check + + hts_log_error( +"To decompress this file, use the following commands:\n" +" truncate -s %" PRIu64 " %s\n" +" gunzip %s\n" +"The resulting uncompressed file should be %" PRIu64 " bytes in length.\n" +"If you do not have a truncate command, skip that step (though gunzip will\n" +"likely produce a \"trailing garbage ignored\" message, which can be ignored).", + csize, filename, filename, usize); + return; + +no_sizes: + hts_log_error( +"To decompress this file, use the following command:\n" +" gunzip %s\n" +"This will likely produce a \"trailing garbage ignored\" message, which can\n" +"usually be safely ignored.", filename); +} + static const char *bgzf_zerr(int errnum, z_stream *zs) { static char buffer[32]; @@ -352,7 +387,7 @@ static const char *bgzf_zerr(int errnum, z_stream *zs) } } -static BGZF *bgzf_read_init(hFILE *hfpr) +static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename) { BGZF *fp; uint8_t magic[18]; @@ -368,12 +403,22 @@ static BGZF *bgzf_read_init(hFILE *hfpr) fp->compressed_block = (char *)fp->uncompressed_block + BGZF_MAX_BLOCK_SIZE; fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b); fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1; + if (fp->is_compressed && (magic[3]&4) && memcmp(&magic[12], "RAZF", 4)==0) { + hts_log_error("Cannot decompress legacy RAZF format"); + razf_info(hfpr, filename); + free(fp->uncompressed_block); + free(fp); + errno = EFTYPE; + return NULL; + } #ifdef BGZF_CACHE if (!(fp->cache = malloc(sizeof(*fp->cache)))) { + free(fp->uncompressed_block); free(fp); return NULL; } if (!(fp->cache->h = kh_init(cache))) { + free(fp->uncompressed_block); free(fp->cache); free(fp); return NULL; @@ -446,11 +491,10 @@ static BGZF *bgzf_write_init(const char *mode) BGZF *bgzf_open(const char *path, const char *mode) { BGZF *fp = 0; - assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r')) { hFILE *fpr; if ((fpr = hopen(path, mode)) == 0) return 0; - fp = bgzf_read_init(fpr); + fp = bgzf_read_init(fpr, path); if (fp == 0) { hclose_abruptly(fpr); return NULL; } fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { @@ -469,11 +513,10 @@ BGZF *bgzf_open(const char *path, const char *mode) BGZF *bgzf_dopen(int fd, const char *mode) { BGZF *fp = 0; - assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r')) { hFILE *fpr; if ((fpr = hdopen(fd, mode)) == 0) return 0; - fp = bgzf_read_init(fpr); + fp = bgzf_read_init(fpr, NULL); if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { @@ -492,9 +535,8 @@ BGZF *bgzf_dopen(int fd, const char *mode) BGZF *bgzf_hopen(hFILE *hfp, const char *mode) { BGZF *fp = NULL; - assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r')) { - fp = bgzf_read_init(hfp); + fp = bgzf_read_init(hfp, NULL); if (fp == NULL) return NULL; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { fp = bgzf_write_init(mode); @@ -532,6 +574,8 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le } else { level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default // NB levels go up to 12 here. + int lvl_map[] = {0,1,2,3,5,6,7,8,10,12}; + level = lvl_map[level>9 ?9 :level]; struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); if (!z) return -1; @@ -572,6 +616,7 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le uint8_t *dst = (uint8_t*)_dst; if (level == 0) { + uncomp: // Uncompressed data if (*dlen < slen+5 + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH) return -1; dst[BLOCK_HEADER_LENGTH] = 1; // BFINAL=1, BTYPE=00; see RFC1951 @@ -593,9 +638,21 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le return -1; } if ((ret = deflate(&zs, Z_FINISH)) != Z_STREAM_END) { - hts_log_error("Deflate operation failed: %s", bgzf_zerr(ret, ret == Z_DATA_ERROR ? &zs : NULL)); + if (ret == Z_OK && zs.avail_out == 0) { + deflateEnd(&zs); + goto uncomp; + } else { + hts_log_error("Deflate operation failed: %s", bgzf_zerr(ret, ret == Z_DATA_ERROR ? &zs : NULL)); + } return -1; } + // If we used up the entire output buffer, then we either ran out of + // room or we *just* fitted, but either way we may as well store + // uncompressed for faster decode. + if (zs.avail_out == 0) { + deflateEnd(&zs); + goto uncomp; + } if ((ret = deflateEnd(&zs)) != Z_OK) { hts_log_error("Call to deflateEnd failed: %s", bgzf_zerr(ret, NULL)); return -1; @@ -676,6 +733,10 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen, } uint32_t crc = libdeflate_crc32(0, (unsigned char *)dst, *dlen); +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Pretend the CRC was OK so the fuzzer doesn't have to get it right + crc = expected_crc; +#endif if (crc != expected_crc) { hts_log_error("CRC32 checksum mismatch"); return -2; @@ -718,6 +779,10 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen, *dlen = *dlen - zs.avail_out; uint32_t crc = crc32(crc32(0L, NULL, 0L), (unsigned char *)dst, *dlen); +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Pretend the CRC was OK so the fuzzer doesn't have to get it right + crc = expected_crc; +#endif if (crc != expected_crc) { hts_log_error("CRC32 checksum mismatch"); return -2; @@ -932,6 +997,8 @@ int bgzf_read_block(BGZF *fp) { hts_tpool_result *r; + if (fp->errcode) return -1; + if (fp->mt) { again: if (fp->mt->hit_eof) { @@ -948,11 +1015,14 @@ int bgzf_read_block(BGZF *fp) if (fp->uncompressed_block == NULL) return -1; fp->compressed_block = (char *)fp->uncompressed_block + BGZF_MAX_BLOCK_SIZE; } // else it's already allocated with malloc, maybe even in-use. - if (mt_destroy(fp->mt) < 0) + if (mt_destroy(fp->mt) < 0) { fp->errcode = BGZF_ERR_IO; + } fp->mt = NULL; hts_tpool_delete_result(r, 0); - + if (fp->errcode) { + return -1; + } goto single_threaded; } @@ -961,13 +1031,14 @@ int bgzf_read_block(BGZF *fp) hts_log_error("BGZF decode jobs returned error %d " "for block offset %"PRId64, j->errcode, j->block_address); + hts_tpool_delete_result(r, 0); return -1; } if (j->hit_eof) { if (!fp->last_block_eof && !fp->no_eof_block) { fp->no_eof_block = 1; - hts_log_warning("EOF marker is absent. The input is probably truncated"); + hts_log_warning("EOF marker is absent. The input may be truncated"); } fp->mt->hit_eof = 1; } @@ -1069,7 +1140,7 @@ int bgzf_read_block(BGZF *fp) if (count == 0) { // no data read if (!fp->last_block_eof && !fp->no_eof_block && !fp->is_gzip) { fp->no_eof_block = 1; - hts_log_warning("EOF marker is absent. The input is probably truncated"); + hts_log_warning("EOF marker is absent. The input may be truncated"); } fp->block_length = 0; return 0; @@ -1412,7 +1483,7 @@ static void *bgzf_mt_writer(void *vp) { int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) { uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; - int count, size = 0, block_length, remaining; + int count, block_length, remaining; // NOTE: Guaranteed to be compressed as we block multi-threading in // uncompressed mode. However it may be gzip compression instead @@ -1422,6 +1493,8 @@ int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) int64_t block_address; block_address = htell(fp->fp); + j->block_address = block_address; // in case we exit with j->errcode + if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; count = hpeek(fp->fp, header, sizeof(header)); if (count == 0) // no data read @@ -1441,7 +1514,6 @@ int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) if (count != sizeof(header)) // no data read return -1; - size = count; block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" if (block_length < BLOCK_HEADER_LENGTH) { j->errcode |= BGZF_ERR_HEADER; @@ -1455,7 +1527,6 @@ int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) j->errcode |= BGZF_ERR_IO; return -1; } - size += count; j->comp_len = block_length; j->uncomp_len = BGZF_MAX_BLOCK_SIZE; j->block_address = block_address; @@ -2217,7 +2288,13 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str) if (fp->block_length == 0) { state = -1; break; } } unsigned char *buf = fp->uncompressed_block; - for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); + + // Equivalent to a naive byte by byte search from + // buf + block_offset to buf + block_length. + void *e = memchr(&buf[fp->block_offset], delim, + fp->block_length - fp->block_offset); + l = e ? (unsigned char *)e - buf : fp->block_length; + if (l < fp->block_length) state = 1; l -= fp->block_offset; if (ks_expand(str, l + 2) < 0) { state = -3; break; } @@ -2230,11 +2307,12 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str) fp->block_length = 0; } } while (state == 0); + if (state < -1) return state; if (str->l == 0 && state < 0) return state; fp->uncompressed_address += str->l + 1; if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--; str->s[str->l] = 0; - return str->l; + return str->l <= INT_MAX ? (int) str->l : INT_MAX; } void bgzf_index_destroy(BGZF *fp) @@ -2488,6 +2566,7 @@ int bgzf_useek(BGZF *fp, off_t uoffset, int where) else break; } int i = ilo-1; + off_t offset = 0; if (bgzf_seek_common(fp, fp->idx->offs[i].caddr, 0) < 0) return -1; @@ -2495,9 +2574,14 @@ int bgzf_useek(BGZF *fp, off_t uoffset, int where) fp->errcode |= BGZF_ERR_IO; return -1; } - if ( uoffset - fp->idx->offs[i].uaddr > 0 ) + offset = uoffset - fp->idx->offs[i].uaddr; + if ( offset > 0 ) { - fp->block_offset = uoffset - fp->idx->offs[i].uaddr; + if (offset > fp->block_length) { + fp->errcode |= BGZF_ERR_IO; + return -1; //offset outside the available data + } + fp->block_offset = offset; assert( fp->block_offset <= fp->block_length ); // todo: skipped, unindexed, blocks } fp->uncompressed_address = uoffset; diff --git a/bgzip.1 b/bgzip.1 index f01a9a643..b5786c190 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -1,10 +1,10 @@ -.TH bgzip 1 "19 December 2019" "htslib-1.10.2" "Bioinformatics tools" +.TH bgzip 1 "25 July 2023" "htslib-1.18" "Bioinformatics tools" .SH NAME .PP bgzip \- Block compression/decompression utility .\" .\" Copyright (C) 2009-2011 Broad Institute. -.\" Copyright (C) 2018 Genome Research Limited. +.\" Copyright (C) 2018, 2021-2023 Genome Research Limited. .\" .\" Author: Heng Li .\" @@ -43,7 +43,7 @@ bgzip \- Block compression/decompression utility .SH SYNOPSIS .PP .B bgzip -.RB [ -cdfhir ] +.RB [ -cdfhikrt ] .RB [ -b .IR virtualOffset ] .RB [ -I @@ -54,7 +54,7 @@ bgzip \- Block compression/decompression utility .IR size ] .RB [ -@ .IR threads ] -.RI [ file ] +.RI [ file " ...]" .PP .SH DESCRIPTION .PP @@ -70,10 +70,22 @@ If the -c option is used, the result will be written to standard output, otherwise when compressing bgzip will write to a new file with a .gz suffix and remove the original. When decompressing the input file must have a .gz suffix, which will be removed to make the output name. Again -after decompression completes the input file will be removed. +after decompression completes the input file will be removed. When multiple +files are given as input, the operation is performed on all of them. .SH OPTIONS .TP 10 +.B "--binary" +Bgzip will attempt to ensure BGZF blocks end on a newline when the +input is a text file. The exception to this is where a single line is +larger than a BGZF block (64Kb). This can aid tools that use the +index to perform random access on the compressed stream, as the start +of a block is likely to also be the start of a text record. + +This option processes text files as if they were binary content, +ignoring the location of newlines. This also restores the behaviour +for text files to bgzip version 1.15 and earlier. +.TP .BI "-b, --offset " INT Decompress to standard output from virtual file position (0-based uncompressed offset). @@ -90,6 +102,14 @@ Overwrite files without asking, or decompress files that don't have a known compression filename extension (e.g., \fI.gz\fR) without asking. Use \fB--force\fR twice to do both without asking. .TP +.B "-g, --rebgzip" +Try to use an existing index to create a compressed file with matching +block offsets. The index must be specified using the \fB-I +\fIfile.gzi\fR option. +Note that this assumes that the same compression library and level are in use +as when making the original file. +Don't use it unless you know what you're doing. +.TP .B "-h, --help" Displays a help message. .TP @@ -101,6 +121,9 @@ file with .gzi appended to it. .BI "-I, --index-name " FILE Index file name. .TP +.B "-k, --keep" +Do not delete input file during operation. +.TP .BI "-l, --compress-level " INT Compression level to use when compressing. From 0 to 9, or -1 for the default level set by the compression library. [-1] @@ -108,17 +131,13 @@ From 0 to 9, or -1 for the default level set by the compression library. [-1] .B "-r, --reindex" Rebuild the index on an existing compressed file. .TP -.B "-g, --rebgzip" -Try to use an existing index to create a compressed file with matching -block offsets. -Note that this assumes that the same compression library and level are in use -as when making the original file. -Don't use it unless you know what you're doing. -.TP .BI "-s, --size " INT Decompress INT bytes (uncompressed size) to standard output. Implies -c. .TP +.B "-t, --test" +Test the intregrity of the compressed file. +.TP .BI "-@, --threads " INT Number of threads to use [1]. .PP @@ -175,6 +194,5 @@ The BGZF library was originally implemented by Bob Handsaker and modified by Heng Li for remote file access and in-memory caching. .SH SEE ALSO -.PP -.BR gzip (1), -.BR tabix (1) +.IR gzip (1), +.IR tabix (1) diff --git a/bgzip.c b/bgzip.c index 097e3f80c..5f577af3a 100644 --- a/bgzip.c +++ b/bgzip.c @@ -1,7 +1,7 @@ /* bgzip.c -- Block compression/decompression utility. Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology - Copyright (C) 2010, 2013-2019 Genome Research Ltd. + Copyright (C) 2010, 2013-2019, 2021-2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -36,13 +36,14 @@ #include #include "htslib/bgzf.h" #include "htslib/hts.h" +#include "htslib/hfile.h" #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN # include #endif -static const int WINDOW_SIZE = 64 * 1024; +static const int WINDOW_SIZE = BGZF_BLOCK_SIZE; static void error(const char *format, ...) { @@ -112,27 +113,28 @@ static int bgzip_main_usage(FILE *fp, int status) fprintf(fp, " -c, --stdout write on standard output, keep original files unchanged\n"); fprintf(fp, " -d, --decompress decompress\n"); fprintf(fp, " -f, --force overwrite files without asking\n"); + fprintf(fp, " -g, --rebgzip use an index file to bgzip a file\n"); fprintf(fp, " -h, --help give this help\n"); fprintf(fp, " -i, --index compress and create BGZF index\n"); fprintf(fp, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n"); + fprintf(fp, " -k, --keep don't delete input files during operation\n"); fprintf(fp, " -l, --compress-level INT Compression level to use when compressing; 0 to 9, or -1 for default [-1]\n"); fprintf(fp, " -r, --reindex (re)index compressed file\n"); - fprintf(fp, " -g, --rebgzip use an index file to bgzip a file\n"); fprintf(fp, " -s, --size INT decompress INT bytes (uncompressed size)\n"); + fprintf(fp, " -t, --test test integrity of compressed file\n"); + fprintf(fp, " --binary Don't align blocks with text lines\n"); fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n"); - fprintf(fp, " -t, --test test integrity of compressed file"); - fprintf(fp, "\n"); return status; } int main(int argc, char **argv) { - int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0; + int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep, binary; BGZF *fp; - void *buffer; + char *buffer; long start, end, size; char *index_fname = NULL; - int threads = 1; + int threads = 1, isstdin = 0, usedstdout = 0, ret = 0; static const struct option loptions[] = { @@ -150,11 +152,13 @@ int main(int argc, char **argv) {"threads", required_argument, NULL, '@'}, {"test", no_argument, NULL, 't'}, {"version", no_argument, NULL, 1}, + {"keep", no_argument, NULL, 'k'}, + {"binary", no_argument, NULL, 2}, {NULL, 0, NULL, 0} }; - compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; - while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grt",loptions,NULL)) >= 0){ + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; binary = 0; + while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grtk",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; case 'c': pstdout = 1; break; @@ -168,11 +172,13 @@ int main(int argc, char **argv) case 'r': reindex = 1; compress = 0; break; case '@': threads = atoi(optarg); break; case 't': test = 1; compress = 0; reindex = 0; break; + case 'k': keep = 1; break; case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; + case 2: binary = 1; break; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); } @@ -182,241 +188,380 @@ int main(int argc, char **argv) fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } - if (compress == 1) { - int f_src = fileno(stdin); - char out_mode[3] = "w\0"; - char out_mode_exclusive[4] = "wx\0"; - - if (compress_level < -1 || compress_level > 9) { - fprintf(stderr, "[bgzip] Invalid compress-level: %d\n", compress_level); - return 1; - } - if (compress_level >= 0) { - out_mode[1] = compress_level + '0'; - out_mode_exclusive[2] = compress_level + '0'; - } - if ( argc>optind ) - { - if ((f_src = open(argv[optind], O_RDONLY)) < 0) { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + if ( (index || reindex) && rebgzip ) + { + fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n"); + return 1; + } + if ( rebgzip && !index_fname ) + { + fprintf(stderr, "[bgzip] Index file name expected with rebgzip. See -I option.\n"); + return 1; + } + /* avoid -I / indexfile with multiple inputs while index/reindex. these wont be set during + read/decompress and are not considered even if set */ + if ( (index || reindex) && index_fname && argc - optind > 1) { + fprintf(stderr, "[bgzip] Cannot specify index filename with multiple data file on index, reindex.\n"); + return 1; + } + + do { + isstdin = optind >= argc ? 1 : !strcmp("-", argv[optind]); //using stdin or not? + /*stdout is in use when explicitly selected or when stdin in is in use, it need to be closed + explicitly to get all io errors*/ + usedstdout |= isstdin || pstdout || test; + + if (compress == 1) { + hFILE* f_src = NULL; + char out_mode[3] = "w\0"; + char out_mode_exclusive[4] = "wx\0"; + + if (compress_level < -1 || compress_level > 9) { + fprintf(stderr, "[bgzip] Invalid compress-level: %d\n", compress_level); + return 1; + } + if (compress_level >= 0) { + out_mode[1] = compress_level + '0'; + out_mode_exclusive[2] = compress_level + '0'; + } + if (!(f_src = hopen(!isstdin ? argv[optind] : "-", "r"))) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), isstdin ? "stdin" : argv[optind]); return 1; } - if (pstdout) - fp = bgzf_open("-", out_mode); - else + if ( argc>optind && !isstdin ) //named input file that isn't an explicit "-" { - char *name = malloc(strlen(argv[optind]) + 5); - strcpy(name, argv[optind]); - strcat(name, ".gz"); - fp = bgzf_open(name, is_forced? out_mode : out_mode_exclusive); - if (fp == NULL && errno == EEXIST && confirm_overwrite(name)) - fp = bgzf_open(name, out_mode); - if (fp == NULL) { - fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); + if (pstdout) + fp = bgzf_open("-", out_mode); + else + { + char *name = malloc(strlen(argv[optind]) + 5); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + fp = bgzf_open(name, is_forced? out_mode : out_mode_exclusive); + if (fp == NULL && errno == EEXIST) { + if (confirm_overwrite(name)) { + fp = bgzf_open(name, out_mode); + } + else { + ret = 2; //explicit N - no overwrite, continue and return 2 + if (hclose(f_src) < 0) + ; //ignoring return value + free(name); + continue; + } + } + if (fp == NULL) { + fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); + free(name); + return 1; + } free(name); - return 1; } - free(name); } - } - else if (!pstdout && isatty(fileno((FILE *)stdout)) ) - return bgzip_main_usage(stderr, EXIT_FAILURE); - else if ( index && !index_fname ) - { - fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); - return 1; - } - else - fp = bgzf_open("-", out_mode); - - if ( index && rebgzip ) - { - fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n"); - return 1; - } + else if (!pstdout && isatty(fileno((FILE *)stdout)) ) + return bgzip_main_usage(stderr, EXIT_FAILURE); + else if ( index && !index_fname ) + { + fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); + return 1; + } + else + fp = bgzf_open("-", out_mode); - if ( rebgzip && !index_fname ) - { - fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); - return 1; - } + if ( index ) bgzf_index_build_init(fp); + if (threads > 1) + bgzf_mt(fp, threads, 256); - if ( index ) bgzf_index_build_init(fp); - if (threads > 1) - bgzf_mt(fp, threads, 256); + buffer = malloc(WINDOW_SIZE); + if (!buffer) + return 1; + if (rebgzip){ + if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.%s\n", !isstdin ? argv[optind] : index_fname, !isstdin ? "gzi" : ""); - buffer = malloc(WINDOW_SIZE); -#ifdef _WIN32 - _setmode(f_src, O_BINARY); -#endif - if (rebgzip){ - if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); + } + else { + htsFormat fmt; + int textual = 0; + if (!binary + && hts_detect_format(f_src, &fmt) == 0 + && fmt.compression == no_compression) { + switch(fmt.format) { + case text_format: + case sam: + case vcf: + case bed: + case fasta_format: + case fastq_format: + case fai_format: + case fqi_format: + textual = 1; + break; + default: break; // silence clang warnings + } + } - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) - if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); - } - else { - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) - if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); + if (binary || !textual) { + // Binary data, either detected or explicit + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) + error("Could not write %d bytes: Error %d\n", + c, fp->errcode); + } else { + /* Text mode, try a flush after a newline */ + int in_header = 1, n = 0, long_line = 0; + while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) { + int c2 = c+n; + int flush = 0; + if (in_header && + (long_line || buffer[0] == '@' || buffer[0] == '#')) { + // Scan forward to find the last header line. + int last_start = 0; + n = 0; + while (n < c2) { + if (buffer[n++] != '\n') + continue; + + last_start = n; + if (n < c2 && + !(buffer[n] == '@' || buffer[n] == '#')) { + in_header = 0; + break; + } + } + if (!last_start) { + n = c2; + long_line = 1; + } else { + n = last_start; + flush = 1; + long_line = 0; + } + } else { + // Scan backwards to find the last newline. + n += c; // c read plus previous n overflow + while (--n >= 0 && ((char *)buffer)[n] != '\n') + ; + + if (n >= 0) { + flush = 1; + n++; + } else { + n = c2; + } + } + + // Pos n is either at the end of the buffer with flush==0, + // or the first byte after a newline and a flush point. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + if (flush) + if (bgzf_flush_try(fp, 65536) < 0) // force + return -1; + + memmove(buffer, buffer+n, c2-n); + n = c2-n; + } + + // Trailing data. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + } + } + if ( index ) + { + if (index_fname) { + if (bgzf_index_dump(fp, index_fname, NULL) < 0) + error("Could not write index to '%s'\n", index_fname); + } else if (!isstdin) { + if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) + error("Could not write index to '%s.gz.gzi'\n", argv[optind]); + } + else { + //stdin, cant create index file as name is not present "-.gz.gzi" not a valid one! + error("Can not write index for stdin data without index filename, use -I option to set index file.\n"); + } + } + if (bgzf_close(fp) < 0) + error("Output close failed: Error %d\n", fp->errcode); + if (hclose(f_src) < 0) + error("Input close failed\n"); + if (argc > optind && !pstdout && !keep && !isstdin) unlink(argv[optind]); + free(buffer); } - if ( index ) + else if ( reindex ) { - if (index_fname) { + if ( argc>optind && !isstdin ) + { + fp = bgzf_open(argv[optind], "r"); + if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); + } + else + { + if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); + fp = bgzf_open("-", "r"); + if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); + } + + buffer = malloc(BGZF_BLOCK_SIZE); + bgzf_index_build_init(fp); + int ret; + while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; + free(buffer); + if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); + + if ( index_fname ) { if (bgzf_index_dump(fp, index_fname, NULL) < 0) error("Could not write index to '%s'\n", index_fname); - } else { - if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) - error("Could not write index to '%s.gz.gzi'", argv[optind]); + } else if (!isstdin) { + if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0) + error("Could not write index to '%s.gzi'\n", argv[optind]); + } + else { + //stdin, cant create index file as name is not present "-.gzi" not a valid one! + error("Can not write index for stdin data without index filename, use -I option to set index file.\n"); } - } - if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); - if (argc > optind && !pstdout) unlink(argv[optind]); - free(buffer); - close(f_src); - return 0; - } - else if ( reindex ) - { - if ( argc>optind ) - { - fp = bgzf_open(argv[optind], "r"); - if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); - } - else - { - if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); - fp = bgzf_open("-", "r"); - if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); - } - buffer = malloc(BGZF_BLOCK_SIZE); - bgzf_index_build_init(fp); - int ret; - while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; - free(buffer); - if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); - - if ( index_fname ) { - if (bgzf_index_dump(fp, index_fname, NULL) < 0) - error("Could not write index to '%s'\n", index_fname); - } else { - if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0) - error("Could not write index to '%s.gzi'\n", argv[optind]); + if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); } - - if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); - return 0; - } - else - { - int f_dst; - - if ( argc>optind ) + else { - fp = bgzf_open(argv[optind], "r"); - if (fp == NULL) { - fprintf(stderr, "[bgzip] Could not open %s: %s\n", argv[optind], strerror(errno)); - return 1; - } - if (bgzf_compression(fp) == no_compression) { - fprintf(stderr, "[bgzip] %s: not a compressed file -- ignored\n", argv[optind]); - bgzf_close(fp); - return 1; - } + int f_dst, is_forced_tmp = is_forced; - if (pstdout || test) { - f_dst = fileno(stdout); - } - else { - const int wrflags = O_WRONLY | O_CREAT | O_TRUNC; - char *name = argv[optind], *ext; - size_t pos; - for (pos = strlen(name); pos > 0; --pos) - if (name[pos] == '.' || name[pos] == '/') break; - if (pos == 0 || name[pos] != '.') { - fprintf(stderr, "[bgzip] can't remove an extension from %s -- please rename\n", argv[optind]); - bgzf_close(fp); + if ( argc>optind && !isstdin ) + { + fp = bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open %s: %s\n", argv[optind], strerror(errno)); return 1; } - name = strdup(argv[optind]); - name[pos] = '\0'; - ext = &name[pos+1]; - if (! (known_extension(ext) || confirm_filename(&is_forced, name, ext))) { - fprintf(stderr, "[bgzip] unknown extension .%s -- declining to decompress to %s\n", ext, name); + if (bgzf_compression(fp) == no_compression) { + fprintf(stderr, "[bgzip] %s: not a compressed file -- ignored\n", argv[optind]); bgzf_close(fp); - free(name); return 1; } - f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666); - if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name)) - f_dst = open(name, wrflags, 0666); - if (f_dst < 0) { - fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); + + if (pstdout || test) { + f_dst = fileno(stdout); + } + else { + const int wrflags = O_WRONLY | O_CREAT | O_TRUNC; + char *name = argv[optind], *ext; + size_t pos; + for (pos = strlen(name); pos > 0; --pos) + if (name[pos] == '.' || name[pos] == '/') break; + if (pos == 0 || name[pos] != '.') { + fprintf(stderr, "[bgzip] can't remove an extension from %s -- please rename\n", argv[optind]); + bgzf_close(fp); + return 1; + } + name = strdup(argv[optind]); + name[pos] = '\0'; + ext = &name[pos+1]; + if (! (known_extension(ext) || confirm_filename(&is_forced_tmp, name, ext))) { + fprintf(stderr, "[bgzip] unknown extension .%s -- declining to decompress to %s\n", ext, name); + bgzf_close(fp); + free(name); + ret = 2; //explicit N, continue and return 2 + continue; + } + f_dst = open(name, is_forced_tmp? wrflags : wrflags|O_EXCL, 0666); + if (f_dst < 0 && errno == EEXIST) { + if (confirm_overwrite(name)) { + f_dst = open(name, wrflags, 0666); + } + else { + ret = 2; //explicit N - no overwrite, continue and return 2 + free(name); + bgzf_close(fp); + continue; + } + } + if (f_dst < 0) { + fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); + free(name); + return 1; + } free(name); - return 1; } - free(name); } - } - else if (!pstdout && isatty(fileno((FILE *)stdin)) ) - return bgzip_main_usage(stderr, EXIT_FAILURE); - else - { - f_dst = fileno(stdout); - fp = bgzf_open("-", "r"); - if (fp == NULL) { - fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); - return 1; + else if (!pstdout && isatty(fileno((FILE *)stdin)) ) + return bgzip_main_usage(stderr, EXIT_FAILURE); + else + { + f_dst = fileno(stdout); + fp = bgzf_open("-", "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); + return 1; + } + if (bgzf_compression(fp) == no_compression) { + fprintf(stderr, "[bgzip] stdin is not compressed -- ignored\n"); + bgzf_close(fp); + return 1; + } } - if (bgzf_compression(fp) == no_compression) { - fprintf(stderr, "[bgzip] stdin is not compressed -- ignored\n"); - bgzf_close(fp); - return 1; + + buffer = malloc(WINDOW_SIZE); + if ( start>0 ) + { + if (index_fname) { + if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) + error("Could not load index: %s\n", index_fname); + } else { + if (optind >= argc || isstdin) { + error("The -b option requires -I when reading from stdin " + "(and stdin must be seekable)\n"); + } + if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) + error("Could not load index: %s.gzi\n", argv[optind]); + } + if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } - } - buffer = malloc(WINDOW_SIZE); - if ( start>0 ) - { - if (index_fname) { - if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) - error("Could not load index: %s\n", index_fname); - } else { - if (optind >= argc) { - error("The -b option requires -I when reading from stdin " - "(and stdin must be seekable)\n"); + if (threads > 1) + bgzf_mt(fp, threads, 256); + + #ifdef _WIN32 + _setmode(f_dst, O_BINARY); + #endif + long start_reg = start, end_reg = end; + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) error("Error %d in block starting at offset %" PRId64 "(%" PRIX64 ")\n", fp->errcode, fp->block_address, fp->block_address); + start += c; + if ( !test && write(f_dst, buffer, c) != c ) { + #ifdef _WIN32 + if (GetLastError() != ERROR_NO_DATA) + #endif + error("Could not write %d bytes\n", c); } - if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) - error("Could not load index: %s.gzi\n", argv[optind]); + if (end >= 0 && start >= end) break; + } + start = start_reg; + end = end_reg; + free(buffer); + if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); + if (argc > optind && !pstdout && !test && !keep && !isstdin) unlink(argv[optind]); + if (!isstdin && !pstdout && !test) { + close(f_dst); //close output file when it is not stdout } - if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } + } while (++optind < argc); - if (threads > 1) - bgzf_mt(fp, threads, 256); - -#ifdef _WIN32 - _setmode(f_dst, O_BINARY); -#endif - while (1) { - if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); - else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); - if (c == 0) break; - if (c < 0) error("Error %d in block starting at offset %" PRId64 "(%" PRIX64 ")\n", fp->errcode, fp->block_address, fp->block_address); - start += c; - if ( !test && write(f_dst, buffer, c) != c ) { -#ifdef _WIN32 - if (GetLastError() != ERROR_NO_DATA) -#endif - error("Could not write %d bytes\n", c); - } - if (end >= 0 && start >= end) break; + if (usedstdout && !reindex) { + //stdout in use, have to close explicitly to get any pending write errors + if (fclose(stdout) != 0 && errno != EBADF) { + fprintf(stderr, "[bgzip] Failed to close stdout, errno %d", errno); + ret = 1; } - free(buffer); - if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); - if (argc > optind && !pstdout && !test) unlink(argv[optind]); - return 0; } + return ret; } diff --git a/builddir_vars.mk.in b/builddir_vars.mk.in new file mode 100644 index 000000000..09bb20fea --- /dev/null +++ b/builddir_vars.mk.in @@ -0,0 +1,58 @@ +# Separate build directory Makefile overrides for htslib. +# +# Copyright (C) 2021 University of Glasgow. +# +# Author: John Marshall +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# This is @configure_input@ +# +# When building from a separate build directory, this file is included by +# HTSlib's Makefile or htslib.mk instead of htslib_vars.mk. It adjusts +# variables to account for a separate source directory and then includes +# the real makefile fragment. + +ifneq "$(HTSPREFIX)" "" +# When included externally via htslib.mk, just update $(HTSSRCDIR) and hence +# $(HTSPREFIX) to point to the source directory (without using any extra +# unprefixed variables, which would be in the external project's namespace). + +# Set to $(HTSDIR)/$(srcdir) (or just $(srcdir) if that's absolute) +HTSSRCDIR = @HTSDIRslash_if_relsrcdir@@srcdir@ + +include $(HTSSRCDIR)/htslib_vars.mk + +else +# When included from HTSlib's Makefile, override $(srcdir) and set VPATH, +# and make any other adjustments required. ($(HTSPREFIX) remains empty as +# the items it prefixes will be found via VPATH instead.) + +srcdir = @srcdir@ +VPATH = @srcdir@ + +srcprefix = $(srcdir)/ + +# Ensure that htscodecs.c can include its version.h. This -I option must come +# before -I. so that these targets get this version.h rather than HTSlib's. +htscodecs/htscodecs/htscodecs.o htscodecs/htscodecs/htscodecs.pico: ALL_CPPFLAGS = -Ihtscodecs/htscodecs -I. $(CPPFLAGS) + +include $(srcdir)/htslib_vars.mk + +endif diff --git a/config.mk.in b/config.mk.in index 976e557df..7341a170d 100644 --- a/config.mk.in +++ b/config.mk.in @@ -1,6 +1,6 @@ # Optional configure Makefile overrides for htslib. # -# Copyright (C) 2015-2017, 2019 Genome Research Ltd. +# Copyright (C) 2015-2017, 2019, 2023 Genome Research Ltd. # # Author: John Marshall # @@ -43,6 +43,7 @@ RANLIB = @RANLIB@ CPPFLAGS = @CPPFLAGS@ CFLAGS = @CFLAGS@ LDFLAGS = @LDFLAGS@ +VERSION_SCRIPT_LDFLAGS = @VERSION_SCRIPT_LDFLAGS@ LIBS = @LIBS@ PLATFORM = @PLATFORM@ @@ -94,7 +95,7 @@ pluginpath = @pluginpath@ LIBHTS_OBJS += plugin.o PLUGIN_OBJS += $(plugin_OBJS) -plugin.o plugin.pico: CPPFLAGS += -DPLUGINPATH=\"$(pluginpath)\" +plugin.o plugin.pico: ALL_CPPFLAGS += -DPLUGINPATH=\"$(pluginpath)\" # When built as separate plugins, these record their version themselves. hfile_gcs.o hfile_gcs.pico: version.h @@ -112,3 +113,9 @@ LDFLAGS += $(noplugin_LDFLAGS) LIBS += $(noplugin_LIBS) endif + +# Extra CFLAGS for specific files +HTS_CFLAGS_AVX2 = @hts_cflags_avx2@ +HTS_CFLAGS_AVX512 = @hts_cflags_avx512@ +HTS_CFLAGS_SSE4 = @hts_cflags_sse4@ +HTS_HAVE_NEON = @hts_have_neon@ diff --git a/configure.ac b/configure.ac index ec137b75d..c1afb38e7 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2019 Genome Research Ltd. +# Copyright (C) 2015-2023 Genome Research Ltd. # # Author: John Marshall # @@ -30,11 +30,12 @@ AC_CONFIG_SRCDIR(hts.c) AC_CONFIG_HEADERS(config.h) m4_include([m4/hts_prog_cc_warnings.m4]) +m4_include([m4/hts_check_compile_flags_needed.m4]) m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2018 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2020-2023 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) @@ -69,6 +70,112 @@ dnl Flags to treat warnings as errors. These need to be applied to CFLAGS dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS) HTS_PROG_CC_WERROR(hts_late_cflags) +# HTSlib uses X/Open-only facilities (M_SQRT2 etc, drand48() etc), and +# various POSIX functions that are provided by various _POSIX_C_SOURCE values +# or by _XOPEN_SOURCE >= 500. It also uses usleep(), which is removed when +# _XOPEN_SOURCE >= 700. Additionally, some definitions may require +# _XOPEN_SOURCE >= 600 on some platforms (snprintf on MinGW, +# PTHREAD_MUTEX_RECURSIVE on some Linux distributions). Hence we set it to 600. + +# Define _XOPEN_SOURCE unless the user has already done so via $CPPFLAGS etc. +AC_CHECK_DECL([_XOPEN_SOURCE], [], + [AC_DEFINE([_XOPEN_SOURCE], [600], [Specify X/Open requirements])], + []) + + +dnl Check for various compiler flags to enable SIMD features +dnl Options for rANS32x16 sse4.1 version - ssse3 +hts_cflags_sse4="" +HTS_CHECK_COMPILE_FLAGS_NEEDED([ssse3], [-mssse3], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_shuffle_epi8(a, b); + return *((char *) &c); + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_SSSE3],1,[Defined to 1 if rANS source using SSSE3 can be compiled.]) +]) + +dnl Options for rANS32x16 sse4.1 version - popcnt +HTS_CHECK_COMPILE_FLAGS_NEEDED([popcnt], [-mpopcnt], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + unsigned int i = _mm_popcnt_u32(1); + return i != 1; + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.]) +]) + +dnl Options for rANS32x16 sse4.1 version - sse4.1 +HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_max_epu32(a, b); + return *((char *) &c); + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled. +]) +dnl Propagate HTSlib's unaligned access preference to htscodecs + AH_VERBATIM([UBSAN],[ +/* Prevent unaligned access in htscodecs SSE4 rANS codec */ +#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0 +#undef UBSAN +#endif]) + AC_DEFINE([UBSAN],1,[]) +]) +AC_SUBST([hts_cflags_sse4]) + +dnl Options for rANS32x16 avx2 version +HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + __m256i b = _mm256_add_epi32(a, a); + long long c = _mm256_extract_epi64(b, 0); + return (int) c; + #endif + ]])], [ + hts_cflags_avx2="$flags_needed" + AC_SUBST([hts_cflags_avx2]) + AC_DEFINE([HAVE_AVX2],1,[Defined to 1 if rANS source using AVX2 can be compiled.]) +]) + +dnl Options for rANS32x16 avx512 version +HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + __m512i a = _mm512_set1_epi32(1); + __m512i b = _mm512_add_epi32(a, a); + return *((char *) &b); + #endif + ]])], [ + hts_cflags_avx512="$flags_needed" + AC_SUBST([hts_cflags_avx512]) + AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.]) +]) + dnl Avoid chicken-and-egg problem where pkg-config supplies the dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check dnl for pkg-config... @@ -80,6 +187,11 @@ static_LDFLAGS=$LDFLAGS static_LIBS='-lpthread -lz -lm' private_LIBS=$LDFLAGS +AC_ARG_ENABLE([versioned-symbols], + [AS_HELP_STRING([--disable-versioned-symbols], + [disable versioned symbols in shared library])], + [], [enable_versioned_symbols=yes]) + AC_ARG_ENABLE([bz2], [AS_HELP_STRING([--disable-bz2], [omit support for BZ2-compressed CRAM files])], @@ -108,6 +220,12 @@ AC_ARG_ENABLE([plugins], [], [enable_plugins=no]) AC_SUBST(enable_plugins) +AC_ARG_WITH([external-htscodecs], + [AS_HELP_STRING([--with-external-htscodecs], + [get htscodecs functions from a shared library])], + [], [with_external_htscodecs=no]) +AC_SUBST(with_external_htscodecs) + AC_ARG_WITH([libdeflate], [AS_HELP_STRING([--with-libdeflate], [use libdeflate for faster crc and deflate algorithms])], @@ -137,9 +255,9 @@ AC_ARG_ENABLE([s3], [support Amazon AWS S3 URLs])], [], [enable_s3=check]) -test -n "$host_alias" || host_alias=unknown-`uname -s` -AC_MSG_CHECKING([shared library type for $host_alias]) -case $host_alias in +basic_host=${host_alias:-unknown-`uname -s`} +AC_MSG_CHECKING([shared library type for $basic_host]) +case $basic_host in *-cygwin* | *-CYGWIN*) host_result="Cygwin DLL" PLATFORM=CYGWIN @@ -157,7 +275,9 @@ case $host_alias in # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, # %lld and %z printf formats work. It also enforces the snprintf to # be C99 compliant so it returns the correct values (in kstring.c). - CPPFLAGS="$CPPCFLAGS -D_XOPEN_SOURCE=600" + + # Now set by default, so no need to do it here. + # CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" ;; *) host_result="plain .so" @@ -168,6 +288,24 @@ esac AC_MSG_RESULT([$host_result]) AC_SUBST([PLATFORM]) +dnl Check for versioned symbol support +dnl Only try for .so shared libraries as other types won't work +AS_IF([test x"$PLATFORM" = xdefault && test x"$enable_versioned_symbols" = xyes], + [AC_CACHE_CHECK([whether the linker supports versioned symbols], + [hts_cv_have_versioned_symbols], [ + save_LDFLAGS=$LDFLAGS + LDFLAGS="-Wl,-version-script,$srcdir/htslib.map $LDFLAGS" + AC_LINK_IFELSE([AC_LANG_PROGRAM()], + [hts_cv_have_versioned_symbols=yes], + [hts_cv_have_versioned_symbols=no]) + LDFLAGS=$save_LDFLAGS + ]) + AS_IF([test "x$hts_cv_have_versioned_symbols" = xyes],[ + VERSION_SCRIPT_LDFLAGS='-Wl,-version-script,$(srcprefix)htslib.map' + AC_SUBST([VERSION_SCRIPT_LDFLAGS]) + ]) +]) + dnl Try to get more control over which symbols are exported in the shared dnl library. HTS_HIDE_DYNAMIC_SYMBOLS @@ -228,7 +366,16 @@ AC_SEARCH_LIBS([recv], [socket ws2_32], [ if test "$ac_cv_search_recv" != "none required" then static_LIBS="$static_LIBS $ac_cv_search_recv" -fi], [MSG_ERROR([unable to find the recv() function])]) +fi], + dnl on MinGW-i686, checking recv() linking requires an annotated declaration + [AC_MSG_CHECKING([for library containing recv using declaration]) + LIBS="-lws2_32 $LIBS" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], [[recv(0, 0, 0, 0);]])], + [AC_MSG_RESULT([-lws2_32]) + static_LIBS="$static_LIBS -lws2_32"], + [AC_MSG_RESULT([no]) + MSG_ERROR([unable to find the recv() function])])]) if test "$enable_bz2" != no; then bz2_devel=ok @@ -280,6 +427,47 @@ produced elsewhere unreadable) or resolve this error to build HTSlib.]) static_LIBS="$static_LIBS -llzma" fi +AS_IF([test "x$with_external_htscodecs" != "xno"], + [libhtscodecs=ok + AC_CHECK_HEADER([htscodecs/rANS_static4x16.h],[], + [libhtscodecs='missing header'],[;]) + AC_CHECK_LIB([htscodecs],[rans_compress_bound_4x16], + [:],[libhtscodecs='missing library']) + AS_IF([test "$libhtscodecs" = "ok"], + [AC_DEFINE([HAVE_EXTERNAL_LIBHTSCODECS], 1, [Define if using an external libhtscodecs]) + LIBS="-lhtscodecs $LIBS" + private_LIBS="-lhtscodecs $private_LIBS" + static_LIBS="-lhtscodecs $static_LIBS" + selected_htscodecs_mk="htscodecs_external.mk"], + [MSG_ERROR([libhtscodecs development files not found: $libhtscodecs + +You asked to use an external htscodecs library, but do not have the +required header / library files. You either need to supply these and +if necessary set CPPFLAGS and LDFLAGS so the compiler can find them; +or configure using --without-external-htscodecs to build the required +functions from the htscodecs submodule. +])])], + [AC_MSG_CHECKING([whether htscodecs files are present]) + AS_IF([test -e "$srcdir/htscodecs/htscodecs/rANS_static4x16.h"], + [AC_MSG_RESULT([yes]) + selected_htscodecs_mk="htscodecs_bundled.mk"], + [AC_MSG_RESULT([no]) + AS_IF([test -e "$srcdir/.git"], + [MSG_ERROR([htscodecs submodule files not present. + +HTSlib uses some functions from the htscodecs project, which is normally +included as a submodule. Try running: + + git submodule update --init --recursive + +in the top-level htslib directory to update it, and then re-run configure. +])], + [MSG_ERROR([htscodecs submodule files not present. + +You have an incomplete distribution. Please try downloading one of the +official releases from https://www.htslib.org +])])])]) + AS_IF([test "x$with_libdeflate" != "xno"], [libdeflate=ok AC_CHECK_HEADER([libdeflate.h],[],[libdeflate='missing header'],[;]) @@ -303,26 +491,31 @@ HTSlib.])])])]) libcurl=disabled if test "$enable_libcurl" != no; then - AC_CHECK_LIB([curl], [curl_easy_pause], - [AC_DEFINE([HAVE_LIBCURL], 1, [Define if libcurl file access is enabled.]) - libcurl=enabled], + libcurl_devel=ok + AC_CHECK_HEADER([curl/curl.h], [], [libcurl_devel="headers not found"], [;]) + AC_CHECK_LIB([curl], [curl_easy_pause], [:], [AC_CHECK_LIB([curl], [curl_easy_init], - [message="library is too old (7.18+ required)"], - [message="library not found"]) - case "$enable_libcurl" in - check) AC_MSG_WARN([libcurl not enabled: $message]) ;; - *) MSG_ERROR([libcurl $message + [libcurl_devel="library is too old (7.18+ required)"], + [libcurl_devel="library not found"])]) + + if test "$libcurl_devel" = ok; then + AC_DEFINE([HAVE_LIBCURL], 1, [Define if libcurl file access is enabled.]) + libcurl=enabled + elif test "$enable_libcurl" = check; then + AC_MSG_WARN([libcurl not enabled: $libcurl_devel]) + else + MSG_ERROR([libcurl $libcurl_devel Support for HTTPS and other SSL-based URLs requires routines from the libcurl -library . Building HTSlib with libcurl enabled +library . Building HTSlib with libcurl enabled requires libcurl development files to be installed on the build machine; you may need to ensure a package such as libcurl4-{gnutls,nss,openssl}-dev (on Debian or Ubuntu Linux) or libcurl-devel (on RPM-based Linux distributions or Cygwin) is installed. Either configure with --disable-libcurl or resolve this error to build HTSlib.]) - ;; - esac]) + fi + dnl -lcurl is only needed for static linking if hfile_libcurl is not a plugin if test "$libcurl" = enabled ; then if test "$enable_plugins" != yes ; then @@ -399,9 +592,13 @@ dnl Only need to add to static_LIBS if not building as a plugin fi fi +dnl Look for regcomp in various libraries (needed on windows/mingw). +AC_SEARCH_LIBS(regcomp, regex, [libregex=needed], []) + dnl Look for PTHREAD_MUTEX_RECURSIVE. dnl This is normally in pthread.h except on some broken glibc implementations. -AC_CHECK_DECL(PTHREAD_MUTEX_RECURSIVE, [], [AC_DEFINE([_XOPEN_SOURCE],[600], [Needed for PTHREAD_MUTEX_RECURSIVE])], [[#include ]]) +dnl Now set by default +dnl AC_CHECK_DECL(PTHREAD_MUTEX_RECURSIVE, [], [AC_DEFINE([_XOPEN_SOURCE],[600], [Needed for PTHREAD_MUTEX_RECURSIVE])], [[#include ]]) if test "$s3" = enabled ; then AC_DEFINE([ENABLE_S3], 1, [Define if HTSlib should enable S3 support.]) @@ -419,4 +616,27 @@ AC_SUBST([static_LDFLAGS]) AC_SUBST([static_LIBS]) AC_CONFIG_FILES([config.mk htslib.pc.tmp:htslib.pc.in]) +AC_CONFIG_LINKS([htscodecs.mk:$selected_htscodecs_mk]) + +if test "$srcdir" != .; then + # Set up for a separate build directory. As HTSlib uses a non-recursive + # makefile, we need to create additional build subdirectories explicitly. + AC_CONFIG_LINKS([Makefile:Makefile htslib.mk:htslib.mk]) + AC_CONFIG_FILES([htslib_vars.mk:builddir_vars.mk.in]) + AC_CONFIG_COMMANDS([mkdir], + [AS_MKDIR_P([cram]) + AS_MKDIR_P([htscodecs/htscodecs]) + AS_MKDIR_P([htscodecs/tests]) + AS_MKDIR_P([test/fuzz]) + AS_MKDIR_P([test/longrefs]) + AS_MKDIR_P([test/tabix])]) +fi + +# @HTSDIRslash_if_relsrcdir@ will be empty when $srcdir is absolute +case "$srcdir" in + /*) HTSDIRslash_if_relsrcdir= ;; + *) HTSDIRslash_if_relsrcdir='$(HTSDIR)/' ;; +esac +AC_SUBST([HTSDIRslash_if_relsrcdir]) + AC_OUTPUT diff --git a/cram/README b/cram/README new file mode 100644 index 000000000..135438227 --- /dev/null +++ b/cram/README @@ -0,0 +1,214 @@ +CRAM encoding internals +======================= + +A quick summary of functions involved. + +The encoder works by accumulating a bunch of BAM records (via the +cram_put_bam_seq function), and at a certain point (eg counter of +records, or switching reference) the array of BAM records it turned +into a container, which in turn creates slices, holding CRAM +data-series in blocks. The function that turns an array of BAM +objects into the container is below. + +cram_encode_container func: + Validate references MD5 against header, unless no_ref mode + If embed_ref <= 1, fetch ref + Switch to embed_ref=2 if failed + + Foreach slice: + If embed_ref == 2 + call cram_generate_reference + if failed switch to no_ref mode + Foreach sequence + call process_one_read to append BAM onto each data series (DS) + call cram_stats_add for each DS to gather metrics + call cram_encode_aux + + # We now have cram DS, per slice + call cram_encoder_init, per DS (based on cram_stats_add data) + + Foreach slice: + call cram_encode_slice to turn DS to blocks + call cram_compess_slice + + call cram_encode_compression_header + +Threading +--------- + +CRAM can be multi-threaded, but this brings complications. + +The above function is the main CPU user, so it is this bit which can +be executed in parallel from multiple threads. To understand this we +need to now look at how the primary loop works when writing a CRAM: + +Encoding main thread: + repeatedly calls cram_put_bam_seq + calls cram_new_container on first time through to initialise + calls cram_next_container when current is full or we need to flush + calls cram_flush_container_mt to flush last container + pushes BAM object onto current container + +If non-threaded, cram_flush_container_mt does: + call cram_flush_container + call cram_encode_container to go from BAM to CRAM data-series + call cram_flush_container2 (writes it out) + +If threaded, cram_flush_container_mt does: + Main: Dispatch cram_flush_thread job + Thread: call cram_encode_container to go from BAM to CRAM data-series + Main: Call cram_flush_result to drain queue of encoded containers + Main: Call cram_flush_container2 (writes it out); + + + +Decisions on when to create new containers, detection of sorted vs unsorted, +switching to multi-seq mode, etc occur at the main thread in +cram_put_bam_seq. + +We can change our mind on container parameters at any point up until +the cram_encode_container call. At that point these parameters get +baked into a container compression header and all data-series +generated need to be in sync with the parameters. + +It is possible that some parameter changes can get detected while +encoding the container, as it is there where we fetch references. Eg +the need to enable embedded reference or switch to non-ref mode. + +While encoding a container, we can change the parameters for *this* +container, and we can also set the default parameter for subsequent +new parameters via the global cram fd to avoid spamming attempts to +load a reference which doesn't exist, but we cannot change other +containers that are being processed in parallel. They'll fend for +themselves. + +References +---------- + +To avoid spamming the reference servers, there is a shared cache of +references being currently used by all the worker threads (leading to +confusing terminology of reference-counting of references). So each +container fetches its section of reference, but the memory for that is +handled via its own layer. + +The shared references and ref meta-data is held in cram_fd -> refs (a +refs_t pointer): + + // References structure. + struct refs_t { + string_alloc_t *pool; // String pool for holding filenames and SN vals + + khash_t(refs) *h_meta; // ref_entry*, index by name + ref_entry **ref_id; // ref_entry*, index by ID + int nref; // number of ref_entry + + char *fn; // current file opened + BGZF *fp; // and the hFILE* to go with it. + + int count; // how many cram_fd sharing this refs struct + + pthread_mutex_t lock; // Mutex for multi-threaded updating + ref_entry *last; // Last queried sequence + int last_id; // Used in cram_ref_decr_locked to delay free + }; + +Within this, ref_entry is the per-reference information: + + typedef struct ref_entry { + char *name; + char *fn; + int64_t length; + int64_t offset; + int bases_per_line; + int line_length; + int64_t count; // for shared references so we know to dealloc seq + char *seq; + mFILE *mf; + int is_md5; // Reference comes from a raw seq found by MD5 + int validated_md5; + } ref_entry; + +Sharing of references to track use between threads is via +cram_ref_incr* and cram_ref_decr* (which locked and unlocked +variants). We free a reference when the usage count hits zero. To +avoid spamming discard and reload in single-thread creation of a +pos-sorted CRAM, we keep track of the last reference in cram_fd and +delay discard by one loop iteration. + +There are complexities here around whether the references come from a +single ref.fa file, are from a local MD5sum cache with one file per +reference (mmapped), or whether they're fetched from some remote +REF_PATH query such as the EBI. (This later case typically downloads +to a local md5 based ref-cache first and mmaps from there.) + +The refs struct start off by being populated from the SAM header. We +have M5 tag and name known, maybe a filename, but length is 0 and seq +is NULL. This is done by cram_load_reference: + +cram_load_reference (cram_fd, filename): + if filename non-NULL + call refs_load_fai + Populates ref_entry with filename, name, length, line-len, etc + sanitise_SQ_lines + If no refs loaded + call refs_from_header + populates ref_entry with name. + Sets length=0 as marker for not-yet-loaded + +The main interface used from the code is cram_get_ref(). It takes a +reference ID, start and end coordinate and returns a pointer to the +relevant sub-sequence. + +cram_get_ref: + r = fd->refs->ref_id[id]; // current ref + call cram_populate_ref if stored length is 0 (ie ref.fa set) + search REF_PATH / REF_CACHE + call bgzf_open if local_path + call open_path_mfile otherwise + copy to local REF_CACHE if required (eg remote fetch) + + If start = 1 and end = ref-length + If ref seq unknown + call cram_ref_load to load entire ref and use that + + If ref seq now known, return it + + // Otherwise known via .fai or we've errored by now. + call load_ref_portion to return a sub-seq from index fasta + +The encoder asks for the entire reference rather than a small portion +of it as we're usually encoding a large amount. The decoder may be +dealing with small range queries, so it only asks for the relevant +sub-section of reference as specified in the cram slice headers. + + +TODO +==== + +- Multi-ref mode is enabled when we have too many small containers in + a row. + + Instead of firing off new containers when we switch reference, we + could always make a new container after N records, separating off + M <= N to make the container such that all M are the same reference, + and shuffling any remaining N-M down as the start of the next. + + This means we can detect how many new containers we would create, + and enable multi-ref mode straight away rather than keeping a recent + history of how many small containers we've emitted. + +- The cache of references currently being used is a better place to + track the global embed-ref and non-ref logic. Better than cram_fd. + Cram_fd is a one-way change, as once we enable non-ref we'll stick + with it. + + However if it was per-ref in the ref-cache then we'd probe and try + each reference once, and then all new containers for that ref would + honour the per-ref parameters. So a single missing reference in the + middle of a large file wouldn't change behaviour for all subsequence + references. + + Optionally we could still do meta-analysis on how many references + are failing, and switch the global cram_fd params to avoid repeated + testing of reference availability if it's becoming obvious that none + of them are known. diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 9f339c723..7d9cf128d 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2021,2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -42,6 +42,23 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#include "../fuzz_settings.h" +#endif + +#include "../htslib/hts_endian.h" + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#include +#include +#else +#include "../htscodecs/htscodecs/varint.h" +#include "../htscodecs/htscodecs/pack.h" +#include "../htscodecs/htscodecs/rle.h" +#endif #include "cram.h" @@ -149,8 +166,8 @@ static void store_bytes_MSB(cram_block *block, char *bytes, int len) { #endif /* Local optimised copy for inlining */ -static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { - unsigned int val = 0; +static inline int64_t get_bits_MSB(cram_block *block, int nbits) { + uint64_t val = 0; int i; #if 0 @@ -202,20 +219,20 @@ static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { } switch(nbits) { -// case 15: GET_BIT_MSB(block, val); -// case 14: GET_BIT_MSB(block, val); -// case 13: GET_BIT_MSB(block, val); -// case 12: GET_BIT_MSB(block, val); -// case 11: GET_BIT_MSB(block, val); -// case 10: GET_BIT_MSB(block, val); -// case 9: GET_BIT_MSB(block, val); - case 8: GET_BIT_MSB(block, val); - case 7: GET_BIT_MSB(block, val); - case 6: GET_BIT_MSB(block, val); - case 5: GET_BIT_MSB(block, val); - case 4: GET_BIT_MSB(block, val); - case 3: GET_BIT_MSB(block, val); - case 2: GET_BIT_MSB(block, val); +// case 15: GET_BIT_MSB(block, val); // fall through +// case 14: GET_BIT_MSB(block, val); // fall through +// case 13: GET_BIT_MSB(block, val); // fall through +// case 12: GET_BIT_MSB(block, val); // fall through +// case 11: GET_BIT_MSB(block, val); // fall through +// case 10: GET_BIT_MSB(block, val); // fall through +// case 9: GET_BIT_MSB(block, val); // fall through + case 8: GET_BIT_MSB(block, val); // fall through + case 7: GET_BIT_MSB(block, val); // fall through + case 6: GET_BIT_MSB(block, val); // fall through + case 5: GET_BIT_MSB(block, val); // fall through + case 4: GET_BIT_MSB(block, val); // fall through + case 3: GET_BIT_MSB(block, val); // fall through + case 2: GET_BIT_MSB(block, val); // fall through case 1: GET_BIT_MSB(block, val); break; @@ -239,7 +256,7 @@ static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { * characters with exactly the correct frequency distribution we check * for it elsewhere.) */ -static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { +static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) { //fprintf(stderr, " store_bits: %02x %d\n", val, nbits); /* @@ -248,15 +265,15 @@ static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { */ unsigned int mask; - if (block->byte+4 >= block->alloc) { + if (block->byte+8 >= block->alloc) { if (block->byte) { block->alloc *= 2; - block->data = realloc(block->data, block->alloc + 4); + block->data = realloc(block->data, block->alloc + 8); if (!block->data) return -1; } else { block->alloc = 1024; - block->data = realloc(block->data, block->alloc + 4); + block->data = realloc(block->data, block->alloc + 8); if (!block->data) return -1; block->data[0] = 0; // initialise first byte of buffer @@ -311,10 +328,27 @@ static char *cram_extract_block(cram_block *b, int size) { /* * --------------------------------------------------------------------------- * EXTERNAL + * + * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the + * size of the object being returned. This type is hard coded in the + * spec document (changing from uint32 to uint64 requires a spec change) + * and there is no data format introspection so implementations have + * to determine which size to use based on version numbers. It also + * doesn't support signed data. + * + * With CRAM 4.0 onwards the size and sign of the data is no longer stated + * explicitly in the specification. Instead EXTERNAL is replaced by three + * new encodings, for bytes and signed / unsigned integers which used a + * variable sized encoding. + * + * For simplicity we use the same encode and decode functions for + * bytes (CRAM4) and external (CRAM3). Given we already had code to + * replace codec + type into a function pointer it makes little + * difference how we ended up at that function. However we disallow + * this codec to operate on integer data for CRAM4 onwards. */ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int l; char *cp; cram_block *b; @@ -325,16 +359,16 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cp = (char *)b->data + b->idx; // E_INT and E_LONG are guaranteed single item queries - l = safe_itf8_get(cp, (char *)b->data + b->uncomp_size, (int32_t *)out); - b->idx += l; + int err = 0; + *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; *out_size = 1; - return l > 0 ? 0 : -1; + return err ? -1 : 0; } int cram_external_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int64_t l; char *cp; cram_block *b; @@ -345,11 +379,12 @@ int cram_external_decode_long(cram_slice *slice, cram_codec *c, cp = (char *)b->data + b->idx; // E_INT and E_LONG are guaranteed single item queries - l = safe_ltf8_get(cp, (char *)b->data + b->uncomp_size, (int64_t *)out); - b->idx += l; + int err = 0; + *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; *out_size = 1; - return l > 0 ? 0 : -1; + return err ? -1 : 0; } int cram_external_decode_char(cram_slice *slice, cram_codec *c, @@ -400,9 +435,32 @@ void cram_external_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_external_decode_init(char *data, int size, + +int cram_external_decode_size(cram_slice *slice, cram_codec *c) { + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return -1; + + return b->uncomp_size; +} + +cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) { + return cram_get_block_by_id(slice, c->u.external.content_id); +} + +int cram_external_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "EXTERNAL(id=%d)", + c->u.external.content_id) < 0 ? -1 : 0; +} + +cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; char *cp = data; @@ -413,17 +471,41 @@ cram_codec *cram_external_decode_init(char *data, int size, return NULL; c->codec = E_EXTERNAL; - if (option == E_INT) - c->decode = cram_external_decode_int; - else if (option == E_LONG) - c->decode = cram_external_decode_long; - else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->decode = cram_external_decode_char; - else - c->decode = cram_external_decode_block; + if (CRAM_MAJOR_VERS(version) >= 4) { + // Version 4 does not permit integer data to be encoded as a + // series of bytes. This is used purely for bytes, either + // singular or declared as arrays + switch (codec) { + case E_EXTERNAL: + if (option == E_BYTE_ARRAY_BLOCK) + c->decode = cram_external_decode_block; + else if (option == E_BYTE || option == E_BYTE_ARRAY) + c->decode = cram_external_decode_char; + else + goto malformed; + break; + default: + goto malformed; + } + } else { + // CRAM 3 and earlier encodes integers as EXTERNAL. We need + // use the option field to indicate the input data format so + // we know which serialisation format to use. + if (option == E_INT) + c->decode = cram_external_decode_int; + else if (option == E_LONG) + c->decode = cram_external_decode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_external_decode_char; + else + c->decode = cram_external_decode_block; + } c->free = cram_external_decode_free; + c->size = cram_external_decode_size; + c->get_block = cram_external_get_block; + c->describe = cram_external_describe; - cp += safe_itf8_get(cp, data + size, &c->u.external.content_id); + c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL); if (cp - data != size) goto malformed; @@ -441,16 +523,25 @@ cram_codec *cram_external_decode_init(char *data, int size, int cram_external_encode_int(cram_slice *slice, cram_codec *c, char *in, int in_size) { uint32_t *i32 = (uint32_t *)in; + return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1; +} - return itf8_put_blk(c->out, *i32) >= 0 ? 0 : -1; +int cram_external_encode_sint(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int32_t *i32 = (int32_t *)in; + return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1; } int cram_external_encode_long(cram_slice *slice, cram_codec *c, char *in, int in_size) { uint64_t *i64 = (uint64_t *)in; + return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1; +} - ltf8_put_blk(c->out, *i64); - return 0; +int cram_external_encode_slong(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *i64 = (int64_t *)in; + return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1; } int cram_external_encode_char(cram_slice *slice, cram_codec *c, @@ -470,7 +561,7 @@ void cram_external_encode_free(cram_codec *c) { int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, int version) { - char tmp[99], *tp = tmp; + char tmp[99], *tp = tmp, *tpend = tmp+99; int len = 0, r = 0, n; if (prefix) { @@ -479,9 +570,9 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, len += l; } - tp += itf8_put(tp, c->u.e_external.content_id); - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, tp-tmp)); r |= n; + tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id); + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, tp-tmp)); r |= n; BLOCK_APPEND(b, tmp, tp-tmp); len += tp-tmp; @@ -493,9 +584,10 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, } cram_codec *cram_external_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; c = malloc(sizeof(*c)); @@ -503,15 +595,34 @@ cram_codec *cram_external_encode_init(cram_stats *st, return NULL; c->codec = E_EXTERNAL; c->free = cram_external_encode_free; - if (option == E_INT) - c->encode = cram_external_encode_int; - else if (option == E_LONG) - c->encode = cram_external_encode_long; - else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->encode = cram_external_encode_char; - else - abort(); + if (CRAM_MAJOR_VERS(version) >= 4) { + // Version 4 does not permit integer data to be encoded as a + // series of bytes. This is used purely for bytes, either + // singular or declared as arrays + switch (codec) { + case E_EXTERNAL: + if (option != E_BYTE && option != E_BYTE_ARRAY) + return NULL; + c->encode = cram_external_encode_char; + break; + default: + return NULL; + } + } else { + // CRAM 3 and earlier encodes integers as EXTERNAL. We need + // use the option field to indicate the input data format so + // we know which serialisation format to use. + if (option == E_INT) + c->encode = cram_external_encode_int; + else if (option == E_LONG) + c->encode = cram_external_encode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->encode = cram_external_encode_char; + else + abort(); + } c->store = cram_external_encode_store; + c->flush = NULL; c->u.e_external.content_id = (size_t)dat; @@ -520,113 +631,1640 @@ cram_codec *cram_external_encode_init(cram_stats *st, /* * --------------------------------------------------------------------------- - * BETA + * VARINT + * + * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8 + * format as well as bytes. In CRAM 4 EXTERNAL is only for bytes and + * byte arrays, with two dedicated encodings for integers: + * VARINT_SIGNED and VARINT_UNSIGNED. These also differ a little to + * EXTERNAL with the addition of an offset field, meaning we can store + * values in, say, the range -2 to 1 million without needing to use + * a signed zig-zag transformation. */ -int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int64_t *out_i = (int64_t *)out; - int i, n = *out_size; +int cram_varint_decode_int(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; - for (i = 0; i < n; i++) - out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - } else { - for (i = 0; i < n; i++) - out_i[i] = -c->u.beta.offset; + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int32_t *)out = c->vv->varint_get32(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_sint(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int32_t *)out = c->vv->varint_get32s(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int64_t *)out = c->vv->varint_get64(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_slong(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int64_t *)out = c->vv->varint_get64s(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +void cram_varint_decode_free(cram_codec *c) { + if (c) + free(c); +} + +int cram_varint_decode_size(cram_slice *slice, cram_codec *c) { + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return -1; + + return b->uncomp_size; +} + +cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) { + return cram_get_block_by_id(slice, c->u.varint.content_id); +} + +int cram_varint_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)", + c->u.varint.content_id, + c->u.varint.offset, + c->u.varint.type) + < 0 ? -1 : 0; +} + +cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data, *cp_end = data+size; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + + // Function pointer choice is theoretically by codec type. + // Given we have some vars as int32 and some as int64 we + // use option too for sizing, although on disk format + // does not change. + switch(codec) { + case E_VARINT_UNSIGNED: + c->decode = (option == E_INT) + ? cram_varint_decode_int + : cram_varint_decode_long; + break; + case E_VARINT_SIGNED: + c->decode = (option == E_INT) + ? cram_varint_decode_sint + : cram_varint_decode_slong; + break; + default: + return NULL; + } + + c->free = cram_varint_decode_free; + c->size = cram_varint_decode_size; + c->get_block = cram_varint_get_block; + c->describe = cram_varint_describe; + + c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL); + c->u.varint.offset = vv->varint_get64s(&cp, cp_end, NULL); + + if (cp - data != size) { + fprintf(stderr, "Malformed varint header stream\n"); + free(c); + return NULL; + } + + c->u.varint.type = option; + + return c; +} + +int cram_varint_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint32_t *i32 = (uint32_t *)in; + return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_sint(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int32_t *i32 = (int32_t *)in; + return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint64_t *i64 = (uint64_t *)in; + return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_slong(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *i64 = (int64_t *)in; + return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +void cram_varint_encode_free(cram_codec *c) { + if (!c) + return; + free(c); +} + +int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix, + int version) { + char tmp[99], *tp = tmp; + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id); + tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset); + len += c->vv->varint_put32_blk(b, c->codec); + len += c->vv->varint_put32_blk(b, tp-tmp); + BLOCK_APPEND(b, tmp, tp-tmp); + len += tp-tmp; + + return len; + + block_err: + return -1; +} + +cram_codec *cram_varint_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->u.e_varint.offset = 0; + if (st) { + // Marginal difference so far! Not worth the hassle? + if (st->min_val < 0 && st->min_val >= -127 + && st->max_val / -st->min_val > 100) { + c->u.e_varint.offset = -st->min_val; + codec = E_VARINT_UNSIGNED; + } else if (st->min_val > 0) { + c->u.e_varint.offset = -st->min_val; + } + } + + c->codec = codec; + c->free = cram_varint_encode_free; + + // Function pointer choice is theoretically by codec type. + // Given we have some vars as int32 and some as int64 we + // use option too for sizing, although on disk format + // does not change. + switch (codec) { + case E_VARINT_UNSIGNED: + c->encode = (option == E_INT) + ? cram_varint_encode_int + : cram_varint_encode_long; + break; + case E_VARINT_SIGNED: + c->encode = (option == E_INT) + ? cram_varint_encode_sint + : cram_varint_encode_slong; + break; + default: + return NULL; } + c->store = cram_varint_encode_store; + c->flush = NULL; + + c->u.e_varint.content_id = (size_t)dat; + + return c; +} +/* + * --------------------------------------------------------------------------- + * CONST_BYTE and CONST_INT + */ +int cram_const_decode_byte(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int i, n; + + for (i = 0, n = *out_size; i < n; i++) + out[i] = c->u.xconst.val; return 0; } -int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { +int cram_const_decode_int(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { int32_t *out_i = (int32_t *)out; - int i, n = *out_size; + int i, n; - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; + for (i = 0, n = *out_size; i < n; i++) + out_i[i] = c->u.xconst.val; - for (i = 0; i < n; i++) - out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - } else { - for (i = 0; i < n; i++) - out_i[i] = -c->u.beta.offset; - } + return 0; +} + +int cram_const_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n; + + for (i = 0, n = *out_size; i < n; i++) + out_i[i] = c->u.xconst.val; return 0; } -int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int i, n = *out_size; +void cram_const_decode_free(cram_codec *c) { + if (c) + free(c); +} +int cram_const_decode_size(cram_slice *slice, cram_codec *c) { + return 0; +} - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; +int cram_const_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "CONST(val=%"PRId64")", + c->u.xconst.val) < 0 ? -1 : 0; +} - if (out) - for (i = 0; i < n; i++) - out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - else - for (i = 0; i < n; i++) - get_bits_MSB(in, c->u.beta.nbits); - } else { - if (out) - for (i = 0; i < n; i++) - out[i] = -c->u.beta.offset; +cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + if (codec == E_CONST_BYTE) + c->decode = cram_const_decode_byte; + else if (option == E_INT) + c->decode = cram_const_decode_int; + else + c->decode = cram_const_decode_long; + c->free = cram_const_decode_free; + c->size = cram_const_decode_size; + c->get_block = NULL; + c->describe = cram_const_describe; + + c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL); + + if (cp - data != size) { + fprintf(stderr, "Malformed const header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_const_encode(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return 0; +} + +int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix, + int version) { + char tmp[99], *tp = tmp; + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val); + len += c->vv->varint_put32_blk(b, c->codec); + len += c->vv->varint_put32_blk(b, tp-tmp); + BLOCK_APPEND(b, tmp, tp-tmp); + len += tp-tmp; + + return len; + + block_err: + return -1; +} + +cram_codec *cram_const_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + c->free = cram_const_decode_free; // as as decode + c->encode = cram_const_encode; // a nop + c->store = cram_const_encode_store; + c->flush = NULL; + c->u.e_xconst.val = st->min_val; + + return c; +} + +/* + * --------------------------------------------------------------------------- + * BETA + */ +int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n = *out_size; + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + +int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n = *out_size; + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + +int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int i, n = *out_size; + + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + if (out) + for (i = 0; i < n; i++) + out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + else + for (i = 0; i < n; i++) + get_bits_MSB(in, c->u.beta.nbits); + } else { + if (out) + for (i = 0; i < n; i++) + out[i] = -c->u.beta.offset; + } + + return 0; +} + +void cram_beta_decode_free(cram_codec *c) { + if (c) + free(c); +} + +int cram_beta_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "BETA(offset=%d, nbits=%d)", + c->u.beta.offset, c->u.beta.nbits) + < 0 ? -1 : 0; +} + +cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_BETA; + if (option == E_INT || option == E_SINT) + c->decode = cram_beta_decode_int; + else if (option == E_LONG || option == E_SLONG) + c->decode = cram_beta_decode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_beta_decode_char; + else { + hts_log_error("BYTE_ARRAYs not supported by this codec"); + free(c); + return NULL; + } + c->free = cram_beta_decode_free; + c->describe = cram_beta_describe; + + c->u.beta.nbits = -1; + c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL); + if (cp < data + size) // Ensure test below works + c->u.beta.nbits = vv->varint_get32(&cp, data + size, NULL); + + if (cp - data != size + || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) { + hts_log_error("Malformed beta header stream"); + free(c); + return NULL; + } + + return c; +} + +int cram_beta_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + // codec length + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset) + + c->vv->varint_size(c->u.e_beta.nbits))); + r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits)); r |= n; + + if (r > 0) return len; + + block_err: + return -1; +} + +int cram_beta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +int cram_beta_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int *syms = (int *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +int cram_beta_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + unsigned char *syms = (unsigned char *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +void cram_beta_encode_free(cram_codec *c) { + if (c) free(c); +} + +cram_codec *cram_beta_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + int min_val, max_val, len = 0; + int64_t range; + + c = malloc(sizeof(*c)); + if (!c) + return NULL; + c->codec = E_BETA; + c->free = cram_beta_encode_free; + if (option == E_INT || option == E_SINT) + c->encode = cram_beta_encode_int; + else if (option == E_LONG || option == E_SLONG) + c->encode = cram_beta_encode_long; + else + c->encode = cram_beta_encode_char; + c->store = cram_beta_encode_store; + c->flush = NULL; + + if (dat) { + min_val = ((int *)dat)[0]; + max_val = ((int *)dat)[1]; + } else { + min_val = INT_MAX; + max_val = INT_MIN; + int i; + for (i = 0; i < MAX_STAT_VAL; i++) { + if (!st->freqs[i]) + continue; + if (min_val > i) + min_val = i; + max_val = i; + } + if (st->h) { + khint_t k; + + for (k = kh_begin(st->h); k != kh_end(st->h); k++) { + if (!kh_exist(st->h, k)) + continue; + + i = kh_key(st->h, k); + if (min_val > i) + min_val = i; + if (max_val < i) + max_val = i; + } + } + } + + assert(max_val >= min_val); + c->u.e_beta.offset = -min_val; + range = (int64_t) max_val - min_val; + while (range) { + len++; + range >>= 1; + } + c->u.e_beta.nbits = len; + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XPACK: Packing multiple values into a single byte. A fast transform that + * reduces time taken by entropy encoder and may also improve compression. + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ +int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n = *out_size; + + if (c->u.xpack.nbits) { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)]; + } else { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[0]; + } + + return 0; +} + +int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n = *out_size; + + if (c->u.xpack.nbits) { + if (cram_not_enough_bits(in, c->u.xpack.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)]; + } else { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[0]; + } + + return 0; +} + +static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) { + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (b) + return 0; + + // get sub-codec data. + cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec); + if (!sub_b) + return -1; + + // Allocate local block to expand into + b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0); + if (!b) + return -1; + int n = sub_b->uncomp_size * 8/c->u.xpack.nbits; + BLOCK_GROW(b, n); + b->uncomp_size = n; + + uint8_t p[256]; + int z; + for (z = 0; z < 256; z++) + p[z] = c->u.xpack.rmap[z]; + hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size, + 8 / c->u.xpack.nbits, p); + + return 0; + + block_err: + return -1; +} + +int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // FIXME: we need to ban data-series interleaving in the spec for this to work. + + // Remember this may be called when threaded and multi-slice per container. + // Hence one cram_codec instance, multiple slices, multiple blocks. + // We therefore have to cache appropriate block info in slice and not codec. + // b = cram_get_block_by_id(slice, c->external.content_id); + if (c->u.xpack.nval > 1) { + cram_xpack_decode_expand_char(slice, c); + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (!b) + return -1; + + if (out) + memcpy(out, b->data + b->byte, *out_size); + b->byte += *out_size; + } else { + memset(out, c->u.xpack.rmap[0], *out_size); + } + + return 0; +} + +void cram_xpack_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xpack.sub_codec) + c->u.xpack.sub_codec->free(c->u.xpack.sub_codec); + + //free(slice->block_by_id[512 + c->codec_id]); + //slice->block_by_id[512 + c->codec_id] = 0; + + free(c); +} + +int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) { + cram_xpack_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} + +cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) { + cram_xpack_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} + +cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + char *endp = data+size; + + if (!(c = calloc(1, sizeof(*c)))) + return NULL; + + c->codec = E_XPACK; + if (option == E_LONG) + c->decode = cram_xpack_decode_long; + else if (option == E_INT) + c->decode = cram_xpack_decode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_xpack_decode_char; + else { + fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); + goto malformed; + } + c->free = cram_xpack_decode_free; + c->size = cram_xpack_decode_size; + c->get_block = cram_xpack_get_block; + c->describe = NULL; + + c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL); + c->u.xpack.nval = vv->varint_get32(&cp, endp, NULL); + if (c->u.xpack.nbits >= 8 || c->u.xpack.nbits < 0 || + c->u.xpack.nval > 256 || c->u.xpack.nval < 0) + goto malformed; + int i; + for (i = 0; i < c->u.xpack.nval; i++) { + uint32_t v = vv->varint_get32(&cp, endp, NULL); + if (v >= 256) + goto malformed; + c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K + } + + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); + if (c->u.xpack.sub_codec == NULL) + goto malformed; + cp += sub_size; + + if (cp - data != size + || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) { + malformed: + fprintf(stderr, "Malformed xpack header stream\n"); + cram_xpack_decode_free(c); + return NULL; + } + + return c; +} + +int cram_xpack_encode_flush(cram_codec *c) { + // Pack the buffered up data + int meta_len; + uint64_t out_len; + uint8_t out_meta[1024]; + uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out), + out_meta, &meta_len, &out_len); + + // We now need to pass this through the next layer of transform + if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming + c->u.e_xpack.sub_codec, + (char *)out, out_len)) + return -1; + + int r = 0; + if (c->u.e_xpack.sub_codec->flush) + r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec); + + free(out); + return r; +} + +int cram_xpack_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + // Store sub-codec + cram_codec *tc = c->u.e_xpack.sub_codec; + cram_block *tb = cram_new_block(0, 0); + if (!tb) + return -1; + int len2 = tc->store(tc, tb, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + + // codec length + int len1 = 0, i; + for (i = 0; i < c->u.e_xpack.nval; i++) + len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n; + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits) + + c->vv->varint_size(c->u.e_xpack.nval) + + len1 + len2)); r |= n; + + // The map and sub-codec + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval)); r |= n; + for (i = 0; i < c->u.e_xpack.nval; i++) + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n; + + BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb)); + + cram_free_block(tb); + + return r > 0 ? len + len2 : -1; + + block_err: + return -1; +} + +// Same as cram_beta_encode_long +int cram_xpack_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits); + + return r; +} + +int cram_xpack_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int *syms = (int *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits); + + return r; +} + +int cram_xpack_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + BLOCK_APPEND(c->out, in, in_size); + return 0; + + block_err: + return -1; +} + +void cram_xpack_encode_free(cram_codec *c) { + if (!c) return; + + if (c->u.e_xpack.sub_codec) + c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec); + + cram_free_block(c->out); + + free(c); +} + +cram_codec *cram_xpack_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XPACK; + c->free = cram_xpack_encode_free; + if (option == E_LONG) + c->encode = cram_xpack_encode_long; + else if (option == E_INT) + c->encode = cram_xpack_encode_int; + else + c->encode = cram_xpack_encode_char; + c->store = cram_xpack_encode_store; + c->flush = cram_xpack_encode_flush; + + cram_xpack_encoder *e = (cram_xpack_encoder *)dat; + c->u.e_xpack.nbits = e->nbits; + c->u.e_xpack.nval = e->nval; + c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL, + E_BYTE_ARRAY, e->sub_codec_dat, + version, vv); + + // Initialise fwd and rev maps + memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3 + int i, n; + for (i = n = 0; i < 256; i++) + if (e->map[i] != -1) + c->u.e_xpack.rmap[n++] = i; // 0,1,2,3 to P,A,C,K + if (n != e->nval) { + fprintf(stderr, "Incorrectly specified number of map items in PACK\n"); + return NULL; + } + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XDELTA: subtract successive values, zig-zag to turn +/- to + only, + * and then var-int encode the result. + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ + +static uint8_t zigzag8 (int8_t x) { return (x << 1) ^ (x >> 7); } +static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); } +static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); } + +//static int8_t unzigzag8 (uint8_t x) { return (x >> 1) ^ -(x & 1); } +static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); } +static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); } + +int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + return -1; +} + +int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // Slow value-by-value method for now + uint32_t *out32 = (uint32_t *)out; + int i; + for (i = 0; i < *out_size; i++) { + uint32_t v; + int one = 1; + if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in, + (char *)&v, &one) < 0) + return -1; + uint32_t d = unzigzag32(v); + c->u.xdelta.last = out32[i] = d + c->u.xdelta.last; + } + + return 0; +} + +static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) { + return -1; +} + +int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + return -1; +} + +static inline int16_t le_int2(int16_t i) { + int16_t s; + i16_to_le(i, (uint8_t *)&s); + return s; +} + +int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in, + char *out_, int *out_size) { + cram_block *out = (cram_block *)out_; + cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec); + int i = 0; + + const int w = c->u.xdelta.word_size; + uint32_t npad = (w - *out_size%w)%w; + uint32_t out_sz = *out_size + npad; + c->u.xdelta.last = 0; // reset for each new array + + for (i = 0; i < out_sz; i += w) { + uint16_t v; + // Need better interface + char *cp = (char *)b->data + b->byte; + char *cp_end = (char *)b->data + b->uncomp_size; + int err = 0; + v = c->vv->varint_get32(&cp, cp_end, &err); + if (err) + return -1; + b->byte = cp - (char *)b->data; + + switch(w) { + case 2: { + int16_t d = unzigzag16(v), z; + c->u.xdelta.last = d + c->u.xdelta.last; + z = le_int2(c->u.xdelta.last); + BLOCK_APPEND(out, &z, 2-npad); + npad = 0; + break; + } + default: + fprintf(stderr, "Unsupported word size by XDELTA\n"); + return -1; + } + } + + return 0; + + block_err: + return -1; +} + +void cram_xdelta_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xdelta.sub_codec) + c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec); + + free(c); +} + +int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) { + cram_xdelta_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} + +cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) { + cram_xdelta_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} + +cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + char *endp = data+size; + + if (!(c = calloc(1, sizeof(*c)))) + return NULL; + + c->codec = E_XDELTA; + if (option == E_LONG) + c->decode = cram_xdelta_decode_long; + else if (option == E_INT) + c->decode = cram_xdelta_decode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_xdelta_decode_char; + else if (option == E_BYTE_ARRAY_BLOCK) { + option = E_BYTE_ARRAY; + c->decode = cram_xdelta_decode_block; + } else { + free(c); + return NULL; + } + c->free = cram_xdelta_decode_free; + c->size = cram_xdelta_decode_size; + c->get_block = cram_xdelta_get_block; + c->describe = NULL; + + c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL); + c->u.xdelta.last = 0; + + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); + if (c->u.xdelta.sub_codec == NULL) + goto malformed; + cp += sub_size; + + if (cp - data != size) { + malformed: + fprintf(stderr, "Malformed xdelta header stream\n"); + cram_xdelta_decode_free(c); + return NULL; + } + + return c; +} + +int cram_xdelta_encode_flush(cram_codec *c) { + int r = -1; + cram_block *b = cram_new_block(0, 0); + if (!b) + return -1; + + switch (c->u.e_xdelta.word_size) { + case 2: { + // Delta + zigzag transform. + // Subtracting two 8-bit values has a 9-bit result (-255 to 255). + // However think of it as turning a wheel clockwise or anti-clockwise. + // If it has 256 gradations then a -ve rotation followed by a +ve + // rotation of the same amount reverses it regardless. + // + // Similarly the zig-zag transformation doesn't invent any extra bits, + // so the entire thing can be done in-situ. This may permit faster + // SIMD loops if we break apart the steps. + + // uint16_t last = 0, d; + // for (i = 0; i < n; i++) { + // d = io[i] - last; + // last = io[i]; + // io[i] = zigzag16(vd); + // } + + // --- vs --- + + // for (i = n-1; i >= 1; i--) + // io[i] -= io[i-1]; + // for (i = 0; i < n; i++) + // io[i] = zigzag16(io[i]); + + // varint: need array variant for speed here. + // With zig-zag + int i, n = BLOCK_SIZE(c->out)/2;; + uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0; + + if (n*2 < BLOCK_SIZE(c->out)) { + // half word + last = *(uint8_t *)dat; + c->vv->varint_put32_blk(b, zigzag16(last)); + dat = (uint16_t *)(((uint8_t *)dat)+1); + } + + for (i = 0; i < n; i++) { + uint16_t d = dat[i] - last; // possibly unaligned + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag16(d)); + } + + break; + } + + case 4: { + int i, n = BLOCK_SIZE(c->out)/4;; + uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0; + + for (i = 0; i < n; i++) { + uint32_t d = dat[i] - last; + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag32(d)); + } + + break; + } + + case 1: { + int i, n = BLOCK_SIZE(c->out);; + uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0; + + for (i = 0; i < n; i++) { + uint32_t d = dat[i] - last; + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag8(d)); + } + + break; + } + + default: + goto err; + } + + if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec, + (char *)b->data, b->byte)) + goto err; + + r = 0; + + err: + cram_free_block(b); + return r; + +} + +int cram_xdelta_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + // Store sub-codec + cram_codec *tc = c->u.e_xdelta.sub_codec; + cram_block *tb = cram_new_block(0, 0); + if (!tb) + return -1; + int len2 = tc->store(tc, tb, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + + // codec length + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size) + + len2)); r |= n; + + // This and sub-codec + len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n; + BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb)); + + cram_free_block(tb); + + return r > 0 ? len + len2 : -1; + + block_err: + return -1; +} + +// Same as cram_beta_encode_long +int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return -1; +} + +int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return -1; +} + +int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + char *dat = malloc(in_size*5); + if (!dat) + return -1; + char *cp = dat, *cp_end = dat + in_size*5; + + c->u.e_xdelta.last = 0; // reset for each new array + if (c->u.e_xdelta.word_size == 2) { + int i, part; + + part = in_size%2; + if (part) { + uint16_t z = in[0]; + c->u.e_xdelta.last = le_int2(z); + cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last)); + } + + uint16_t *in16 = (uint16_t *)(in+part); + for (i = 0; i < in_size/2; i++) { + uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last; + c->u.e_xdelta.last = le_int2(in16[i]); + cp += c->vv->varint_put32(cp, cp_end, zigzag16(d)); + } + } + if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec, + (char *)dat, cp-dat)) { + free(dat); + return -1; + } + + free(dat); + return 0; +} + +void cram_xdelta_encode_free(cram_codec *c) { + if (!c) return; + + if (c->u.e_xdelta.sub_codec) + c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec); + + cram_free_block(c->out); + + free(c); +} + +cram_codec *cram_xdelta_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XDELTA; + c->free = cram_xdelta_encode_free; + if (option == E_LONG) + c->encode = cram_xdelta_encode_long; + else if (option == E_INT) + c->encode = cram_xdelta_encode_int; + else + c->encode = cram_xdelta_encode_char; + c->store = cram_xdelta_encode_store; + c->flush = cram_xdelta_encode_flush; + + cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat; + c->u.e_xdelta.word_size = e->word_size; + c->u.e_xdelta.last = 0; + c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL, + E_BYTE_ARRAY, + e->sub_codec_dat, + version, vv); + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XRLE + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ +int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // TODO if and when needed + return -1; +} + +int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // TODO if and when needed + return -1; +} + +// Expands an XRLE transform and caches result in slice->block_by_id[] +static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) { + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (b) + return 0; + + b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0); + if (!b) + return -1; + cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec); + if (!lit_b) + return -1; + unsigned char *lit_dat = lit_b->data; + unsigned int lit_sz = lit_b->uncomp_size; + unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec); + + cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec); + if (!len_b) + return -1; + unsigned char *len_dat = len_b->data; + + uint8_t rle_syms[256]; + int rle_nsyms = 0; + int i; + for (i = 0; i < 256; i++) { + if (c->u.xrle.rep_score[i] > 0) + rle_syms[rle_nsyms++] = i; + } + + uint64_t out_sz; + int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz); + if (!(b->data = malloc(out_sz))) + return -1; + hts_rle_decode(lit_dat, lit_sz, + len_dat+nb, len_sz-nb, + rle_syms, rle_nsyms, + b->data, &out_sz); + b->uncomp_size = out_sz; + + return 0; +} + +int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) { + cram_xrle_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} + +cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) { + cram_xrle_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} + +int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int n = *out_size; + + cram_xrle_decode_expand_char(slice, c); + cram_block *b = slice->block_by_id[512 + c->codec_id]; + + memcpy(out, b->data + b->idx, n); + b->idx += n; + return 0; + + // Old code when not cached + while (n > 0) { + if (c->u.xrle.cur_len == 0) { + unsigned char lit; + int one = 1; + if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in, + (char *)&lit, &one) < 0) + return -1; + c->u.xrle.cur_lit = lit; + + if (c->u.xrle.rep_score[lit] > 0) { + if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in, + (char *)&c->u.xrle.cur_len, &one) < 0) + return -1; + } // else cur_len still zero + //else fprintf(stderr, "%d\n", lit); + + c->u.xrle.cur_len++; + } + + if (n >= c->u.xrle.cur_len) { + memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len); + out += c->u.xrle.cur_len; + n -= c->u.xrle.cur_len; + c->u.xrle.cur_len = 0; + } else { + memset(out, c->u.xrle.cur_lit, n); + out += n; + c->u.xrle.cur_len -= n; + n = 0; + } } return 0; } -void cram_beta_decode_free(cram_codec *c) { - if (c) - free(c); +void cram_xrle_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xrle.len_codec) + c->u.xrle.len_codec->free(c->u.xrle.len_codec); + + if (c->u.xrle.lit_codec) + c->u.xrle.lit_codec->free(c->u.xrle.lit_codec); + + free(c); } -cram_codec *cram_beta_decode_init(char *data, int size, +cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; + char *endp = data+size; + int err = 0; - if (!(c = malloc(sizeof(*c)))) + if (!(c = calloc(1, sizeof(*c)))) return NULL; - c->codec = E_BETA; - if (option == E_INT) - c->decode = cram_beta_decode_int; - else if (option == E_LONG) - c->decode = cram_beta_decode_long; + c->codec = E_XRLE; + if (option == E_LONG) + c->decode = cram_xrle_decode_long; + else if (option == E_INT) + c->decode = cram_xrle_decode_int; else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->decode = cram_beta_decode_char; + c->decode = cram_xrle_decode_char; else { - hts_log_error("BYTE_ARRAYs not supported by this codec"); + fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); free(c); return NULL; } - c->free = cram_beta_decode_free; + c->free = cram_xrle_decode_free; + c->size = cram_xrle_decode_size; + c->get_block = cram_xrle_get_block; + c->describe = NULL; + c->u.xrle.cur_len = 0; + c->u.xrle.cur_lit = -1; + + // RLE map + int i, j, nrle = vv->varint_get32(&cp, endp, &err); + memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score)); + for (i = 0; i < nrle && i < 256; i++) { + j = vv->varint_get32(&cp, endp, &err); + if (j >= 0 && j < 256) + c->u.xrle.rep_score[j] = 1; + } - c->u.beta.nbits = -1; - cp += safe_itf8_get(cp, data + size, &c->u.beta.offset); - if (cp < data + size) // Ensure test below works - cp += safe_itf8_get(cp, data + size, &c->u.beta.nbits); + // Length and literal sub encodings + c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err); + int sub_size = vv->varint_get32(&cp, endp, &err); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding, + cp, sub_size, E_INT, version, vv); + if (c->u.xrle.len_codec == NULL) + goto malformed; + cp += sub_size; - if (cp - data != size - || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) { - hts_log_error("Malformed beta header stream"); - free(c); - return NULL; - } + c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err); + sub_size = vv->varint_get32(&cp, endp, &err); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding, + cp, sub_size, option, version, vv); + if (c->u.xrle.lit_codec == NULL) + goto malformed; + cp += sub_size; + + if (err) + goto malformed; return c; + + malformed: + fprintf(stderr, "Malformed xrle header stream\n"); + cram_xrle_decode_free(c); + return NULL; } -int cram_beta_encode_store(cram_codec *c, cram_block *b, - char *prefix, int version) { +int cram_xrle_encode_flush(cram_codec *c) { + uint8_t *out_lit, *out_len; + uint64_t out_lit_size, out_len_size; + uint8_t rle_syms[256]; + int rle_nsyms = 0, i; + + for (i = 0; i < 256; i++) + if (c->u.e_xrle.rep_score[i] > 0) + rle_syms[rle_nsyms++] = i; + + if (!c->u.e_xrle.to_flush) { + c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out); + c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out); + } + + out_len = malloc(c->u.e_xrle.to_flush_size+8); + if (!out_len) + return -1; + + int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size); + + out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size, + out_len+nb, &out_len_size, + rle_syms, &rle_nsyms, + NULL, &out_lit_size); + out_len_size += nb; + + + // TODO: can maybe "gift" the sub codec the data block, to remove + // one level of memcpy. + if (c->u.e_xrle.len_codec->encode(NULL, + c->u.e_xrle.len_codec, + (char *)out_len, out_len_size)) + return -1; + + if (c->u.e_xrle.lit_codec->encode(NULL, + c->u.e_xrle.lit_codec, + (char *)out_lit, out_lit_size)) + return -1; + + free(out_len); + free(out_lit); + + return 0; +} + +int cram_xrle_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { int len = 0, r = 0, n; + cram_codec *tc; + cram_block *b_rle, *b_len, *b_lit; if (prefix) { size_t l = strlen(prefix); @@ -634,118 +2272,135 @@ int cram_beta_encode_store(cram_codec *c, cram_block *b, len += l; } - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, itf8_size(c->u.e_beta.offset) - + itf8_size(c->u.e_beta.nbits))); // codec length - r |= n; - len += (n = itf8_put_blk(b, c->u.e_beta.offset)); r |= n; - len += (n = itf8_put_blk(b, c->u.e_beta.nbits)); r |= n; + // List of symbols to RLE + b_rle = cram_new_block(0, 0); + if (!b_rle) + return -1; + int i, nrle = 0, len1 = 0; + for (i = 0; i < 256; i++) { + if (c->u.e_xrle.rep_score[i] > 0) { + nrle++; + len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n; + } + } - if (r > 0) return len; + // Store length and literal sub-codecs to get encoded length + tc = c->u.e_xrle.len_codec; + b_len = cram_new_block(0, 0); + if (!b_len) + return -1; + int len2 = tc->store(tc, b_len, NULL, version); + + tc = c->u.e_xrle.lit_codec; + b_lit = cram_new_block(0, 0); + if (!b_lit) + return -1; + int len3 = tc->store(tc, b_lit, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3 + + c->vv->varint_size(nrle))); r |= n; + len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n; + BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle)); + BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); + BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit)); + + cram_free_block(b_rle); + cram_free_block(b_len); + cram_free_block(b_lit); + + if (r > 0) + return len + len1 + len2 + len3; block_err: return -1; } -int cram_beta_encode_long(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - int64_t *syms = (int64_t *)in; - int i, r = 0; - - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); +int cram_xrle_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + // TODO if and when needed + return -1; +} - return r; +int cram_xrle_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + // TODO if and when needed + return -1; } -int cram_beta_encode_int(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - int *syms = (int *)in; - int i, r = 0; +int cram_xrle_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + if (c->u.e_xrle.to_flush) { + if (!c->out && !(c->out = cram_new_block(0, 0))) + return -1; + BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size); + c->u.e_xrle.to_flush = NULL; + c->u.e_xrle.to_flush_size = 0; + } - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); + if (c->out && BLOCK_SIZE(c->out) > 0) { + // Gathering data + BLOCK_APPEND(c->out, in, in_size); + return 0; + } - return r; + // else cache copy of the data we're about to send to flush instead. + c->u.e_xrle.to_flush = in; + c->u.e_xrle.to_flush_size = in_size; + return 0; + + block_err: + return -1; } -int cram_beta_encode_char(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - unsigned char *syms = (unsigned char *)in; - int i, r = 0; +void cram_xrle_encode_free(cram_codec *c) { + if (!c) return; - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); + if (c->u.e_xrle.len_codec) + c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec); + if (c->u.e_xrle.lit_codec) + c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec); - return r; -} + cram_free_block(c->out); -void cram_beta_encode_free(cram_codec *c) { - if (c) free(c); + free(c); } -cram_codec *cram_beta_encode_init(cram_stats *st, +cram_codec *cram_xrle_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; - int min_val, max_val, len = 0; - int64_t range; - c = malloc(sizeof(*c)); - if (!c) + if (!(c = malloc(sizeof(*c)))) return NULL; - c->codec = E_BETA; - c->free = cram_beta_encode_free; - if (option == E_INT) - c->encode = cram_beta_encode_int; - else if (option == E_LONG) - c->encode = cram_beta_encode_long; - else - c->encode = cram_beta_encode_char; - c->store = cram_beta_encode_store; - if (dat) { - min_val = ((int *)dat)[0]; - max_val = ((int *)dat)[1]; - } else { - min_val = INT_MAX; - max_val = INT_MIN; - int i; - for (i = 0; i < MAX_STAT_VAL; i++) { - if (!st->freqs[i]) - continue; - if (min_val > i) - min_val = i; - max_val = i; - } - if (st->h) { - khint_t k; + c->codec = E_XRLE; + c->free = cram_xrle_encode_free; + if (option == E_LONG) + c->encode = cram_xrle_encode_long; + else if (option == E_INT) + c->encode = cram_xrle_encode_int; + else + c->encode = cram_xrle_encode_char; + c->store = cram_xrle_encode_store; + c->flush = cram_xrle_encode_flush; - for (k = kh_begin(st->h); k != kh_end(st->h); k++) { - if (!kh_exist(st->h, k)) - continue; + cram_xrle_encoder *e = (cram_xrle_encoder *)dat; - i = kh_key(st->h, k); - if (min_val > i) - min_val = i; - if (max_val < i) - max_val = i; - } - } - } + c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL, + E_BYTE, e->len_dat, + version, vv); + c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL, + E_BYTE, e->lit_dat, + version, vv); + c->u.e_xrle.cur_lit = -1; + c->u.e_xrle.cur_len = -1; + c->u.e_xrle.to_flush = NULL; + c->u.e_xrle.to_flush_size = 0; - assert(max_val >= min_val); - c->u.e_beta.offset = -min_val; - range = (int64_t) max_val - min_val; - while (range) { - len++; - range >>= 1; - } - c->u.e_beta.nbits = len; + memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score)); return c; } @@ -803,9 +2458,18 @@ void cram_subexp_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_subexp_decode_init(char *data, int size, +int cram_subexp_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "SUBEXP(offset=%d,k=%d)", + c->u.subexp.offset, + c->u.subexp.k) + < 0 ? -1 : 0; +} + +cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; @@ -820,10 +2484,11 @@ cram_codec *cram_subexp_decode_init(char *data, int size, c->codec = E_SUBEXP; c->decode = cram_subexp_decode; c->free = cram_subexp_decode_free; + c->describe = cram_subexp_describe; c->u.subexp.k = -1; - cp += safe_itf8_get(cp, data + size, &c->u.subexp.offset); - cp += safe_itf8_get(cp, data + size, &c->u.subexp.k); + c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL); + c->u.subexp.k = vv->varint_get32(&cp, data + size, NULL); if (cp - data != size || c->u.subexp.k < 0) { hts_log_error("Malformed subexp header stream"); @@ -867,9 +2532,16 @@ void cram_gamma_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_gamma_decode_init(char *data, int size, +int cram_gamma_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset) + < 0 ? -1 : 0; +} + +cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; char *cp = data; @@ -887,8 +2559,9 @@ cram_codec *cram_gamma_decode_init(char *data, int size, c->codec = E_GAMMA; c->decode = cram_gamma_decode; c->free = cram_gamma_decode_free; + c->describe = cram_gamma_describe; - cp += safe_itf8_get(cp, data + size, &c->u.gamma.offset); + c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL); if (cp - data != size) goto malformed; @@ -1079,12 +2752,30 @@ int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, return 0; } +int cram_huffman_describe(cram_codec *c, kstring_t *ks) { + int r = 0, n; + r |= ksprintf(ks, "HUFFMAN(codes={") < 0; + for (n = 0; n < c->u.huffman.ncodes; n++) { + r |= ksprintf(ks, "%s%"PRId64, n?",":"", + c->u.huffman.codes[n].symbol); + } + r |= ksprintf(ks, "},lengths={") < 0; + for (n = 0; n < c->u.huffman.ncodes; n++) { + r |= ksprintf(ks, "%s%d", n?",":"", + c->u.huffman.codes[n].len); + } + r |= ksprintf(ks, "})") < 0; + return r; +} + /* * Initialises a huffman decoder from an encoding data stream. */ -cram_codec *cram_huffman_decode_init(char *data, int size, +cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { int32_t ncodes = 0, i, j; char *cp = data, *data_end = &data[size]; cram_codec *h; @@ -1092,14 +2783,14 @@ cram_codec *cram_huffman_decode_init(char *data, int size, int32_t val, last_len, max_len = 0; uint32_t max_val; // needs one more bit than val const int max_code_bits = sizeof(val) * 8 - 1; - int l; + int err = 0; if (option == E_BYTE_ARRAY_BLOCK) { hts_log_error("BYTE_ARRAYs not supported by this codec"); return NULL; } - cp += safe_itf8_get(cp, data_end, &ncodes); + ncodes = vv->varint_get32(&cp, data_end, &err); if (ncodes < 0) { hts_log_error("Invalid number of symbols in huffman stream"); return NULL; @@ -1108,7 +2799,12 @@ cram_codec *cram_huffman_decode_init(char *data, int size, errno = ENOMEM; return NULL; } - +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) { + errno = ENOMEM; + return NULL; + } +#endif h = calloc(1, sizeof(*h)); if (!h) return NULL; @@ -1117,6 +2813,7 @@ cram_codec *cram_huffman_decode_init(char *data, int size, h->free = cram_huffman_decode_free; h->u.huffman.ncodes = ncodes; + h->u.huffman.option = option; if (ncodes) { codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes)); if (!codes) { @@ -1129,21 +2826,19 @@ cram_codec *cram_huffman_decode_init(char *data, int size, /* Read symbols and bit-lengths */ if (option == E_LONG) { - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - l = safe_ltf8_get(cp, data_end, &codes[i].symbol); - } + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get64(&cp, data_end, &err); + } else if (option == E_INT || option == E_BYTE) { + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get32(&cp, data_end, &err); } else { - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - int32_t i32; - l = safe_itf8_get(cp, data_end, &i32); - codes[i].symbol = i32; - } + goto malformed; } - if (l < 1) + if (err) goto malformed; - cp += safe_itf8_get(cp, data_end, &i); + i = vv->varint_get32(&cp, data_end, &err); if (i != ncodes) goto malformed; @@ -1154,9 +2849,9 @@ cram_codec *cram_huffman_decode_init(char *data, int size, return h; } - for (i = 0, l = 1; i < ncodes; i++, cp += l) { - l = safe_itf8_get(cp, data_end, &codes[i].len); - if (l < 1) + for (i = 0; i < ncodes; i++) { + codes[i].len = vv->varint_get32(&cp, data_end, &err); + if (err) break; if (codes[i].len < 0) { hts_log_error("Huffman code length (%d) is negative", codes[i].len); @@ -1165,16 +2860,14 @@ cram_codec *cram_huffman_decode_init(char *data, int size, if (max_len < codes[i].len) max_len = codes[i].len; } - if (l < 1 || cp - data != size || max_len >= ncodes) + if (err || cp - data != size || max_len >= ncodes) goto malformed; /* 31 is max. bits available in val */ if (max_len > max_code_bits) { hts_log_error("Huffman code length (%d) is greater " "than maximum supported (%d)", max_len, max_code_bits); - free(h); - free(codes); - return NULL; + goto malformed; } /* Sort by bit length and then by symbol value */ @@ -1229,12 +2922,12 @@ cram_codec *cram_huffman_decode_init(char *data, int size, h->decode = cram_huffman_decode_char0; else h->decode = cram_huffman_decode_char; - } else if (option == E_LONG) { + } else if (option == E_LONG || option == E_SLONG) { if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_long0; else h->decode = cram_huffman_decode_long; - } else if (option == E_INT) { + } else if (option == E_INT || option == E_SINT || option == E_BYTE) { if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_int0; else @@ -1242,6 +2935,7 @@ cram_codec *cram_huffman_decode_init(char *data, int size, } else { return NULL; } + h->describe = cram_huffman_describe; return (cram_codec *)h; @@ -1389,7 +3083,7 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory */ char *tmp = malloc(6*c->u.e_huffman.nvals+16); - char *tp = tmp; + char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16; if (!tmp) return -1; @@ -1400,24 +3094,33 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, len += l; } - tp += itf8_put(tp, c->u.e_huffman.nvals); + tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals); if (c->u.e_huffman.option == E_LONG) { for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += ltf8_put(tp, codes[i].symbol); + tp += c->vv->varint_put64(tp, tpend, codes[i].symbol); } - } else { + } else if (c->u.e_huffman.option == E_SLONG) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol); + } + } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += c->vv->varint_put32(tp, tpend, codes[i].symbol); + } + } else if (c->u.e_huffman.option == E_SINT) { for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].symbol); + tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol); } + } else { + return -1; } - tp += itf8_put(tp, c->u.e_huffman.nvals); - for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].len); - } + tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals); + for (i = 0; i < c->u.e_huffman.nvals; i++) + tp += c->vv->varint_put32(tp, tpend, codes[i].len); - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, tp-tmp)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, tp-tmp)); r |= n; BLOCK_APPEND(b, tmp, tp-tmp); len += tp-tmp; @@ -1431,12 +3134,14 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, } cram_codec *cram_huffman_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, - int version) { - int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens = NULL, code, len; + int version, varint_vec *vv) { + int *vals = NULL, *freqs = NULL, *lens = NULL, code, len; int *new_vals, *new_freqs; - int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; + int i, max_val = 0, min_val = INT_MAX, k; + size_t nvals, vals_alloc = 0; cram_codec *c; cram_huffman_code *codes; @@ -1461,7 +3166,6 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, vals[nvals] = i; freqs[nvals] = st->freqs[i]; assert(st->freqs[i] > 0); - ntot += freqs[nvals]; if (max_val < i) max_val = i; if (min_val > i) min_val = i; nvals++; @@ -1484,7 +3188,6 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, vals[nvals]= kh_key(st->h, k); freqs[nvals] = kh_val(st->h, k); assert(freqs[nvals] > 0); - ntot += freqs[nvals]; if (max_val < i) max_val = i; if (min_val > i) min_val = i; nvals++; @@ -1594,18 +3297,21 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, c->encode = cram_huffman_encode_char0; else c->encode = cram_huffman_encode_char; - } else if (option == E_INT) { + } else if (option == E_INT || option == E_SINT) { if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_int0; else c->encode = cram_huffman_encode_int; - } else if (option == E_LONG) { + } else if (option == E_LONG || option == E_SLONG) { if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_long0; else c->encode = cram_huffman_encode_long; + } else { + return NULL; } c->store = cram_huffman_encode_store; + c->flush = NULL; return c; @@ -1658,14 +3364,30 @@ void cram_byte_array_len_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_byte_array_len_decode_init(char *data, int size, +int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) { + int r = 0; + r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0; + cram_byte_array_len_decoder *l = &c->u.byte_array_len; + r |= l->len_codec->describe + ? l->len_codec->describe(l->len_codec, ks) + : (ksprintf(ks, "?")<0); + r |= ksprintf(ks, "},val_codec={") < 0; + r |= l->val_codec->describe + ? l->val_codec->describe(l->val_codec, ks) + : (ksprintf(ks, "?")<0); + r |= ksprintf(ks, "}") < 0; + + return r; +} + +cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; char *endp = data + size; - int32_t encoding = 0; - int32_t sub_size = -1; if (!(c = malloc(sizeof(*c)))) return NULL; @@ -1673,26 +3395,26 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size, c->codec = E_BYTE_ARRAY_LEN; c->decode = cram_byte_array_len_decode; c->free = cram_byte_array_len_decode_free; + c->describe = cram_byte_array_len_describe; c->u.byte_array_len.len_codec = NULL; c->u.byte_array_len.val_codec = NULL; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &sub_size); + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); if (sub_size < 0 || endp - cp < sub_size) goto malformed; - c->u.byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size, - E_INT, version); + c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + E_INT, version, vv); if (c->u.byte_array_len.len_codec == NULL) goto no_codec; cp += sub_size; - sub_size = -1; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &sub_size); + encoding = vv->varint_get32(&cp, endp, NULL); + sub_size = vv->varint_get32(&cp, endp, NULL); if (sub_size < 0 || endp - cp < sub_size) goto malformed; - c->u.byte_array_len.val_codec = cram_decoder_init(encoding, cp, sub_size, - option, version); + c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); if (c->u.byte_array_len.val_codec == NULL) goto no_codec; cp += sub_size; @@ -1760,8 +3482,8 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, len3 = tc->store(tc, b_val, NULL, version); if (len3 < 0) goto block_err; - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, len2+len3)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n; BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val)); @@ -1778,9 +3500,10 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, } cram_codec *cram_byte_array_len_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat; @@ -1791,15 +3514,16 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st, c->free = cram_byte_array_len_encode_free; c->encode = cram_byte_array_len_encode; c->store = cram_byte_array_len_encode_store; + c->flush = NULL; c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding, st, E_INT, e->len_dat, - version); + version, vv); c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding, NULL, E_BYTE_ARRAY, e->val_dat, - version); + version, vv); if (!c->u.e_byte_array_len.len_codec || !c->u.e_byte_array_len.val_codec) { @@ -1855,8 +3579,8 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, int *out_size) { cram_block *b; cram_block *out = (cram_block *)out_; - char *cp, *out_cp, *cp_end; - char stop; + unsigned char *cp, *cp_end; + unsigned char stop; b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id); if (!b) @@ -1864,25 +3588,25 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, if (b->idx >= b->uncomp_size) return -1; - cp = (char *)b->data + b->idx; - cp_end = (char *)b->data + b->uncomp_size; - out_cp = (char *)BLOCK_END(out); + cp = b->data + b->idx; + cp_end = b->data + b->uncomp_size; stop = c->u.byte_array_stop.stop; if (cp_end - cp < out->alloc - out->byte) { + unsigned char *out_cp = BLOCK_END(out); while (cp != cp_end && *cp != stop) *out_cp++ = *cp++; - BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); + BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out); } else { - char *cp_start; + unsigned char *cp_start; for (cp_start = cp; cp != cp_end && *cp != stop; cp++) ; BLOCK_APPEND(out, cp_start, cp - cp_start); BLOCK_GROW(out, cp - cp_start); } - *out_size = cp - (char *)(b->data + b->idx); - b->idx = cp - (char *)b->data + 1; + *out_size = cp - (b->data + b->idx); + b->idx = cp - b->data + 1; return 0; @@ -1896,11 +3620,21 @@ void cram_byte_array_stop_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_byte_array_stop_decode_init(char *data, int size, +int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)", + c->u.byte_array_stop.stop, + c->u.byte_array_stop.content_id) + < 0 ? -1 : 0; +} + +cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; unsigned char *cp = (unsigned char *)data; + int err = 0; if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2)) goto malformed; @@ -1922,6 +3656,7 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size, return NULL; } c->free = cram_byte_array_stop_decode_free; + c->describe = cram_byte_array_stop_describe; c->u.byte_array_stop.stop = *cp++; if (CRAM_MAJOR_VERS(version) == 1) { @@ -1929,11 +3664,10 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size, + ((unsigned int) cp[3]<<24); cp += 4; } else { - cp += safe_itf8_get((char *) cp, data + size, - &c->u.byte_array_stop.content_id); + c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err); } - if ((char *)cp - data != size) + if ((char *)cp - data != size || err) goto malformed; return c; @@ -1971,19 +3705,20 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, len += l; } - cp += itf8_put(cp, c->codec); + cp += c->vv->varint_put32(cp, buf+20, c->codec); if (CRAM_MAJOR_VERS(version) == 1) { - cp += itf8_put(cp, 5); + cp += c->vv->varint_put32(cp, buf+20, 5); *cp++ = c->u.e_byte_array_stop.stop; *cp++ = (c->u.e_byte_array_stop.content_id >> 0) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 8) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff; } else { - cp += itf8_put(cp, 1 + itf8_size(c->u.e_byte_array_stop.content_id)); + cp += c->vv->varint_put32(cp, buf+20, 1 + + c->vv->varint_size(c->u.e_byte_array_stop.content_id)); *cp++ = c->u.e_byte_array_stop.stop; - cp += itf8_put(cp, c->u.e_byte_array_stop.content_id); + cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id); } BLOCK_APPEND(b, buf, cp-buf); @@ -1996,9 +3731,10 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, } cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; c = malloc(sizeof(*c)); @@ -2008,6 +3744,7 @@ cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, c->free = cram_byte_array_stop_encode_free; c->encode = cram_byte_array_stop_encode; c->store = cram_byte_array_stop_encode_store; + c->flush = NULL; c->u.e_byte_array_stop.stop = ((int *)dat)[0]; c->u.e_byte_array_stop.content_id = ((int *)dat)[1]; @@ -2031,33 +3768,68 @@ const char *cram_encoding2str(enum cram_encoding t) { case E_SUBEXP: return "SUBEXP"; case E_GOLOMB_RICE: return "GOLOMB_RICE"; case E_GAMMA: return "GAMMA"; + + case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED"; + case E_VARINT_SIGNED: return "VARINT_SIGNED"; + case E_CONST_BYTE: return "CONST_BYTE"; + case E_CONST_INT: return "CONST_INT"; + case E_NUM_CODECS: default: return "?"; } } -static cram_codec *(*decode_init[])(char *data, +static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, enum cram_external_type option, - int version) = { - NULL, + int version, varint_vec *vv) = { + // CRAM 3.0 valid codecs + NULL, // null codec cram_external_decode_init, - NULL, + NULL, // golomb cram_huffman_decode_init, cram_byte_array_len_decode_init, cram_byte_array_stop_decode_init, cram_beta_decode_init, cram_subexp_decode_init, - NULL, + NULL, // golomb rice cram_gamma_decode_init, + + // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, // was xbyte + cram_varint_decode_init, // varint unsigned + cram_varint_decode_init, // varint signed + cram_const_decode_init, // const byte + cram_const_decode_init, // const int + + // Gap to CRAM 4 transfomrations; 45 to 49 inclusive + NULL, NULL, NULL, NULL, NULL, + + NULL, // xhuffman + cram_xpack_decode_init, + cram_xrle_decode_init, + cram_xdelta_decode_init, }; -cram_codec *cram_decoder_init(enum cram_encoding codec, +cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, + enum cram_encoding codec, char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) { - return decode_init[codec](data, size, option, version); + cram_codec *r = decode_init[codec](hdr, data, size, codec, + option, version, vv); + if (r) { + r->vv = vv; + r->codec_id = hdr->ncodecs++; + } + return r; } else { hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec)); return NULL; @@ -2065,33 +3837,70 @@ cram_codec *cram_decoder_init(enum cram_encoding codec, } static cram_codec *(*encode_init[])(cram_stats *stx, + enum cram_encoding codec, enum cram_external_type option, void *opt, - int version) = { - NULL, - cram_external_encode_init, - NULL, + int version, varint_vec *vv) = { + // CRAM 3.0 valid codecs + NULL, // null codec + cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4 + NULL, // golomb cram_huffman_encode_init, cram_byte_array_len_encode_init, cram_byte_array_stop_encode_init, cram_beta_encode_init, - NULL, //cram_subexp_encode_init, - NULL, - NULL, //cram_gamma_encode_init, + NULL, // subexponential (we support decode only) + NULL, // golomb rice + NULL, // gamma (we support decode only) + + // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, // was xbyte + cram_varint_encode_init, // varint unsigned + cram_varint_encode_init, // varint signed + cram_const_encode_init, // const byte + cram_const_encode_init, // const int + + // Gap to CRAM 4 transfomrations; 45 to 49 inclusive + NULL, NULL, NULL, NULL, NULL, + + NULL, // xhuffman + cram_xpack_encode_init, + cram_xrle_encode_init, + cram_xdelta_encode_init, }; cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { if (st && !st->nvals) return NULL; + // cram_stats_encoding assumes integer data, but if option + // is E_BYTE then tweak the requested encoding. This ought + // to be fixed in cram_stats_encoding instead. + if (option == E_BYTE || option == E_BYTE_ARRAY || + option == E_BYTE_ARRAY_BLOCK) { + if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED) + codec = E_EXTERNAL; + else if (codec == E_CONST_INT) + codec = E_CONST_BYTE; + } + if (encode_init[codec]) { cram_codec *r; - if ((r = encode_init[codec](st, option, dat, version))) + if ((r = encode_init[codec](st, codec, option, dat, version, vv))) r->out = NULL; + if (!r) { + hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec)); + return NULL; + } + r->vv = vv; return r; } else { hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec)); @@ -2108,29 +3917,43 @@ int cram_codec_to_id(cram_codec *c, int *id2) { int bnum1, bnum2 = -2; switch (c->codec) { + case E_CONST_INT: + case E_CONST_BYTE: + bnum1 = -2; // no blocks used + break; + case E_HUFFMAN: bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1; break; + case E_GOLOMB: case E_BETA: case E_SUBEXP: case E_GOLOMB_RICE: case E_GAMMA: + // CORE block bnum1 = -1; break; + case E_EXTERNAL: + case E_VARINT_UNSIGNED: + case E_VARINT_SIGNED: bnum1 = c->u.external.content_id; break; + case E_BYTE_ARRAY_LEN: bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL); bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL); break; + case E_BYTE_ARRAY_STOP: bnum1 = c->u.byte_array_stop.content_id; break; + case E_NULL: bnum1 = -2; break; + default: hts_log_error("Unknown codec type %d", c->codec); bnum1 = -1; @@ -2157,6 +3980,12 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { int j; switch (c->codec) { + case E_CONST_INT: + case E_CONST_BYTE: + // shares struct with decode + c->store = cram_const_encode_store; + break; + case E_EXTERNAL: // shares struct with decode c->free = cram_external_encode_free; @@ -2167,6 +3996,25 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->encode = cram_external_encode_long; else if (c->decode == cram_external_decode_char) c->encode = cram_external_encode_char; + else if (c->decode == cram_external_decode_block) + c->encode = cram_external_encode_char; + else + return -1; + break; + + case E_VARINT_SIGNED: + case E_VARINT_UNSIGNED: + // shares struct with decode + c->free = cram_varint_encode_free; + c->store = cram_varint_encode_store; + if (c->decode == cram_varint_decode_int) + c->encode = cram_varint_encode_int; + else if (c->decode == cram_varint_decode_sint) + c->encode = cram_varint_encode_sint; + else if (c->decode == cram_varint_decode_long) + c->encode = cram_varint_encode_long; + else if (c->decode == cram_varint_decode_slong) + c->encode = cram_varint_encode_slong; else return -1; break; @@ -2177,11 +4025,13 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { // unify this. cram_codec *t = malloc(sizeof(*t)); if (!t) return -1; + t->vv = c->vv; t->codec = E_HUFFMAN; t->free = cram_huffman_encode_free; t->store = cram_huffman_encode_store; t->u.e_huffman.codes = c->u.huffman.codes; t->u.e_huffman.nvals = c->u.huffman.ncodes; + t->u.e_huffman.option = c->u.huffman.option; for (j = 0; j < t->u.e_huffman.nvals; j++) { int32_t sym = t->u.e_huffman.codes[j].symbol; if (sym >= -1 && sym < MAX_HUFF) @@ -2223,10 +4073,31 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { return -1; break; + case E_XPACK: { + // shares struct with decode + cram_codec t = *c; + t.free = cram_xpack_encode_free; + t.store = cram_xpack_encode_store; + if (t.decode == cram_xpack_decode_long) + t.encode = cram_xpack_encode_long; + else if (t.decode == cram_xpack_decode_int) + t.encode = cram_xpack_encode_int; + else if (t.decode == cram_xpack_decode_char) + t.encode = cram_xpack_encode_char; + else + return -1; + t.u.e_xpack.sub_codec = t.u.xpack.sub_codec; + if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1) + return -1; + *c = t; + break; + } + case E_BYTE_ARRAY_LEN: { cram_codec *t = malloc(sizeof(*t)); if (!t) return -1; - t->codec = E_BYTE_ARRAY_LEN; + t->vv = c->vv; + t->codec = E_BYTE_ARRAY_LEN; t->free = cram_byte_array_len_encode_free; t->store = cram_byte_array_len_encode_store; t->encode = cram_byte_array_len_encode; @@ -2241,6 +4112,7 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { // {len,val}_{encoding,dat} are undefined, but unused. // Leaving them unset here means we can test that assertion. *c = *t; + free(t); break; } @@ -2257,3 +4129,10 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { return 0; } + +int cram_codec_describe(cram_codec *c, kstring_t *ks) { + if (c && c->describe) + return c->describe(c, ks); + else + return ksprintf(ks, "?"); +} diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 31a170031..d93d9955c 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2015, 2018 Genome Research Ltd. +Copyright (c) 2012-2015, 2018, 2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -58,6 +58,7 @@ typedef struct { typedef struct { int ncodes; cram_huffman_code *codes; + int option; } cram_huffman_decoder; #define MAX_HUFF 128 @@ -73,6 +74,46 @@ typedef struct { int32_t nbits; } cram_beta_decoder; +// A PACK transform, packing multiple values into a single byte +typedef struct { + int32_t nbits; + enum cram_encoding sub_encoding; + void *sub_codec_dat; + struct cram_codec *sub_codec; + int nval; // number of items in maps + uint32_t rmap[256]; // 0,1,2,3 -> P,A,C,K + int map[256]; // P,A,C,K -> 0,1,2,3 // NB: max input is uint8_tb? Or use hash? +} cram_xpack_decoder; +typedef cram_xpack_decoder cram_xpack_encoder; + +// Transforms symbols X,Y,Z to bytes 0,1,2. +typedef struct { + enum cram_encoding len_encoding; + enum cram_encoding lit_encoding; + void *len_dat; + void *lit_dat; + struct cram_codec *len_codec; + struct cram_codec *lit_codec; + int cur_len; + int cur_lit; + int rep_score[256]; + char *to_flush; + size_t to_flush_size; +} cram_xrle_decoder; +typedef cram_xrle_decoder cram_xrle_encoder; + +// DELTA + zigzag + varint encoding +typedef struct { + // FIXME: define endian here too. Require little endian? + int64_t last; + uint8_t word_size; // 1, 2, 4, 8 + //uint8_t sign; // true if input data is already signed + enum cram_encoding sub_encoding; + void *sub_codec_dat; + struct cram_codec *sub_codec; +} cram_xdelta_decoder; +typedef cram_xdelta_decoder cram_xdelta_encoder; + typedef struct { int32_t offset; } cram_gamma_decoder; @@ -87,6 +128,12 @@ typedef struct { enum cram_external_type type; } cram_external_decoder; +typedef struct { + int32_t content_id; + int64_t offset; + enum cram_external_type type; +} cram_varint_decoder; + typedef struct { struct cram_codec *len_codec; struct cram_codec *val_codec; @@ -106,12 +153,18 @@ typedef struct { struct cram_codec *val_codec; } cram_byte_array_len_encoder; +typedef struct { + int64_t val; +} cram_const_codec; + /* * A generic codec structure. */ -typedef struct cram_codec { +struct cram_codec { enum cram_encoding codec; cram_block *out; + varint_vec *vv; + int codec_id; void (*free)(struct cram_codec *codec); int (*decode)(cram_slice *slice, struct cram_codec *codec, cram_block *in, char *out, int *out_size); @@ -119,6 +172,10 @@ typedef struct cram_codec { char *in, int in_size); int (*store)(struct cram_codec *codec, cram_block *b, char *prefix, int version); + int (*size)(cram_slice *slice, struct cram_codec *codec); + int (*flush)(struct cram_codec *codec); + cram_block *(*get_block)(cram_slice *slice, struct cram_codec *codec); + int (*describe)(struct cram_codec *codec, kstring_t *ks); union { cram_huffman_decoder huffman; @@ -128,23 +185,34 @@ typedef struct cram_codec { cram_subexp_decoder subexp; cram_byte_array_len_decoder byte_array_len; cram_byte_array_stop_decoder byte_array_stop; + cram_xpack_decoder xpack; + cram_xrle_decoder xrle; + cram_xdelta_decoder xdelta; + cram_const_codec xconst; + cram_varint_decoder varint; cram_huffman_encoder e_huffman; cram_external_decoder e_external; cram_byte_array_stop_decoder e_byte_array_stop; cram_byte_array_len_encoder e_byte_array_len; cram_beta_decoder e_beta; + cram_xpack_decoder e_xpack; + cram_xrle_decoder e_xrle; + cram_xdelta_decoder e_xdelta; + cram_const_codec e_xconst; + cram_varint_decoder e_varint; } u; -} cram_codec; +}; const char *cram_encoding2str(enum cram_encoding t); -cram_codec *cram_decoder_init(enum cram_encoding codec, char *data, int size, +cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, + enum cram_encoding codec, char *data, int size, enum cram_external_type option, - int version); + int version, varint_vec *vv); cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st, enum cram_external_type option, void *dat, - int version); + int version, varint_vec *vv); //int cram_decode(void *codes, char *in, int in_size, char *out, int *out_size); //void cram_decoder_free(void *codes); diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 6aeb0aa37..26c7c1fac 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -66,12 +66,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Returns number of bytes decoded on success * -1 on failure */ -int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { +int cram_decode_TD(cram_fd *fd, char *cp, const char *endp, + cram_block_compression_hdr *h) { char *op = cp; unsigned char *dat; cram_block *b; int32_t blk_size = 0; - int nTL, i, sz; + int nTL, i, sz, err = 0; if (!(b = cram_new_block(0, 0))) return -1; @@ -85,14 +86,14 @@ int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { } /* Decode */ - cp += safe_itf8_get(cp, endp, &blk_size); + blk_size = fd->vv.varint_get32(&cp, endp, &err); if (!blk_size) { h->nTL = 0; cram_free_block(b); return cp - op; } - if (blk_size < 0 || endp - cp < blk_size) { + if (err || blk_size < 0 || endp - cp < blk_size) { cram_free_block(b); return -1; } @@ -143,7 +144,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, cram_block *b) { char *cp, *endp, *cp_copy; cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); - int i; + int i, err = 0; int32_t map_size = 0, map_count = 0; if (!hdr) @@ -160,27 +161,16 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, endp = cp + b->uncomp_size; if (CRAM_MAJOR_VERS(fd->version) == 1) { - int32_t i32; - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id); -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ -#ifdef LARGE_POS - cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_start); - cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_span); -#else - cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_start=i32; - cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_span=i32; -#endif - cp += safe_itf8_get(cp, endp, &hdr->num_records); - cp += safe_itf8_get(cp, endp, &hdr->num_landmarks); + hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err); + hdr->ref_seq_span = fd->vv.varint_get64(&cp, endp, &err); + } else { + hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err); + hdr->ref_seq_span = fd->vv.varint_get32(&cp, endp, &err); + } + hdr->num_records = fd->vv.varint_get32(&cp, endp, &err); + hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err); if (hdr->num_landmarks < 0 || hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) || endp - cp < hdr->num_landmarks) { @@ -191,9 +181,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, free(hdr); return NULL; } - for (i = 0; i < hdr->num_landmarks; i++) { - cp += safe_itf8_get(cp, endp, &hdr->landmark[i]); - } + for (i = 0; i < hdr->num_landmarks; i++) + hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);; } hdr->preservation_map = kh_init(map); @@ -211,11 +200,12 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, /* Initialise defaults for preservation map */ hdr->read_names_included = 0; hdr->AP_delta = 1; + hdr->qs_seq_orient = 1; memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20); /* Preservation map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); for (i = 0; i < map_count; i++) { pmap_t hd; khint_t k; @@ -269,6 +259,18 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, hdr->no_ref = !hd.i; break; + case CRAM_KEY('Q','O'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "QO", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + hdr->qs_seq_orient = hd.i; + break; + case CRAM_KEY('S','M'): if (endp - cp < 5) { cram_free_compression_header(hdr); @@ -311,7 +313,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, break; case CRAM_KEY('T','D'): { - int sz = cram_decode_TD(cp, endp, hdr); // tag dictionary + int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary if (sz < 0) { cram_free_compression_header(hdr); return NULL; @@ -342,8 +344,9 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } /* Record encoding map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); + int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; for (i = 0; i < map_count; i++) { char *key = cp; int32_t encoding = E_NULL; @@ -359,8 +362,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } cp += 2; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &size); + encoding = fd->vv.varint_get32(&cp, endp, &err); + size = fd->vv.varint_get32(&cp, endp, &err); offset = cp - (char *)b->data; @@ -394,31 +397,20 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { ds_id = DS_AP; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_SLONG : E_INT; } else if (key[0] == 'R' && key[1] == 'G') { - ds_id = DS_RG; type = E_INT; + ds_id = DS_RG; + type = E_INT; } else if (key[0] == 'M' && key[1] == 'F') { ds_id = DS_MF; type = E_INT; } else if (key[0] == 'N' && key[1] == 'S') { ds_id = DS_NS; type = E_INT; } else if (key[0] == 'N' && key[1] == 'P') { ds_id = DS_NP; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_LONG : E_INT; } else if (key[0] == 'T' && key[1] == 'S') { ds_id = DS_TS; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_SLONG : E_INT; } else if (key[0] == 'N' && key[1] == 'F') { ds_id = DS_NF; type = E_INT; } else if (key[0] == 'T' && key[1] == 'C') { @@ -471,8 +463,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, key); hdr->codecs[ds_id]->free(hdr->codecs[ds_id]); } - hdr->codecs[ds_id] = cram_decoder_init(encoding, cp, size, - type, fd->version); + hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size, + type, fd->version, &fd->vv); if (!hdr->codecs[ds_id]) { cram_free_compression_header(hdr); return NULL; @@ -502,13 +494,13 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } /* Tag encoding map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); for (i = 0; i < map_count; i++) { int32_t encoding = E_NULL; int32_t size = 0; cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc - uint8_t *key; + uint8_t key[3]; if (!m || endp - cp < 6) { free(m); @@ -516,19 +508,19 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, return NULL; } - key = (uint8_t *) cp + 1; - m->key = (key[0]<<16)|(key[1]<<8)|key[2]; - - cp += 4; // Strictly ITF8, but this suffices - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &size); + m->key = fd->vv.varint_get32(&cp, endp, &err); + key[0] = m->key>>16; + key[1] = m->key>>8; + key[2] = m->key; + encoding = fd->vv.varint_get32(&cp, endp, &err); + size = fd->vv.varint_get32(&cp, endp, &err); m->encoding = encoding; m->size = size; m->offset = cp - (char *)b->data; if (size < 0 || endp - cp < size || - !(m->codec = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY_BLOCK, fd->version))) { + !(m->codec = cram_decoder_init(hdr, encoding, cp, size, + E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) { cram_free_compression_header(hdr); free(m); return NULL; @@ -539,7 +531,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])]; hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m; } - if (cp - cp_copy != map_size) { + if (err || cp - cp_copy != map_size) { cram_free_compression_header(hdr); return NULL; } @@ -676,6 +668,13 @@ int cram_dependent_data_series(cram_fd *fd, s->data_series |= CRAM_CF | CRAM_NF; if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ)) s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL; + if (s->data_series & CRAM_FN) { + // The CRAM_FN loop checks for reference length boundaries, + // which needs a working seq_pos. Some fields are fixed size + // irrespective of if we decode (BS), but others need to know + // the size of the string fetched back (SC, IN, BB). + s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB; + } orig_ds = s->data_series; @@ -955,7 +954,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { cram_block_slice_hdr *hdr; unsigned char *cp; unsigned char *cp_end; - int i; + int i, err = 0; if (b->method != RAW) { /* Spec. says slice header should be RAW, but we can future-proof @@ -976,17 +975,14 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { hdr->content_type = b->content_type; if (b->content_type == MAPPED_SLICE) { - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id); -#ifdef LARGE_POS - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); -#else - int32_t i32; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->ref_seq_start = i32; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->ref_seq_span = i32; -#endif + hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); + hdr->ref_seq_span = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); + } else { + hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + hdr->ref_seq_span = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + } if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) { free(hdr); hts_log_error("Negative values not permitted for header " @@ -994,19 +990,15 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { return NULL; } } - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_records); + hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err); hdr->record_counter = 0; if (CRAM_MAJOR_VERS(fd->version) == 2) { - int32_t i32 = 0; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->record_counter = i32; + hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->record_counter); + hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); } - - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_blocks); - - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_content_ids); + hdr->num_blocks = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); if (hdr->num_content_ids < 1 || hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) { /* Slice must have at least one data block, @@ -1020,20 +1012,18 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { return NULL; } - for (i = 0; i < hdr->num_content_ids; i++) { - int l = safe_itf8_get((char *)cp, (char *)cp_end, - &hdr->block_content_ids[i]); - if (l <= 0) { - free(hdr->block_content_ids); - free(hdr); - return NULL; - } - cp += l; + for (i = 0; i < hdr->num_content_ids; i++) + hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp, + (char *)cp_end, + &err); + if (err) { + free(hdr->block_content_ids); + free(hdr); + return NULL; } - if (b->content_type == MAPPED_SLICE) { - cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->ref_base_id); - } + if (b->content_type == MAPPED_SLICE) + hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err); if (CRAM_MAJOR_VERS(fd->version) != 1) { if (cp_end - cp < 16) { @@ -1046,7 +1036,12 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { memset(hdr->md5, 0, 16); } - return hdr; + if (!err) + return hdr; + + free(hdr->block_content_ids); + free(hdr); + return NULL; } @@ -1113,11 +1108,18 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, uint32_t nm = 0; int32_t md_dist = 0; int orig_aux = 0; - int decode_md = s->decode_md && s->ref && !has_MD && cr->ref_id >= 0; - int decode_nm = s->decode_md && s->ref && !has_NM && cr->ref_id >= 0; + // CRAM < 4.0 decode_md is off/on + // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise) + int do_md = CRAM_MAJOR_VERS(fd->version) >= 4 + ? (s->decode_md > 0) + : (s->decode_md != 0); + int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0); + int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0); uint32_t ds = s->data_series; sam_hrecs_t *bfd = sh->hrecs; + cram_codec **codecs = c->comp_hdr->codecs; + if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { memset(qual, 255, cr->len); } @@ -1127,13 +1129,14 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (decode_md) { orig_aux = BLOCK_SIZE(s->aux_blk); - BLOCK_APPEND(s->aux_blk, "MDZ", 3); + if (has_MD == 0) + BLOCK_APPEND(s->aux_blk, "MDZ", 3); } if (ds & CRAM_FN) { - if (!c->comp_hdr->codecs[DS_FN]) return -1; - r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN], - blk, (char *)&fn, &out_sz); + if (!codecs[DS_FN]) return -1; + r |= codecs[DS_FN]->decode(s,codecs[DS_FN], + blk, (char *)&fn, &out_sz); if (r) return r; } else { fn = 0; @@ -1145,6 +1148,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (!(ds & (CRAM_FC | CRAM_FP))) goto skip_cigar; + if (fn) { + if ((ds & CRAM_FC) && !codecs[DS_FC]) + return -1; + if ((ds & CRAM_FP) && !codecs[DS_FP]) + return -1; + } + for (f = 0; f < fn; f++) { int32_t pos = 0; char op; @@ -1157,22 +1167,20 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_FC) { - if (!c->comp_hdr->codecs[DS_FC]) return -1; - r |= c->comp_hdr->codecs[DS_FC]->decode(s, - c->comp_hdr->codecs[DS_FC], - blk, - &op, &out_sz); + r |= codecs[DS_FC]->decode(s, + codecs[DS_FC], + blk, + &op, &out_sz); if (r) return r; } if (!(ds & CRAM_FP)) continue; - if (!c->comp_hdr->codecs[DS_FP]) return -1; - r |= c->comp_hdr->codecs[DS_FP]->decode(s, - c->comp_hdr->codecs[DS_FP], - blk, - (char *)&pos, &out_sz); + r |= codecs[DS_FP]->decode(s, + codecs[DS_FP], + blk, + (char *)&pos, &out_sz); if (r) return r; pos += prev_pos; @@ -1213,26 +1221,33 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, // 'N' in both ref and seq is also mismatch for NM/MD if (ref_pos + pos-seq_pos > s->ref_end) goto beyond_slice; + + const char *refp = s->ref + ref_pos - s->ref_start + 1; + const int frag_len = pos - seq_pos; + int do_cpy = 1; if (decode_md || decode_nm) { - int i; - for (i = 0; i < pos - seq_pos; i++) { - // FIXME: not N, but nt16 lookup == 15? - char base = s->ref[ref_pos - s->ref_start + 1 + i]; - if (base == 'N') { - if (add_md_char(s, decode_md, - s->ref[ref_pos - s->ref_start + 1 + i], - &md_dist) < 0) - return -1; - nm++; - } else { - md_dist++; + char *N = memchr(refp, 'N', frag_len); + if (N) { + int i; + for (i = 0; i < frag_len; i++) { + char base = refp[i]; + if (base == 'N') { + if (add_md_char(s, decode_md, + 'N', &md_dist) < 0) + return -1; + nm++; + } else { + md_dist++; + } + seq[seq_pos-1+i] = base; } - seq[seq_pos-1+i] = base; + do_cpy = 0; + } else { + md_dist += frag_len; } - } else { - memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], - pos - seq_pos); } + if (do_cpy) + memcpy(&seq[seq_pos-1], refp, frag_len); } } #ifdef USE_X @@ -1270,12 +1285,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, switch (CRAM_MAJOR_VERS(fd->version)) { case 1: if (ds & CRAM_IN) { - r |= c->comp_hdr->codecs[DS_IN] - ? c->comp_hdr->codecs[DS_IN] - ->decode(s, c->comp_hdr->codecs[DS_IN], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) + r |= codecs[DS_IN] + ? codecs[DS_IN]->decode(s, codecs[DS_IN], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2) : (seq[pos-1] = 'N', out_sz2 = 1, 0); have_sc = 1; } @@ -1283,22 +1297,20 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 2: default: if (ds & CRAM_SC) { - r |= c->comp_hdr->codecs[DS_SC] - ? c->comp_hdr->codecs[DS_SC] - ->decode(s, c->comp_hdr->codecs[DS_SC], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) + r |= codecs[DS_SC] + ? codecs[DS_SC]->decode(s, codecs[DS_SC], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2) : (seq[pos-1] = 'N', out_sz2 = 1, 0); have_sc = 1; } break; //default: - // r |= c->comp_hdr->codecs[DS_BB] - // ? c->comp_hdr->codecs[DS_BB] - // ->decode(s, c->comp_hdr->codecs[DS_BB], - // blk, &seq[pos-1], &out_sz2) + // r |= codecs[DS_BB] + // ? codecs[DS_BB]->decode(s, codecs[DS_BB], + // blk, &seq[pos-1], &out_sz2) // : (seq[pos-1] = 'N', out_sz2 = 1, 0); } if (have_sc) { @@ -1318,10 +1330,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BS) { - if (!c->comp_hdr->codecs[DS_BS]) return -1; - r |= c->comp_hdr->codecs[DS_BS] - ->decode(s, c->comp_hdr->codecs[DS_BS], blk, - (char *)&base, &out_sz); + if (!codecs[DS_BS]) return -1; + r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk, + (char *)&base, &out_sz); if (pos-1 < cr->len) seq[pos-1] = 'N'; // FIXME look up BS=base value } @@ -1333,10 +1344,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BS) { - if (!c->comp_hdr->codecs[DS_BS]) return -1; - r |= c->comp_hdr->codecs[DS_BS] - ->decode(s, c->comp_hdr->codecs[DS_BS], blk, - (char *)&base, &out_sz); + if (!codecs[DS_BS]) return -1; + r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk, + (char *)&base, &out_sz); if (r) return -1; if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { if (pos-1 < cr->len) @@ -1375,10 +1385,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_DL) { - if (!c->comp_hdr->codecs[DS_DL]) return -1; - r |= c->comp_hdr->codecs[DS_DL] - ->decode(s, c->comp_hdr->codecs[DS_DL], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_DL]) return -1; + r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk, + (char *)&i32, &out_sz); if (r) return r; if (decode_md || decode_nm) { if (ref_pos + i32 > s->ref_end) @@ -1430,11 +1439,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_IN) { - if (!c->comp_hdr->codecs[DS_IN]) return -1; - r |= c->comp_hdr->codecs[DS_IN] - ->decode(s, c->comp_hdr->codecs[DS_IN], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2); + if (!codecs[DS_IN]) return -1; + r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2); if (r) return r; cig_op = BAM_CINS; cig_len += out_sz2; @@ -1451,11 +1459,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BA) { - if (!c->comp_hdr->codecs[DS_BA]) return -1; - r |= c->comp_hdr->codecs[DS_BA] - ->decode(s, c->comp_hdr->codecs[DS_BA], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz); + if (!codecs[DS_BA]) return -1; + r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz); if (r) return r; } cig_op = BAM_CINS; @@ -1474,11 +1481,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_BB) { - if (!c->comp_hdr->codecs[DS_BB]) return -1; - r |= c->comp_hdr->codecs[DS_BB] - ->decode(s, c->comp_hdr->codecs[DS_BB], blk, - cr->len ? &seq[pos-1] : NULL, - &len); + if (!codecs[DS_BB]) return -1; + r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk, + cr->len ? &seq[pos-1] : NULL, + &len); if (r) return r; if (decode_md || decode_nm) { @@ -1525,13 +1531,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_QQ) { - if (!c->comp_hdr->codecs[DS_QQ]) return -1; + if (!codecs[DS_QQ]) return -1; if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? - r |= c->comp_hdr->codecs[DS_QQ] - ->decode(s, c->comp_hdr->codecs[DS_QQ], blk, - (char *)&qual[pos-1], &len); + r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk, + (char *)&qual[pos-1], &len); if (r) return r; } @@ -1554,11 +1559,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } #endif if (ds & CRAM_BA) { - if (!c->comp_hdr->codecs[DS_BA]) return -1; - r |= c->comp_hdr->codecs[DS_BA] - ->decode(s, c->comp_hdr->codecs[DS_BA], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz); + if (!codecs[DS_BA]) return -1; + r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz); if (decode_md || decode_nm) { if (md_dist >= 0 && decode_md) @@ -1578,13 +1582,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } } if (ds & CRAM_QS) { - if (!c->comp_hdr->codecs[DS_QS]) return -1; + if (!codecs[DS_QS]) return -1; if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ASCII ?. Same as htsjdk - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + (char *)&qual[pos-1], &out_sz); } #ifdef USE_X cig_op = BAM_CBASE_MISMATCH; @@ -1600,13 +1603,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'Q': { // Quality score; QS if (ds & CRAM_QS) { - if (!c->comp_hdr->codecs[DS_QS]) return -1; + if (!codecs[DS_QS]) return -1; if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + (char *)&qual[pos-1], &out_sz); //printf(" %d: QS = %d (ret %d)\n", f, qc, r); } break; @@ -1618,10 +1620,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_HC) { - if (!c->comp_hdr->codecs[DS_HC]) return -1; - r |= c->comp_hdr->codecs[DS_HC] - ->decode(s, c->comp_hdr->codecs[DS_HC], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_HC]) return -1; + r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CHARD_CLIP; cig_len += i32; @@ -1635,10 +1636,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_PD) { - if (!c->comp_hdr->codecs[DS_PD]) return -1; - r |= c->comp_hdr->codecs[DS_PD] - ->decode(s, c->comp_hdr->codecs[DS_PD], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_PD]) return -1; + r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CPAD; cig_len += i32; @@ -1652,10 +1652,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_RS) { - if (!c->comp_hdr->codecs[DS_RS]) return -1; - r |= c->comp_hdr->codecs[DS_RS] - ->decode(s, c->comp_hdr->codecs[DS_RS], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_RS]) return -1; + r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CREF_SKIP; cig_len += i32; @@ -1702,31 +1701,32 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (cr->len - seq_pos + 1 > 0) { if (ref_pos + cr->len-seq_pos +1 > s->ref_end) goto beyond_slice; + int remainder = cr->len - (seq_pos-1); + int j = ref_pos - s->ref_start + 1; if (decode_md || decode_nm) { - int i, j = ref_pos - s->ref_start + 1; - // FIXME: Update this to match spec once we're also - // ready to update samtools calmd. (N vs any ambig) - if (memchr(&s->ref[j], 'N', cr->len - (seq_pos-1))) { - for (i = seq_pos-1, j -= i; i < cr->len; i++) { - char base = s->ref[j+i]; + int i; + char *N = memchr(&s->ref[j], 'N', remainder); + if (!N) { + // short cut the common case + md_dist += cr->len - (seq_pos-1); + } else { + char *refp = &s->ref[j-(seq_pos-1)]; + md_dist += N-&s->ref[j]; + int i_start = seq_pos-1 + (N - &s->ref[j]); + for (i = i_start; i < cr->len; i++) { + char base = refp[i]; if (base == 'N') { - if (add_md_char(s, decode_md, 'N', &md_dist) < 0) + if (add_md_char(s, decode_md, 'N', + &md_dist) < 0) return -1; nm++; } else { md_dist++; } - seq[i] = base; } - } else { - // faster than above code - memcpy(&seq[seq_pos-1], &s->ref[j], cr->len - (seq_pos-1)); - md_dist += cr->len - (seq_pos-1); } - } else { - memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], - cr->len - (seq_pos-1)); } + memcpy(&seq[seq_pos-1], &s->ref[j], remainder); } ref_pos += cr->len - seq_pos + 1; } @@ -1776,15 +1776,14 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } cr->ncigar = ncigar - cr->cigar; - cr->aend = ref_pos; + cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos; //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); if (ds & CRAM_MQ) { - if (!c->comp_hdr->codecs[DS_MQ]) return -1; - r |= c->comp_hdr->codecs[DS_MQ] - ->decode(s, c->comp_hdr->codecs[DS_MQ], blk, - (char *)&cr->mqual, &out_sz); + if (!codecs[DS_MQ]) return -1; + r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk, + (char *)&cr->mqual, &out_sz); } else { cr->mqual = 40; } @@ -1792,10 +1791,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { int32_t out_sz2 = cr->len; - if (!c->comp_hdr->codecs[DS_QS]) return -1; - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - qual, &out_sz2); + if (!codecs[DS_QS]) return -1; + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + qual, &out_sz2); } s->cigar = cigar; @@ -1807,32 +1805,67 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (decode_md) { BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z: - cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux; + size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux; + if (has_MD < 0) { + // has_MD < 0; already have MDZ allocated in aux at -has_MD, + // but wrote MD to end of aux (at orig_aux). + // We need some memmoves to shuffle it around. + char tmp_MD_[1024], *tmp_MD = tmp_MD_; + unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux; + if (sz > 1024) { + tmp_MD = malloc(sz); + if (!tmp_MD) + return -1; + } + memcpy(tmp_MD, orig_aux_p, sz); + memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz, + &BLOCK_DATA(s->aux_blk)[-has_MD], + orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]); + memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz); + if (tmp_MD != tmp_MD_) + free(tmp_MD); + + if (-has_NM > -has_MD) + // we inserted before NM, so move it up a bit + has_NM -= sz; + } + // else has_MD == 0 and we've already appended MD to the end. + + cr->aux_size += sz; } if (decode_nm) { - char buf[7]; - size_t buf_size; - buf[0] = 'N'; buf[1] = 'M'; - if (nm <= UINT8_MAX) { - buf_size = 4; - buf[2] = 'C'; - buf[3] = (nm>> 0) & 0xff; - } else if (nm <= UINT16_MAX) { - buf_size = 5; - buf[2] = 'S'; - buf[3] = (nm>> 0) & 0xff; - buf[4] = (nm>> 8) & 0xff; + if (has_NM == 0) { + char buf[7]; + size_t buf_size; + buf[0] = 'N'; buf[1] = 'M'; + if (nm <= UINT8_MAX) { + buf_size = 4; + buf[2] = 'C'; + buf[3] = (nm>> 0) & 0xff; + } else if (nm <= UINT16_MAX) { + buf_size = 5; + buf[2] = 'S'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + } else { + buf_size = 7; + buf[2] = 'I'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + buf[5] = (nm>>16) & 0xff; + buf[6] = (nm>>24) & 0xff; + } + BLOCK_APPEND(s->aux_blk, buf, buf_size); + cr->aux_size += buf_size; } else { - buf_size = 7; - buf[2] = 'I'; - buf[3] = (nm>> 0) & 0xff; - buf[4] = (nm>> 8) & 0xff; - buf[5] = (nm>>16) & 0xff; - buf[6] = (nm>>24) & 0xff; + // Preallocated space for NM at -has_NM into aux block + unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM; + buf[0] = (nm>> 0) & 0xff; + buf[1] = (nm>> 8) & 0xff; + buf[2] = (nm>>16) & 0xff; + buf[3] = (nm>>24) & 0xff; } - BLOCK_APPEND(s->aux_blk, buf, buf_size); - cr->aux_size += buf_size; } return r; @@ -1914,7 +1947,11 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, return -1; } -static int cram_decode_aux(cram_container *c, cram_slice *s, +// has_MD and has_NM are filled out with 0 for none present, +// 1 for present and verbatim, and -pos for present as placeholder +// (MD*, NM*) to be generated and filled out at offset +pos. +static int cram_decode_aux(cram_fd *fd, + cram_container *c, cram_slice *s, cram_block *blk, cram_record *cr, int *has_MD, int *has_NM) { int i, r = 0, out_sz = 1; @@ -1946,29 +1983,89 @@ static int cram_decode_aux(cram_container *c, cram_slice *s, for (i = 0; i < cr->ntags; i++) { int32_t id, out_sz = 1; - unsigned char tag_data[3]; + unsigned char tag_data[7]; cram_map *m; if (TN[0] == 'M' && TN[1] == 'D' && has_MD) - *has_MD = 1; + *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1); if (TN[0] == 'N' && TN[1] == 'M' && has_NM) - *has_NM = 1; + *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);; //printf("Tag %d/%d\n", i+1, cr->ntags); - tag_data[0] = *TN++; - tag_data[1] = *TN++; - tag_data[2] = *TN++; + tag_data[0] = TN[0]; + tag_data[1] = TN[1]; + tag_data[2] = TN[2]; id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2]; - m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); - if (!m) - return -1; - BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') { + // Place holder, fill out contents later. + int tag_data_size; + if (TN[0] == 'N' && TN[1] == 'M') { + // Use a fixed size, so we can allocate room for it now. + memcpy(&tag_data[2], "I\0\0\0\0", 5); + tag_data_size = 7; + } else if (TN[0] == 'R' && TN[1] == 'G') { + // RG is variable size, but known already. Insert now + TN += 3; + // Equiv to fd->header->hrecs->rg[cr->rg], but this is the + // new header API equivalent. + const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg); + if (!rg) + continue; - if (!m->codec) return -1; - r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); - if (r) break; - cr->aux_size += out_sz + 3; + size_t rg_len = strlen(rg); + tag_data[2] = 'Z'; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + BLOCK_APPEND(s->aux_blk, rg, rg_len); + BLOCK_APPEND_CHAR(s->aux_blk, '\0'); + cr->aux_size += 3 + rg_len + 1; + cr->rg = -1; // prevents auto-add later + continue; + } else { + // Unknown size. We'll insert MD into stream later. + tag_data[2] = 'Z'; + tag_data_size = 3; + } + BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size); + cr->aux_size += tag_data_size; + TN += 3; + } else { + TN += 3; + m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); + if (!m) + return -1; + + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + + if (!m->codec) return -1; + r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); + if (r) break; + cr->aux_size += out_sz + 3; + + // cF CRAM flags. + if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) { + // Remove cF tag + uint8_t cF = BLOCK_END(s->aux_blk)[-1]; + BLOCK_SIZE(s->aux_blk) -= out_sz+3; + cr->aux_size -= out_sz+3; + + // bit 1 => don't auto-decode MD. + // Pretend MD is present verbatim, so we don't auto-generate + if ((cF & 1) && has_MD && *has_MD == 0) + *has_MD = 1; + + // bit 1 => don't auto-decode NM + if ((cF & 2) && has_NM && *has_NM == 0) + *has_NM = 1; + } + } + + // We could go to 2^32 fine, but we shouldn't be hitting this anyway, + // and it's protecting against memory hogs too. + if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) { + hts_log_error("CRAM->BAM aux block size overflow"); + goto block_err; + } } return r; @@ -2011,7 +2108,7 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { * Or do we just admit defeat and output 0 for tlen? It's the * safe option... */ - if (cr->tlen == INT_MIN) { + if (cr->tlen == INT64_MIN) { int id1 = rec, id2 = rec; int64_t aleft = cr->apos, aright = cr->aend; int64_t tlen; @@ -2120,9 +2217,16 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { cr->mate_ref_id = -1; } - if (cr->tlen == INT_MIN) - cr->tlen = 0; // Just in case + if (cr->tlen == INT64_MIN) + cr->tlen = 0; // Just incase } + + for (rec = 0; rec < s->hdr->num_records; rec++) { + cram_record *cr = &s->crecs[rec]; + if (cr->explicit_tlen != INT64_MIN) + cr->tlen = cr->explicit_tlen; + } + return 0; } @@ -2137,6 +2241,31 @@ static char *md5_print(unsigned char *md5, char *out) { return out; } +/* + * Utility function to decode tlen (ISIZE), as it's called + * in multiple places. + * + * Returns codec return value (0 on success). + */ +static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s, + cram_block *blk, int64_t *tlen) { + int out_sz = 1, r = 0; + + if (!c->comp_hdr->codecs[DS_TS]) return -1; + if (CRAM_MAJOR_VERS(fd->version) < 4) { + int32_t i32; + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)&i32, &out_sz); + *tlen = i32; + } else { + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)tlen, &out_sz); + } + return r; +} + /* * Decode an entire slice from container blocks. Fills out s->crecs[] array. * Returns 0 on success @@ -2199,7 +2328,10 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, return -1; ref_id = s->hdr->ref_seq_id; - embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; + if (CRAM_MAJOR_VERS(fd->version) < 4) + embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; + else + embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0; if (ref_id >= 0) { if (embed_ref) { @@ -2310,13 +2442,21 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, } } - if ((!s->ref && s->hdr->ref_base_id < 0) - || memcmp(digest, s->hdr->md5, 16) != 0) { + if (!c->comp_hdr->no_ref && + ((!s->ref && s->hdr->ref_base_id < 0) + || memcmp(digest, s->hdr->md5, 16) != 0)) { char M[33]; - hts_log_error("MD5 checksum reference mismatch at #%d:%d-%d", - ref_id, s->ref_start, s->ref_end); - hts_log_error("CRAM: %s", md5_print(s->hdr->md5, M)); - hts_log_error("Ref : %s", md5_print(digest, M)); + const char *rname = sam_hdr_tid2name(sh, ref_id); + if (!rname) rname="?"; // cannot happen normally + hts_log_error("MD5 checksum reference mismatch at %s:%d-%d", + rname, s->ref_start, s->ref_end); + hts_log_error("CRAM : %s", md5_print(s->hdr->md5, M)); + hts_log_error("Ref : %s", md5_print(digest, M)); + kstring_t ks = KS_INITIALIZE; + if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0) + hts_log_error("@SQ M5: %s", ks.s); + hts_log_error("Please check the reference given is correct"); + ks_free(&ks); return -1; } } @@ -2454,20 +2594,28 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_AP) { if (!c->comp_hdr->codecs[DS_AP]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_AP] - ->decode(s, c->comp_hdr->codecs[DS_AP], blk, - (char *)&cr->apos, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_AP] - ->decode(s, c->comp_hdr->codecs[DS_AP], blk, - (char *)&i32, &out_sz); - cr->apos = i32; -#endif - if (r) goto block_err; - if (c->comp_hdr->AP_delta) + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&cr->apos, &out_sz); + } else { + int32_t i32; + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&i32, &out_sz); + cr->apos = i32; + } + if (r) goto block_err;; + if (c->comp_hdr->AP_delta) { + if (cr->apos < 0 && c->unsorted == 0) { + // cache locally in c->unsorted so we don't have an + // excessive number of locks + pthread_mutex_lock(&fd->ref_lock); + c->unsorted = fd->unsorted = 1; + pthread_mutex_unlock(&fd->ref_lock); + } cr->apos += s->last_apos; + } s->last_apos= cr->apos; } else { cr->apos = c->ref_seq_start; @@ -2505,6 +2653,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->mate_pos = 0; cr->mate_line = -1; cr->mate_ref_id = -1; + cr->explicit_tlen = INT64_MIN; if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) { if (ds & CRAM_MF) { if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -2560,39 +2709,30 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, // } if (ds & CRAM_NP) { - if (!c->comp_hdr->codecs[DS_NP]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_NP] - ->decode(s, c->comp_hdr->codecs[DS_NP], blk, - (char *)&cr->mate_pos, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_NP] - ->decode(s, c->comp_hdr->codecs[DS_NP], blk, - (char *)&i32, &out_sz); - cr->mate_pos = i32; -#endif + if (!c->comp_hdr->codecs[DS_NP]) goto block_err;; + if (CRAM_MAJOR_VERS(fd->version) < 4) { + int32_t i32; + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&i32, &out_sz); + cr->mate_pos = i32; + } else { + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&cr->mate_pos, &out_sz); + } if (r) goto block_err; } if (ds & CRAM_TS) { if (!c->comp_hdr->codecs[DS_TS]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_TS] - ->decode(s, c->comp_hdr->codecs[DS_TS], blk, - (char *)&cr->tlen, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_TS] - ->decode(s, c->comp_hdr->codecs[DS_TS], blk, - (char *)&i32, &out_sz); - cr->tlen = i32; -#endif + r = cram_decode_tlen(fd, c, s, blk, &cr->tlen); if (r) goto block_err; } else { - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; } } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) { + // else not detached if (ds & CRAM_NF) { if (!c->comp_hdr->codecs[DS_NF]) goto block_err; r |= c->comp_hdr->codecs[DS_NF] @@ -2606,15 +2746,32 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, //dstring_nappend(name_ds, name, cr->name_len); cr->mate_ref_id = -1; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; cr->mate_pos = 0; } else { cr->mate_flags = 0; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; + } + if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) { + if (ds & CRAM_TS) { + r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen); + if (r) return r; + } else { + cr->mate_flags = 0; + cr->tlen = INT64_MIN; + } + } + } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) { + if (ds & CRAM_TS) { + r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen); + if (r) return r; + } else { + cr->mate_flags = 0; + cr->tlen = INT64_MIN; } } else { cr->mate_flags = 0; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; } /* else if (!name[0]) { @@ -2634,7 +2791,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (CRAM_MAJOR_VERS(fd->version) == 1) r |= cram_decode_aux_1_0(c, s, blk, cr); else - r |= cram_decode_aux(c, s, blk, cr, &has_MD, &has_NM); + r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM); if (r) goto block_err; /* Fake up dynamic string growth and appending */ @@ -2705,6 +2862,16 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, memset(qual, 255, cr->len); } } + + if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) { + int i, j; + for (i = 0, j = cr->len-1; i < j; i++, j--) { + unsigned char c; + c = qual[i]; + qual[i] = qual[j]; + qual[j] = c; + } + } } pthread_mutex_lock(&fd->ref_lock); @@ -2839,10 +3006,10 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, */ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, cram_record *cr, int rec, bam_seq_t **bam) { - int bam_idx, rg_len; + int ret, rg_len; char name_a[1024], *name; int name_len; - char *aux, *aux_orig; + char *aux; char *seq, *qual; sam_hrecs_t *bfd = sh->hrecs; @@ -2853,18 +3020,28 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, name_len = cr->name_len; } else { name = name_a; - name_len = strlen(fd->prefix); - memcpy(name, fd->prefix, name_len); - name += name_len; - *name++ = ':'; - if (cr->mate_line >= 0 && cr->mate_line < rec) - name = (char *)append_uint64((unsigned char *)name, - s->hdr->record_counter + - cr->mate_line + 1); - else - name = (char *)append_uint64((unsigned char *)name, - s->hdr->record_counter + - rec + 1); + if (cr->mate_line >= 0 && cr->mate_line < s->max_rec && + s->crecs[cr->mate_line].name_len > 0) { + // Copy our mate if non-zero. + memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name, + s->crecs[cr->mate_line].name_len); + name = name_a + s->crecs[cr->mate_line].name_len; + } else { + // Otherwise generate a name based on prefix + name_len = strlen(fd->prefix); + memcpy(name, fd->prefix, name_len); + name += name_len; + *name++ = ':'; + if (cr->mate_line >= 0 && cr->mate_line < rec) { + name = (char *)append_uint64((unsigned char *)name, + s->hdr->record_counter + + cr->mate_line + 1); + } else { + name = (char *)append_uint64((unsigned char *)name, + s->hdr->record_counter + + rec + 1); + } + } name_len = name - name_a; name = name_a; } @@ -2887,7 +3064,6 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, cr->len = 0; } - if (fd->required_fields & SAM_QUAL) { if (!BLOCK_DATA(s->qual_blk)) return -1; @@ -2896,41 +3072,37 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, qual = NULL; } - bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len, - name, name_len, - cr->flags, - cr->ref_id, - cr->apos, - cr->aend, - cr->mqual, - cr->ncigar, &s->cigar[cr->cigar], - cr->mate_ref_id, - cr->mate_pos, - cr->tlen, - cr->len, - seq, - qual); - if (bam_idx == -1) - return -1; + ret = bam_set1(*bam, + name_len, name, + cr->flags, cr->ref_id, cr->apos - 1, cr->mqual, + cr->ncigar, &s->cigar[cr->cigar], + cr->mate_ref_id, cr->mate_pos - 1, cr->tlen, + cr->len, seq, qual, + cr->aux_size + rg_len); + if (ret < 0) { + return ret; + } - aux = aux_orig = (char *)bam_aux(*bam); + aux = (char *)bam_aux(*bam); /* Auxiliary strings */ if (cr->aux_size != 0) { memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size); aux += cr->aux_size; + (*bam)->l_data += cr->aux_size; } /* RG:Z: */ - if (cr->rg != -1) { - int len = bfd->rg[cr->rg].name_len; + if (rg_len > 0) { *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z'; + int len = bfd->rg[cr->rg].name_len; memcpy(aux, bfd->rg[cr->rg].name, len); aux += len; *aux++ = 0; + (*bam)->l_data += rg_len; } - return bam_idx + (aux - aux_orig); + return (*bam)->l_data; } /* @@ -3086,7 +3258,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // position beyond end of range; bail out - if (c_next->ref_seq_start > fd->range.end) { + if (fd->range.refid != -1 && + c_next->ref_seq_start > fd->range.end) { cram_free_container(c_next); fd->ctr_mt = NULL; fd->ooc = 1; @@ -3094,7 +3267,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // before start of range; skip to next container - if (c_next->ref_seq_start + c_next->ref_seq_span-1 < + if (fd->range.refid != -1 && + c_next->ref_seq_start + c_next->ref_seq_span-1 < fd->range.start) { c_next->curr_slice_mt = c_next->max_slice; cram_seek(fd, c_next->length, SEEK_CUR); @@ -3159,7 +3333,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // position beyond end of range; bail out - if (s_next->hdr->ref_seq_start > fd->range.end) { + if (fd->range.refid != -1 && + s_next->hdr->ref_seq_start > fd->range.end) { fd->ooc = 1; cram_free_slice(s_next); c_next->slice = s_next = NULL; @@ -3167,7 +3342,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // before start of range; skip to next slice - if (s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 < + if (fd->range.refid != -1 && + s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 < fd->range.start) { cram_free_slice(s_next); c_next->slice = s_next = NULL; diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 66f27eebf..9797fa7a8 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -42,12 +42,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram.h" #include "os.h" #include "../sam_internal.h" // for nibble2base #include "../htslib/hts.h" #include "../htslib/hts_endian.h" +#include "../textutils_internal.h" KHASH_MAP_INIT_STR(m_s2u64, uint64_t) @@ -58,7 +60,8 @@ KHASH_MAP_INIT_STR(m_s2u64, uint64_t) static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum, kstring_t *MD); + bam_seq_t *b, int rnum, kstring_t *MD, + int embed_ref, int no_ref); /* * Returns index of val into key. @@ -67,7 +70,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, static int sub_idx(char *key, char val) { int i; - for (i = 0; *key && *key++ != val; i++); + for (i = 0; i < 4 && *key++ != val; i++); return i; } @@ -78,11 +81,14 @@ static int sub_idx(char *key, char val) { * NULL on failure */ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h) { + cram_block_compression_hdr *h, + int embed_ref) { cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0); cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); int i, mc, r = 0; + int no_ref = c->no_ref; + if (!cb || !map) return NULL; @@ -95,27 +101,11 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, * the total size (stored as a variable length string). */ -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ - // Duplicated from container itself, and removed in 1.1 if (CRAM_MAJOR_VERS(fd->version) == 1) { r |= itf8_put_blk(cb, h->ref_seq_id); -#ifdef LARGE_POS - r |= ltf8_put_blk(cb, h->ref_seq_start); - r |= ltf8_put_blk(cb, h->ref_seq_span); -#else r |= itf8_put_blk(cb, h->ref_seq_start); r |= itf8_put_blk(cb, h->ref_seq_span); -#endif r |= itf8_put_blk(cb, h->num_records); r |= itf8_put_blk(cb, h->num_landmarks); for (i = 0; i < h->num_landmarks; i++) { @@ -123,12 +113,14 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, } } - if (h->preservation_map) + if (h->preservation_map) { kh_destroy(map, h->preservation_map); + h->preservation_map = NULL; + } /* Create in-memory preservation map */ /* FIXME: should create this when we create the container */ - { + if (c->num_records > 0) { khint_t k; int r; @@ -166,7 +158,13 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, if (-1 == r) return NULL; kh_val(h->preservation_map, k).i = h->AP_delta; - if (fd->no_ref || fd->embed_ref) { + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + k = kh_put(map, h->preservation_map, "QO", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = h->qs_seq_orient; + } + + if (no_ref || embed_ref>0) { // Reference Required == No k = kh_put(map, h->preservation_map, "RR", &r); if (-1 == r) return NULL; @@ -196,62 +194,55 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, switch(CRAM_KEY(key[0], key[1])) { case CRAM_KEY('M','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('U','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('P','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('A','P'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('R','N'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('R','R'): + case CRAM_KEY('Q','O'): BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); break; case CRAM_KEY('S','M'): { char smat[5], *mp = smat; + // Output format is for order ACGTN (minus ref base) + // to store the code value 0-3 for each symbol. + // + // Note this is different to storing the symbols in order + // that the codes occur from 0-3, which is what we used to + // do. (It didn't matter as we always had a fixed table in + // the order.) *mp++ = - (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) | - (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) | - (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) | - (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0); + (sub_idx(h->substitution_matrix[0], 'C') << 6) | + (sub_idx(h->substitution_matrix[0], 'G') << 4) | + (sub_idx(h->substitution_matrix[0], 'T') << 2) | + (sub_idx(h->substitution_matrix[0], 'N') << 0); *mp++ = - (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) | - (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) | - (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) | - (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0); + (sub_idx(h->substitution_matrix[1], 'A') << 6) | + (sub_idx(h->substitution_matrix[1], 'G') << 4) | + (sub_idx(h->substitution_matrix[1], 'T') << 2) | + (sub_idx(h->substitution_matrix[1], 'N') << 0); *mp++ = - (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) | - (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) | - (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) | - (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0); + (sub_idx(h->substitution_matrix[2], 'A') << 6) | + (sub_idx(h->substitution_matrix[2], 'C') << 4) | + (sub_idx(h->substitution_matrix[2], 'T') << 2) | + (sub_idx(h->substitution_matrix[2], 'N') << 0); *mp++ = - (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) | - (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) | - (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) | - (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0); + (sub_idx(h->substitution_matrix[3], 'A') << 6) | + (sub_idx(h->substitution_matrix[3], 'C') << 4) | + (sub_idx(h->substitution_matrix[3], 'G') << 2) | + (sub_idx(h->substitution_matrix[3], 'N') << 0); *mp++ = - (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) | - (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) | - (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) | - (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0); + (sub_idx(h->substitution_matrix[4], 'A') << 6) | + (sub_idx(h->substitution_matrix[4], 'C') << 4) | + (sub_idx(h->substitution_matrix[4], 'G') << 2) | + (sub_idx(h->substitution_matrix[4], 'T') << 0); BLOCK_APPEND(map, smat, 5); break; } case CRAM_KEY('T','D'): { - r |= itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); + r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0); BLOCK_APPEND(map, BLOCK_DATA(h->TD_blk), BLOCK_SIZE(h->TD_blk)); @@ -266,8 +257,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, mc++; } } - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); /* rec encoding map */ @@ -467,33 +458,11 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, return NULL; mc++; } - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); /* tag encoding map */ -#if 0 - mp = map; mc = 0; - if (h->tag_encoding_map) { - HashItem *hi; - HashIter *iter = HashTableIterCreate(); - if (!iter) - return NULL; - - while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) { - cram_map *m = hi->data.p; - int sz; - - mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]); - if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version))) - return NULL; - mp += sz; - mc++; - } - - HashTableIterDestroy(iter); - } -#else mc = 0; BLOCK_SIZE(map) = 0; if (c->tags_used) { @@ -507,16 +476,16 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, key = kh_key(c->tags_used, k); cram_codec *cd = kh_val(c->tags_used, k)->codec; - r |= itf8_put_blk(map, key); + r |= (fd->vv.varint_put32_blk(map, key) <= 0); if (-1 == cd->store(cd, map, NULL, fd->version)) return NULL; mc++; } } -#endif - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb)); @@ -548,38 +517,38 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { if (!b) return NULL; - cp = buf = malloc(16+5*(8+s->hdr->num_blocks)); + cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks)); if (NULL == buf) { cram_free_block(b); return NULL; } - cp += itf8_put(cp, s->hdr->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put(cp, s->hdr->ref_seq_start); - cp += ltf8_put(cp, s->hdr->ref_seq_span); -#else - cp += itf8_put(cp, s->hdr->ref_seq_start); - cp += itf8_put(cp, s->hdr->ref_seq_span); -#endif - cp += itf8_put(cp, s->hdr->num_records); + cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span); + } + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) - cp += itf8_put(cp, s->hdr->record_counter); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter); else if (CRAM_MAJOR_VERS(fd->version) >= 3) - cp += ltf8_put(cp, s->hdr->record_counter); - cp += itf8_put(cp, s->hdr->num_blocks); - cp += itf8_put(cp, s->hdr->num_content_ids); + cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids); for (j = 0; j < s->hdr->num_content_ids; j++) { - cp += itf8_put(cp, s->hdr->block_content_ids[j]); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]); } if (s->hdr->content_type == MAPPED_SLICE) - cp += itf8_put(cp, s->hdr->ref_base_id); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id); if (CRAM_MAJOR_VERS(fd->version) != 1) { memcpy(cp, s->hdr->md5, 16); cp += 16; } - assert(cp-buf <= 16+5*(8+s->hdr->num_blocks)); + assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks)); b->data = (unsigned char *)buf; b->comp_size = b->uncomp_size = cp-buf; @@ -602,6 +571,7 @@ static int cram_encode_slice_read(cram_fd *fd, int64_t *last_pos) { int r = 0; int32_t i32; + int64_t i64; unsigned char uc; //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); @@ -620,24 +590,22 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); if (c->pos_sorted) { -#ifdef LARGE_POS - int64_t i64; - i64 = cr->apos - *last_pos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); -#else - i32 = cr->apos - *last_pos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); -#endif + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + i64 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); + } else { + i32 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); + } *last_pos = cr->apos; } else { -#ifdef LARGE_POS - int64_t i64; - i64 = cr->apos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); -#else - i32 = cr->apos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); -#endif + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + i64 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); + } else { + i32 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); + } } r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); @@ -649,23 +617,30 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], (char *)&cr->mate_ref_id, 1); -#ifdef LARGE_POS - r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], - (char *)&cr->mate_pos, 1); - - r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], - (char *)&cr->tlen, 1); -#else - i32 = cr->mate_pos; - r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], - (char *)&i32, 1); - i32 = cr->tlen; - r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], - (char *)&i32, 1); -#endif - } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { - r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], - (char *)&cr->mate_line, 1); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&cr->mate_pos, 1); + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&cr->tlen, 1); + } else { + i32 = cr->mate_pos; + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&i32, 1); + i32 = cr->tlen; + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&i32, 1); + } + } else { + if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { + r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], + (char *)&cr->mate_line, 1); + } + if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) { + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&cr->tlen, 1); + } + } } /* Aux tags */ @@ -716,7 +691,7 @@ static int cram_encode_slice_read(cram_fd *fd, // BLOCK_DATA(s->soft_blk) + f->S.seq_idx, // f->S.len); - //if (IS_CRAM_3_VERS(fd)) { + //if (CRAM_MAJOR_VERS(fd->version) >= 3) { // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], // BLOCK_DATA(s->seqs_blk) + f->S.seq_idx, // f->S.len); @@ -726,7 +701,7 @@ static int cram_encode_slice_read(cram_fd *fd, //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], // seq, f->S.len); - //if (IS_CRAM_3_VERS(fd)) { + //if (CRAM_MAJOR_VERS(fd->version) >= 3) { // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], // BLOCK_DATA(s->seqs_blk) + f->I.seq_idx, // f->I.len); @@ -822,89 +797,148 @@ static int cram_encode_slice_read(cram_fd *fd, static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { int level = fd->level, i; int method = 1<version >= (3<<8)+1); /* Compress the CORE Block too, with minimal zlib level */ if (level > 5 && s->block[0]->uncomp_size > 500) - cram_compress_block(fd, s->block[0], NULL, 1<block[0], NULL, 1<use_bz2) method |= 1<use_rans) - method |= (1<use_rans) { + method_ranspr = (1< 1) + method_ranspr |= + (1< 5) + method_ranspr |= (1<use_rans) { + methodF |= v31_or_above ? method_ranspr : method_rans; + method |= v31_or_above ? method_ranspr : method_rans; + } + + int method_arith = 0; + if (fd->use_arith) { + method_arith = (1< 1) + method_arith |= + (1<use_arith && v31_or_above) { + methodF |= method_arith; + method |= method_arith; + } if (fd->use_lzma) method |= (1<= 6) + if (level >= 5) { + method |= 1<use_fqz) { + qmethod |= 1<level > 4) { + qmethod |= 1<level > 6) { + qmethod |= (1<metrics_lock); + for (i = 0; i < DS_END; i++) + if (c->stats[i] && c->stats[i]->nvals > 16) + fd->m[i]->unpackable = 1; + pthread_mutex_unlock(&fd->metrics_lock); /* Specific compression methods for certain block types */ - if (cram_compress_block(fd, s->block[DS_IN], fd->m[DS_IN], //IN (seq) - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq) + method, level)) return -1; if (fd->level == 0) { /* Do nothing */ } else if (fd->level == 1) { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - methodF, 1)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethodF, 1)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, 1)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, 1)) return -1; } } else if (fd->level < 3) { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethod, 1)) return -1; - if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA], + method, 1)) return -1; if (s->block[DS_BB]) - if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB], + method, 1)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, level)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, level)) return -1; } } else { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethod, level)) return -1; - if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA], + method, level)) return -1; if (s->block[DS_BB]) - if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB], + method, level)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, level)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, level)) return -1; } } // NAME: best is generally xz, bzip2, zlib then rans1 - if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN], - method & ~(1<version >= (3<<8)+1 && fd->use_tok) + method_rn |= fd->use_arith ? (1<block[DS_RN], fd->m[DS_RN], + method_rn, level)) return -1; // NS shows strong local correlation as rearrangements are localised - if (s->block[DS_NS] != s->block[0]) - if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS], - method, level)) + if (s->block[DS_NS] && s->block[DS_NS] != s->block[0]) + if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS], + method, level)) return -1; @@ -920,8 +954,8 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { if (s->aux_block[i]->method != RAW) continue; - if (cram_compress_block(fd, s->aux_block[i], s->aux_block[i]->m, - method, level)) + if (cram_compress_block2(fd, s, s->aux_block[i], s->aux_block[i]->m, + method, level)) return -1; } } @@ -938,7 +972,7 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { if (s->block[i]->method != RAW) continue; - if (cram_compress_block(fd, s->block[i], fd->m[i], + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], methodF, level)) return -1; } @@ -947,6 +981,106 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { return 0; } +/* + * Allocates a block associated with the cram codec associated with + * data series ds_id or the internal codec_id (depending on codec + * type). + * + * The ds_ids are what end up written to disk as an external block. + * The c_ids are internal and used when daisy-chaining transforms + * such as MAP and RLE. These blocks are also allocated, but + * are ephemeral in nature. (The codecs themselves cannot allocate + * these as the same codec pointer may be operating on multiple slices + * if we're using a multi-slice container.) + * + * Returns 0 on success + * -1 on failure + */ +static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) { + if (!codec) + return 0; + + switch(codec->codec) { + // Codecs which are hard-coded to use the CORE block + case E_GOLOMB: + case E_HUFFMAN: + case E_BETA: + case E_SUBEXP: + case E_GOLOMB_RICE: + case E_GAMMA: + codec->out = s->block[0]; + break; + + // Codecs which don't use external blocks + case E_CONST_BYTE: + case E_CONST_INT: + codec->out = NULL; + break; + + // Codecs that emit directly to external blocks + case E_EXTERNAL: + case E_VARINT_UNSIGNED: + case E_VARINT_SIGNED: + if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id))) + return -1; + codec->u.external.content_id = ds_id; + codec->out = s->block[ds_id]; + break; + + case E_BYTE_ARRAY_STOP: // Why no sub-codec? + if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id))) + return -1; + codec->u.byte_array_stop.content_id = ds_id; + codec->out = s->block[ds_id]; + break; + + + // Codecs that contain sub-codecs which may in turn emit to external blocks + case E_BYTE_ARRAY_LEN: { + cram_codec *bal = codec->u.e_byte_array_len.len_codec; + if (cram_allocate_block(bal, s, bal->u.external.content_id)) + return -1; + bal = codec->u.e_byte_array_len.val_codec; + if (cram_allocate_block(bal, s, bal->u.external.content_id)) + return -1; + + break; + } + + case E_XRLE: + if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id)) + //ds_id == DS_QS ? DS_QS_len : ds_id)) + return -1; + if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id)) + return -1; + + break; + + case E_XPACK: + if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id)) + return -1; + codec->out = cram_new_block(0, 0); // ephemeral + if (!codec->out) + return -1; + + break; + + case E_XDELTA: + if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id)) + return -1; + codec->out = cram_new_block(0, 0); // ephemeral + if (!codec->out) + return -1; + + break; + + default: + break; + } + + return 0; +} + /* * Encodes a single slice from a container * @@ -954,14 +1088,12 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { * -1 on failure */ static int cram_encode_slice(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h, cram_slice *s) { + cram_block_compression_hdr *h, cram_slice *s, + int embed_ref) { int rec, r = 0; int64_t last_pos; - int embed_ref; enum cram_DS_ID id; - embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; - /* * Slice external blocks: * ID 0 => base calls (insertions, soft-clip) @@ -974,12 +1106,14 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, */ /* Create cram slice header */ - s->hdr->ref_base_id = embed_ref ? DS_ref : -1; + s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0 + ? DS_ref + : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1); s->hdr->record_counter = c->num_records + c->record_counter; c->num_records += s->hdr->num_records; int ntags = c->tags_used ? c->tags_used->n_occupied : 0; - s->block = calloc(DS_END + ntags, sizeof(s->block[0])); + s->block = calloc(DS_END + ntags*2, sizeof(s->block[0])); s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t)); if (!s->block || !s->hdr->block_content_ids) return -1; @@ -1000,7 +1134,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, } // Embedded reference - if (embed_ref) { + if (embed_ref>0) { if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref))) return -1; s->ref_id = DS_ref; // needed? @@ -1012,55 +1146,9 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, /* * All the data-series blocks if appropriate. */ - for (id = DS_BF; id < DS_TN; id++) { - if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL || - h->codecs[id]->codec == E_BYTE_ARRAY_STOP || - h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) { - switch (h->codecs[id]->codec) { - case E_EXTERNAL: - if (!(s->block[id] = cram_new_block(EXTERNAL, id))) - return -1; - h->codecs[id]->u.external.content_id = id; - break; - - case E_BYTE_ARRAY_STOP: - if (!(s->block[id] = cram_new_block(EXTERNAL, id))) - return -1; - h->codecs[id]->u.byte_array_stop.content_id = id; - break; - - case E_BYTE_ARRAY_LEN: { - cram_codec *cc; - - cc = h->codecs[id]->u.e_byte_array_len.len_codec; - if (cc->codec == E_EXTERNAL) { - int eid = cc->u.external.content_id; - if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) - return -1; - cc->u.external.content_id = eid; - cc->out = s->block[eid]; - } - - cc = h->codecs[id]->u.e_byte_array_len.val_codec; - if (cc->codec == E_EXTERNAL) { - int eid = cc->u.external.content_id; - if (!s->block[eid]) - if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) - return -1; - cc->u.external.content_id = eid; - cc->out = s->block[eid]; - } - break; - } - default: - break; - } - } else { - if (!(id == DS_BB && !h->codecs[DS_BB])) - s->block[id] = s->block[0]; - } - if (h->codecs[id]) - h->codecs[id]->out = s->block[id]; + for (id = DS_QS; id < DS_TN; id++) { + if (cram_allocate_block(h->codecs[id], s, id) < 0) + return -1; } /* @@ -1085,11 +1173,21 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, s->block[0]->comp_size = s->block[0]->uncomp_size; // Make sure the fixed blocks point to the correct sources + if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]); s->block[DS_IN] = s->base_blk; s->base_blk = NULL; + if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]); s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL; + if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]); s->block[DS_RN] = s->name_blk; s->name_blk = NULL; + if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]); s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL; + // Finalise any data transforms. + for (id = DS_QS; id < DS_TN; id++) { + if (h->codecs[id] && h->codecs[id]->flush) + h->codecs[id]->flush(h->codecs[id]); + } + // Ensure block sizes are up to date. for (id = 1; id < s->hdr->num_blocks; id++) { if (!s->block[id] || s->block[id] == s->block[0]) @@ -1285,12 +1383,21 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, cr->name = BLOCK_SIZE(s->name_blk); if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) { - BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); - cr->name_len = bam_name_len(b); + if (CRAM_MAJOR_VERS(fd->version) >= 4 + && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) + && cr->mate_line) { + // Dedup read names in V4 + BLOCK_APPEND(s->name_blk, "\0", 1); + cr->name_len = 1; + } else { + BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); + cr->name_len = bam_name_len(b); + } } else { // Can only discard duplicate names if not detached cr->name_len = 0; } + if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0) goto block_err; } @@ -1301,6 +1408,343 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, return -1; } +// CRAM version >= 3.1 +#define CRAM_ge31(v) ((v) >= 0x301) + +// Returns the next cigar op code: one of the BAM_C* codes, +// or -1 if no more are present. +static inline +int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos, + uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) { + for(;;) { + while (*cig_len == 0) { + if (*cig_ind < ncigar) { + *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK; + *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT; + (*cig_ind)++; + } else { + return -1; + } + } + + if (skip[*cig_op]) { + *spos += (bam_cigar_type(*cig_op)&1) * *cig_len; + *cig_len = 0; + continue; + } + + (*cig_len)--; + break; + } + + return *cig_op; +} + +// Ensure ref and hist are large enough. +static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, + hts_pos_t ref_start, hts_pos_t *ref_end) { + if (pos < ref_start) + return -1; + if (pos < *ref_end) + return 0; + + // realloc + hts_pos_t old_end = *ref_end ? *ref_end : ref_start; + hts_pos_t new_end = *ref_end = ref_start + 1000 + (pos-ref_start)*1.5; + + char *tmp = realloc(*ref, *ref_end-ref_start); + if (!tmp) + return -1; + *ref = tmp; + + uint32_t (*tmp5)[5] = realloc(**hist, + (*ref_end - ref_start)*sizeof(**hist)); + if (!tmp5) + return -1; + *hist = tmp5; + *ref_end = new_end; + + // initialise + old_end -= ref_start; + new_end -= ref_start; + memset(&(*ref)[old_end], 0, new_end-old_end); + memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist)); + + return 0; +} + +// Walk through MD + seq to generate ref +static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], + hts_pos_t ref_start, hts_pos_t *ref_end, + const uint8_t *MD) { + uint8_t *seq = bam_get_seq(b); + uint32_t *cigar = bam_get_cigar(b); + uint32_t ncigar = b->core.n_cigar; + uint32_t cig_op = 0, cig_len = 0, cig_ind = 0; + + int iseq = 0, next_op; + hts_pos_t iref = b->core.pos - ref_start; + + // Skip INS, REF_SKIP, *CLIP, PAD. and BACK. + static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1}; + while (iseq < b->core.l_qseq && *MD) { + if (isdigit(*MD)) { + // match + int overflow = 0; + int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow); + if (overflow || + extend_ref(ref, hist, iref+ref_start + len, + ref_start, ref_end) < 0) + return -1; + while (iseq < b->core.l_qseq && len) { + // rewrite to have internal loops? + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && + next_op != BAM_CEQUAL) { + hts_log_info("MD:Z and CIGAR are incompatible for " + "record %s", bam_get_qname(b)); + return -1; + } + + // Short-cut loop over same cigar op for efficiency + cig_len++; + do { + cig_len--; + (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)]; + iseq++; + len--; + } while (cig_len && iseq < b->core.l_qseq && len); + } + if (len > 0) + return -1; // MD is longer than seq + } else if (*MD == '^') { + // deletion + MD++; + while (isalpha(*MD)) { + if (extend_ref(ref, hist, iref+ref_start, ref_start, + ref_end) < 0) + return -1; + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CDEL) { + hts_log_info("MD:Z and CIGAR are incompatible"); + return -1; + } + + (*ref)[iref++] = *MD++ & ~0x20; + } + } else { + // substitution + if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0) + return -1; + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) { + hts_log_info("MD:Z and CIGAR are incompatible"); + return -1; + } + + (*ref)[iref++] = *MD++ & ~0x20; + iseq++; + } + } + + return 1; +} + +// Append a sequence to a ref/consensus structure. +// We maintain both an absolute refefence (ACGTN where MD:Z is +// present) and a 5-way frequency array for when no MD:Z is known. +// We then subsequently convert the 5-way frequencies to a consensus +// ref in a second pass. +// +// Returns >=0 on success, +// -1 on failure (eg inconsistent data) +static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], + hts_pos_t ref_start, hts_pos_t *ref_end) { + const uint8_t *MD = bam_aux_get(b, "MD"); + int ret = 0; + if (MD && *MD == 'Z') { + // We can use MD to directly compute the reference + int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1); + + if (ret > 0) + return ret; + } + + // Otherwise we just use SEQ+CIGAR and build a consensus which we later + // turn into a fake reference + uint32_t *cigar = bam_get_cigar(b); + uint32_t ncigar = b->core.n_cigar; + uint32_t i, j; + hts_pos_t iseq = 0, iref = b->core.pos - ref_start; + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < ncigar; i++) { + switch (bam_cigar_op(cigar[i])) { + case BAM_CSOFT_CLIP: + case BAM_CINS: + iseq += bam_cigar_oplen(cigar[i]); + break; + + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: { + int len = bam_cigar_oplen(cigar[i]); + // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4 + static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4}; + + if (extend_ref(ref, hist, iref+ref_start + len, + ref_start, ref_end) < 0) + return -1; + if (iseq + len <= b->core.l_qseq) { + // Nullify failed MD:Z if appropriate + if (ret < 0) + memset(&(*ref)[iref], 0, len); + + for (j = 0; j < len; j++, iref++, iseq++) + (*hist)[iref][L16[bam_seqi(seq, iseq)]]++; + } else { + // Probably a 2ndary read with seq "*" + iseq += len; + iref += len; + } + break; + } + + case BAM_CDEL: + case BAM_CREF_SKIP: + iref += bam_cigar_oplen(cigar[i]); + } + } + + return 1; +} + +// Automatically generates the reference and stashed it in c->ref, also +// setting c->ref_start and c->ref_end. +// +// If we have MD:Z tags then we use them to directly infer the reference, +// along with SEQ + CIGAR. Otherwise we use SEQ/CIGAR only to build up +// a consensus and then assume the reference as the majority rule. +// +// In this latter scenario we need to be wary of auto-generating MD and NM +// during decode, but that's handled elsewhere via an additional aux tag. +// +// Returns 0 on success, +// -1 on failure +static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { + // TODO: if we can find an external reference then use it, even if the + // user told us to do embed_ref=2. + char *ref = NULL; + uint32_t (*hist)[5] = NULL; + hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0; + + // initial allocation + if (extend_ref(&ref, &hist, + c->bams[r1 + s->hdr->num_records-1]->core.pos + + c->bams[r1 + s->hdr->num_records-1]->core.l_qseq, + ref_start, &ref_end) < 0) + return -1; + + // Add each bam file to the reference/consensus arrays + int r2; + hts_pos_t last_pos = -1; + for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) { + if (c->bams[r1]->core.pos < last_pos) { + hts_log_error("Cannot build reference with unsorted data"); + goto err; + } + last_pos = c->bams[r1]->core.pos; + if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0) + goto err; + } + + // Compute the consensus + hts_pos_t i; + for (i = 0; i < ref_end-ref_start; i++) { + if (!ref[i]) { + int max_v = 0, max_j = 4, j; + for (j = 0; j < 4; j++) + // don't call N (j==4) unless no coverage + if (max_v < hist[i][j]) + max_v = hist[i][j], max_j = j; + ref[i] = "ACGTN"[max_j]; + } + } + free(hist); + + // Put the reference in place so it appears to be an external + // ref file. + c->ref = ref; + c->ref_start = ref_start+1; + c->ref_end = ref_end+1; + + return 0; + + err: + free(ref); + free(hist); + return -1; +} + +// Check if the SQ M5 tag matches the reference we've loaded. +static int validate_md5(cram_fd *fd, int ref_id) { + if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref) + return 0; + + // Have we already checked this ref? + if (fd->refs->ref_id[ref_id]->validated_md5) + return 0; + + // Check if we have the MD5 known. + // We should, but maybe we're using embedded references? + sam_hrecs_t *hrecs = fd->header->hrecs; + sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN", + hrecs->ref[ref_id].name); + if (!ty) + return 0; + + sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL); + if (!m5tag) + return 0; + + // It's known, so compute md5 on the loaded reference sequence. + char *ref = fd->refs->ref_id[ref_id]->seq; + int64_t len = fd->refs->ref_id[ref_id]->length; + hts_md5_context *md5; + char unsigned buf[16]; + char buf2[33]; + + if (!(md5 = hts_md5_init())) + return -1; + hts_md5_update(md5, ref, len); + hts_md5_final(buf, md5); + hts_md5_destroy(md5); + hts_md5_hex(buf2, buf); + + // Compare it to header @SQ M5 tag + if (strcmp(m5tag->str+3, buf2)) { + hts_log_error("SQ header M5 tag discrepancy for reference '%s'", + hrecs->ref[ref_id].name); + hts_log_error("Please use the correct reference, or " + "consider using embed_ref=2"); + return -1; + } + fd->refs->ref_id[ref_id]->validated_md5 = 1; + + return 0; +} + /* * Encodes all slices in a container into blocks. * Returns 0 on success @@ -1311,50 +1755,102 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { cram_block_compression_hdr *h = c->comp_hdr; cram_block *c_hdr; int multi_ref = 0; - int r1, r2, sn, nref; + int r1, r2, sn, nref, embed_ref, no_ref; spare_bams *spares; + if (CRAM_MAJOR_VERS(fd->version) == 1) + goto err; + //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;} #define goto_err goto err + restart: /* Cache references up-front if we have unsorted access patterns */ pthread_mutex_lock(&fd->ref_lock); nref = fd->refs->nref; pthread_mutex_unlock(&fd->ref_lock); - - if (!fd->no_ref && c->refs_used) { - for (i = 0; i < nref; i++) { - if (c->refs_used[i]) - cram_get_ref(fd, i, 1, 0); - } - } + embed_ref = c->embed_ref; + no_ref = c->no_ref; /* To create M5 strings */ /* Fetch reference sequence */ - if (!fd->no_ref) { + if (!no_ref) { if (!c->bams || !c->bams[0]) goto_err; bam_seq_t *b = c->bams[0]; - char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); - if (!ref && bam_ref(b) >= 0) { - hts_log_error("Failed to load reference #%d", bam_ref(b)); - return -1; - } - if ((c->ref_id = bam_ref(b)) >= 0) { - c->ref_seq_id = c->ref_id; - c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; - c->ref_start = 1; - c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; + if (embed_ref <= 1) { + char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); + if (!ref && bam_ref(b) >= 0) { + if (!c->pos_sorted) { + // TODO: maybe also check fd->no_ref? + hts_log_warning("Failed to load reference #%d", + bam_ref(b)); + hts_log_warning("Switching to non-ref mode"); + + pthread_mutex_lock(&fd->ref_lock); + c->embed_ref = fd->embed_ref = 0; + c->no_ref = fd->no_ref = 1; + pthread_mutex_unlock(&fd->ref_lock); + goto restart; + } + + if (c->multi_seq || embed_ref == 0) { + hts_log_error("Failed to load reference #%d", bam_ref(b)); + return -1; + } + hts_log_warning("Failed to load reference #%d", bam_ref(b)); + hts_log_warning("Enabling embed_ref=2 mode to auto-generate" + " reference"); + if (embed_ref <= 0) + hts_log_warning("NOTE: the CRAM file will be bigger than" + " using an external reference"); + pthread_mutex_lock(&fd->ref_lock); + embed_ref = c->embed_ref = fd->embed_ref = 2; + pthread_mutex_unlock(&fd->ref_lock); + goto auto_ref; + } else if (ref) { + if (validate_md5(fd, c->ref_seq_id) < 0) + goto_err; + } + if ((c->ref_id = bam_ref(b)) >= 0) { + c->ref_seq_id = c->ref_id; + c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; + c->ref_start = 1; + c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; + } } else { - c->ref_seq_id = c->ref_id; // FIXME remove one var! + auto_ref: + // Auto-embed ref. + // This starts as 'N' and is amended on-the-fly as we go + // based on MD:Z tags. + if ((c->ref_id = bam_ref(b)) >= 0) { + c->ref_free = 1; + c->ref = NULL; + } } + c->ref_seq_id = c->ref_id; } else { c->ref_id = bam_ref(c->bams[0]); cram_ref_incr(fd->refs, c->ref_id); c->ref_seq_id = c->ref_id; } + if (!no_ref && c->refs_used) { + for (i = 0; i < nref; i++) { + if (c->refs_used[i]) { + if (cram_get_ref(fd, i, 1, 0)) { + if (validate_md5(fd, i) < 0) + goto_err; + } else { + hts_log_warning("Failed to find reference, " + "switching to non-ref mode"); + no_ref = c->no_ref = 1; + } + } + } + } + /* Turn bams into cram_records and gather basic stats */ for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { cram_slice *s = c->slices[sn]; @@ -1375,6 +1871,33 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // is done within process_one_read(). kstring_t MD = {0}; + // Embed consensus / MD-generated ref + if (embed_ref == 2) { + if (cram_generate_reference(c, s, r1) < 0) { + // Should this be a permanent thing via fd->no_ref? + // Doing so means we cannot easily switch back again should + // things fix themselves later on. This is likely not a + // concern though as failure to generate a reference implies + // unsorted data which is rarely recovered from. + + // Only if sn == 0. We're hosed if we're on the 2nd slice and + // the first worked, as no-ref is a container global param. + if (sn > 0) { + hts_log_error("Failed to build reference, " + "switching to non-ref mode"); + return -1; + } else { + hts_log_warning("Failed to build reference, " + "switching to non-ref mode"); + } + pthread_mutex_lock(&fd->ref_lock); + c->embed_ref = fd->embed_ref = 0; + c->no_ref = fd->no_ref = 1; + pthread_mutex_unlock(&fd->ref_lock); + goto restart; + } + } + // Iterate through records creating the cram blocks for some // fields and just gathering stats for others. for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) { @@ -1382,7 +1905,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { bam_seq_t *b = c->bams[r1]; /* If multi-ref we need to cope with changing reference per seq */ - if (c->multi_seq && !fd->no_ref) { + if (c->multi_seq && !no_ref) { if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) { if (c->ref_seq_id >= 0) cram_ref_decr(fd->refs, c->ref_seq_id); @@ -1392,6 +1915,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { free(MD.s); return -1; } + if (validate_md5(fd, bam_ref(b)) < 0) + return -1; c->ref_seq_id = bam_ref(b); // overwritten later by -2 if (!fd->refs->ref_id[c->ref_seq_id]->seq) @@ -1402,7 +1927,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } } - if (process_one_read(fd, c, s, cr, b, r2, &MD) != 0) { + if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref, + no_ref) != 0) { free(MD.s); return -1; } @@ -1413,6 +1939,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (last_base < cr->aend) last_base = cr->aend; } + free(MD.s); // Process_one_read doesn't add read names as it can change @@ -1427,6 +1954,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { s->hdr->ref_seq_id = -2; s->hdr->ref_seq_start = 0; s->hdr->ref_seq_span = 0; + } else if (c->ref_id == -1 && CRAM_ge31(fd->version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + s->hdr->ref_seq_id = -1; + s->hdr->ref_seq_start = 0; + s->hdr->ref_seq_span = 0; } else { s->hdr->ref_seq_id = c->ref_id; s->hdr->ref_seq_start = first_base; @@ -1438,7 +1971,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // slice can start aggregating them from the start again. if (c->tags_used->n_occupied) { int ntags = c->tags_used->n_occupied; - s->aux_block = calloc(ntags, sizeof(*s->aux_block)); + s->aux_block = calloc(ntags*2, sizeof(*s->aux_block)); if (!s->aux_block) return -1; @@ -1454,12 +1987,15 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (!tm->blk) continue; s->aux_block[s->naux_block++] = tm->blk; tm->blk = NULL; + if (!tm->blk2) continue; + s->aux_block[s->naux_block++] = tm->blk2; + tm->blk2 = NULL; } - assert(s->naux_block <= c->tags_used->n_occupied); + assert(s->naux_block <= 2*c->tags_used->n_occupied); } } - if (c->multi_seq && !fd->no_ref) { + if (c->multi_seq && !no_ref) { if (c->ref_seq_id >= 0) cram_ref_decr(fd->refs, c->ref_seq_id); } @@ -1491,11 +2027,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Compute MD5s */ + no_ref = c->no_ref; + int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; + for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; if (CRAM_MAJOR_VERS(fd->version) != 1) { - if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) { + if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) { hts_md5_context *md5 = hts_md5_init(); if (!md5) return -1; @@ -1517,13 +2056,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== BF ===\n"); h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]), c->stats[DS_BF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err; //fprintf(stderr, "=== CF ===\n"); h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]), c->stats[DS_CF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err; //fprintf(stderr, "=== RN ===\n"); @@ -1532,114 +2071,125 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // fd->version); //fprintf(stderr, "=== AP ===\n"); - if (c->pos_sorted) { - h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), - c->stats[DS_AP], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) { + if (c->pos_sorted) + h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), + c->stats[DS_AP], + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); + else + // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT. + h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED + : E_EXTERNAL, + NULL, + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); } else { + // Removed BETA in v4.0. + // Should we consider dropping use of it for 3.0 too? int p[2] = {0, c->max_apos}; - h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, - fd->version); + h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, + is_v4 ? E_LONG : E_INT, + p, fd->version, &fd->vv); +// cram_xdelta_encoder e; +// e.word_size = is_v4 ? 8 : 4; +// e.sub_encoding = E_EXTERNAL; +// e.sub_codec_dat = (void *)DS_AP; +// +// h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL, +// is_v4 ? E_LONG : E_INT, +// &e, fd->version, &fd->vv); } if (!h->codecs[DS_AP]) goto_err; //fprintf(stderr, "=== RG ===\n"); h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), - c->stats[DS_RG], E_INT, NULL, - fd->version); + c->stats[DS_RG], + E_INT, + NULL, + fd->version, &fd->vv); if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err; //fprintf(stderr, "=== MQ ===\n"); h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]), c->stats[DS_MQ], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err; //fprintf(stderr, "=== NS ===\n"); h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), c->stats[DS_NS], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err; //fprintf(stderr, "=== MF ===\n"); h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]), c->stats[DS_MF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err; //fprintf(stderr, "=== TS ===\n"); h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), c->stats[DS_TS], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err; //fprintf(stderr, "=== NP ===\n"); h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), c->stats[DS_NP], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err; //fprintf(stderr, "=== NF ===\n"); h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]), c->stats[DS_NF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err; //fprintf(stderr, "=== RL ===\n"); h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]), c->stats[DS_RL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err; //fprintf(stderr, "=== FN ===\n"); h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]), c->stats[DS_FN], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err; //fprintf(stderr, "=== FC ===\n"); h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]), c->stats[DS_FC], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err; //fprintf(stderr, "=== FP ===\n"); h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]), c->stats[DS_FP], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err; //fprintf(stderr, "=== DL ===\n"); h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]), c->stats[DS_DL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err; //fprintf(stderr, "=== BA ===\n"); h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]), c->stats[DS_BA], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err; if (CRAM_MAJOR_VERS(fd->version) >= 3) { cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)DS_BB_len; //e.len_dat = (void *)DS_BB; @@ -1648,7 +2198,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_BB]) goto_err; } else { h->codecs[DS_BB] = NULL; @@ -1657,7 +2207,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== BS ===\n"); h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]), c->stats[DS_BS], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err; if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -1671,13 +2221,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TC ===\n"); h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]), c->stats[DS_TC], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err; //fprintf(stderr, "=== TN ===\n"); h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]), c->stats[DS_TN], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err; } else { h->codecs[DS_TC] = NULL; @@ -1686,32 +2236,32 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TL ===\n"); h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]), c->stats[DS_TL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err; //fprintf(stderr, "=== RI ===\n"); h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), c->stats[DS_RI], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err; //fprintf(stderr, "=== RS ===\n"); h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]), c->stats[DS_RS], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err; //fprintf(stderr, "=== PD ===\n"); h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]), c->stats[DS_PD], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err; //fprintf(stderr, "=== HC ===\n"); h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]), c->stats[DS_HC], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err; //fprintf(stderr, "=== SC ===\n"); @@ -1720,14 +2270,16 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); } else { // Appears to be no practical benefit to using this method, // but it may work better if we start mixing SC, IN and BB // elements into the same external block. cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)DS_SC_len; e.val_encoding = E_EXTERNAL; @@ -1735,7 +2287,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); } if (!h->codecs[DS_SC]) goto_err; } @@ -1745,19 +2297,19 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { int i2[2] = {0, DS_IN}; h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_IN]) goto_err; } h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, (void *)DS_QS, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_QS]) goto_err; { int i2[2] = {0, DS_RN}; h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_RN]) goto_err; } @@ -1766,7 +2318,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { for (i = 0; i < c->curr_slice; i++) { hts_log_info("Encode slice %d", i); - if (cram_encode_slice(fd, c, h, c->slices[i]) != 0) + int local_embed_ref = + embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0; + if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0) return -1; } @@ -1776,10 +2330,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->ref_seq_start = c->ref_seq_start; h->ref_seq_span = c->ref_seq_span; h->num_records = c->num_records; - h->AP_delta = c->pos_sorted; + h->qs_seq_orient = c->qs_seq_orient; + // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1) + h->AP_delta = c->pos_sorted; memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); - if (!(c_hdr = cram_encode_compression_header(fd, c, h))) + if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref))) return -1; } @@ -1799,14 +2355,21 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { ? c_hdr->uncomp_size : c_hdr->comp_size; slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(c_hdr->content_id) + - itf8_size(c_hdr->comp_size) + - itf8_size(c_hdr->uncomp_size); + fd->vv.varint_size(c_hdr->content_id) + + fd->vv.varint_size(c_hdr->comp_size) + + fd->vv.varint_size(c_hdr->uncomp_size); } c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; - c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; - c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; + if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + c->ref_seq_start = 0; + c->ref_seq_span = 0; + } else { + c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; + c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; + } for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; @@ -1824,15 +2387,15 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { : s->hdr_block->comp_size; slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(s->hdr_block->content_id) + - itf8_size(s->hdr_block->comp_size) + - itf8_size(s->hdr_block->uncomp_size); + fd->vv.varint_size(s->hdr_block->content_id) + + fd->vv.varint_size(s->hdr_block->comp_size) + + fd->vv.varint_size(s->hdr_block->uncomp_size); for (j = 0; j < s->hdr->num_blocks; j++) { slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(s->block[j]->content_id) + - itf8_size(s->block[j]->comp_size) + - itf8_size(s->block[j]->uncomp_size); + fd->vv.varint_size(s->block[j]->content_id) + + fd->vv.varint_size(s->block[j]->comp_size) + + fd->vv.varint_size(s->block[j]->uncomp_size); slice_offset += s->block[j]->method == RAW ? s->block[j]->uncomp_size @@ -1848,7 +2411,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } /* Cache references up-front if we have unsorted access patterns */ - if (!fd->no_ref && c->refs_used) { + if (!no_ref && c->refs_used) { for (i = 0; i < fd->refs->nref; i++) { if (c->refs_used[i]) cram_ref_decr(fd->refs, i); @@ -2074,160 +2637,21 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, return -1; } -/* - * Encodes auxiliary data, CRAM 1.0 format. - * Returns the read-group parsed out of the BAM aux fields on success - * NULL on failure or no rg present (FIXME) - */ -static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, - cram_slice *s, cram_record *cr) { - char *aux, *tmp, *rg = NULL; - int aux_size = bam_blk_size(b) - - ((char *)bam_aux(b) - (char *)&bam_ref(b)); - - /* Worst case is 1 nul char on every ??:Z: string, so +33% */ - BLOCK_GROW(s->aux_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_blk); - - aux = (char *)bam_aux(b); - cr->TN_idx = s->nTN; - - while (aux[0] != 0) { - int32_t i32; - int r; - - if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { - rg = &aux[3]; - while (*aux++); - continue; - } - if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { - while (*aux++); - continue; - } - if (aux[0] == 'N' && aux[1] == 'M') { - switch(aux[2]) { - case 'A': case 'C': case 'c': aux+=4; break; - case 'I': case 'i': case 'f': aux+=7; break; - default: - hts_log_error("Unhandled type code for NM tag"); - return NULL; - } - continue; - } - - cr->ntags++; - - i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; - kh_put(m_tagmap, c->tags_used, i32, &r); - if (-1 == r) - return NULL; - - if (s->nTN >= s->aTN) { - s->aTN = s->aTN ? s->aTN*2 : 1024; - if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN)))) - return NULL; - } - s->TN[s->nTN++] = i32; - if (cram_stats_add(c->stats[DS_TN], i32) < 0) - goto block_err; - - switch(aux[2]) { - case 'A': case 'C': case 'c': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; - break; - - case 'S': case 's': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'I': case 'i': case 'f': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'd': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'Z': case 'H': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; // stop byte - break; - - case 'B': { - int type = aux[3], blen; - uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + - (((unsigned char *)aux)[5]<< 8) + - (((unsigned char *)aux)[6]<<16) + - (((unsigned char *)aux)[7]<<24)); - // skip TN field - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - - // We use BYTE_ARRAY_LEN with external length, so store that first - switch (type) { - case 'c': case 'C': - blen = count; - break; - case 's': case 'S': - blen = 2*count; - break; - case 'i': case 'I': case 'f': - blen = 4*count; - break; - default: - hts_log_error("Unknown sub-type '%c' for aux type 'B'", type); - return NULL; - } - - tmp += itf8_put(tmp, blen+5); - - *tmp++=*aux++; // sub-type & length - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - - // The tag data itself - memcpy(tmp, aux, blen); tmp += blen; aux += blen; - - //cram_stats_add(c->aux_B_stats, blen); - break; - } - default: - hts_log_error("Unknown aux type '%c'", aux[2]); - return NULL; - } - } - if (cram_stats_add(c->stats[DS_TC], cr->ntags) < 0) - goto block_err; - - cr->aux = BLOCK_SIZE(s->aux_blk); - cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); - BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); - assert(s->aux_blk->byte <= s->aux_blk->alloc); - - return rg; - - block_err: - return NULL; -} - /* * Encodes auxiliary data. Largely duplicated from above, but done so to * keep it simple and avoid a myriad of version ifs. * - * Returns the read-group parsed out of the BAM aux fields on success + * Returns the RG header line pointed to by the BAM aux fields on success, * NULL on failure or no rg present, also sets "*err" to non-zero */ -static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, - cram_slice *s, cram_record *cr, - int verbatim_NM, int verbatim_MD, - int NM, kstring_t *MD, - int *err) { - char *aux, *orig, *rg = NULL; +static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, + cram_container *c, + cram_slice *s, cram_record *cr, + int verbatim_NM, int verbatim_MD, + int NM, kstring_t *MD, int cf_tag, + int no_ref, int *err) { + char *aux, *orig; + sam_hrec_rg_t *brg = NULL; int aux_size = bam_get_l_aux(b); cram_block *td_b = c->comp_hdr->TD_blk; int TD_blk_size = BLOCK_SIZE(td_b), new; @@ -2238,22 +2662,50 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, orig = aux = (char *)bam_aux(b); + + // cF:i => Extra CRAM bit flags. + // 1: Don't auto-decode MD (may be invalid) + // 2: Don't auto-decode NM (may be invalid) + if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) { + // Temporary copy of aux so we can ammend it. + aux = malloc(aux_size+4); + if (!aux) + return NULL; + + memcpy(aux, orig, aux_size); + aux[aux_size++] = 'c'; + aux[aux_size++] = 'F'; + aux[aux_size++] = 'C'; + aux[aux_size++] = cf_tag; + orig = aux; + } + // Copy aux keys to td_b and aux values to slice aux blocks while (aux - orig < aux_size && aux[0] != 0) { int r; // RG:Z if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { - rg = &aux[3]; - while (*aux++); - continue; + char *rg = &aux[3]; + brg = sam_hrecs_find_rg(fd->header->hrecs, rg); + if (brg) { + while (*aux++); + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "RG*", 3); + continue; + } else { + // RG:Z tag will be stored verbatim + hts_log_warning("Missing @RG header for RG \"%s\"", rg); + } } // MD:Z if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { - if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { + if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) { while (*aux++); + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "MD*", 3); continue; } } @@ -2261,7 +2713,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // NM:i if (aux[0] == 'N' && aux[1] == 'M') { - if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) { + if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) { int NM_ = bam_aux2i((uint8_t *)aux+2); if (NM_ == NM) { switch(aux[2]) { @@ -2272,6 +2724,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, hts_log_error("Unhandled type code for NM tag"); return NULL; } + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "NM*", 3); continue; } } @@ -2325,7 +2779,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // string as byte_array_stop c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); break; case 'A': case 'c': case 'C': { @@ -2333,8 +2787,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 1) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2344,7 +2803,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2353,8 +2812,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 2) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2364,7 +2828,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } case 'i': case 'I': case 'f': { @@ -2372,8 +2836,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 4) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2383,7 +2852,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2395,7 +2864,9 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // too. cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)sk; // or key+128 for len? e.val_encoding = E_EXTERNAL; @@ -2403,7 +2874,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2508,8 +2979,15 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (!tm->blk) { if (!(tm->blk = cram_new_block(EXTERNAL, key))) return NULL; - codec->u.e_byte_array_len.len_codec->out = tm->blk; - codec->u.e_byte_array_len.val_codec->out = tm->blk; + if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) { + if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128))) + return NULL; + codec->u.e_byte_array_len.len_codec->out = tm->blk2; + codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk; + } else { + codec->u.e_byte_array_len.len_codec->out = tm->blk; + codec->u.e_byte_array_len.val_codec->out = tm->blk; + } } // skip TN field @@ -2570,11 +3048,17 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0) goto block_err; + if (orig != (char *)bam_aux(b)) + free(orig); + if (err) *err = 0; - return rg; + + return brg; err: block_err: + if (orig != (char *)bam_aux(b)) + free(orig); return NULL; } @@ -2585,12 +3069,18 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, * * See cram_next_container() and cram_close(). */ -void cram_update_curr_slice(cram_container *c) { +void cram_update_curr_slice(cram_container *c, int version) { cram_slice *s = c->slice; if (c->multi_seq) { s->hdr->ref_seq_id = -2; s->hdr->ref_seq_start = 0; s->hdr->ref_seq_span = 0; + } else if (c->curr_ref == -1 && CRAM_ge31(version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + s->hdr->ref_seq_id = -1; + s->hdr->ref_seq_start = 0; + s->hdr->ref_seq_span = 0; } else { s->hdr->ref_seq_id = c->curr_ref; s->hdr->ref_seq_start = c->first_base; @@ -2625,7 +3115,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { c->curr_ref = bam_ref(b); if (c->slice) - cram_update_curr_slice(c); + cram_update_curr_slice(c, fd->version); /* Flush container */ if (c->curr_slice == c->max_slice || @@ -2657,7 +3147,12 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { fd->slices_per_container); if (!c) return NULL; + + pthread_mutex_lock(&fd->ref_lock); + c->no_ref = fd->no_ref; + c->embed_ref = fd->embed_ref; c->record_counter = fd->record_counter; + pthread_mutex_unlock(&fd->ref_lock); c->curr_ref = bam_ref(b); } @@ -2684,9 +3179,16 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { c->s_num_bases = 0; c->n_mapped = 0; + // QO field: 0 implies original orientation, 1 implies sequence orientation + // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff. + // Conversely other data sets it's often better than 1% saving for 0. + // Short of trying both and learning, for now we use use 0 for V4, 1 for V3. + c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1; + return c; } + /* * Converts a single bam record into a cram record. * Possibly used within a thread. @@ -2696,9 +3198,10 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { */ static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum, kstring_t *MD) { + bam_seq_t *b, int rnum, kstring_t *MD, + int embed_ref, int no_ref) { int i, fake_qual = -1, NM = 0; - char *cp, *rg; + char *cp; char *ref, *seq, *qual; // Any places with N in seq and/or reference can lead to ambiguous @@ -2711,16 +3214,24 @@ static int process_one_read(cram_fd *fd, cram_container *c, // FIXME: multi-ref containers - ref = c->ref; cr->flags = bam_flag(b); cr->len = bam_seq_len(b); - if (!bam_aux_get(b, "MD")) + uint8_t *md; + if (!(md = bam_aux_get(b, "MD"))) MD = NULL; else MD->l = 0; + int cf_tag = 0; + + if (embed_ref == 2) { + cf_tag = MD ? 0 : 1; // No MD + cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM + } + //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); + ref = c->ref ? c->ref - (c->ref_start-1) : NULL; cr->ref_id = bam_ref(b); if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0) goto block_err; @@ -2729,7 +3240,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // Non reference based encoding means storing the bases verbatim as features, which in // turn means every base also has a quality already stored. - if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) + if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3) @@ -2739,7 +3250,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, c->num_bases += cr->len; cr->apos = bam_pos(b)+1; if (c->pos_sorted) { - if (cr->apos < s->last_apos) { + if (cr->apos < s->last_apos && !fd->ap_delta) { c->pos_sorted = 0; } else { if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0) @@ -2769,6 +3280,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, qual = cp = (char *)bam_qual(b); + /* Copy and parse */ if (!(cr->flags & BAM_FUNMAP)) { uint32_t *cig_to, *cig_from; @@ -2806,7 +3318,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n", // cig_len, &ref[apos], cig_len, &seq[spos]); l = 0; - if (!fd->no_ref && cr->len) { + if (!no_ref && cr->len) { int end = cig_len+apos < c->ref_end ? cig_len : c->ref_end - apos; char *sp = &seq[spos]; @@ -2834,6 +3346,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (!sp[l]) break; if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) { +#if 0 // Disabled for the time being as it doesn't // seem to gain us much. int ol=l; @@ -2851,6 +3364,43 @@ static int process_one_read(cram_fd *fd, cram_container *c, qp[l], rp[l])) return -1; } +#else + // With urmap pushed to the limit and lots + // of unaligned data (should be soft-clipped) + // this saves ~2-7%. Worth it? + int nl = l; + int max_end = nl, max_score = 0, score = 0; + while (nl < end) { + if (rp[nl] != sp[nl]) { + score += 3; + if (max_score < score) { + max_score = score; + max_end = nl; + } + } else { + score--; + if (score < -2 || + max_score - score > 7) + break; + } + nl++; + } + if (max_score > 20) { + cram_add_bases(fd, c, s, cr, spos+l, + max_end-l, &seq[spos+l]); + l = max_end-1; + } else { + while (l < nl) { + if (rp[l] != sp[l]) + cram_add_substitution(fd, c, s, + cr, spos+l, + sp[l], qp[l], + rp[l]); + l++; + } + l--; + } +#endif } else { if (cram_add_substitution(fd, c, s, cr, spos+l, sp[l], qp[l], rp[l])) @@ -2863,7 +3413,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, } if (l < cig_len && cr->len) { - if (fd->no_ref) { + if (no_ref) { if (CRAM_MAJOR_VERS(fd->version) == 3) { if (cram_add_bases(fd, c, s, cr, spos, cig_len-l, &seq[spos])) @@ -2922,7 +3472,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (cram_add_insertion(c, s, cr, spos, cig_len, cr->len ? &seq[spos] : NULL)) return -1; - if (fd->no_ref && cr->len) { + if (no_ref && cr->len) { for (l = 0; l < cig_len; l++, spos++) { cram_add_quality(fd, c, s, cr, spos, qual[spos]); } @@ -2938,7 +3488,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, fd->version)) return -1; - if (fd->no_ref && + if (no_ref && !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { if (cr->len) { for (l = 0; l < cig_len; l++, spos++) { @@ -2974,7 +3524,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, return -1; } fake_qual = spos; - cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end); + cr->aend = no_ref ? apos : MIN(apos, c->ref_end); if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0) goto block_err; @@ -2995,20 +3545,19 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); int err = 0; - if (CRAM_MAJOR_VERS(fd->version) == 1) - rg = cram_encode_aux_1_0(fd, b, c, s, cr); - else - rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err); + sam_hrec_rg_t *brg = + cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, + cf_tag, no_ref, &err); if (err) goto block_err; /* Read group, identified earlier */ - if (rg) { - sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, rg); - cr->rg = brg ? brg->id : -1; + if (brg) { + cr->rg = brg->id; } else if (CRAM_MAJOR_VERS(fd->version) == 1) { sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN"); - assert(brg); + if (!brg) goto block_err; + cr->rg = brg->id; } else { cr->rg = -1; } @@ -3033,7 +3582,19 @@ static int process_one_read(cram_fd *fd, cram_container *c, char *from = (char *)&bam_qual(b)[0]; char *to = &cp[0]; memcpy(to, from, cr->len); - //for (i = 0; i < cr->len; i++) cp[i] = from[i]; + + // Store quality in original orientation for better compression. + if (!c->qs_seq_orient) { + if (cr->flags & BAM_FREVERSE) { + int i, j; + for (i = 0, j = cr->len-1; i < j; i++, j--) { + unsigned char c; + c = to[i]; + to[i] = to[j]; + to[j] = c; + } + } + } } BLOCK_SIZE(s->qual_blk) += cr->len; } else { @@ -3086,11 +3647,6 @@ static int process_one_read(cram_fd *fd, cram_container *c, // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/- // a small amount, if appropriate options set. - if ((bam_ins_size(b) && - llabs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || - (!bam_ins_size(b) && !fd->tlen_zero)) - goto detached; - if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) && !(fd->tlen_zero && bam_mate_pos(b) == 0)) goto detached; @@ -3109,10 +3665,6 @@ static int process_one_read(cram_fd *fd, cram_container *c, !(fd->tlen_zero && p->ref_id == -1)) goto detached; - if ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || - (!p->tlen && !fd->tlen_zero)) - goto detached; - if (p->mate_pos != cr->apos && !(fd->tlen_zero && p->mate_pos == 0)) goto detached; @@ -3138,6 +3690,29 @@ static int process_one_read(cram_fd *fd, cram_container *c, !((p->cram_flags & CRAM_FLAG_DISCARD_NAME)))) goto detached; + // Now check TLEN. We do this last as sometimes it's the + // only thing that differs. In CRAM4 we have a better way + // of handling this that doesn't break detached status + int explicit_tlen = 0; + int tflag1 = ((bam_ins_size(b) && + llabs(bam_ins_size(b) - sign*(aright-aleft+1)) + > fd->tlen_approx) + || (!bam_ins_size(b) && !fd->tlen_zero)); + + int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) + > fd->tlen_approx) + || (!p->tlen && !fd->tlen_zero)); + + if (tflag1 || tflag2) { + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN; + } else { + // Stil do detached for unmapped data in CRAM4 as this + // also impacts RNEXT calculation. + goto detached; + } + } + /* * The fields below are unused when encoding this read as it is * no longer detached. In theory they may get referred to when @@ -3148,7 +3723,9 @@ static int process_one_read(cram_fd *fd, cram_container *c, * not emitted. */ cr->mate_pos = p->apos; - cr->tlen = sign*(aright-aleft+1); + cram_stats_add(c->stats[DS_NP], cr->mate_pos); + cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1); + cram_stats_add(c->stats[DS_TS], cr->tlen); cr->mate_flags = ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; @@ -3157,7 +3734,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (p->cram_flags & CRAM_FLAG_STATS_ADDED) { cram_stats_del(c->stats[DS_NP], p->mate_pos); cram_stats_del(c->stats[DS_MF], p->mate_flags); - cram_stats_del(c->stats[DS_TS], p->tlen); + if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN)) + cram_stats_del(c->stats[DS_TS], p->tlen); cram_stats_del(c->stats[DS_NS], p->mate_ref_id); } @@ -3173,6 +3751,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // Clear detached from cr flags cr->cram_flags &= ~CRAM_FLAG_DETACHED; + cr->cram_flags |= explicit_tlen; if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0) goto block_err; @@ -3183,7 +3762,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, } p->cram_flags &= ~CRAM_FLAG_DETACHED; - p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; + p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;; if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0) goto block_err; @@ -3261,12 +3840,19 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (!fd->ctr) return -1; fd->ctr->record_counter = fd->record_counter; + + pthread_mutex_lock(&fd->ref_lock); + fd->ctr->no_ref = fd->no_ref; + fd->ctr->embed_ref = fd->embed_ref; + pthread_mutex_unlock(&fd->ref_lock); } c = fd->ctr; + int embed_ref = c->embed_ref; + if (!c->slice || c->curr_rec == c->max_rec || (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) || - (c->s_num_bases >= fd->bases_per_slice)) { + (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) { int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1; int curr_ref = c->slice ? c->curr_ref : bam_ref(b); @@ -3281,7 +3867,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { */ if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && fd->last_slice && fd->last_slice < c->max_rec/4+10 && - !fd->embed_ref) { + embed_ref<=0) { if (!c->multi_seq) hts_log_info("Multi-ref enabled for next container"); multi_seq = 1; @@ -3299,11 +3885,11 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (CRAM_MAJOR_VERS(fd->version) == 1 || c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice || - c->s_num_bases >= fd->bases_per_slice) { + c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) { if (NULL == (c = cram_next_container(fd, b))) { if (fd->ctr) { // prevent cram_close attempting to flush - cram_free_container(fd->ctr); + fd->ctr_mt = fd->ctr; // delay free when threading fd->ctr = NULL; } return -1; @@ -3324,7 +3910,28 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { // We detected we need multi-seq fd->multi_seq = 1; c->multi_seq = 1; - c->pos_sorted = 0; // required atm for multi_seq slices + c->pos_sorted = 0; + + // Cram_next_container may end up flushing an existing one and + // triggering fd->embed_ref=2 if no reference is found. + // Embedded refs are incompatible with multi-seq, so we bail + // out and switch to no_ref in this scenario. We do this + // within the container only, as multi_seq may be temporary + // and we switch back away from it again. + pthread_mutex_lock(&fd->ref_lock); + if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) { + hts_log_warning("Changing from embed_ref to no_ref mode"); + // Should we update fd->embed_ref and no_ref here too? + // Doing so means if we go into multi-seq and back out + // again, eg due a cluster of tiny refs in the middle of + // much larger ones, then we bake in no-ref mode. + // + // However for unsorted data we're realistically not + // going to switch back. + c->embed_ref = fd->embed_ref = 0; // or -1 for auto? + c->no_ref = fd->no_ref = 1; + } + pthread_mutex_unlock(&fd->ref_lock); if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); @@ -3339,8 +3946,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->slice_rec = c->curr_rec; // Have we seen this reference before? - if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref && - !fd->unsorted && multi_seq) { + if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && + embed_ref<=0 && !fd->unsorted && multi_seq) { if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); @@ -3351,8 +3958,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { } else if (c->refs_used && c->refs_used[bam_ref(b)]) { pthread_mutex_lock(&fd->ref_lock); fd->unsorted = 1; - pthread_mutex_unlock(&fd->ref_lock); fd->multi_seq = 1; + pthread_mutex_unlock(&fd->ref_lock); } } @@ -3390,6 +3997,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->curr_rec++; c->curr_c_rec++; c->s_num_bases += bam_seq_len(b); + c->s_aux_bytes += bam_get_l_aux(b); c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1; fd->record_counter++; diff --git a/cram/cram_encode.h b/cram/cram_encode.h index c779b46a7..03b8054e8 100644 --- a/cram/cram_encode.h +++ b/cram/cram_encode.h @@ -74,7 +74,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b); * NULL on failure */ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h); + cram_block_compression_hdr *h, + int embed_ref); /*! INTERNAL: * Encodes a slice compression header. @@ -106,7 +107,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c); * * See cram_next_container() and cram_close(). */ -void cram_update_curr_slice(cram_container *c); +void cram_update_curr_slice(cram_container *c, int version); #ifdef __cplusplus } diff --git a/cram/cram_external.c b/cram/cram_external.c index d0fd48cc8..7455185ad 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2019 Genome Research Ltd. +Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -40,6 +40,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include +#include + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#else +#include "../htscodecs/htscodecs/rANS_static4x16.h" +#endif #include "../htslib/hfile.h" #include "cram.h" @@ -82,6 +89,14 @@ void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) { c->num_blocks = num_blocks; } +int32_t cram_container_get_num_records(cram_container *c) { + return c->num_records; +} + +int64_t cram_container_get_num_bases(cram_container *c) { + return c->num_bases; +} + /* Returns the landmarks[] array and the number of elements * in num_landmarks. @@ -180,6 +195,294 @@ int cram_block_compression_hdr_decoder2encoder(cram_fd *fd, return 0; } +typedef struct { + cram_block_compression_hdr *hdr; + cram_map *curr_map; + int idx; + int is_tag; // phase 2 using tag_encoding_map +} cram_codec_iter; + +static void cram_codec_iter_init(cram_block_compression_hdr *hdr, + cram_codec_iter *iter) { + iter->hdr = hdr; + iter->curr_map = NULL; + iter->idx = 0; + iter->is_tag = 0; +} + +// See enum cram_DS_ID in cram/cram_structs +static int cram_ds_to_key(enum cram_DS_ID ds) { + switch(ds) { + case DS_RN: return 256*'R'+'N'; + case DS_QS: return 256*'Q'+'S'; + case DS_IN: return 256*'I'+'N'; + case DS_SC: return 256*'S'+'C'; + case DS_BF: return 256*'B'+'F'; + case DS_CF: return 256*'C'+'F'; + case DS_AP: return 256*'A'+'P'; + case DS_RG: return 256*'R'+'G'; + case DS_MQ: return 256*'M'+'Q'; + case DS_NS: return 256*'N'+'S'; + case DS_MF: return 256*'M'+'F'; + case DS_TS: return 256*'T'+'S'; + case DS_NP: return 256*'N'+'P'; + case DS_NF: return 256*'N'+'F'; + case DS_RL: return 256*'R'+'L'; + case DS_FN: return 256*'F'+'N'; + case DS_FC: return 256*'F'+'C'; + case DS_FP: return 256*'F'+'P'; + case DS_DL: return 256*'D'+'L'; + case DS_BA: return 256*'B'+'A'; + case DS_BS: return 256*'B'+'S'; + case DS_TL: return 256*'T'+'L'; + case DS_RI: return 256*'R'+'I'; + case DS_RS: return 256*'R'+'S'; + case DS_PD: return 256*'P'+'D'; + case DS_HC: return 256*'H'+'C'; + case DS_BB: return 256*'B'+'B'; + case DS_QQ: return 256*'Q'+'Q'; + case DS_TN: return 256*'T'+'N'; + case DS_TC: return 256*'T'+'C'; + case DS_TM: return 256*'T'+'M'; + case DS_TV: return 256*'T'+'V'; + default: break; + } + + return -1; // unknown +} + +static cram_codec *cram_codec_iter_next(cram_codec_iter *iter, + int *key) { + cram_codec *cc = NULL; + cram_block_compression_hdr *hdr = iter->hdr; + + if (!iter->is_tag) { + // 1: Iterating through main data-series + do { + cc = hdr->codecs[iter->idx++]; + } while(!cc && iter->idx < DS_END); + if (cc) { + *key = cram_ds_to_key(iter->idx-1); + return cc; + } + + // Reset index for phase 2 + iter->idx = 0; + iter->is_tag = 1; + } + + do { + if (!iter->curr_map) + iter->curr_map = hdr->tag_encoding_map[iter->idx++]; + + cc = iter->curr_map ? iter->curr_map->codec : NULL; + if (cc) { + *key = iter->curr_map->key; + iter->curr_map = iter->curr_map->next; + return cc; + } + } while (iter->idx <= CRAM_MAP_HASH); + + // End of codecs + return NULL; +} + +/* + * A list of data-series, used to create a linked list threaded through + * a single array. + */ +typedef struct ds_list { + int data_series; + int next; +} ds_list; + +KHASH_MAP_INIT_INT(cid, int64_t) + +// Opaque struct for the CRAM block content-id -> data-series map. +struct cram_cid2ds_t { + ds_list *ds; // array of data-series with linked lists threading through it + int ds_size; + int ds_idx; + khash_t(cid) *hash; // key=content_id, value=index to ds array + int *ds_a; // serialised array of data-series returned by queries. +}; + +void cram_cid2ds_free(cram_cid2ds_t *cid2ds) { + if (cid2ds) { + if (cid2ds->hash) + kh_destroy(cid, cid2ds->hash); + free(cid2ds->ds); + free(cid2ds->ds_a); + free(cid2ds); + } +} + +/* + * Map cram block numbers to data-series. It's normally a 1:1 mapping, + * but in rare cases it can be 1:many (or even many:many). + * The key is the block number and the value is an index into the data-series + * array, which we iterate over until reaching a negative value. + * + * Provide cid2ds as NULL to allocate a new map or pass in an existing one + * to append to this map. The new (or existing) map is returned. + * + * Returns the cid2ds (newly allocated or as provided) on success, + * NULL on failure. + */ +cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr, + cram_cid2ds_t *cid2ds) { + cram_cid2ds_t *c2d = cid2ds; + if (!c2d) { + c2d = calloc(1, sizeof(*c2d)); + if (!c2d) + return NULL; + + c2d->hash = kh_init(cid); + if (!c2d->hash) + goto err; + } + + // Iterate through codecs. Initially primary two-left ones in + // rec_encoding_map, and then the three letter in tag_encoding_map. + cram_codec_iter citer; + cram_codec_iter_init(hdr, &citer); + cram_codec *codec; + int key; + + while ((codec = cram_codec_iter_next(&citer, &key))) { + // Having got a codec, we can then use cram_codec_to_id to get + // the block IDs utilised by that codec. This is then our + // map for allocating data blocks to data series, but for shared + // blocks we can't separate out how much is used by each DS. + int bnum[2]; + cram_codec_get_content_ids(codec, bnum); + + khiter_t k; + int ret, i; + for (i = 0; i < 2; i++) { + if (bnum[i] > -2) { + k = kh_put(cid, c2d->hash, bnum[i], &ret); + if (ret < 0) + goto err; + + if (c2d->ds_idx >= c2d->ds_size) { + c2d->ds_size += 100; + c2d->ds_size *= 2; + ds_list *ds_new = realloc(c2d->ds, + c2d->ds_size * sizeof(*ds_new)); + if (!ds_new) + goto err; + c2d->ds = ds_new; + } + + if (ret == 0) { + // Shared content_id, so add to list of DS + + // Maybe data-series should be part of the hash key? + // + // So top-32 bit is content-id, bot-32 bit is key. + // Sort hash by key and then can group all the data-series + // known together. ?? + // + // Brute force for now, scan to see if recorded. + // Typically this is minimal effort as we almost always + // have 1 data-series per block content-id, so the list to + // search is of size 1. + int dsi = kh_value(c2d->hash, k); + while (dsi >= 0) { + if (c2d->ds[dsi].data_series == key) + break; + dsi = c2d->ds[dsi].next; + } + + if (dsi == -1) { + // Block content_id seen before, but not with this DS + c2d->ds[c2d->ds_idx].data_series = key; + c2d->ds[c2d->ds_idx].next = kh_value(c2d->hash, k); + kh_value(c2d->hash, k) = c2d->ds_idx; + c2d->ds_idx++; + } + } else { + // First time this content id has been used + c2d->ds[c2d->ds_idx].data_series = key; + c2d->ds[c2d->ds_idx].next = -1; + kh_value(c2d->hash, k) = c2d->ds_idx; + c2d->ds_idx++; + } + } + } + } + + return c2d; + + err: + if (c2d != cid2ds) + cram_cid2ds_free(c2d); + return NULL; +} + +/* + * Return a list of data series observed as belonging to a block with + * the specified content_id. *n is the number of data series + * returned, or 0 if block is unused. + * Block content_id of -1 is used to indicate the CORE block. + * + * The pointer returned is owned by the cram_cid2ds state and should + * not be freed by the caller. + */ +int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n) { + *n = 0; + if (!c2d || !c2d->hash) + return NULL; + + khiter_t k = kh_get(cid, c2d->hash, content_id); + if (k == kh_end(c2d->hash)) + return NULL; + + if (!c2d->ds_a) { + c2d->ds_a = malloc(c2d->ds_idx * sizeof(int)); + if (!c2d->ds_a) + return NULL; + } + + int dsi = kh_value(c2d->hash, k); // initial ds array index from hash + int idx = 0; + while (dsi >= 0) { + c2d->ds_a[idx++] = c2d->ds[dsi].data_series; + dsi = c2d->ds[dsi].next; // iterate over list within ds array + } + + *n = idx; + return c2d->ds_a; +} + +/* + * Produces a description of the record and tag encodings held within + * a compression header and appends to 'ks'. + * + * Returns 0 on success, + * <0 on failure. + */ +int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks) { + cram_codec_iter citer; + cram_codec_iter_init(hdr, &citer); + cram_codec *codec; + int key, r = 0; + + while ((codec = cram_codec_iter_next(&citer, &key))) { + char key_s[4] = {0}; + int key_i = 0; + if (key>>16) key_s[key_i++] = key>>16; + key_s[key_i++] = (key>>8)&0xff; + key_s[key_i++] = key&0xff; + r |= ksprintf(ks, "\t%s\t", key_s) < 0; + r |= cram_codec_describe(codec, ks) < 0; + r |= kputc('\n', ks) < 0; + } + + return r ? -1 : 0; +} + /* *----------------------------------------------------------------------------- * cram_slice @@ -188,17 +491,35 @@ int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) { return hdr->num_blocks; } +int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h) { + return h->ref_base_id; +} + +void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, + int *refid, hts_pos_t *start, hts_pos_t *span) { + if (refid) + *refid = h->ref_seq_id; + if (start) + *start = h->ref_seq_start; + if (span) + *span = h->ref_seq_span; +} /* *----------------------------------------------------------------------------- * cram_block */ -int32_t cram_block_get_content_id(cram_block *b) { return b->content_id; } +int32_t cram_block_get_content_id(cram_block *b) { + return b->content_type == CORE ? -1 : b->content_id; +} int32_t cram_block_get_comp_size(cram_block *b) { return b->comp_size; } int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; } int32_t cram_block_get_crc32(cram_block *b) { return b->crc32; } void * cram_block_get_data(cram_block *b) { return BLOCK_DATA(b); } int32_t cram_block_get_size(cram_block *b) { return BLOCK_SIZE(b); } +enum cram_block_method cram_block_get_method(cram_block *b) { + return (enum cram_block_method)b->orig_method; +} enum cram_content_type cram_block_get_content_type(cram_block *b) { return b->content_type; } @@ -223,6 +544,122 @@ void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); } size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); } void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; } +/* + * Given a compressed block of data in a specified compression method, + * fill out the 'cm' field with meta-data gleaned from the compressed + * block. + * + * If comp is CRAM_COMP_UNKNOWN, we attempt to auto-detect the compression + * format, but this doesn't work for all methods. + * + * Retuns the detected or specified comp method, and fills out *cm + * if non-NULL. + */ +cram_method_details *cram_expand_method(uint8_t *data, int32_t size, + enum cram_block_method comp) { + cram_method_details *cm = calloc(1, sizeof(*cm)); + if (!cm) + return NULL; + + const char *xz_header = "\xFD""7zXZ"; // including nul + + if (comp == CRAM_COMP_UNKNOWN) { + // Auto-detect + if (size > 1 && data[0] == 0x1f && data[1] == 0x8b) + comp = CRAM_COMP_GZIP; + else if (size > 3 && data[1] == 'B' && data[2] == 'Z' + && data[3] == 'h') + comp = CRAM_COMP_BZIP2; + else if (size > 6 && memcmp(xz_header, data, 6) == 0) + comp = CRAM_COMP_LZMA; + else + comp = CRAM_COMP_UNKNOWN; + } + cm->method = comp; + + // Interrogate the compressed data stream to fill out additional fields. + switch (comp) { + case CRAM_COMP_GZIP: + if (size > 8) { + if (data[8] == 4) + cm->level = 1; + else if (data[8] == 2) + cm->level = 9; + else + cm->level = 5; + } + break; + + case CRAM_COMP_BZIP2: + if (size > 3 && data[3] >= '1' && data[3] <= '9') + cm->level = data[3]-'0'; + break; + + case CRAM_COMP_RANS4x8: + cm->Nway = 4; + if (size > 0 && data[0] == 1) + cm->order = 1; + else + cm->order = 0; + break; + + case CRAM_COMP_RANSNx16: + if (size > 0) { + cm->order = data[0] & 1; + cm->Nway = data[0] & RANS_ORDER_X32 ? 32 : 4; + cm->rle = data[0] & RANS_ORDER_RLE ? 1 : 0; + cm->pack = data[0] & RANS_ORDER_PACK ? 1 : 0; + cm->cat = data[0] & RANS_ORDER_CAT ? 1 : 0; + cm->stripe = data[0] & RANS_ORDER_STRIPE ? 1 : 0; + cm->nosz = data[0] & RANS_ORDER_NOSZ ? 1 : 0; + } + break; + + case CRAM_COMP_ARITH: + if (size > 0) { + // Not in a public header, but the same transforms as rANSNx16 + cm->order = data[0] & 3; + cm->rle = data[0] & RANS_ORDER_RLE ? 1 : 0; + cm->pack = data[0] & RANS_ORDER_PACK ? 1 : 0; + cm->cat = data[0] & RANS_ORDER_CAT ? 1 : 0; + cm->stripe = data[0] & RANS_ORDER_STRIPE ? 1 : 0; + cm->nosz = data[0] & RANS_ORDER_NOSZ ? 1 : 0; + cm->ext = data[0] & 4 /*external*/ ? 1 : 0; + } + break; + + case CRAM_COMP_TOK3: + if (size > 8) { + if (data[8] == 1) + cm->level = 11; + else if (data[8] == 0) + cm->level = 1; + } + break; + + default: + break; + } + + return cm; +} + +/* + *----------------------------------------------------------------------------- + * cram_codecs + */ + +// -2 is unused. +// -1 is CORE +// >= 0 is the block with that Content ID +void cram_codec_get_content_ids(cram_codec *c, int ids[2]) { + ids[0] = cram_codec_to_id(c, &ids[1]); +} + +/* + *----------------------------------------------------------------------------- + * Utility functions + */ /* * Copies the blocks representing the next num_slice slices from a @@ -318,7 +755,7 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, return -1; if (cram_block_compression_hdr_decoder2encoder(in, ch) != 0) return -1; - n_blk = cram_encode_compression_header(in, c, ch); + n_blk = cram_encode_compression_header(in, c, ch, in->embed_ref); cram_free_compression_header(ch); /* @@ -333,15 +770,17 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, char *op = cp; char *endp = cp + cram_block_get_uncomp_size(o_blk); //fprintf(stderr, "sz = %d\n", (int)(endp-cp)); - int32_t i32; + int32_t i32, err = 0; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); cp += i32; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); cp += i32; op = cp; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); i32 += (cp-op); + if (err) + return -2; //fprintf(stderr, "remaining %d bytes\n", i32); cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2); diff --git a/cram/cram_index.c b/cram/cram_index.c index b567d3e05..b775e9431 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2013-2020 Genome Research Ltd. +Copyright (c) 2013-2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -72,7 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dump_index_(cram_index *e, int level) { int i, n; n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); - printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset); + printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next); for (i = 0; i < e->nslice; i++) { dump_index_(&e->e[i], level+1); } @@ -86,6 +86,37 @@ static void dump_index(cram_fd *fd) { } #endif +// Thread a linked list through the nested containment list. +// This makes navigating it and finding the "next" index entry +// trivial. +static cram_index *link_index_(cram_index *e, cram_index *e_last) { + int i; + if (e_last) + e_last->e_next = e; + + // We don't want to link in the top-level cram_index with + // offset=0 and start/end = INT_MIN/INT_MAX. + if (e->offset) + e_last = e; + + for (i = 0; i < e->nslice; i++) + e_last = link_index_(&e->e[i], e_last); + + return e_last; +} + +static void link_index(cram_fd *fd) { + int i; + cram_index *e_last = NULL; + + for (i = 0; i < fd->index_sz; i++) { + e_last = link_index_(&fd->index[i], e_last); + } + + if (e_last) + e_last->e_next = NULL; +} + static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { int sign = 1; int32_t val = 0; @@ -173,6 +204,11 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { idx_stack[idx_stack_ptr] = idx; + // Support pathX.cram##idx##pathY.crai + const char *fn_delim = strstr(fn, HTS_IDX_DELIM); + if (fn_delim && !fn_idx) + fn_idx = fn_delim + strlen(HTS_IDX_DELIM); + if (!fn_idx) { if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn)) tfn_idx = hts_idx_getfn(fn, ".crai"); @@ -270,7 +306,8 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { idx_stack[(idx_stack_ptr = 0)] = idx; } - while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) { + while (!(e.start >= idx->start && e.end <= idx->end) || + (idx->start == 0 && idx->refid == -1)) { idx = idx_stack[--idx_stack_ptr]; } @@ -308,7 +345,10 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { free(kstr.s); free(tfn_idx); - // dump_index(fd); + // Convert NCList to linear linked list + link_index(fd); + + //dump_index(fd); return 0; @@ -351,7 +391,7 @@ void cram_index_free(cram_fd *fd) { * entries, but we require at least one per reference.) * * If the index finds multiple slices overlapping this position we - * return the first one only. Subsequent calls should specifying + * return the first one only. Subsequent calls should specify * "from" as the last slice we checked to find the next one. Otherwise * set "from" to be NULL to find the first one. * @@ -366,6 +406,17 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, int i, j, k; cram_index *e; + if (from) { + // Continue from a previous search. + // We switch to just scanning the linked list, as the nested + // lists are typically short. + e = from->e_next; + if (e && e->refid == refid && e->start <= pos) + return e; + else + return NULL; + } + switch(refid) { case HTS_IDX_NONE: case HTS_IDX_REST: @@ -395,8 +446,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, return NULL; } - if (!from) - from = &fd->index[refid+1]; + from = &fd->index[refid+1]; // Ref with nothing aligned against it. if (!from->e) @@ -461,55 +511,42 @@ cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { slice = fd->index[refid+1].nslice - 1; - return &from->e[slice]; + // e is the last entry in the nested containment list, but it may + // contain further slices within it. + cram_index *e = &from->e[slice]; + while (e->e_next) + e = e->e_next; + + return e; } +/* + * Find the last container overlapping pos 'end', and the file offset of + * its end (equivalent to the start offset of the container following it). + */ cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) { - cram_index *first = cram_index_query(fd, refid, end, NULL); - cram_index *last = cram_index_last(fd, refid, NULL); - if (!first || !last) - return NULL; - - while (first < last && (first+1)->start <= end) - first++; + cram_index *e = NULL, *prev_e; + do { + prev_e = e; + e = cram_index_query(fd, refid, end, prev_e); + } while (e); - while (first->e) { - int count = 0; - int nslices = first->nslice; - first = first->e; - while (++count < nslices && (first+1)->start <= end) - first++; - } + if (!prev_e) + return NULL; + e = prev_e; - // Compute the start location of next container. + // Note: offset of e and e->e_next may be the same if we're using a + // multi-ref container where a single container generates multiple + // index entries. // - // This is useful for stitching containers together in the multi-region - // iterator. Sadly we can't compute this from the single index line. - // - // Note we can have neighbouring index entries at the same location - // for when we have multi-reference mode and/or multiple slices per - // container. - cram_index *next = first; + // We need to keep iterating until offset differs in order to find + // the genuine file offset for the end of container. do { - if (next >= last) { - // Next non-empty reference - while (++refid+1 < fd->index_sz) - if (fd->index[refid+1].nslice) - break; - if (refid+1 >= fd->index_sz) { - next = NULL; - } else { - next = fd->index[refid+1].e; - last = fd->index[refid+1].e + fd->index[refid+1].nslice; - } - } else { - next++; - } - } while (next && next->offset == first->offset); - - first->next = next ? next->offset : 0; + prev_e = e; + e = e->e_next; + } while (e && e->offset == prev_e->offset); - return first; + return prev_e; } /* @@ -625,9 +662,10 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - ref, ref_start, ref_end - ref_start + 1, - (int64_t)cpos, landmark, sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + ref, ref_start, ref_end - ref_start + 1, + (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) return -4; } @@ -638,9 +676,10 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - ref, ref_start, ref_end - ref_start + 1, - (int64_t)cpos, landmark, sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + ref, ref_start, ref_end - ref_start + 1, + (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) return -4; } @@ -670,9 +709,10 @@ int cram_index_slice(cram_fd *fd, if (s->hdr->ref_seq_id == -2) { ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz); } else { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - s->hdr->ref_seq_id, s->hdr->ref_seq_start, - s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + s->hdr->ref_seq_id, s->hdr->ref_seq_start, + s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; } diff --git a/cram/cram_io.c b/cram/cram_io.c index 1904fa6cb..20e0a6555 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -69,11 +69,30 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define crc32(a,b,c) libdeflate_crc32((a),(b),(c)) #endif +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#include "../fuzz_settings.h" +#endif + #include "cram.h" #include "os.h" #include "../htslib/hts.h" #include "open_trace_file.h" -#include "rANS_static.h" + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#include +#include +#include +#include +#include // CRAM v4.0 variable-size integers +#else +#include "../htscodecs/htscodecs/rANS_static.h" +#include "../htscodecs/htscodecs/rANS_static4x16.h" +#include "../htscodecs/htscodecs/arith_dynamic.h" +#include "../htscodecs/htscodecs/tokenise_name3.h" +#include "../htscodecs/htscodecs/fqzcomp_qual.h" +#include "../htscodecs/htscodecs/varint.h" +#endif //#define REF_DEBUG @@ -95,9 +114,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PATH_MAX FILENAME_MAX #endif -#define TRIAL_SPAN 50 +#define TRIAL_SPAN 70 #define NTRIALS 3 +#define CRAM_DEFAULT_LEVEL 5 /* ---------------------------------------------------------------------- * ITF8 encoding and decoding. @@ -247,6 +267,112 @@ int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { return 5; } +/* + * Stores a value to memory in ITF-8 format. + * + * Returns the number of bytes required to store the number. + * This is a maximum of 5 bytes. + */ +static inline int itf8_put(char *cp, int32_t val) { + unsigned char *up = (unsigned char *)cp; + if (!(val & ~0x00000007f)) { // 1 byte + *up = val; + return 1; + } else if (!(val & ~0x00003fff)) { // 2 byte + *up++ = (val >> 8 ) | 0x80; + *up = val & 0xff; + return 2; + } else if (!(val & ~0x01fffff)) { // 3 byte + *up++ = (val >> 16) | 0xc0; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 3; + } else if (!(val & ~0x0fffffff)) { // 4 byte + *up++ = (val >> 24) | 0xe0; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 4; + } else { // 5 byte + *up++ = 0xf0 | ((val>>28) & 0xff); + *up++ = (val >> 20) & 0xff; + *up++ = (val >> 12) & 0xff; + *up++ = (val >> 4 ) & 0xff; + *up = val & 0x0f; + return 5; + } +} + + +/* 64-bit itf8 variant */ +static inline int ltf8_put(char *cp, int64_t val) { + unsigned char *up = (unsigned char *)cp; + if (!(val & ~((1LL<<7)-1))) { + *up = val; + return 1; + } else if (!(val & ~((1LL<<(6+8))-1))) { + *up++ = (val >> 8 ) | 0x80; + *up = val & 0xff; + return 2; + } else if (!(val & ~((1LL<<(5+2*8))-1))) { + *up++ = (val >> 16) | 0xc0; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 3; + } else if (!(val & ~((1LL<<(4+3*8))-1))) { + *up++ = (val >> 24) | 0xe0; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 4; + } else if (!(val & ~((1LL<<(3+4*8))-1))) { + *up++ = (val >> 32) | 0xf0; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 5; + } else if (!(val & ~((1LL<<(2+5*8))-1))) { + *up++ = (val >> 40) | 0xf8; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 6; + } else if (!(val & ~((1LL<<(1+6*8))-1))) { + *up++ = (val >> 48) | 0xfc; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 7; + } else if (!(val & ~((1LL<<(7*8))-1))) { + *up++ = (val >> 56) | 0xfe; + *up++ = (val >> 48) & 0xff; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 8; + } else { + *up++ = 0xff; + *up++ = (val >> 56) & 0xff; + *up++ = (val >> 48) & 0xff; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 9; + } +} + /* * Encodes and writes a single integer in ITF-8 format. * Returns 0 on success @@ -514,6 +640,361 @@ int ltf8_put_blk(cram_block *blk, int64_t val) { return -1; } +static int64_t safe_itf8_get(char **cp, const char *endp, int *err) { + const unsigned char *up = (unsigned char *)*cp; + + if (endp && endp - *cp < 5 && + (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) { + if (err) *err = 1; + return 0; + } + + if (up[0] < 0x80) { + (*cp)++; + return up[0]; + } else if (up[0] < 0xc0) { + (*cp)+=2; + return ((up[0] <<8) | up[1]) & 0x3fff; + } else if (up[0] < 0xe0) { + (*cp)+=3; + return ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; + } else if (up[0] < 0xf0) { + (*cp)+=4; + uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; + return (int32_t)uv; + } else { + (*cp)+=5; + uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); + return (int32_t)uv; + } +} + +static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) { + unsigned char *up = (unsigned char *)*cp; + + if (endp && endp - *cp < 9 && + (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) { + if (err) *err = 1; + return 0; + } + + if (up[0] < 0x80) { + (*cp)++; + return up[0]; + } else if (up[0] < 0xc0) { + (*cp)+=2; + return (((uint64_t)up[0]<< 8) | + (uint64_t)up[1]) & (((1LL<<(6+8)))-1); + } else if (up[0] < 0xe0) { + (*cp)+=3; + return (((uint64_t)up[0]<<16) | + ((uint64_t)up[1]<< 8) | + (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); + } else if (up[0] < 0xf0) { + (*cp)+=4; + return (((uint64_t)up[0]<<24) | + ((uint64_t)up[1]<<16) | + ((uint64_t)up[2]<< 8) | + (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); + } else if (up[0] < 0xf8) { + (*cp)+=5; + return (((uint64_t)up[0]<<32) | + ((uint64_t)up[1]<<24) | + ((uint64_t)up[2]<<16) | + ((uint64_t)up[3]<< 8) | + (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); + } else if (up[0] < 0xfc) { + (*cp)+=6; + return (((uint64_t)up[0]<<40) | + ((uint64_t)up[1]<<32) | + ((uint64_t)up[2]<<24) | + ((uint64_t)up[3]<<16) | + ((uint64_t)up[4]<< 8) | + (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); + } else if (up[0] < 0xfe) { + (*cp)+=7; + return (((uint64_t)up[0]<<48) | + ((uint64_t)up[1]<<40) | + ((uint64_t)up[2]<<32) | + ((uint64_t)up[3]<<24) | + ((uint64_t)up[4]<<16) | + ((uint64_t)up[5]<< 8) | + (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); + } else if (up[0] < 0xff) { + (*cp)+=8; + return (((uint64_t)up[1]<<48) | + ((uint64_t)up[2]<<40) | + ((uint64_t)up[3]<<32) | + ((uint64_t)up[4]<<24) | + ((uint64_t)up[5]<<16) | + ((uint64_t)up[6]<< 8) | + (uint64_t)up[7]) & ((1LL<<(7*8))-1); + } else { + (*cp)+=9; + return (((uint64_t)up[1]<<56) | + ((uint64_t)up[2]<<48) | + ((uint64_t)up[3]<<40) | + ((uint64_t)up[4]<<32) | + ((uint64_t)up[5]<<24) | + ((uint64_t)up[6]<<16) | + ((uint64_t)up[7]<< 8) | + (uint64_t)up[8]); + } +} + +// Wrapper for now +static int safe_itf8_put(char *cp, char *cp_end, int32_t val) { + return itf8_put(cp, val); +} + +static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) { + return ltf8_put(cp, val); +} + +static int itf8_size(int64_t v) { + return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5); +} + +//----------------------------------------------------------------------------- + +// CRAM v4.0 onwards uses a different variable sized integer encoding +// that is size agnostic. + +// Local interface to varint.h inline version, so we can use in func ptr. +// Note a lot of these use the unsigned interface but take signed int64_t. +// This is because the old CRAM ITF8 inteface had signed -1 as unsigned +// 0xffffffff. +static int uint7_size(int64_t v) { + return var_size_u64(v); +} + +static int64_t uint7_get_32(char **cp, const char *endp, int *err) { + uint32_t val; + int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t sint7_get_32(char **cp, const char *endp, int *err) { + int32_t val; + int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t uint7_get_64(char **cp, const char *endp, int *err) { + uint64_t val; + int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t sint7_get_64(char **cp, const char *endp, int *err) { + int64_t val; + int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int uint7_put_32(char *cp, char *endp, int32_t val) { + return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val); +} + +static int sint7_put_32(char *cp, char *endp, int32_t val) { + return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val); +} + +static int uint7_put_64(char *cp, char *endp, int64_t val) { + return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val); +} + +static int sint7_put_64(char *cp, char *endp, int64_t val) { + return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val); +} + +// Put direct to to cram_block +static int uint7_put_blk_32(cram_block *blk, int32_t v) { + uint8_t buf[10]; + int sz = var_put_u32(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int sint7_put_blk_32(cram_block *blk, int32_t v) { + uint8_t buf[10]; + int sz = var_put_s32(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int uint7_put_blk_64(cram_block *blk, int64_t v) { + uint8_t buf[10]; + int sz = var_put_u64(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int sint7_put_blk_64(cram_block *blk, int64_t v) { + uint8_t buf[10]; + int sz = var_put_s64(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +// Decode 32-bits with CRC update from cram_fd +static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { + uint8_t b[5], i = 0; + int c; + uint32_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u32(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 5 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = v; + return i; +} + +// Decode 32-bits with CRC update from cram_fd +static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { + uint8_t b[5], i = 0; + int c; + uint32_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u32(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 5 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = (v>>1) ^ -(v&1); + return i; +} + + +// Decode 64-bits with CRC update from cram_fd +static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) { + uint8_t b[10], i = 0; + int c; + uint64_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u64(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 10 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = v; + return i; +} + +//----------------------------------------------------------------------------- + /* * Decodes a 32-bit little endian value from fd and stores in val. * @@ -631,10 +1112,13 @@ char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size, int level, int strat) { level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default - level *= 1.2; // NB levels go up to 12 here; 5 onwards is +1 - if (level >= 8) level += level/8; // 8->10, 9->12 + level *= 1.23; // NB levels go up to 12 here; 5 onwards is +1 + level += level>=8; // 5,6,7->6,7,8 8->10 9->12 if (level > 12) level = 12; + if (strat == Z_RLE) // not supported by libdeflate + level = 1; + struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); if (!z) { hts_log_error("Call to libdeflate_alloc_compressor failed"); @@ -733,6 +1217,7 @@ char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { } #endif +#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8) static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, int level, int strat) { z_stream s; @@ -789,6 +1274,7 @@ static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, } return (char *)cdata; } +#endif #ifdef HAVE_LIBLZMA /* ------------------------------------------------------------------------ */ @@ -864,7 +1350,7 @@ static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { r = lzma_code(&strm, LZMA_FINISH); if (r != LZMA_OK && r != LZMA_STREAM_END) { hts_log_error("Call to lzma_code failed with error %d", r); - return NULL; + goto fail; } new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1); @@ -912,6 +1398,9 @@ cram_block *cram_new_block(enum cram_content_type content_type, b->alloc = 0; b->byte = 0; b->bit = 7; // MSB + b->crc32 = 0; + b->idx = 0; + b->m = NULL; return b; } @@ -934,9 +1423,9 @@ cram_block *cram_read_block(cram_fd *fd) { c = b->method; crc = crc32(crc, &c, 1); if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } c = b->content_type; crc = crc32(crc, &c, 1); - if (-1 == itf8_decode_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } - if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } - if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n", // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size); @@ -1015,13 +1504,18 @@ uint32_t cram_block_size(cram_block *b) { * -1 on failure */ int cram_write_block(cram_fd *fd, cram_block *b) { + char vardata[100]; + int vardata_o = 0; + assert(b->method != RAW || (b->comp_size == b->uncomp_size)); if (hputc(b->method, fd->fp) == EOF) return -1; if (hputc(b->content_type, fd->fp) == EOF) return -1; - if (itf8_encode(fd, b->content_id) == -1) return -1; - if (itf8_encode(fd, b->comp_size) == -1) return -1; - if (itf8_encode(fd, b->uncomp_size) == -1) return -1; + vardata_o += fd->vv.varint_put32(vardata , vardata+100, b->content_id); + vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size); + vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size); + if (vardata_o != hwrite(fd->fp, vardata, vardata_o)) + return -1; if (b->data) { if (b->method == RAW) { @@ -1037,15 +1531,15 @@ int cram_write_block(cram_fd *fd, cram_block *b) { } if (CRAM_MAJOR_VERS(fd->version) >= 3) { - unsigned char dat[100], *cp = dat;; + char dat[100], *cp = (char *)dat; uint32_t crc; *cp++ = b->method; *cp++ = b->content_type; - cp += itf8_put((char*)cp, b->content_id); - cp += itf8_put((char*)cp, b->comp_size); - cp += itf8_put((char*)cp, b->uncomp_size); - crc = crc32(0L, dat, cp-dat); + cp += fd->vv.varint_put32(cp, dat+100, b->content_id); + cp += fd->vv.varint_put32(cp, dat+100, b->comp_size); + cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size); + crc = crc32(0L, (uc *)dat, cp-dat); if (b->method == RAW) { b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size); @@ -1078,6 +1572,11 @@ int cram_uncompress_block(cram_block *b) { char *uncomp; size_t uncomp_size = 0; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Pretend the CRC was OK so the fuzzer doesn't have to get it right + b->crc32_checked = 1; +#endif + if (b->crc32_checked == 0) { uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc); b->crc32_checked = 1; @@ -1177,6 +1676,73 @@ int cram_uncompress_block(cram_block *b) { break; } + case FQZ: { + uncomp_size = b->uncomp_size; + uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0); + if (!uncomp) + return -1; + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = uncomp_size; + b->method = RAW; + b->uncomp_size = uncomp_size; + break; + } + + case RANS_PR0: { + unsigned int usize = b->uncomp_size, usize2; + uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2); + if (!uncomp) + return -1; + if (usize != usize2) { + free(uncomp); + return -1; + } + b->orig_method = RANS_PR0 + (b->data[0]&1) + + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = usize2; + b->method = RAW; + b->uncomp_size = usize2; // Just incase it differs + //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); + break; + } + + case ARITH_PR0: { + unsigned int usize = b->uncomp_size, usize2; + uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2); + if (!uncomp) + return -1; + if (usize != usize2) { + free(uncomp); + return -1; + } + b->orig_method = ARITH_PR0 + (b->data[0]&1) + + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = usize2; + b->method = RAW; + b->uncomp_size = usize2; // Just incase it differs + //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); + break; + } + + case TOK3: { + uint32_t out_len; + uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len); + if (!cp) + return -1; + b->orig_method = TOK3; + b->method = RAW; + free(b->data); + b->data = cp; + b->alloc = out_len; + b->uncomp_size = out_len; + break; + } + default: return -1; } @@ -1184,12 +1750,14 @@ int cram_uncompress_block(cram_block *b) { return 0; } -static char *cram_compress_by_method(char *in, size_t in_size, +static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, int content_id, size_t *out_size, - enum cram_block_method method, + enum cram_block_method_int method, int level, int strat) { switch (method) { case GZIP: + case GZIP_RLE: + case GZIP_1: // Read names bizarrely benefit from zlib over libdeflate for // mid-range compression levels. Focusing purely of ratio or // speed, libdeflate still wins. It also seems to win for @@ -1197,9 +1765,11 @@ static char *cram_compress_by_method(char *in, size_t in_size, // // Eg RN at level 5; libdeflate=55.9MB zlib=51.6MB #ifdef HAVE_LIBDEFLATE +# if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8)) if (content_id == DS_RN && level >= 4 && level <= 7) return zlib_mem_deflate(in, in_size, out_size, level, strat); else +# endif return libdeflate_deflate(in, in_size, out_size, level, strat); #else return zlib_mem_deflate(in, in_size, out_size, level, strat); @@ -1225,6 +1795,32 @@ static char *cram_compress_by_method(char *in, size_t in_size, #endif } + case FQZ: + case FQZ_b: + case FQZ_c: + case FQZ_d: { + // Extract the necessary portion of the slice into an fqz_slice struct. + // These previously were the same thing, but this permits us to detach + // the codec from the rest of this CRAM implementation. + fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice)); + if (!f) + return NULL; + f->num_records = s->hdr->num_records; + f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice)); + f->flags = f->len + s->hdr->num_records; + int i; + for (i = 0; i < s->hdr->num_records; i++) { + f->flags[i] = s->crecs[i].flags; + f->len[i] = (i+1 < s->hdr->num_records + ? s->crecs[i+1].qual - s->crecs[i].qual + : s->block[DS_QS]->uncomp_size - s->crecs[i].qual); + } + char *comp = fqz_compress(strat & 0xff /* cram vers */, f, + in, in_size, out_size, strat >> 8, NULL); + free(f); + return comp; + } + case LZMA: #ifdef HAVE_LIBLZMA return lzma_mem_deflate(in, in_size, out_size, level); @@ -1232,23 +1828,68 @@ static char *cram_compress_by_method(char *in, size_t in_size, return NULL; #endif - case RANS0: { + case RANS0: + case RANS1: { + unsigned int out_size_i; + unsigned char *cp; + cp = rans_compress((unsigned char *)in, in_size, &out_size_i, + method == RANS0 ? 0 : 1); + *out_size = out_size_i; + return (char *)cp; + } + + case RANS_PR0: + case RANS_PR1: + case RANS_PR64: + case RANS_PR9: + case RANS_PR128: + case RANS_PR129: + case RANS_PR192: + case RANS_PR193: { unsigned int out_size_i; unsigned char *cp; - cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0); + + // see enum cram_block. We map RANS_* methods to order bit-fields + static int methmap[] = { 1, 64,9, 128,129, 192,193 }; + + int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]; + cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, + m | RANS_ORDER_SIMD_AUTO); *out_size = out_size_i; return (char *)cp; } - case RANS1: { + case ARITH_PR0: + case ARITH_PR1: + case ARITH_PR64: + case ARITH_PR9: + case ARITH_PR128: + case ARITH_PR129: + case ARITH_PR192: + case ARITH_PR193: { unsigned int out_size_i; unsigned char *cp; - cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1); + // see enum cram_block. We map ARITH_* methods to order bit-fields + static int methmap[] = { 1, 64,9, 128,129, 192,193 }; + + cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i, + method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]); *out_size = out_size_i; return (char *)cp; } + case TOK3: + case TOKA: { + int out_len; + int lev = level; + if (method == TOK3 && lev > 3) + lev = 3; + uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL); + *out_size = out_len; + return (char *)cp; + } + case RAW: break; @@ -1270,13 +1911,37 @@ static char *cram_compress_by_method(char *in, size_t in_size, * * Method and level -1 implies defaults, as specified in cram_fd. */ -int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, - int method, int level) { +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level) { + + if (!b) + return 0; char *comp = NULL; size_t comp_size = 0; int strat; + // Internally we have parameterised methods that externally map + // to the same CRAM method value. + // See enum_cram_block_method_int in cram_structs.h. + int methmap[] = { + // Externally defined values + RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3, + + // Reserved for possible expansion + 0, 0, + + // Internally parameterised versions matching back to above + // external values + GZIP, GZIP, + FQZ, FQZ, FQZ, + RANS, + RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, + TOK3, + ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, + }; + if (b->method != RAW) { // Maybe already compressed if s->block[0] was compressed and // we have e.g. s->block[DS_BA] set to s->block[0] due to only @@ -1306,19 +1971,41 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, return 0; } +#ifndef ABS +# define ABS(a) ((a)>=0?(a):-(a)) +#endif + if (metrics) { pthread_mutex_lock(&fd->metrics_lock); + // Sudden changes in size trigger a retrial. These are mainly + // triggered when switching to sorted / unsorted, where the number + // of elements in a slice radically changes. + // + // We also get large fluctuations based on genome coordinate for + // e.g. SA:Z and SC series, but we consider the typical scale of + // delta between blocks and use this to look for abnormality. + if (metrics->input_avg_sz && + (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) || + b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) && + ABS(b->uncomp_size-metrics->input_avg_sz) + > 10*metrics->input_avg_delta) { + metrics->next_trial = 0; + } + if (metrics->trial > 0 || --metrics->next_trial <= 0) { - size_t sz_best = INT_MAX; - size_t sz_gz_rle = 0; - size_t sz_gz_def = 0; - size_t sz_rans0 = 0; - size_t sz_rans1 = 0; - size_t sz_bzip2 = 0; - size_t sz_lzma = 0; - int method_best = 0; + int m, unpackable = metrics->unpackable; + size_t sz_best = b->uncomp_size; + size_t sz[CRAM_MAX_METHOD] = {0}; + int method_best = 0; // RAW char *c_best = NULL, *c = NULL; + metrics->input_avg_delta = + 0.9 * (metrics->input_avg_delta + + ABS(b->uncomp_size - metrics->input_avg_sz)); + + metrics->input_avg_sz += b->uncomp_size*.2; + metrics->input_avg_sz *= 0.8; + if (metrics->revised_method) method = metrics->revised_method; else @@ -1327,174 +2014,190 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, if (metrics->next_trial <= 0) { metrics->next_trial = TRIAL_SPAN; metrics->trial = NTRIALS; - metrics->sz_gz_rle /= 2; - metrics->sz_gz_def /= 2; - metrics->sz_rans0 /= 2; - metrics->sz_rans1 /= 2; - metrics->sz_bzip2 /= 2; - metrics->sz_lzma /= 2; - } - - pthread_mutex_unlock(&fd->metrics_lock); - - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_gz_rle, GZIP, 1, Z_RLE); - if (c && sz_best > sz_gz_rle) { - sz_best = sz_gz_rle; - method_best = GZIP_RLE; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_gz_rle = b->uncomp_size*2+1000; - } - - //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle); + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] /= 2; + metrics->unpackable = 0; } - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_gz_def, GZIP, level, - Z_FILTERED); - if (c && sz_best > sz_gz_def) { - sz_best = sz_gz_def; - method_best = GZIP; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_gz_def = b->uncomp_size*2+1000; - } - - //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def); + // Compress this block using the best method + if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) { + // No point trying bit-pack if 17+ symbols. + if (method & (1<data, b->uncomp_size, - b->content_id, &sz_rans0, RANS0, 0, 0); - if (c && sz_best > sz_rans0) { - sz_best = sz_rans0; - method_best = RANS0; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_rans0 = b->uncomp_size*2+1000; - } - } + // Libdeflate doesn't have a Z_RLE strategy. + // We treat it as level 1, but iff we haven't also + // explicitly listed that in the method list. +#ifdef HAVE_LIBDEFLATE + if ((method & (1<data, b->uncomp_size, - b->content_id, &sz_rans1, RANS1, 0, 0); - if (c && sz_best > sz_rans1) { - sz_best = sz_rans1; - method_best = RANS1; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_rans1 = b->uncomp_size*2+1000; - } - } + pthread_mutex_unlock(&fd->metrics_lock); - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_bzip2, BZIP2, level, 0); - if (c && sz_best > sz_bzip2) { - sz_best = sz_bzip2; - method_best = BZIP2; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); + for (m = 0; m < CRAM_MAX_METHOD; m++) { + if (method & (1u<version); break; + case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; + case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; + case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; + case TOK3: strat = 0; break; + case TOKA: strat = 1; break; + default: strat = 0; + } + + c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, + b->content_id, &sz[m], m, lvl, strat); + + if (c && sz_best > sz[m]) { + sz_best = sz[m]; + method_best = m; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw + } } else { - sz_bzip2 = b->uncomp_size*2+1000; + sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw } } - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_lzma, LZMA, level, 0); - if (c && sz_best > sz_lzma) { - sz_best = sz_lzma; - method_best = LZMA; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_lzma = b->uncomp_size*2+1000; - } + if (c_best) { + free(b->data); + b->data = (unsigned char *)c_best; + b->method = method_best; // adjusted to methmap[method_best] later + b->comp_size = sz_best; } - //fprintf(stderr, "sz_best = %d\n", sz_best); - - free(b->data); - b->data = (unsigned char *)c_best; - //printf("method_best = %s\n", cram_block_method2str(method_best)); - b->method = method_best == GZIP_RLE ? GZIP : method_best; - b->comp_size = sz_best; - + // Accumulate stats for all methods tried pthread_mutex_lock(&fd->metrics_lock); - metrics->sz_gz_rle += sz_gz_rle; - metrics->sz_gz_def += sz_gz_def; - metrics->sz_rans0 += sz_rans0; - metrics->sz_rans1 += sz_rans1; - metrics->sz_bzip2 += sz_bzip2; - metrics->sz_lzma += sz_lzma; + for (m = 0; m < CRAM_MAX_METHOD; m++) + // don't be overly sure on small blocks. + // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1 + // needs to be at least 60 bytes smaller to overcome the + // fixed size addition. + metrics->sz[m] += sz[m]+2000; + + // When enough trials performed, find the best on average if (--metrics->trial == 0) { int best_method = RAW; int best_sz = INT_MAX; - // Scale methods by cost - if (fd->level <= 3) { - metrics->sz_rans1 *= 1.02; - metrics->sz_gz_def *= 1.04; - metrics->sz_bzip2 *= 1.08; - metrics->sz_lzma *= 1.10; + // Relative costs of methods. See enum_cram_block_method_int + // and methmap + double meth_cost[32] = { + // Externally defined methods + 1, // 0 raw + 1.04, // 1 gzip (Z_FILTERED) + 1.07, // 2 bzip2 + 1.08, // 3 lzma + 1.00, // 4 rans (O0) + 1.00, // 5 ranspr (O0) + 1.04, // 6 arithpr (O0) + 1.05, // 7 fqz + 1.05, // 8 tok3 (rans) + 1.00, 1.00, // 9,10 reserved + + // Paramterised versions of above + 1.01, // gzip rle + 1.01, // gzip -1 + + 1.05, 1.05, 1.05, // FQZ_b,c,d + + 1.01, // rans O1 + + 1.01, // rans_pr1 + 1.00, // rans_pr64; if smaller, usually fast + 1.03, // rans_pr65/9 + 1.00, // rans_pr128 + 1.01, // rans_pr129 + 1.00, // rans_pr192 + 1.01, // rans_pr193 + + 1.07, // tok3 arith + + 1.04, // arith_pr1 + 1.04, // arith_pr64 + 1.04, // arith_pr9 + 1.03, // arith_pr128 + 1.04, // arith_pr129 + 1.04, // arith_pr192 + 1.04, // arith_pr193 + }; + + // Scale methods by cost based on compression level + if (fd->level <= 1) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)*4; + } else if (fd->level <= 3) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1); } else if (fd->level <= 6) { - metrics->sz_rans1 *= 1.01; - metrics->sz_gz_def *= 1.02; - metrics->sz_bzip2 *= 1.03; - metrics->sz_lzma *= 1.05; + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)/2; + } else if (fd->level <= 7) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)/3; + } // else cost is ignored + + // Ensure these are never used; BSC and ZSTD + metrics->sz[9] = metrics->sz[10] = INT_MAX; + + for (m = 0; m < CRAM_MAX_METHOD; m++) { + if ((!metrics->sz[m]) || (!(method & (1u< metrics->sz[m]) + best_sz = metrics->sz[m], best_method = m; } - if (method & (1< metrics->sz_gz_rle) - best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE; - - if (method & (1< metrics->sz_gz_def) - best_sz = metrics->sz_gz_def, best_method = GZIP; - - if (method & (1< metrics->sz_rans0) - best_sz = metrics->sz_rans0, best_method = RANS0; - - if (method & (1< metrics->sz_rans1) - best_sz = metrics->sz_rans1, best_method = RANS1; - - if (method & (1< metrics->sz_bzip2) - best_sz = metrics->sz_bzip2, best_method = BZIP2; - - if (method & (1< metrics->sz_lzma) - best_sz = metrics->sz_lzma, best_method = LZMA; - - if (best_method == GZIP_RLE) { - metrics->method = GZIP; - metrics->strat = Z_RLE; + if (best_method != metrics->method) { + //metrics->trial = (NTRIALS+1)/2; // be sure + //metrics->next_trial /= 1.5; + metrics->consistency = 0; } else { - metrics->method = best_method; - metrics->strat = Z_FILTERED; + metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0); + metrics->consistency++; + } + + metrics->method = best_method; + switch (best_method) { + case GZIP: strat = Z_FILTERED; break; + case GZIP_1: strat = Z_DEFAULT_STRATEGY; break; + case GZIP_RLE: strat = Z_RLE; break; + case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; + case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; + case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; + case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; + case TOK3: strat = 0; break; + case TOKA: strat = 1; break; + default: strat = 0; } + metrics->strat = strat; // If we see at least MAXFAIL trials in a row for a specific // compression method with more than MAXDELTA aggregate @@ -1502,134 +2205,139 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, // for this block type. #define MAXDELTA 0.20 #define MAXFAILS 4 - if (best_method == GZIP_RLE) { - metrics->gz_rle_cnt = 0; - metrics->gz_rle_extra = 0; - } else if (best_sz < metrics->sz_gz_rle) { - double r = (double)metrics->sz_gz_rle / best_sz - 1; - if (++metrics->gz_rle_cnt >= MAXFAILS && - (metrics->gz_rle_extra += r) >= MAXDELTA) - method &= ~(1<cnt[m] = 0; + metrics->extra[m] = 0; + } else if (best_sz < metrics->sz[m]) { + double r = (double)metrics->sz[m] / best_sz - 1; + int mul = 1+(fd->level>=7); + if (++metrics->cnt[m] >= MAXFAILS*mul && + (metrics->extra[m] += r) >= MAXDELTA*mul) + method &= ~(1u<sz[m] > best_sz) + method &= ~(1u<gz_def_cnt = 0; - metrics->gz_def_extra = 0; - } else if (best_sz < metrics->sz_gz_def) { - double r = (double)metrics->sz_gz_def / best_sz - 1; - if (++metrics->gz_def_cnt >= MAXFAILS && - (metrics->gz_def_extra += r) >= MAXDELTA) - method &= ~(1<rans0_cnt = 0; - metrics->rans0_extra = 0; - } else if (best_sz < metrics->sz_rans0) { - double r = (double)metrics->sz_rans0 / best_sz - 1; - if (++metrics->rans0_cnt >= MAXFAILS && - (metrics->rans0_extra += r) >= MAXDELTA) - method &= ~(1<rans1_cnt = 0; - metrics->rans1_extra = 0; - } else if (best_sz < metrics->sz_rans1) { - double r = (double)metrics->sz_rans1 / best_sz - 1; - if (++metrics->rans1_cnt >= MAXFAILS && - (metrics->rans1_extra += r) >= MAXDELTA) - method &= ~(1<bzip2_cnt = 0; - metrics->bzip2_extra = 0; - } else if (best_sz < metrics->sz_bzip2) { - double r = (double)metrics->sz_bzip2 / best_sz - 1; - if (++metrics->bzip2_cnt >= MAXFAILS && - (metrics->bzip2_extra += r) >= MAXDELTA) - method &= ~(1<lzma_cnt = 0; - metrics->lzma_extra = 0; - } else if (best_sz < metrics->sz_lzma) { - double r = (double)metrics->sz_lzma / best_sz - 1; - if (++metrics->lzma_cnt >= MAXFAILS && - (metrics->lzma_extra += r) >= MAXDELTA) - method &= ~(1<revised_method) - // fprintf(stderr, "%d: method from %x to %x\n", + //if (fd->verbose > 1 && method != metrics->revised_method) + // fprintf(stderr, "%d: revising method from %x to %x\n", // b->content_id, metrics->revised_method, method); metrics->revised_method = method; } pthread_mutex_unlock(&fd->metrics_lock); } else { + metrics->input_avg_delta = + 0.9 * (metrics->input_avg_delta + + ABS(b->uncomp_size - metrics->input_avg_sz)); + + metrics->input_avg_sz += b->uncomp_size*.2; + metrics->input_avg_sz *= 0.8; + strat = metrics->strat; method = metrics->method; pthread_mutex_unlock(&fd->metrics_lock); - comp = cram_compress_by_method((char *)b->data, b->uncomp_size, + comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, b->content_id, &comp_size, method, - level, strat); + method == GZIP_1 ? 1 : level, + strat); if (!comp) return -1; - free(b->data); - b->data = (unsigned char *)comp; - b->comp_size = comp_size; - b->method = method; + + if (comp_size < b->uncomp_size) { + free(b->data); + b->data = (unsigned char *)comp; + b->comp_size = comp_size; + b->method = method; + } else { + free(comp); + } } } else { // no cached metrics, so just do zlib? - comp = cram_compress_by_method((char *)b->data, b->uncomp_size, + comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, b->content_id, &comp_size, GZIP, level, Z_FILTERED); if (!comp) { - hts_log_error("Compression failed"); + hts_log_error("Compression failed!"); return -1; } - free(b->data); - b->data = (unsigned char *)comp; - b->comp_size = comp_size; - b->method = GZIP; + + if (comp_size < b->uncomp_size) { + free(b->data); + b->data = (unsigned char *)comp; + b->comp_size = comp_size; + b->method = GZIP; + } else { + free(comp); + } + strat = Z_FILTERED; } hts_log_info("Compressed block ID %d from %d to %d by method %s", b->content_id, b->uncomp_size, b->comp_size, cram_block_method2str(b->method)); - if (b->method == RANS1) - b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing + b->method = methmap[b->method]; return 0; } +int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, + int method, int level) { + return cram_compress_block2(fd, NULL, b, metrics, method, level); +} cram_metrics *cram_new_metrics(void) { cram_metrics *m = calloc(1, sizeof(*m)); if (!m) return NULL; m->trial = NTRIALS-1; - m->next_trial = TRIAL_SPAN; + m->next_trial = TRIAL_SPAN/2; // learn quicker at start m->method = RAW; m->strat = 0; m->revised_method = 0; + m->unpackable = 0; return m; } -char *cram_block_method2str(enum cram_block_method m) { +char *cram_block_method2str(enum cram_block_method_int m) { switch(m) { - case RAW: return "RAW"; - case GZIP: return "GZIP"; - case BZIP2: return "BZIP2"; - case LZMA: return "LZMA"; - case RANS0: return "RANS0"; - case RANS1: return "RANS1"; - case GZIP_RLE: return "GZIP_RLE"; + case RAW: return "RAW"; + case GZIP: return "GZIP"; + case BZIP2: return "BZIP2"; + case LZMA: return "LZMA"; + case RANS0: return "RANS0"; + case RANS1: return "RANS1"; + case GZIP_RLE: return "GZIP_RLE"; + case GZIP_1: return "GZIP_1"; + case FQZ: return "FQZ"; + case FQZ_b: return "FQZ_b"; + case FQZ_c: return "FQZ_c"; + case FQZ_d: return "FQZ_d"; + case RANS_PR0: return "RANS_PR0"; + case RANS_PR1: return "RANS_PR1"; + case RANS_PR64: return "RANS_PR64"; + case RANS_PR9: return "RANS_PR9"; + case RANS_PR128: return "RANS_PR128"; + case RANS_PR129: return "RANS_PR129"; + case RANS_PR192: return "RANS_PR192"; + case RANS_PR193: return "RANS_PR193"; + case TOK3: return "TOK3_R"; + case TOKA: return "TOK3_A"; + case ARITH_PR0: return "ARITH_PR0"; + case ARITH_PR1: return "ARITH_PR1"; + case ARITH_PR64: return "ARITH_PR64"; + case ARITH_PR9: return "ARITH_PR9"; + case ARITH_PR128: return "ARITH_PR128"; + case ARITH_PR129: return "ARITH_PR129"; + case ARITH_PR192: return "ARITH_PR192"; + case ARITH_PR193: return "ARITH_PR193"; case BM_ERROR: break; } return "?"; @@ -1833,7 +2541,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { /* Only the reference file provided. Get the index file name from it */ if (!(r->fn = string_dup(r->pool, fn))) goto err; - sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn); + snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn); } } @@ -1890,6 +2598,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { e->seq = NULL; e->mf = NULL; e->is_md5 = 0; + e->validated_md5 = 0; k = kh_put(refs, r->h_meta, e->name, &n); if (-1 == n) { @@ -2326,6 +3035,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { fd->refs->fp = fp; fd->refs->fn = r->fn; r->is_md5 = 1; + r->validated_md5 = 1; // Fall back to cram_get_ref() where it'll do the actual // reading of the file. @@ -2347,6 +3057,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } r->length = sz; r->is_md5 = 1; + r->validated_md5 = 1; } else { refs_t *refs; const char *fn; @@ -2502,7 +3213,7 @@ void cram_ref_decr(refs_t *r, int id) { } /* - * Used by cram_ref_load and cram_ref_get. The file handle will have + * Used by cram_ref_load and cram_get_ref. The file handle will have * already been opened, so we can catch it. The ref_entry *e informs us * of whether this is a multi-line fasta file or a raw MD5 style file. * Either way we create a single contiguous sequence. @@ -2520,6 +3231,10 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { /* * Compute locations in file. This is trivial for the MD5 files, but * is still necessary for the fasta variants. + * + * Note the offset here, as with faidx, has the assumption that white- + * space (the diff between line_length and bases_per_line) only occurs + * at the end of a line of text. */ offset = e->line_length ? e->offset + (start-1)/e->bases_per_line * e->line_length + @@ -2548,14 +3263,34 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { /* Strip white-space if required. */ if (len != end-start+1) { - int i, j; + hts_pos_t i, j; char *cp = seq; char *cp_to; + // Copy up to the first white-space, and then repeatedly just copy + // bases_per_line verbatim, and use the slow method to end again. + // + // This may seem excessive, but this code can be a significant + // portion of total CRAM decode CPU time for shallow data sets. for (i = j = 0; i < len; i++) { - if (cp[i] >= '!' && cp[i] <= '~') - cp[j++] = toupper_c(cp[i]); + if (!isspace_c(cp[i])) + cp[j++] = cp[i] & ~0x20; + else + break; + } + while (i < len && isspace_c(cp[i])) + i++; + while (i < len - e->line_length) { + hts_pos_t j_end = j + e->bases_per_line; + while (j < j_end) + cp[j++] = cp[i++] & ~0x20; // toupper equiv + i += e->line_length - e->bases_per_line; } + for (; i < len; i++) { + if (!isspace_c(cp[i])) + cp[j++] = cp[i] & ~0x20; + } + cp_to = cp+j; if (cp_to - seq != end-start+1) { @@ -2671,7 +3406,7 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { char *seq; int ostart = start; - if (id == -1) + if (id == -1 || start < 1) return NULL; /* FIXME: axiomatic query of r->seq being true? @@ -2726,8 +3461,11 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { */ pthread_mutex_lock(&fd->refs->lock); if (r->length == 0) { + if (fd->ref_fn) + hts_log_warning("Reference file given, but ref '%s' not present", + r->name); if (cram_populate_ref(fd, id, r) == -1) { - hts_log_error("Failed to populate reference for id %d", id); + hts_log_warning("Failed to populate reference for id %d", id); pthread_mutex_unlock(&fd->refs->lock); pthread_mutex_unlock(&fd->ref_lock); return NULL; @@ -2747,8 +3485,6 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { end = r->length; if (end >= r->length) end = r->length; - if (start < 1) - return NULL; if (end - start >= 0.5*r->length || fd->shared_ref) { start = 1; @@ -2867,7 +3603,7 @@ int cram_load_reference(cram_fd *fd, char *fn) { if (fn) { fd->refs = refs_load_fai(fd->refs, fn, - !(fd->embed_ref && fd->mode == 'r')); + !(fd->embed_ref>0 && fd->mode == 'r')); fn = fd->refs ? fd->refs->fn : NULL; if (!fn) ret = -1; @@ -2925,6 +3661,9 @@ cram_container *cram_new_container(int nrec, int nslice) { c->pos_sorted = 1; c->max_apos = 0; c->multi_seq = 0; + c->qs_seq_orient = 1; + c->no_ref = 0; + c->embed_ref = -1; // automatic selection c->bams = NULL; @@ -2944,6 +3683,7 @@ cram_container *cram_new_container(int nrec, int nslice) { if (!(c->tags_used = kh_init(m_tagmap))) goto err; c->refs_used = 0; + c->ref_free = 0; return c; @@ -3016,6 +3756,9 @@ void cram_free_container(cram_container *c) { kh_destroy(m_tagmap, c->tags_used); } + if (c->ref_free) + free(c->ref); + free(c); } @@ -3036,13 +3779,13 @@ cram_container *cram_read_container(cram_fd *fd) { memset(&c2, 0, sizeof(c2)); if (CRAM_MAJOR_VERS(fd->version) == 1) { - if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) { + if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { fd->eof = fd->empty_container ? 1 : 2; return NULL; } else { rd+=s; } - } else { + } else if (CRAM_MAJOR_VERS(fd->version) < 4) { uint32_t len; if ((s = int32_decode(fd, &c2.length)) == -1) { if (CRAM_MAJOR_VERS(fd->version) == 2 && @@ -3056,55 +3799,61 @@ cram_container *cram_read_container(cram_fd *fd) { } len = le_int4(c2.length); crc = crc32(0L, (unsigned char *)&len, 4); + } else { + if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { + fd->eof = fd->empty_container ? 1 : 2; + return NULL; + } else { + rd+=s; + } } - if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ -#ifdef LARGE_POS - if ((s = ltf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; - if ((s = ltf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; -#else - int32_t i32; - if ((s = itf8_decode_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; - c2.ref_seq_start = i32; - if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; - c2.ref_seq_span = i32; -#endif - if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; + if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + int64_t i64; + if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i64; + if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i64; + } else { + int32_t i32; + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i32; + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i32; + } + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; if (CRAM_MAJOR_VERS(fd->version) == 1) { c2.record_counter = 0; c2.num_bases = 0; } else { if (CRAM_MAJOR_VERS(fd->version) >= 3) { - if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1) + if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1) return NULL; else rd += s; } else { int32_t i32; - if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd += s; c2.record_counter = i32; } - if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1) + if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1) return NULL; else rd += s; } - if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc)) == -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s; + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc)) == -1) + return NULL; + else + rd+=s; + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1) + return NULL; + else + rd+=s; if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t)) return NULL; @@ -3113,14 +3862,20 @@ cram_container *cram_read_container(cram_fd *fd) { return NULL; *c = c2; - +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) { + fd->err = errno = ENOMEM; + cram_free_container(c); + return NULL; + } +#endif if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) { fd->err = errno; cram_free_container(c); return NULL; } for (i = 0; i < c->num_landmarks; i++) { - if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) { + if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) { cram_free_container(c); return NULL; } else { @@ -3136,6 +3891,11 @@ cram_container *cram_read_container(cram_fd *fd) { rd+=4; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Pretend the CRC was OK so the fuzzer doesn't have to get it right + crc = c->crc32; +#endif + if (crc != c->crc32) { hts_log_error("Container header CRC32 failure"); cram_free_container(c); @@ -3182,7 +3942,7 @@ int cram_container_size(cram_container *c) { */ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) { - unsigned char *cp = (unsigned char *)dat; + char *cp = (char *)dat; int i; // Check the input buffer is large enough according to our stated @@ -3191,41 +3951,39 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) return -1; if (CRAM_MAJOR_VERS(fd->version) == 1) { - cp += itf8_put((char*)cp, c->length); + cp += itf8_put(cp, c->length); } else { *(int32_t *)cp = le_int4(c->length); cp += 4; } if (c->multi_seq) { - cp += itf8_put((char*)cp, -2); - cp += itf8_put((char*)cp, 0); - cp += itf8_put((char*)cp, 0); + cp += fd->vv.varint_put32(cp, NULL, -2); + cp += fd->vv.varint_put32(cp, NULL, 0); + cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += itf8_put((char*)cp, c->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put((char*)cp, c->ref_seq_start); - cp += ltf8_put((char*)cp, c->ref_seq_span); -#else - cp += itf8_put((char*)cp, c->ref_seq_start); - cp += itf8_put((char*)cp, c->ref_seq_span); -#endif + cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); + } } - cp += itf8_put((char*)cp, c->num_records); + cp += fd->vv.varint_put32(cp, NULL, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { - cp += itf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put64(cp, NULL, c->record_counter); } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += ltf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->record_counter); } - - cp += itf8_put((char*)cp, c->num_blocks); - cp += itf8_put((char*)cp, c->num_landmarks); + cp += fd->vv.varint_put64(cp, NULL, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); for (i = 0; i < c->num_landmarks; i++) - cp += itf8_put((char*)cp, c->landmark[i]); + cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); if (CRAM_MAJOR_VERS(fd->version) >= 3) { - c->crc32 = crc32(0L, (uc *)dat, (char*)cp-dat); + c->crc32 = crc32(0L, (uc *)dat, cp-dat); cp[0] = c->crc32 & 0xff; cp[1] = (c->crc32 >> 8) & 0xff; cp[2] = (c->crc32 >> 16) & 0xff; @@ -3233,7 +3991,7 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) cp += 4; } - *size = (char *)cp-dat; // actual used size + *size = cp-dat; // actual used size return 0; } @@ -3246,50 +4004,51 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) * -1 on failure */ int cram_write_container(cram_fd *fd, cram_container *c) { - char buf_a[1024], *buf = buf_a; - unsigned char *cp; + char buf_a[1024], *buf = buf_a, *cp; int i; - if (55 + c->num_landmarks * 5 >= 1024) - buf = malloc(55 + c->num_landmarks * 5); - cp = (unsigned char *)buf; + if (61 + c->num_landmarks * 10 >= 1024) { + buf = malloc(61 + c->num_landmarks * 10); + if (!buf) + return -1; + } + cp = buf; if (CRAM_MAJOR_VERS(fd->version) == 1) { - cp += itf8_put((char*)cp, c->length); - } else { + cp += itf8_put(cp, c->length); + } else if (CRAM_MAJOR_VERS(fd->version) <= 3) { *(int32_t *)cp = le_int4(c->length); cp += 4; + } else { + cp += fd->vv.varint_put32(cp, NULL, c->length); } if (c->multi_seq) { - cp += itf8_put((char*)cp, -2); - cp += itf8_put((char*)cp, 0); - cp += itf8_put((char*)cp, 0); + cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2); + cp += fd->vv.varint_put32(cp, NULL, 0); + cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += itf8_put((char*)cp, c->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put((char*)cp, c->ref_seq_start); - cp += ltf8_put((char*)cp, c->ref_seq_span); -#else - cp += itf8_put((char*)cp, c->ref_seq_start); - cp += itf8_put((char*)cp, c->ref_seq_span); -#endif - } - cp += itf8_put((char*)cp, c->num_records); - if (CRAM_MAJOR_VERS(fd->version) == 2) { - cp += itf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); - } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += ltf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); + } } - - cp += itf8_put((char*)cp, c->num_blocks); - cp += itf8_put((char*)cp, c->num_landmarks); + cp += fd->vv.varint_put32(cp, NULL, c->num_records); + if (CRAM_MAJOR_VERS(fd->version) >= 3) + cp += fd->vv.varint_put64(cp, NULL, c->record_counter); + else + cp += fd->vv.varint_put32(cp, NULL, c->record_counter); + cp += fd->vv.varint_put64(cp, NULL, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); for (i = 0; i < c->num_landmarks; i++) - cp += itf8_put((char*)cp, c->landmark[i]); + cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); if (CRAM_MAJOR_VERS(fd->version) >= 3) { - c->crc32 = crc32(0L, (uc *)buf, (char*)cp-buf); + c->crc32 = crc32(0L, (uc *)buf, cp-buf); cp[0] = c->crc32 & 0xff; cp[1] = (c->crc32 >> 8) & 0xff; cp[2] = (c->crc32 >> 16) & 0xff; @@ -3297,7 +4056,7 @@ int cram_write_container(cram_fd *fd, cram_container *c) { cp += 4; } - if ((char*)cp-buf != hwrite(fd->fp, buf, (char*)cp-buf)) { + if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { if (buf != buf_a) free(buf); return -1; @@ -3490,13 +4249,9 @@ void reset_metrics(cram_fd *fd) { m->trial = NTRIALS; m->next_trial = TRIAL_SPAN; m->revised_method = 0; + m->unpackable = 0; - m->sz_gz_rle = 0; - m->sz_gz_def = 0; - m->sz_rans0 = 0; - m->sz_rans1 = 0; - m->sz_bzip2 = 0; - m->sz_lzma = 0; + memset(m->sz, 0, sizeof(m->sz)); } } @@ -3737,8 +4492,8 @@ cram_slice *cram_new_slice(enum cram_content_type type, int nrecs) { s->block_by_id = NULL; s->last_apos = 0; if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; - s->cigar = NULL; - s->cigar_alloc = 0; + s->cigar_alloc = 1024; + if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; s->ncigar = 0; if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; @@ -3841,8 +4596,8 @@ cram_slice *cram_read_slice(cram_fd *fd) { } /* Initialise encoding/decoding tables */ - s->cigar = NULL; - s->cigar_alloc = 0; + s->cigar_alloc = 1024; + if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; s->ncigar = 0; if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; @@ -3894,8 +4649,8 @@ cram_file_def *cram_read_file_def(cram_fd *fd) { return NULL; } - if (def->major_version > 3) { - hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x or 3.x, got %d.%d", + if (def->major_version > 4) { + hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d", def->major_version, def->minor_version); free(def); return NULL; @@ -3945,6 +4700,11 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { if (-1 == int32_decode(fd, &header_len)) return NULL; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (header_len > FUZZ_ALLOC_LIMIT) + return NULL; +#endif + /* Alloc and read */ if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1))) return NULL; @@ -3984,9 +4744,9 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { } len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); /* Extract header from 1st block */ if (-1 == int32_get_blk(b, &header_len) || @@ -4013,9 +4773,9 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { return NULL; } len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); cram_free_block(b); } @@ -4082,7 +4842,7 @@ static void full_path(char *out, char *in) { strncpy(out, in, PATH_MAX-1); out[PATH_MAX-1] = 0; } else { - int len; + size_t len; // unable to get dir or out+in is too long if (!getcwd(out, PATH_MAX) || @@ -4092,7 +4852,7 @@ static void full_path(char *out, char *in) { return; } - sprintf(out+len, "/%.*s", PATH_MAX - 2 - len, in); + snprintf(out+len, PATH_MAX - len, "/%s", in); // FIXME: cope with `pwd`/../../../foo.fa ? } @@ -4123,8 +4883,13 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { return -1; } + if (-1 == refs_from_header(fd)) + return -1; + if (-1 == refs2id(fd->refs, fd->header)) + return -1; + /* Fix M5 strings */ - if (fd->refs && !fd->no_ref) { + if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) { int i; for (i = 0; i < hdr->hrecs->nref; i++) { sam_hrec_type_t *ty; @@ -4145,17 +4910,32 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { return -1; } rlen = fd->refs->ref_id[i]->length; - if (!(md5 = hts_md5_init())) - return -1; ref = cram_get_ref(fd, i, 1, rlen); - if (NULL == ref) return -1; + if (NULL == ref) { + if (fd->embed_ref == -1) { + // auto embed-ref + hts_log_warning("No M5 tags present and could not " + "find reference"); + hts_log_warning("Enabling embed_ref=2 option"); + hts_log_warning("NOTE: the CRAM file will be bigger " + "than using an external reference"); + pthread_mutex_lock(&fd->ref_lock); + fd->embed_ref = 2; + pthread_mutex_unlock(&fd->ref_lock); + break; + } + return -1; + } rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ + if (!(md5 = hts_md5_init())) + return -1; hts_md5_update(md5, ref, rlen); hts_md5_final(buf, md5); hts_md5_destroy(md5); cram_ref_decr(fd->refs, i); hts_md5_hex(buf2, buf); + fd->refs->ref_id[i]->validated_md5 = 1; if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL)) return -1; } @@ -4209,9 +4989,9 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { if (blank_block) { c->length = b->comp_size + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); c->num_blocks = 2; c->num_landmarks = 2; @@ -4226,8 +5006,8 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { // Plus extra storage for uncompressed secondary blank block padded_length = MIN(c->length*.5, 10000); c->length += padded_length + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(padded_length)*2; + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(padded_length)*2; } else { // Pad the block instead. c->num_blocks = 1; @@ -4240,9 +5020,9 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { c->length = b->comp_size + padded_length + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); if (NULL == (pads = calloc(1, padded_length))) { cram_free_block(b); @@ -4283,11 +5063,6 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { cram_free_container(c); } - if (-1 == refs_from_header(fd)) - return -1; - if (-1 == refs2id(fd->refs, fd->header)) - return -1; - if (0 != hflush(fd->fp)) return -1; @@ -4303,6 +5078,53 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { * The top-level cram opening, closing and option handling */ +/* + * Sets CRAM variable sized integer decode function tables. + * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64. + * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit + * (or anything inbetween), but also now supports signed values. + * + * Version is the CRAM major version number. + * vv is the vector table (probably &cram_fd->vv) + */ +static void cram_init_varint(varint_vec *vv, int version) { + if (version >= 4) { + vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic + vv->varint_get32s = sint7_get_32; + vv->varint_get64 = uint7_get_64; + vv->varint_get64s = sint7_get_64; + vv->varint_put32 = uint7_put_32; + vv->varint_put32s = sint7_put_32; + vv->varint_put64 = uint7_put_64; + vv->varint_put64s = sint7_put_64; + vv->varint_put32_blk = uint7_put_blk_32; + vv->varint_put32s_blk = sint7_put_blk_32; + vv->varint_put64_blk = uint7_put_blk_64; + vv->varint_put64s_blk = sint7_put_blk_64; + vv->varint_size = uint7_size; + vv->varint_decode32_crc = uint7_decode_crc32; + vv->varint_decode32s_crc = sint7_decode_crc32; + vv->varint_decode64_crc = uint7_decode_crc64; + } else { + vv->varint_get32 = safe_itf8_get; + vv->varint_get32s = safe_itf8_get; + vv->varint_get64 = safe_ltf8_get; + vv->varint_get64s = safe_ltf8_get; + vv->varint_put32 = safe_itf8_put; + vv->varint_put32s = safe_itf8_put; + vv->varint_put64 = safe_ltf8_put; + vv->varint_put64s = safe_ltf8_put; + vv->varint_put32_blk = itf8_put_blk; + vv->varint_put32s_blk = itf8_put_blk; + vv->varint_put64_blk = ltf8_put_blk; + vv->varint_put64s_blk = ltf8_put_blk; + vv->varint_size = itf8_size; + vv->varint_decode32_crc = itf8_decode_crc; + vv->varint_decode32s_crc = itf8_decode_crc; + vv->varint_decode64_crc = ltf8_decode_crc; + } +} + /* * Initialises the lookup tables. These could be global statics, but they're * clumsy to setup in a multi-threaded environment unless we generate @@ -4385,6 +5207,8 @@ static void cram_init_tables(cram_fd *fd) { fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2; fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3; } + + cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version)); } // Default version numbers for CRAM @@ -4430,7 +5254,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { if (!fd) return NULL; - fd->level = 5; + fd->level = CRAM_DEFAULT_LEVEL; for (i = 0; mode[i]; i++) { if (mode[i] >= '0' && mode[i] <= '9') { fd->level = mode[i] - '0'; @@ -4452,6 +5276,8 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->version = fd->file_def->major_version * 256 + fd->file_def->minor_version; + cram_init_tables(fd); + if (!(fd->header = cram_read_SAM_hdr(fd))) { cram_free_file_def(fd->file_def); goto err; @@ -4475,12 +5301,11 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { strncpy(def->file_id, filename, 20); fd->version = major_version * 256 + minor_version; + cram_init_tables(fd); /* SAM header written later along with this file_def */ } - cram_init_tables(fd); - fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); if (!fd->prefix) goto err; @@ -4499,12 +5324,14 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->seqs_per_slice = SEQS_PER_SLICE; fd->bases_per_slice = BASES_PER_SLICE; fd->slices_per_container = SLICE_PER_CNT; - fd->embed_ref = 0; + fd->embed_ref = -1; // automatic selection fd->no_ref = 0; + fd->ap_delta = 0; fd->ignore_md5 = 0; fd->lossy_read_names = 0; fd->use_bz2 = 0; fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); + fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1); fd->use_lzma = 0; fd->multi_seq = -1; fd->multi_seq_user = -1; @@ -4594,7 +5421,7 @@ int cram_flush(cram_fd *fd) { if (fd->mode == 'w' && fd->ctr) { if(fd->ctr->slice) - cram_update_curr_slice(fd->ctr); + cram_update_curr_slice(fd->ctr, fd->version); if (-1 == cram_flush_container_mt(fd, fd->ctr)) return -1; @@ -4603,6 +5430,90 @@ int cram_flush(cram_fd *fd) { return 0; } +/* + * Writes an EOF block to a CRAM file. + * + * Returns 0 on success + * -1 on failure + */ +int cram_write_eof_block(cram_fd *fd) { + // EOF block is a container with special values to aid detection + if (CRAM_MAJOR_VERS(fd->version) >= 2) { + // Empty container with + // ref_seq_id -1 + // start pos 0x454f46 ("EOF") + // span 0 + // nrec 0 + // counter 0 + // nbases 0 + // 1 block (landmark 0) + // (CRC32) + cram_container c; + memset(&c, 0, sizeof(c)); + c.ref_seq_id = -1; + c.ref_seq_start = 0x454f46; // "EOF" + c.ref_seq_span = 0; + c.record_counter = 0; + c.num_bases = 0; + c.num_blocks = 1; + int32_t land[1] = {0}; + c.landmark = land; + + // An empty compression header block with + // method raw (0) + // type comp header (1) + // content id 0 + // block contents size 6 + // raw size 6 + // empty preservation map (01 00) + // empty data series map (01 00) + // empty tag map (01 00) + // block CRC + cram_block_compression_hdr ch; + memset(&ch, 0, sizeof(ch)); + c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0); + + c.length = c.comp_hdr_block->byte // Landmark[0] + + 5 // block struct + + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC + if (cram_write_container(fd, &c) < 0 || + cram_write_block(fd, c.comp_hdr_block) < 0) { + cram_close(fd); + cram_free_block(c.comp_hdr_block); + return -1; + } + if (ch.preservation_map) + kh_destroy(map, ch.preservation_map); + cram_free_block(c.comp_hdr_block); + + // V2.1 bytes + // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id + // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + + // V3.0 bytes: + // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id + // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // 05 bd d9 4f // CRC32 + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + // ee 63 01 4b // CRC32 + + // V4.0 bytes: + // 0f 00 00 00 8f ff ff ff // Cont HDR: size, ref seq id + // 82 95 9e 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // ac d6 05 bc // CRC32 + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + // ee 63 01 4b // CRC32 + } + + return 0; +} /* * Closes a CRAM file. * Returns 0 on success @@ -4617,7 +5528,7 @@ int cram_close(cram_fd *fd) { if (fd->mode == 'w' && fd->ctr) { if(fd->ctr->slice) - cram_update_curr_slice(fd->ctr); + cram_update_curr_slice(fd->ctr, fd->version); if (-1 == cram_flush_container_mt(fd, fd->ctr)) return -1; @@ -4646,25 +5557,8 @@ int cram_close(cram_fd *fd) { if (fd->mode == 'w') { /* Write EOF block */ - if (CRAM_MAJOR_VERS(fd->version) == 3) { - if (38 != hwrite(fd->fp, - "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR - "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR - "\x00\x01\x00" // Cont HDR - "\x05\xbd\xd9\x4f" // CRC32 - "\x00\x01\x00\x06\x06" // Comp.HDR blk - "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk - "\xee\x63\x01\x4b", // CRC32 - 38)) - return -1; - } else { - if (30 != hwrite(fd->fp, - "\x0b\x00\x00\x00\xff\xff\xff\xff" - "\x0f\xe0\x45\x4f\x46\x00\x00\x00" - "\x00\x01\x00\x00\x01\x00\x06\x06" - "\x01\x00\x01\x00\x01\x00", 30)) - return -1; - } + if (0 != cram_write_eof_block(fd)) + return -1; } for (bl = fd->bl; bl; bl = next) { @@ -4788,6 +5682,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { case CRAM_OPT_SEQS_PER_SLICE: fd->seqs_per_slice = va_arg(args, int); + if (fd->bases_per_slice == BASES_PER_SLICE) + fd->bases_per_slice = fd->seqs_per_slice * 500; break; case CRAM_OPT_BASES_PER_SLICE: @@ -4806,6 +5702,10 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->no_ref = va_arg(args, int); break; + case CRAM_OPT_POS_DELTA: + fd->ap_delta = va_arg(args, int); + break; + case CRAM_OPT_IGNORE_MD5: fd->ignore_md5 = va_arg(args, int); break; @@ -4828,6 +5728,18 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->use_rans = va_arg(args, int); break; + case CRAM_OPT_USE_TOK: + fd->use_tok = va_arg(args, int); + break; + + case CRAM_OPT_USE_FQZ: + fd->use_fqz = va_arg(args, int); + break; + + case CRAM_OPT_USE_ARITH: + fd->use_arith = va_arg(args, int); + break; + case CRAM_OPT_USE_LZMA: fd->use_lzma = va_arg(args, int); break; @@ -4883,15 +5795,29 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { } if (!((major == 1 && minor == 0) || (major == 2 && (minor == 0 || minor == 1)) || - (major == 3 && minor == 0))) { - hts_log_error("Unknown version string; use 1.0, 2.0, 2.1 or 3.0"); + (major == 3 && (minor == 0 || minor == 1)) || + (major == 4 && minor == 0))) { + hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0"); errno = EINVAL; return -1; } + + if (major > 3 || (major == 3 && minor > 1)) { + hts_log_warning( + "CRAM version %s is still a draft and subject to change.\n" + "This is a technology demonstration that should not be " + "used for archival data.", s); + } + fd->version = major*256 + minor; - if (CRAM_MAJOR_VERS(fd->version) >= 3) - fd->use_rans = 1; + fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; + + fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 && + CRAM_MINOR_VERS(fd->version) >= 1) || + CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0; + cram_init_tables(fd); + break; } @@ -4955,6 +5881,41 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->level = va_arg(args, int); break; + case HTS_OPT_PROFILE: { + enum hts_profile_option prof = va_arg(args, int); + switch (prof) { + case HTS_PROFILE_FAST: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1; + fd->use_tok = 0; + fd->seqs_per_slice = 10000; + break; + + case HTS_PROFILE_NORMAL: + break; + + case HTS_PROFILE_SMALL: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6; + fd->use_bz2 = 1; + fd->use_fqz = 1; + fd->seqs_per_slice = 25000; + break; + + case HTS_PROFILE_ARCHIVE: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7; + fd->use_bz2 = 1; + fd->use_fqz = 1; + fd->use_arith = 1; + if (fd->level > 7) + fd->use_lzma = 1; + fd->seqs_per_slice = 100000; + break; + } + + if (fd->bases_per_slice == BASES_PER_SLICE) + fd->bases_per_slice = fd->seqs_per_slice * 500; + break; + } + default: hts_log_error("Unknown CRAM option code %d", opt); errno = EINVAL; diff --git a/cram/cram_io.h b/cram/cram_io.h index 194305398..53ae30f59 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -68,308 +68,9 @@ extern "C" { */ int itf8_decode(cram_fd *fd, int32_t *val); -static inline int itf8_get(char *cp, int32_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = ((up[0] <<8) | up[1]) & 0x3fff; - return 2; - } else if (up[0] < 0xe0) { - *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; - return 3; - } else if (up[0] < 0xf0) { - *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; - return 4; - } else { - *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); - return 5; - } -} - -/* - * Stores a value to memory in ITF-8 format. - * - * Returns the number of bytes required to store the number. - * This is a maximum of 5 bytes. - */ -static inline int itf8_put(char *cp, int32_t val) { - unsigned char *up = (unsigned char *)cp; - if (!(val & ~0x00000007f)) { // 1 byte - *up = val; - return 1; - } else if (!(val & ~0x00003fff)) { // 2 byte - *up++ = (val >> 8 ) | 0x80; - *up = val & 0xff; - return 2; - } else if (!(val & ~0x01fffff)) { // 3 byte - *up++ = (val >> 16) | 0xc0; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 3; - } else if (!(val & ~0x0fffffff)) { // 4 byte - *up++ = (val >> 24) | 0xe0; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 4; - } else { // 5 byte - *up++ = 0xf0 | ((val>>28) & 0xff); - *up++ = (val >> 20) & 0xff; - *up++ = (val >> 12) & 0xff; - *up++ = (val >> 4 ) & 0xff; - *up = val & 0x0f; - return 5; - } -} - - -/* 64-bit itf8 variant */ -static inline int ltf8_put(char *cp, int64_t val) { - unsigned char *up = (unsigned char *)cp; - if (!(val & ~((1LL<<7)-1))) { - *up = val; - return 1; - } else if (!(val & ~((1LL<<(6+8))-1))) { - *up++ = (val >> 8 ) | 0x80; - *up = val & 0xff; - return 2; - } else if (!(val & ~((1LL<<(5+2*8))-1))) { - *up++ = (val >> 16) | 0xc0; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 3; - } else if (!(val & ~((1LL<<(4+3*8))-1))) { - *up++ = (val >> 24) | 0xe0; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 4; - } else if (!(val & ~((1LL<<(3+4*8))-1))) { - *up++ = (val >> 32) | 0xf0; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 5; - } else if (!(val & ~((1LL<<(2+5*8))-1))) { - *up++ = (val >> 40) | 0xf8; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 6; - } else if (!(val & ~((1LL<<(1+6*8))-1))) { - *up++ = (val >> 48) | 0xfc; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 7; - } else if (!(val & ~((1LL<<(7*8))-1))) { - *up++ = (val >> 56) | 0xfe; - *up++ = (val >> 48) & 0xff; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 8; - } else { - *up++ = 0xff; - *up++ = (val >> 56) & 0xff; - *up++ = (val >> 48) & 0xff; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 9; - } -} - -static inline int ltf8_get(char *cp, int64_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = (((uint64_t)up[0]<< 8) | - (uint64_t)up[1]) & (((1LL<<(6+8)))-1); - return 2; - } else if (up[0] < 0xe0) { - *val_p = (((uint64_t)up[0]<<16) | - ((uint64_t)up[1]<< 8) | - (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint64_t)up[0]<<24) | - ((uint64_t)up[1]<<16) | - ((uint64_t)up[2]<< 8) | - (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); - return 4; - } else if (up[0] < 0xf8) { - *val_p = (((uint64_t)up[0]<<32) | - ((uint64_t)up[1]<<24) | - ((uint64_t)up[2]<<16) | - ((uint64_t)up[3]<< 8) | - (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); - return 5; - } else if (up[0] < 0xfc) { - *val_p = (((uint64_t)up[0]<<40) | - ((uint64_t)up[1]<<32) | - ((uint64_t)up[2]<<24) | - ((uint64_t)up[3]<<16) | - ((uint64_t)up[4]<< 8) | - (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); - return 6; - } else if (up[0] < 0xfe) { - *val_p = (((uint64_t)up[0]<<48) | - ((uint64_t)up[1]<<40) | - ((uint64_t)up[2]<<32) | - ((uint64_t)up[3]<<24) | - ((uint64_t)up[4]<<16) | - ((uint64_t)up[5]<< 8) | - (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); - return 7; - } else if (up[0] < 0xff) { - *val_p = (((uint64_t)up[1]<<48) | - ((uint64_t)up[2]<<40) | - ((uint64_t)up[3]<<32) | - ((uint64_t)up[4]<<24) | - ((uint64_t)up[5]<<16) | - ((uint64_t)up[6]<< 8) | - (uint64_t)up[7]) & ((1LL<<(7*8))-1); - return 8; - } else { - *val_p = (((uint64_t)up[1]<<56) | - ((uint64_t)up[2]<<48) | - ((uint64_t)up[3]<<40) | - ((uint64_t)up[4]<<32) | - ((uint64_t)up[5]<<24) | - ((uint64_t)up[6]<<16) | - ((uint64_t)up[7]<< 8) | - (uint64_t)up[8]); - return 9; - } -} - -#define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5) - - -/* Version of itf8_get that checks it hasn't run out of input */ - extern const int itf8_bytes[16]; extern const int ltf8_bytes[256]; -static inline int safe_itf8_get(const char *cp, const char *endp, - int32_t *val_p) { - const unsigned char *up = (unsigned char *)cp; - - if (endp - cp < 5 && - (cp >= endp || endp - cp < itf8_bytes[up[0]>>4])) { - *val_p = 0; - return 0; - } - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = ((up[0] <<8) | up[1]) & 0x3fff; - return 2; - } else if (up[0] < 0xe0) { - *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; - return 4; - } else { - uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); - *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1; - return 5; - } -} - -static inline int safe_ltf8_get(const char *cp, const char *endp, - int64_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (endp - cp < 9 && - (cp >= endp || endp - cp < ltf8_bytes[up[0]])) return 0; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = (((uint64_t)up[0]<< 8) | - (uint64_t)up[1]) & (((1LL<<(6+8)))-1); - return 2; - } else if (up[0] < 0xe0) { - *val_p = (((uint64_t)up[0]<<16) | - ((uint64_t)up[1]<< 8) | - (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint64_t)up[0]<<24) | - ((uint64_t)up[1]<<16) | - ((uint64_t)up[2]<< 8) | - (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); - return 4; - } else if (up[0] < 0xf8) { - *val_p = (((uint64_t)up[0]<<32) | - ((uint64_t)up[1]<<24) | - ((uint64_t)up[2]<<16) | - ((uint64_t)up[3]<< 8) | - (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); - return 5; - } else if (up[0] < 0xfc) { - *val_p = (((uint64_t)up[0]<<40) | - ((uint64_t)up[1]<<32) | - ((uint64_t)up[2]<<24) | - ((uint64_t)up[3]<<16) | - ((uint64_t)up[4]<< 8) | - (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); - return 6; - } else if (up[0] < 0xfe) { - *val_p = (((uint64_t)up[0]<<48) | - ((uint64_t)up[1]<<40) | - ((uint64_t)up[2]<<32) | - ((uint64_t)up[3]<<24) | - ((uint64_t)up[4]<<16) | - ((uint64_t)up[5]<< 8) | - (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); - return 7; - } else if (up[0] < 0xff) { - *val_p = (((uint64_t)up[1]<<48) | - ((uint64_t)up[2]<<40) | - ((uint64_t)up[3]<<32) | - ((uint64_t)up[4]<<24) | - ((uint64_t)up[5]<<16) | - ((uint64_t)up[6]<< 8) | - (uint64_t)up[7]) & ((1LL<<(7*8))-1); - return 8; - } else { - *val_p = (((uint64_t)up[1]<<56) | - ((uint64_t)up[2]<<48) | - ((uint64_t)up[3]<<40) | - ((uint64_t)up[4]<<32) | - ((uint64_t)up[5]<<24) | - ((uint64_t)up[6]<<16) | - ((uint64_t)up[7]<< 8) | - (uint64_t)up[8]); - return 9; - } -} - /*! Pushes a value in ITF8 format onto the end of a block. * * This shouldn't be used for high-volume data as it is not the fastest @@ -467,9 +168,12 @@ int cram_uncompress_block(cram_block *b); */ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, int method, int level); +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level); cram_metrics *cram_new_metrics(void); -char *cram_block_method2str(enum cram_block_method m); +char *cram_block_method2str(enum cram_block_method_int m); char *cram_content_type2str(enum cram_content_type t); /* @@ -525,7 +229,7 @@ static inline int block_resize(cram_block *b, size_t len) { size_t alloc = b->alloc; while (alloc <= len) - alloc = alloc ? alloc*1.5 : 1024; + alloc = alloc ? alloc + (alloc>>2) : 1024; return block_resize_exact(b, alloc); } @@ -541,8 +245,10 @@ static inline int block_append(cram_block *b, const void *s, size_t len) { if (block_grow(b, len) < 0) return -1; - memcpy(BLOCK_END(b), s, len); - BLOCK_SIZE(b) += len; + if (len) { + memcpy(BLOCK_END(b), s, len); + BLOCK_SIZE(b) += len; + } return 0; } diff --git a/cram/cram_samtools.c b/cram/cram_samtools.c deleted file mode 100644 index 890a7fad4..000000000 --- a/cram/cram_samtools.c +++ /dev/null @@ -1,123 +0,0 @@ -/* -Copyright (c) 2010-2013, 2017-2019 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include - -#include "cram.h" -#include "../htslib/sam.h" -#include "../sam_internal.h" - -/*--------------------------------------------------------------------------- - * Samtools compatibility portion - */ -int bam_construct_seq(bam_seq_t **bp, size_t extra_len, - const char *qname, size_t qname_len, - int flag, - int rname, // Ref ID - int64_t pos, - int64_t end, // aligned start/end coords - int mapq, - uint32_t ncigar, const uint32_t *cigar, - int mrnm, // Mate Ref ID - int64_t mpos, - int64_t isize, - int len, - const char *seq, - const char *qual) { - static const char L[256] = { - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15, - 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, - 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 - }; - bam1_t *b = (bam1_t *)*bp; - uint8_t *cp; - int i, qname_nuls, bam_len; - - //b->l_aux = extra_len; // we fill this out later - - qname_nuls = 4 - qname_len%4; - bam_len = qname_len + qname_nuls + ncigar*4 + (len+1)/2 + len + extra_len; - if (realloc_bam_data(b, bam_len) < 0) - return -1; - b->l_data = bam_len; - - b->core.tid = rname; - b->core.pos = pos-1; - b->core.bin = bam_reg2bin(pos-1, end); - b->core.qual = mapq; - b->core.l_qname = qname_len+qname_nuls; - b->core.l_extranul = qname_nuls-1; - b->core.flag = flag; - b->core.n_cigar = ncigar; - b->core.l_qseq = len; - b->core.mtid = mrnm; - b->core.mpos = mpos-1; - b->core.isize = isize; - - cp = b->data; - - strncpy((char *)cp, qname, qname_len); - for (i = 0; i < qname_nuls; i++) - cp[qname_len+i] = '\0'; - cp += qname_len+qname_nuls; - if (ncigar > 0) memcpy(cp, cigar, ncigar*4); - cp += ncigar*4; - - for (i = 0; i+1 < len; i+=2) { - *cp++ = (L[(uc)seq[i]]<<4) + L[(uc)seq[i+1]]; - } - if (i < len) - *cp++ = L[(uc)seq[i]]<<4; - - if (qual) - memcpy(cp, qual, len); - else - memset(cp, '\xff', len); - - return bam_len; -} diff --git a/cram/cram_samtools.h b/cram/cram_samtools.h index 115a96550..a4c9bf5cc 100644 --- a/cram/cram_samtools.h +++ b/cram/cram_samtools.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2010-2013, 2018 Genome Research Ltd. +Copyright (c) 2010-2013, 2018, 2020 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -72,27 +72,4 @@ enum cigar_op { typedef bam1_t bam_seq_t; -#ifdef __cplusplus -extern "C" { -#endif - -int bam_construct_seq(bam_seq_t **bp, size_t extra_len, - const char *qname, size_t qname_len, - int flag, - int rname, // Ref ID - int64_t pos, - int64_t end, // aligned start/end coords - int mapq, - uint32_t ncigar, const uint32_t *cigar, - int mrnm, // Mate Ref ID - int64_t mpos, - int64_t isize, - int len, - const char *seq, - const char *qual); - -#ifdef __cplusplus -} -#endif - #endif /* CRAM_SAMTOOLS_H */ diff --git a/cram/cram_stats.c b/cram/cram_stats.c index 20ef51dc7..3ceda0db1 100644 --- a/cram/cram_stats.c +++ b/cram/cram_stats.c @@ -193,6 +193,8 @@ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { } st->nvals = nvals; + st->min_val = min_val; + st->max_val = max_val; assert(ntot == st->nsamp); free(vals); @@ -202,7 +204,19 @@ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { * Simple policy that everything is external unless it can be * encoded using zero bits as a unary item huffman table. */ - return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + // Note, we're assuming integer data here as we don't have the + // type passed in. Cram_encoder_init does know the type and + // will convert to E_CONST_BYTE or E_EXTERNAL as appropriate. + if (nvals == 1) + return E_CONST_INT; + else if (nvals == 0 || min_val < 0) + return E_VARINT_SIGNED; + else + return E_VARINT_UNSIGNED; + } else { + return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL; + } } void cram_stats_free(cram_stats *st) { diff --git a/cram/cram_structs.h b/cram/cram_structs.h index ce7ad665a..160663392 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2016, 2018-2020 Genome Research Ltd. +Copyright (c) 2012-2016, 2018-2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -88,7 +88,7 @@ struct hFILE; #define BASES_PER_SLICE (SEQS_PER_SLICE*500) #define SLICE_PER_CNT 1 -#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" +#define CRAM_SUBST_MATRIX "CGTNGTANCATNGCANACGT" #define MAX_STAT_VAL 1024 //#define MAX_STAT_VAL 16 @@ -97,21 +97,36 @@ typedef struct cram_stats { khash_t(m_i2i) *h; int nsamp; // total number of values added int nvals; // total number of unique values added + int64_t min_val, max_val; } cram_stats; /* NB: matches java impl, not the spec */ enum cram_encoding { E_NULL = 0, - E_EXTERNAL = 1, - E_GOLOMB = 2, - E_HUFFMAN = 3, + E_EXTERNAL = 1, // Only for BYTE type in CRAM 4 + E_GOLOMB = 2, // Not in CRAM 4 + E_HUFFMAN = 3, // Not in CRAM 4 E_BYTE_ARRAY_LEN = 4, E_BYTE_ARRAY_STOP = 5, - E_BETA = 6, - E_SUBEXP = 7, - E_GOLOMB_RICE = 8, - E_GAMMA = 9, - E_NUM_CODECS = 10, /* Number of codecs, not a real one. */ + E_BETA = 6, // Not in CRAM 4 + E_SUBEXP = 7, // Not in CRAM 4 + E_GOLOMB_RICE = 8, // Not in CRAM 4 + E_GAMMA = 9, // Not in CRAM 4 + + // CRAM 4 specific codecs + E_VARINT_UNSIGNED = 41, // Specialisation of EXTERNAL + E_VARINT_SIGNED = 42, // Specialisation of EXTERNAL + E_CONST_BYTE = 43, // Alternative to HUFFMAN with 1 symbol + E_CONST_INT = 44, // Alternative to HUFFMAN with 1 symbol + + // More experimental ideas, not documented in spec yet + E_XHUFFMAN = 50, // To external block + E_XPACK = 51, // Transform to sub-codec + E_XRLE = 52, // Transform to sub-codec + E_XDELTA = 53, // Transform to sub-codec + + // Total number of codecs, not a real one. + E_NUM_CODECS, }; enum cram_external_type { @@ -120,6 +135,8 @@ enum cram_external_type { E_BYTE = 3, E_BYTE_ARRAY = 4, E_BYTE_ARRAY_BLOCK = 5, + E_SINT = 6, // signed INT + E_SLONG = 7, // signed LONG }; /* External IDs used by this implementation (only assumed during writing) */ @@ -192,19 +209,61 @@ struct cram_file_def { struct cram_slice; -/* Now in htslib/cram.h -enum cram_block_method { +// Internal version of htslib/cram.h enum. +// Note these have to match the laout of methmap and methcost in +// cram_io.c:cram_compress_block2 +enum cram_block_method_int { + // Public methods as defined in the CRAM spec. BM_ERROR = -1, + + // CRAM 2.x and 3.0 RAW = 0, GZIP = 1, BZIP2 = 2, LZMA = 3, - RANS = 4, // Generic; either order - RANS0 = 4, - RANS1 = 10, // Not externalised; stored as RANS (generic) - GZIP_RLE = 11, // NB: not externalised in CRAM + RANS = 4, RANS0 = RANS, + + // CRAM 3.1 onwards + RANSPR = 5, RANS_PR0 = RANSPR, + ARITH = 6, ARITH_PR0 = ARITH, + FQZ = 7, + TOK3 = 8, + // BSC = 9, ZSTD = 10 + + // Methods not externalised, but used in metrics. + // Externally they become one of the above methods. + GZIP_RLE = 11, + GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM + + FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods + + //RANS0, // Order 0 + RANS1, + + //RANS_PR0, // Order 0 + RANS_PR1, // Order 1 + RANS_PR64, // O0 + RLE + RANS_PR9, // O1 + X4 + RANS_PR128, // O0 + Pack + RANS_PR129, // O1 + Pack + RANS_PR192, // O0 + RLE + pack + RANS_PR193, // O1 + RLE + pack + + //TOK3, // tok+rans + TOKA, // tok+arith + + //ARITH_PR0, // Order 0 + ARITH_PR1, // Order 1 + ARITH_PR64, // O0 + RLE + ARITH_PR9, // O1 + X4 + ARITH_PR128, // O0 + Pack + ARITH_PR129, // O1 + Pack + ARITH_PR192, // O0 + RLE + pack + ARITH_PR193, // O1 + RLE + pack + + // NB: must end on no more than 31 unless we change to a + // 64-bit method type. }; -*/ /* Now in htslib/cram.h enum cram_content_type { @@ -218,39 +277,31 @@ enum cram_content_type { }; */ +/* Maximum simultaneous codecs allowed, 1 per bit */ +#define CRAM_MAX_METHOD 32 + /* Compression metrics */ struct cram_metrics { // number of trials and time to next trial int trial; int next_trial; + int consistency; // aggregate sizes during trials - int sz_gz_rle; - int sz_gz_def; - int sz_rans0; - int sz_rans1; - int sz_bzip2; - int sz_lzma; + int sz[CRAM_MAX_METHOD]; + int input_avg_sz, input_avg_delta; // resultant method from trials - int method; + int method, revised_method; int strat; // Revisions of method, to allow culling of continually failing ones. - int gz_rle_cnt; - int gz_def_cnt; - int rans0_cnt; - int rans1_cnt; - int bzip2_cnt; - int lzma_cnt; - int revised_method; - - double gz_rle_extra; - double gz_def_extra; - double rans0_extra; - double rans1_extra; - double bzip2_extra; - double lzma_extra; + int cnt[CRAM_MAX_METHOD]; + + double extra[CRAM_MAX_METHOD]; + + // Not amenable to rANS bit-packing techniques; cardinality > 16 + int unpackable; }; // Hash aux key (XX:i) to cram_metrics @@ -259,7 +310,7 @@ KHASH_MAP_INIT_INT(m_metrics, cram_metrics*) /* Block */ struct cram_block { - enum cram_block_method method, orig_method; + enum cram_block_method_int method, orig_method; enum cram_content_type content_type; int32_t content_id; int32_t comp_size; @@ -301,6 +352,7 @@ struct cram_block_compression_hdr { // indexed by ref-base and subst. code char substitution_matrix[5][4]; int no_ref; + int qs_seq_orient; // 1 => same as seq. 0 => original orientation // TD Dictionary as a concatenated block cram_block *TD_blk; // Tag Dictionary @@ -317,6 +369,9 @@ struct cram_block_compression_hdr { char *uncomp; // A single block of uncompressed data size_t uncomp_size, uncomp_alloc; + + // Total codec count, used for index to block_by_id for transforms + int ncodecs; }; typedef struct cram_map { @@ -331,6 +386,7 @@ typedef struct cram_map { typedef struct cram_tag_map { struct cram_codec *codec; cram_block *blk; + cram_block *blk2; cram_metrics *m; } cram_tag_map; @@ -396,10 +452,13 @@ struct cram_container { int last_slice; // number of reads in last slice (0 for 1st) int multi_seq; // true if packing multi seqs per cont/slice int unsorted; // true is AP_delta is 0. + int qs_seq_orient; // 1 => same as seq. 0 => original orientation /* Copied from fd before encoding, to allow multi-threading */ int ref_start, first_base, last_base, ref_id, ref_end; char *ref; + int embed_ref; // 1 if embedding ref, 2 if embedding cons + int no_ref; // true if referenceless //struct ref_entry *ref; /* For multi-threading */ @@ -414,8 +473,10 @@ struct cram_container { uint32_t crc32; // CRC32 uint64_t s_num_bases; // number of bases in this slice + uint64_t s_aux_bytes; // number of bytes of aux in BAM uint32_t n_mapped; // Number of mapped reads + int ref_free; // whether 'ref' is owned by us and must be freed. }; /* @@ -436,11 +497,12 @@ typedef struct cram_record { int32_t mate_ref_id; int64_t mate_pos; // NP int64_t tlen; // TS + int64_t explicit_tlen;// TS, but PNEXT/RNEXT still need auto-computing // Auxiliary data int32_t ntags; // TC - int32_t aux; // idx to s->aux_blk - int32_t aux_size; // total size of packed ntags in aux_blk + uint32_t aux; // idx to s->aux_blk + uint32_t aux_size; // total size of packed ntags in aux_blk #ifndef TN_external int32_t TN_idx; // TN; idx to s->TN; #else @@ -448,15 +510,15 @@ typedef struct cram_record { #endif int TL; - int32_t seq; // idx to s->seqs_blk - int32_t qual; // idx to s->qual_blk - int32_t cigar; // idx to s->cigar + uint32_t seq; // idx to s->seqs_blk + uint32_t qual; // idx to s->qual_blk + uint32_t cigar; // idx to s->cigar int32_t ncigar; int64_t aend; // alignment end int32_t mqual; // MQ - int32_t feature; // idx to s->feature - int32_t nfeature; // number of features + uint32_t feature; // idx to s->feature + uint32_t nfeature; // number of features int32_t mate_flags; // MF } cram_record; @@ -562,8 +624,8 @@ struct cram_slice { uint32_t ncigar; cram_feature *features; - int nfeatures; - int afeatures; // allocated size of features + uint32_t nfeatures; + uint32_t afeatures; // allocated size of features #ifndef TN_external // TN field (Tag Name) @@ -616,6 +678,7 @@ typedef struct ref_entry { char *seq; mFILE *mf; int is_md5; // Reference comes from a raw seq found by MD5 + int validated_md5; } ref_entry; KHASH_MAP_INIT_STR(refs, ref_entry*) @@ -663,7 +726,10 @@ typedef struct cram_index { int slice; // 1.0 landmark index, 1.1 landmark value int len; // 1.1 - size of slice in bytes int64_t offset; // 1.0 1.1 - int64_t next; // derived: offset of next container. + + // Linked list of cram_index entries. Used to convert recursive + // NCList back to a linear list. + struct cram_index *e_next; } cram_index; typedef struct { @@ -681,6 +747,36 @@ typedef struct spare_bams { struct spare_bams *next; } spare_bams; +struct cram_fd; +typedef struct varint_vec { + // Returns number of bytes decoded from fd, 0 on error + int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); + int (*varint_decode32s_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); + int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc); + + // Returns the value and increments *cp. Sets err to 1 iff an error occurs. + // NOTE: Does not set err to 0 on success. + int64_t (*varint_get32) (char **cp, const char *endp, int *err); + int64_t (*varint_get32s)(char **cp, const char *endp, int *err); + int64_t (*varint_get64) (char **cp, const char *endp, int *err); + int64_t (*varint_get64s)(char **cp, const char *endp, int *err); + + // Returns the number of bytes written, <= 0 on error. + int (*varint_put32) (char *cp, char *endp, int32_t val_p); + int (*varint_put32s)(char *cp, char *endp, int32_t val_p); + int (*varint_put64) (char *cp, char *endp, int64_t val_p); + int (*varint_put64s)(char *cp, char *endp, int64_t val_p); + + // Returns the number of bytes written, <= 0 on error. + int (*varint_put32_blk) (cram_block *blk, int32_t val_p); + int (*varint_put32s_blk)(cram_block *blk, int32_t val_p); + int (*varint_put64_blk) (cram_block *blk, int64_t val_p); + int (*varint_put64s_blk)(cram_block *blk, int64_t val_p); + + // Returns number of bytes needed to encode 'val' + int (*varint_size)(int64_t val); +} varint_vec; + struct cram_fd { struct hFILE *fp; int mode; // 'r' or 'w' @@ -703,14 +799,14 @@ struct cram_fd { cram_container *ctr_mt; // positions for encoding or decoding - int first_base, last_base; + int first_base, last_base; // copied to container // cached reference portion refs_t *refs; // ref meta-data structure char *ref, *ref_free; // current portion held in memory - int ref_id; - int ref_start; - int ref_end; + int ref_id; // copied to container + int ref_start; // copied to container + int ref_end; // copied to container char *ref_fn; // reference fasta filename // compression level and metrics @@ -723,12 +819,15 @@ struct cram_fd { int seqs_per_slice; int bases_per_slice; int slices_per_container; - int embed_ref; - int no_ref; + int embed_ref; // copied to container + int no_ref; // copied to container int ignore_md5; int use_bz2; int use_rans; int use_lzma; + int use_fqz; + int use_tok; + int use_arith; int shared_ref; unsigned int required_fields; int store_md; @@ -772,6 +871,16 @@ struct cram_fd { int tlen_zero; // If true, permit tlen 0 (=> tlen calculated) BGZF *idxfp; // File pointer for on-the-fly index creation + + // variable integer decoding callbacks. + // This changed in CRAM4.0 to a data-size agnostic encoding. + varint_vec vv; + + // Force AP delta even on non positional sorted data. + // This can be beneficial for pairs where pairs are nearby each other. + // We suffer with delta to unrelated things (previous pair), but gain + // in delta between them. (Ideal would be a per read setting.) + int ap_delta; }; // Translation of required fields to cram data series @@ -853,7 +962,8 @@ enum cram_fields { #define CRAM_FLAG_DETACHED (1<<1) #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) #define CRAM_FLAG_NO_SEQ (1<<3) -#define CRAM_FLAG_MASK ((1<<4)-1) +#define CRAM_FLAG_EXPLICIT_TLEN (1<<4) +#define CRAM_FLAG_MASK ((1<<5)-1) /* Internal only */ #define CRAM_FLAG_STATS_ADDED (1<<30) diff --git a/cram/open_trace_file.c b/cram/open_trace_file.c index 1518396d7..4d617b736 100644 --- a/cram/open_trace_file.c +++ b/cram/open_trace_file.c @@ -242,7 +242,7 @@ static char *expand_path(const char *file, char *dirname, int max_s_digits) { /* Special case for "./" or absolute filenames */ if (*file == '/' || (len==1 && *dirname == '.')) { - sprintf(path, "%s", file); + memcpy(path, file, lenf + 1); } else { /* Handle %[0-9]*s expansions, if required */ char *path_end = path; diff --git a/cram/rANS_byte.h b/cram/rANS_byte.h deleted file mode 100644 index f8bcae248..000000000 --- a/cram/rANS_byte.h +++ /dev/null @@ -1,352 +0,0 @@ -/* rans_byte.h originally from https://github.com/rygorous/ryg_rans - * - * This is a public-domain implementation of several rANS variants. rANS is an - * entropy coder from the ANS family, as described in Jarek Duda's paper - * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540). - */ - -/*-------------------------------------------------------------------------- */ - -// Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014 -// -// Not intended to be "industrial strength"; just meant to illustrate the general -// idea. - -#ifndef RANS_BYTE_HEADER -#define RANS_BYTE_HEADER - -#include - -#ifdef assert -#define RansAssert assert -#else -#define RansAssert(x) -#endif - -// READ ME FIRST: -// -// This is designed like a typical arithmetic coder API, but there's three -// twists you absolutely should be aware of before you start hacking: -// -// 1. You need to encode data in *reverse* - last symbol first. rANS works -// like a stack: last in, first out. -// 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give -// it a pointer to the *end* of your buffer (exclusive), and it will -// slowly move towards the beginning as more bytes are emitted. -// 3. Unlike basically any other entropy coder implementation you might -// have used, you can interleave data from multiple independent rANS -// encoders into the same bytestream without any extra signaling; -// you can also just write some bytes by yourself in the middle if -// you want to. This is in addition to the usual arithmetic encoder -// property of being able to switch models on the fly. Writing raw -// bytes can be useful when you have some data that you know is -// incompressible, and is cheaper than going through the rANS encode -// function. Using multiple rANS coders on the same byte stream wastes -// a few bytes compared to using just one, but execution of two -// independent encoders can happen in parallel on superscalar and -// Out-of-Order CPUs, so this can be *much* faster in tight decoding -// loops. -// -// This is why all the rANS functions take the write pointer as an -// argument instead of just storing it in some context struct. - -// -------------------------------------------------------------------------- - -// L ('l' in the paper) is the lower bound of our normalization interval. -// Between this and our byte-aligned emission, we use 31 (not 32!) bits. -// This is done intentionally because exact reciprocals for 31-bit uints -// fit in 32-bit uints: this permits some optimizations during encoding. -#define RANS_BYTE_L (1u << 23) // lower bound of our normalization interval - -// State for a rANS encoder. Yep, that's all there is to it. -typedef uint32_t RansState; - -// Initialize a rANS encoder. -static inline void RansEncInit(RansState* r) -{ - *r = RANS_BYTE_L; -} - -// Renormalize the encoder. Internal function. -static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits) -{ - uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; // this turns into a shift. - if (x >= x_max) { - uint8_t* ptr = *pptr; - do { - *--ptr = (uint8_t) (x & 0xff); - x >>= 8; - } while (x >= x_max); - *pptr = ptr; - } - return x; -} - -// Encodes a single symbol with range start "start" and frequency "freq". -// All frequencies are assumed to sum to "1 << scale_bits", and the -// resulting bytes get written to ptr (which is updated). -// -// NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from -// beginning to end! Likewise, the output bytestream is written *backwards*: -// ptr starts pointing at the end of the output buffer and keeps decrementing. -static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - // renormalize - RansState x = RansEncRenorm(*r, pptr, freq, scale_bits); - - // x = C(s,x) - *r = ((x / freq) << scale_bits) + (x % freq) + start; -} - -// Flushes the rANS encoder. -static inline void RansEncFlush(RansState* r, uint8_t** pptr) -{ - uint32_t x = *r; - uint8_t* ptr = *pptr; - - ptr -= 4; - ptr[0] = (uint8_t) (x >> 0); - ptr[1] = (uint8_t) (x >> 8); - ptr[2] = (uint8_t) (x >> 16); - ptr[3] = (uint8_t) (x >> 24); - - *pptr = ptr; -} - -// Initializes a rANS decoder. -// Unlike the encoder, the decoder works forwards as you'd expect. -static inline void RansDecInit(RansState* r, uint8_t** pptr) -{ - uint32_t x; - uint8_t* ptr = *pptr; - - x = ((uint32_t) ptr[0]) << 0; - x |= ((uint32_t) ptr[1]) << 8; - x |= ((uint32_t) ptr[2]) << 16; - x |= ((uint32_t) ptr[3]) << 24; - ptr += 4; - - *pptr = ptr; - *r = x; -} - -// Returns the current cumulative frequency (map it to a symbol yourself!) -static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits) -{ - return *r & ((1u << scale_bits) - 1); -} - -// Advances in the bit stream by "popping" a single symbol with range start -// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits", -// and the resulting bytes get written to ptr (which is updated). -static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - uint32_t mask = (1u << scale_bits) - 1; - - // s, x = D(x) - uint32_t x = *r; - x = freq * (x >> scale_bits) + (x & mask) - start; - - // renormalize - if (x < RANS_BYTE_L) { - uint8_t* ptr = *pptr; - do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L); - *pptr = ptr; - } - - *r = x; -} - -// -------------------------------------------------------------------------- - -// That's all you need for a full encoder; below here are some utility -// functions with extra convenience or optimizations. - -// Encoder symbol description -// This (admittedly odd) selection of parameters was chosen to make -// RansEncPutSymbol as cheap as possible. -typedef struct { - uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval - uint32_t rcp_freq; // Fixed-point reciprocal frequency - uint32_t bias; // Bias - uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq - uint16_t rcp_shift; // Reciprocal shift -} RansEncSymbol; - -// Decoder symbols are straightforward. -typedef struct { - uint16_t start; // Start of range. - uint16_t freq; // Symbol frequency. -} RansDecSymbol; - -// Initializes an encoder symbol to start "start" and frequency "freq" -static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - RansAssert(scale_bits <= 16); - RansAssert(start <= (1u << scale_bits)); - RansAssert(freq <= (1u << scale_bits) - start); - - // Say M := 1 << scale_bits. - // - // The original encoder does: - // x_new = (x/freq)*M + start + (x%freq) - // - // The fast encoder does (schematically): - // q = mul_hi(x, rcp_freq) >> rcp_shift (division) - // r = x - q*freq (remainder) - // x_new = q*M + bias + r (new x) - // plugging in r into x_new yields: - // x_new = bias + x + q*(M - freq) - // =: bias + x + q*cmpl_freq (*) - // - // and we can just precompute cmpl_freq. Now we just need to - // set up our parameters such that the original encoder and - // the fast encoder agree. - - s->x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; - s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq); - if (freq < 2) { - // freq=0 symbols are never valid to encode, so it doesn't matter what - // we set our values to. - // - // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately, - // our fixed-point reciprocal approximation can only multiply by values - // smaller than 1. - // - // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0. - // This gives: - // q = mul_hi(x, rcp_freq) >> rcp_shift - // = mul_hi(x, (1<<32) - 1)) >> 0 - // = floor(x - x/(2^32)) - // = x - 1 if 1 <= x < 2^32 - // and we know that x>0 (x=0 is never in a valid normalization interval). - // - // So we now need to choose the other parameters such that - // x_new = x*M + start - // plug it in: - // x*M + start (desired result) - // = bias + x + q*cmpl_freq (*) - // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq) - // = bias + 1 + (x - 1)*M - // = x*M + (bias + 1 - M) - // - // so we have start = bias + 1 - M, or equivalently - // bias = start + M - 1. - s->rcp_freq = ~0u; - s->rcp_shift = 0; - s->bias = start + (1 << scale_bits) - 1; - } else { - // Alverson, "Integer Division using reciprocals" - // shift=ceil(log2(freq)) - uint32_t shift = 0; - while (freq > (1u << shift)) - shift++; - - s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq); - s->rcp_shift = shift - 1; - - // With these values, 'q' is the correct quotient, so we - // have bias=start. - s->bias = start; - } - - s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol -} - -// Initialize a decoder symbol to start "start" and frequency "freq" -static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq) -{ - RansAssert(start <= (1 << 16)); - RansAssert(freq <= (1 << 16) - start); - s->start = (uint16_t) start; - s->freq = (uint16_t) freq; -} - -// Encodes a given symbol. This is faster than straight RansEnc since we can do -// multiplications instead of a divide. -// -// See RansEncSymbolInit for a description of how this works. -static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym) -{ - RansAssert(sym->x_max != 0); // can't encode symbol with freq=0 - - // renormalize - uint32_t x = *r; - uint32_t x_max = sym->x_max; - - if (x >= x_max) { - uint8_t* ptr = *pptr; - do { - *--ptr = (uint8_t) (x & 0xff); - x >>= 8; - } while (x >= x_max); - *pptr = ptr; - } - - // x = C(s,x) - // NOTE: written this way so we get a 32-bit "multiply high" when - // available. If you're on a 64-bit platform with cheap multiplies - // (e.g. x64), just bake the +32 into rcp_shift. - //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift; - - // The extra >>32 has already been added to RansEncSymbolInit - uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift); - *r = x + sym->bias + q * sym->cmpl_freq; -} - -// Equivalent to RansDecAdvance that takes a symbol. -static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits) -{ - RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits); -} - -// Advances in the bit stream by "popping" a single symbol with range start -// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits". -// No renormalization or output happens. -static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - uint32_t mask = (1u << scale_bits) - 1; - - // s, x = D(x) - uint32_t x = *r; - *r = freq * (x >> scale_bits) + (x & mask) - start; -} - -// Equivalent to RansDecAdvanceStep that takes a symbol. -static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits) -{ - RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits); -} - -// Renormalize. -static inline void RansDecRenorm(RansState* r, uint8_t** pptr) -{ - // renormalize - uint32_t x = *r; - - if (x < RANS_BYTE_L) { - uint8_t* ptr = *pptr; - x = (x << 8) | *ptr++; - if (x < RANS_BYTE_L) - x = (x << 8) | *ptr++; - *pptr = ptr; - } - - *r = x; -} - -// Renormalize, with extra checks for falling off the end of the input. -static inline void RansDecRenormSafe(RansState* r, uint8_t** pptr, uint8_t *ptr_end) -{ - uint32_t x = *r; - uint8_t* ptr = *pptr; - if (x >= RANS_BYTE_L || ptr >= ptr_end) return; - x = (x << 8) | *ptr++; - if (x < RANS_BYTE_L && ptr < ptr_end) - x = (x << 8) | *ptr++; - *pptr = ptr; - *r = x; -} - - -#endif // RANS_BYTE_HEADER diff --git a/cram/rANS_static.c b/cram/rANS_static.c deleted file mode 100644 index b58fe7189..000000000 --- a/cram/rANS_static.c +++ /dev/null @@ -1,909 +0,0 @@ -/* - * Copyright (c) 2014-2019 Genome Research Ltd. - * Author(s): James Bonfield - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger - * Institute nor the names of its contributors may be used to endorse - * or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH - * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Author: James Bonfield, Wellcome Trust Sanger Institute. 2014 - */ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rANS_static.h" -#include "rANS_byte.h" - -#define TF_SHIFT 12 -#define TOTFREQ (1<0?(a):-(a)) -#ifndef BLK_SIZE -# define BLK_SIZE 1024*1024 -#endif - -// Room to allow for expanded BLK_SIZE on worst case compression. -#define BLK_SIZE2 ((int)(1.05*BLK_SIZE)) - -/*----------------------------------------------------------------------------- - * Memory to memory compression functions. - * - * These are original versions without any manual loop unrolling. They - * are easier to understand, but can be up to 2x slower. - */ - -unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9); - unsigned char *cp, *out_end; - RansEncSymbol syms[256]; - RansState rans0, rans1, rans2, rans3; - uint8_t* ptr; - int F[256] = {0}, i, j, tab_size, rle, x, fsum = 0; - int m = 0, M = 0; - uint64_t tr; - - if (!out_buf) - return NULL; - - ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9; - - // Compute statistics - for (i = 0; i < in_size; i++) { - F[in[i]]++; - } - tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size; - normalise_harder: - // Normalise so T[i] == TOTFREQ - for (fsum = m = M = j = 0; j < 256; j++) { - if (!F[j]) - continue; - - if (m < F[j]) - m = F[j], M = j; - - if ((F[j] = (F[j]*tr)>>31) == 0) - F[j] = 1; - fsum += F[j]; - } - - fsum++; - if (fsum < TOTFREQ) { - F[M] += TOTFREQ-fsum; - } else if (fsum-TOTFREQ > F[M]/2) { - // Corner case to avoid excessive frequency reduction - tr = 2104533975; goto normalise_harder; // equiv to *0.98. - } else { - F[M] -= fsum-TOTFREQ; - } - - //printf("F[%d]=%d\n", M, F[M]); - assert(F[M]>0); - - // Encode statistics. - cp = out_buf+9; - - for (x = rle = j = 0; j < 256; j++) { - if (F[j]) { - // j - if (rle) { - rle--; - } else { - *cp++ = j; - if (!rle && j && F[j-1]) { - for(rle=j+1; rle<256 && F[rle]; rle++) - ; - rle -= j+1; - *cp++ = rle; - } - //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]); - } - - // F[j] - if (F[j]<128) { - *cp++ = F[j]; - } else { - *cp++ = 128 | (F[j]>>8); - *cp++ = F[j]&0xff; - } - RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); - x += F[j]; - } - } - *cp++ = 0; - - //write(1, out_buf+4, cp-(out_buf+4)); - tab_size = cp-out_buf; - - RansEncInit(&rans0); - RansEncInit(&rans1); - RansEncInit(&rans2); - RansEncInit(&rans3); - - switch (i=(in_size&3)) { - case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); - case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); - case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); - case 0: - break; - } - for (i=(in_size &~3); i>0; i-=4) { - RansEncSymbol *s3 = &syms[in[i-1]]; - RansEncSymbol *s2 = &syms[in[i-2]]; - RansEncSymbol *s1 = &syms[in[i-3]]; - RansEncSymbol *s0 = &syms[in[i-4]]; - - RansEncPutSymbol(&rans3, &ptr, s3); - RansEncPutSymbol(&rans2, &ptr, s2); - RansEncPutSymbol(&rans1, &ptr, s1); - RansEncPutSymbol(&rans0, &ptr, s0); - } - - RansEncFlush(&rans3, &ptr); - RansEncFlush(&rans2, &ptr); - RansEncFlush(&rans1, &ptr); - RansEncFlush(&rans0, &ptr); - - // Finalise block size and return it - *out_size = (out_end - ptr) + tab_size; - - cp = out_buf; - - *cp++ = 0; // order - *cp++ = ((*out_size-9)>> 0) & 0xff; - *cp++ = ((*out_size-9)>> 8) & 0xff; - *cp++ = ((*out_size-9)>>16) & 0xff; - *cp++ = ((*out_size-9)>>24) & 0xff; - - *cp++ = (in_size>> 0) & 0xff; - *cp++ = (in_size>> 8) & 0xff; - *cp++ = (in_size>>16) & 0xff; - *cp++ = (in_size>>24) & 0xff; - - memmove(out_buf + tab_size, ptr, out_end-ptr); - - return out_buf; -} - -typedef struct { - unsigned char R[TOTFREQ]; -} ari_decoder; - -unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Load in the static tables */ - unsigned char *cp = in + 9; - unsigned char *cp_end = in + in_size; - int i, j, x, rle; - unsigned int out_sz, in_sz; - char *out_buf; - ari_decoder D; - RansDecSymbol syms[256]; - - if (in_size < 26) // Need at least this many bytes just to start - return NULL; - - if (*in++ != 0) // Order-0 check - return NULL; - - in_sz = ((((uint32_t) in[0])<<0) | (((uint32_t) in[1])<<8) | - (((uint32_t) in[2])<<16) | (((uint32_t) in[3])<<24)); - out_sz = ((((uint32_t) in[4])<<0) | (((uint32_t) in[5])<<8) | - (((uint32_t) in[6])<<16) | (((uint32_t) in[7])<<24)); - if (in_sz != in_size-9) - return NULL; - - if (out_sz >= INT_MAX) - return NULL; // protect against some overflow cases - - // Precompute reverse lookup of frequency. - rle = x = 0; - j = *cp++; - do { - int F, C; - if (cp > cp_end - 16) return NULL; // Not enough input bytes left - if ((F = *cp++) >= 128) { - F &= ~128; - F = ((F & 127) << 8) | *cp++; - } - C = x; - - RansDecSymbolInit(&syms[j], C, F); - - /* Build reverse lookup table */ - if (x + F > TOTFREQ) - return NULL; - memset(&D.R[x], j, F); - - x += F; - - if (!rle && j+1 == *cp) { - j = *cp++; - rle = *cp++; - } else if (rle) { - rle--; - j++; - if (j > 255) - return NULL; - } else { - j = *cp++; - } - } while(j); - - if (x < TOTFREQ-1 || x > TOTFREQ) - return NULL; - if (x < TOTFREQ) // historically we fill 4095, not 4096 - D.R[x] = D.R[x-1]; - - if (cp > cp_end - 16) return NULL; // Not enough input bytes left - - RansState rans0, rans1, rans2, rans3; - uint8_t *ptr = cp; - RansDecInit(&rans0, &ptr); - RansDecInit(&rans1, &ptr); - RansDecInit(&rans2, &ptr); - RansDecInit(&rans3, &ptr); - - out_buf = malloc(out_sz); - if (!out_buf) - return NULL; - - int out_end = (out_sz&~3); - - RansState R[4]; - R[0] = rans0; - R[1] = rans1; - R[2] = rans2; - R[3] = rans3; - uint32_t mask = (1u << TF_SHIFT)-1; - - for (i=0; i < out_end; i+=4) { - uint32_t m[4] = {R[0] & mask, - R[1] & mask, - R[2] & mask, - R[3] & mask}; - uint8_t c[4] = {D.R[m[0]], - D.R[m[1]], - D.R[m[2]], - D.R[m[3]]}; - out_buf[i+0] = c[0]; - out_buf[i+1] = c[1]; - out_buf[i+2] = c[2]; - out_buf[i+3] = c[3]; - - // In theory all TOTFREQ elements of D.R are filled out, but it's - // possible this may not be true (invalid input). We could - // check with x == TOTFREQ after filling out D.R matrix, but - // for historical reasons this sums to TOTFREQ-1 leaving one - // byte in D.R uninitialised. Or we could check here that - // syms[c[0..3]].freq > 0 and initialising syms, but that is - // slow. - // - // We take the former approach and accept a potential for garbage in - // -> garbage out in the rare 1 in TOTFREQ case as the overhead of - // continuous validation of freq > 0 is steep on this tight loop. - - // RansDecAdvanceSymbolStep(&R[0], &syms[c[0]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[1], &syms[c[1]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[2], &syms[c[2]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[3], &syms[c[3]], TF_SHIFT); - R[0] = syms[c[0]].freq * (R[0]>>TF_SHIFT); - R[0] += m[0] - syms[c[0]].start; - R[1] = syms[c[1]].freq * (R[1]>>TF_SHIFT); - R[1] += m[1] - syms[c[1]].start; - R[2] = syms[c[2]].freq * (R[2]>>TF_SHIFT); - R[2] += m[2] - syms[c[2]].start; - R[3] = syms[c[3]].freq * (R[3]>>TF_SHIFT); - R[3] += m[3] - syms[c[3]].start; - - if (ptr < cp_end - 8) { // Each renorm reads no more than 2 bytes - RansDecRenorm(&R[0], &ptr); - RansDecRenorm(&R[1], &ptr); - RansDecRenorm(&R[2], &ptr); - RansDecRenorm(&R[3], &ptr); - } else { - RansDecRenormSafe(&R[0], &ptr, cp_end); - RansDecRenormSafe(&R[1], &ptr, cp_end); - RansDecRenormSafe(&R[2], &ptr, cp_end); - RansDecRenormSafe(&R[3], &ptr, cp_end); - } - } - - switch(out_sz&3) { - case 3: - out_buf[out_end+2] = D.R[RansDecGet(&R[2], TF_SHIFT)]; - case 2: - out_buf[out_end+1] = D.R[RansDecGet(&R[1], TF_SHIFT)]; - case 1: - out_buf[out_end] = D.R[RansDecGet(&R[0], TF_SHIFT)]; - default: - break; - } - - *out_size = out_sz; - - return (unsigned char *)out_buf; -} - -unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - unsigned char *out_buf = NULL, *out_end, *cp; - unsigned int last_i, tab_size, rle_i, rle_j; - RansEncSymbol (*syms)[256] = NULL; /* syms[256][256] */ - int (*F)[256] = NULL; /* F[256][256] */ - int *T = NULL; /* T[256] */ - int i, j; - unsigned char c; - - if (in_size < 4) - return rans_compress_O0(in, in_size, out_size); - - syms = malloc(256 * sizeof(*syms)); - if (!syms) goto cleanup; - F = calloc(256, sizeof(*F)); - if (!F) goto cleanup; - T = calloc(256, sizeof(*T)); - if (!T) goto cleanup; - out_buf = malloc(1.05*in_size + 257*257*3 + 9); - if (!out_buf) goto cleanup; - - out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9; - cp = out_buf+9; - - //for (last = 0, i=in_size-1; i>=0; i--) { - // F[last][c = in[i]]++; - // T[last]++; - // last = c; - //} - - for (last_i=i=0; i>2)]]++; - F[0][in[2*(in_size>>2)]]++; - F[0][in[3*(in_size>>2)]]++; - T[0]+=3; - - // Normalise so T[i] == TOTFREQ - for (rle_i = i = 0; i < 256; i++) { - int t2, m, M; - unsigned int x; - - if (T[i] == 0) - continue; - - //uint64_t p = (TOTFREQ * TOTFREQ) / t; - double p = ((double)TOTFREQ)/T[i]; - normalise_harder: - for (t2 = m = M = j = 0; j < 256; j++) { - if (!F[i][j]) - continue; - - if (m < F[i][j]) - m = F[i][j], M = j; - - //if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0) - if ((F[i][j] *= p) == 0) - F[i][j] = 1; - t2 += F[i][j]; - } - - t2++; - if (t2 < TOTFREQ) { - F[i][M] += TOTFREQ-t2; - } else if (t2-TOTFREQ >= F[i][M]/2) { - // Corner case to avoid excessive frequency reduction - p = .98; goto normalise_harder; - } else { - F[i][M] -= t2-TOTFREQ; - } - - // Store frequency table - // i - if (rle_i) { - rle_i--; - } else { - *cp++ = i; - // FIXME: could use order-0 statistics to observe which alphabet - // symbols are present and base RLE on that ordering instead. - if (i && T[i-1]) { - for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++) - ; - rle_i -= i+1; - *cp++ = rle_i; - } - } - - int *F_i_ = F[i]; - x = 0; - rle_j = 0; - for (j = 0; j < 256; j++) { - if (F_i_[j]) { - //fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x); - - // j - if (rle_j) { - rle_j--; - } else { - *cp++ = j; - if (!rle_j && j && F_i_[j-1]) { - for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++) - ; - rle_j -= j+1; - *cp++ = rle_j; - } - } - - // F_i_[j] - if (F_i_[j]<128) { - *cp++ = F_i_[j]; - } else { - *cp++ = 128 | (F_i_[j]>>8); - *cp++ = F_i_[j]&0xff; - } - - RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT); - x += F_i_[j]; - } - } - *cp++ = 0; - } - *cp++ = 0; - - //write(1, out_buf+4, cp-(out_buf+4)); - tab_size = cp - out_buf; - assert(tab_size < 257*257*3); - - RansState rans0, rans1, rans2, rans3; - RansEncInit(&rans0); - RansEncInit(&rans1); - RansEncInit(&rans2); - RansEncInit(&rans3); - - uint8_t* ptr = out_end; - - int isz4 = in_size>>2; - int i0 = 1*isz4-2; - int i1 = 2*isz4-2; - int i2 = 3*isz4-2; - int i3 = 4*isz4-2; - - unsigned char l0 = in[i0+1]; - unsigned char l1 = in[i1+1]; - unsigned char l2 = in[i2+1]; - unsigned char l3 = in[i3+1]; - - // Deal with the remainder - l3 = in[in_size-1]; - for (i3 = in_size-2; i3 > 4*isz4-2; i3--) { - unsigned char c3 = in[i3]; - RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]); - l3 = c3; - } - - for (; i0 >= 0; i0--, i1--, i2--, i3--) { - unsigned char c0, c1, c2, c3; - RansEncSymbol *s3 = &syms[c3 = in[i3]][l3]; - RansEncSymbol *s2 = &syms[c2 = in[i2]][l2]; - RansEncSymbol *s1 = &syms[c1 = in[i1]][l1]; - RansEncSymbol *s0 = &syms[c0 = in[i0]][l0]; - - RansEncPutSymbol(&rans3, &ptr, s3); - RansEncPutSymbol(&rans2, &ptr, s2); - RansEncPutSymbol(&rans1, &ptr, s1); - RansEncPutSymbol(&rans0, &ptr, s0); - - l0 = c0; - l1 = c1; - l2 = c2; - l3 = c3; - } - - RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]); - RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]); - RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]); - RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]); - - RansEncFlush(&rans3, &ptr); - RansEncFlush(&rans2, &ptr); - RansEncFlush(&rans1, &ptr); - RansEncFlush(&rans0, &ptr); - - *out_size = (out_end - ptr) + tab_size; - - cp = out_buf; - *cp++ = 1; // order - - *cp++ = ((*out_size-9)>> 0) & 0xff; - *cp++ = ((*out_size-9)>> 8) & 0xff; - *cp++ = ((*out_size-9)>>16) & 0xff; - *cp++ = ((*out_size-9)>>24) & 0xff; - - *cp++ = (in_size>> 0) & 0xff; - *cp++ = (in_size>> 8) & 0xff; - *cp++ = (in_size>>16) & 0xff; - *cp++ = (in_size>>24) & 0xff; - - memmove(out_buf + tab_size, ptr, out_end-ptr); - - cleanup: - free(syms); - free(F); - free(T); - - return out_buf; -} - -unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Load in the static tables */ - unsigned char *cp = in + 9; - unsigned char *ptr_end = in + in_size; - int i, j = -999, x, rle_i, rle_j; - unsigned int out_sz, in_sz; - char *out_buf = NULL; - ari_decoder *D = NULL; /* D[256] */ - RansDecSymbol (*syms)[256] = NULL; /* syms[256][256] */ - - if (in_size < 27) // Need at least this many bytes to start - return NULL; - - if (*in++ != 1) // Order-1 check - return NULL; - - in_sz = ((((uint32_t) in[0])<<0) | (((uint32_t) in[1])<<8) | - (((uint32_t) in[2])<<16) | (((uint32_t) in[3])<<24)); - out_sz = ((((uint32_t) in[4])<<0) | (((uint32_t) in[5])<<8) | - (((uint32_t) in[6])<<16) | (((uint32_t) in[7])<<24)); - if (in_sz != in_size-9) - return NULL; - - if (out_sz >= INT_MAX) - return NULL; // protect against some overflow cases - - // calloc may add 2% overhead to CRAM decode, but on linux with glibc it's - // often the same thing due to using mmap. - D = calloc(256, sizeof(*D)); - if (!D) goto cleanup; - syms = malloc(256 * sizeof(*syms)); - if (!syms) goto cleanup; - /* These memsets prevent illegal memory access in syms due to - broken compressed data. As D is calloc'd, all illegal transitions - will end up in either row or column 0 of syms. */ - memset(&syms[0], 0, sizeof(syms[0])); - for (i = 1; i < 256; i++) memset(&syms[i][0], 0, sizeof(syms[0][0])); - - //fprintf(stderr, "out_sz=%d\n", out_sz); - - //i = *cp++; - rle_i = 0; - i = *cp++; - do { - rle_j = x = 0; - j = *cp++; - do { - int F, C; - if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left - if ((F = *cp++) >= 128) { - F &= ~128; - F = ((F & 127) << 8) | *cp++; - } - C = x; - - //fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, F, C); - - if (!F) - F = TOTFREQ; - - RansDecSymbolInit(&syms[i][j], C, F); - - /* Build reverse lookup table */ - if (x + F > TOTFREQ) - goto cleanup; - memset(&D[i].R[x], j, F); - - x += F; - - if (!rle_j && j+1 == *cp) { - j = *cp++; - rle_j = *cp++; - } else if (rle_j) { - rle_j--; - j++; - if (j > 255) - goto cleanup; - } else { - j = *cp++; - } - } while(j); - - if (x < TOTFREQ-1 || x > TOTFREQ) - goto cleanup; - if (x < TOTFREQ) // historically we fill 4095, not 4096 - D[i].R[x] = D[i].R[x-1]; - - if (!rle_i && i+1 == *cp) { - i = *cp++; - rle_i = *cp++; - } else if (rle_i) { - rle_i--; - i++; - if (i > 255) - goto cleanup; - } else { - i = *cp++; - } - } while (i); - - // Precompute reverse lookup of frequency. - - RansState rans0, rans1, rans2, rans3; - uint8_t *ptr = cp; - if (ptr > ptr_end - 16) goto cleanup; // Not enough input bytes left - RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto cleanup; - - int isz4 = out_sz>>2; - int l0 = 0; - int l1 = 0; - int l2 = 0; - int l3 = 0; - int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4}; - - RansState R[4]; - R[0] = rans0; - R[1] = rans1; - R[2] = rans2; - R[3] = rans3; - - /* Allocate output buffer */ - out_buf = malloc(out_sz); - if (!out_buf) goto cleanup; - - for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) { - uint32_t m[4] = {R[0] & ((1u << TF_SHIFT)-1), - R[1] & ((1u << TF_SHIFT)-1), - R[2] & ((1u << TF_SHIFT)-1), - R[3] & ((1u << TF_SHIFT)-1)}; - - uint8_t c[4] = {D[l0].R[m[0]], - D[l1].R[m[1]], - D[l2].R[m[2]], - D[l3].R[m[3]]}; - - out_buf[i4[0]] = c[0]; - out_buf[i4[1]] = c[1]; - out_buf[i4[2]] = c[2]; - out_buf[i4[3]] = c[3]; - - //RansDecAdvanceSymbolStep(&R[0], &syms[l0][c[0]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[1], &syms[l1][c[1]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[2], &syms[l2][c[2]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[3], &syms[l3][c[3]], TF_SHIFT); - - R[0] = syms[l0][c[0]].freq * (R[0]>>TF_SHIFT); - R[0] += m[0] - syms[l0][c[0]].start; - R[1] = syms[l1][c[1]].freq * (R[1]>>TF_SHIFT); - R[1] += m[1] - syms[l1][c[1]].start; - R[2] = syms[l2][c[2]].freq * (R[2]>>TF_SHIFT); - R[2] += m[2] - syms[l2][c[2]].start; - R[3] = syms[l3][c[3]].freq * (R[3]>>TF_SHIFT); - R[3] += m[3] - syms[l3][c[3]].start; - - if (ptr < ptr_end - 8) { // Each renorm reads no more than 2 bytes - RansDecRenorm(&R[0], &ptr); - RansDecRenorm(&R[1], &ptr); - RansDecRenorm(&R[2], &ptr); - RansDecRenorm(&R[3], &ptr); - } else { - RansDecRenormSafe(&R[0], &ptr, ptr_end); - RansDecRenormSafe(&R[1], &ptr, ptr_end); - RansDecRenormSafe(&R[2], &ptr, ptr_end); - RansDecRenormSafe(&R[3], &ptr, ptr_end); - } - - l0 = c[0]; - l1 = c[1]; - l2 = c[2]; - l3 = c[3]; - } - - // Remainder - for (; i4[3] < out_sz; i4[3]++) { - unsigned char c3 = D[l3].R[RansDecGet(&R[3], TF_SHIFT)]; - out_buf[i4[3]] = c3; - - uint32_t m = R[3] & ((1u << TF_SHIFT)-1); - R[3] = syms[l3][c3].freq * (R[3]>>TF_SHIFT) + m - syms[l3][c3].start; - RansDecRenormSafe(&R[3], &ptr, ptr_end); - l3 = c3; - } - - *out_size = out_sz; - - cleanup: - if (D) - free(D); - free(syms); - - return (unsigned char *)out_buf; -} - -/*----------------------------------------------------------------------------- - * Simple interface to the order-0 vs order-1 encoders and decoders. - */ -unsigned char *rans_compress(unsigned char *in, unsigned int in_size, - unsigned int *out_size, int order) { - return order - ? rans_compress_O1(in, in_size, out_size) - : rans_compress_O0(in, in_size, out_size); -} - -unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Both rans_uncompress functions need to be able to read at least 9 - bytes. */ - if (in_size < 9) - return NULL; - return in[0] - ? rans_uncompress_O1(in, in_size, out_size) - : rans_uncompress_O0(in, in_size, out_size); -} - - -#ifdef TEST_MAIN -/*----------------------------------------------------------------------------- - * Main. - * - * This is a simple command line tool for testing order-0 and order-1 - * compression using the rANS codec. Simply compile with - * - * gcc -DTEST_MAIN -O3 -I. cram/rANS_static.c -o cram/rANS_static - * - * Usage: cram/rANS_static -o0 < file > file.o0 - * cram/rANS_static -d < file.o0 > file2 - * - * cram/rANS_static -o1 < file > file.o1 - * cram/rANS_static -d < file.o1 > file2 - */ -int main(int argc, char **argv) { - int opt, order = 0; - unsigned char in_buf[BLK_SIZE2+257*257*3]; - int decode = 0; - FILE *infp = stdin, *outfp = stdout; - struct timeval tv1, tv2; - size_t bytes = 0; - - extern char *optarg; - extern int optind; - - while ((opt = getopt(argc, argv, "o:d")) != -1) { - switch (opt) { - case 'o': - order = atoi(optarg); - break; - - case 'd': - decode = 1; - break; - } - } - - order = order ? 1 : 0; // Only support O(0) and O(1) - - if (optind < argc) { - if (!(infp = fopen(argv[optind], "rb"))) { - perror(argv[optind]); - return 1; - } - optind++; - } - - if (optind < argc) { - if (!(outfp = fopen(argv[optind], "wb"))) { - perror(argv[optind]); - fclose(infp); - return 1; - } - optind++; - } - - gettimeofday(&tv1, NULL); - - if (decode) { - // Only used in some test implementations of RC_GetFreq() - //RC_init(); - //RC_init2(); - - for (;;) { - uint32_t in_size, out_size; - unsigned char *out; - - if (9 != fread(in_buf, 1, 9, infp)) - break; - in_size = *(int *)&in_buf[1]; - if (in_size != fread(in_buf+9, 1, in_size, infp)) { - fprintf(stderr, "Truncated input\n"); - exit(1); - } - out = rans_uncompress(in_buf, in_size+9, &out_size); - if (!out) - abort(); - - fwrite(out, 1, out_size, outfp); - free(out); - - bytes += out_size; - } - } else { - for (;;) { - uint32_t in_size, out_size; - unsigned char *out; - - in_size = fread(in_buf, 1, BLK_SIZE, infp); - if (in_size <= 0) - break; - - out = rans_compress(in_buf, in_size, &out_size, order); - - fwrite(out, 1, out_size, outfp); - free(out); - - bytes += in_size; - } - } - - gettimeofday(&tv2, NULL); - - fprintf(stderr, "Took %ld microseconds, %5.1f MB/s\n", - (long)(tv2.tv_sec - tv1.tv_sec)*1000000 + - tv2.tv_usec - tv1.tv_usec, - (double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 + - tv2.tv_usec - tv1.tv_usec)); - - if (infp != stdin) fclose(infp); - if (outfp != stdout) fclose(outfp); - - return 0; -} -#endif diff --git a/cram/rANS_static.h b/cram/rANS_static.h deleted file mode 100644 index 5c3cf2cbf..000000000 --- a/cram/rANS_static.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2014 Genome Research Ltd. - * Author(s): James Bonfield - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger - * Institute nor the names of its contributors may be used to endorse - * or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH - * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - -#ifndef RANS_STATIC_H -#define RANS_STATIC_H - -#ifdef __cplusplus -extern "C" { -#endif - -unsigned char *rans_compress(unsigned char *in, unsigned int in_size, - unsigned int *out_size, int order); -unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size, - unsigned int *out_size); - -#ifdef __cplusplus -} -#endif - -#endif /* RANS_STATIC_H */ diff --git a/faidx.c b/faidx.c index 4b25d3918..089001dd3 100644 --- a/faidx.c +++ b/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- FASTA and FASTQ random access. - Copyright (C) 2008, 2009, 2013-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li @@ -702,6 +702,12 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, return NULL; } + if (val->line_blen <= 0) { + hts_log_error("Invalid line length in index: %d", val->line_blen); + *len = -1; + return NULL; + } + ret = bgzf_useek(fai->bgzf, offset + beg / val->line_blen * val->line_len @@ -731,7 +737,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } s[l] = '\0'; - *len = l < INT_MAX ? l : INT_MAX; + *len = l; return s; } @@ -766,6 +772,22 @@ static int fai_get_val(const faidx_t *fai, const char *str, return 0; } +/* + * The internal still has line_blen as uint32_t, but our references + * can be longer, so for future proofing we use hts_pos_t. We also needed + * a signed value so we can return negatives as an error. + */ +hts_pos_t fai_line_length(const faidx_t *fai, const char *str) +{ + faidx1_t val; + int64_t beg, end; + hts_pos_t len; + + if (fai_get_val(fai, str, &len, &val, &beg, &end)) + return -1; + else + return val.line_blen; +} char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len) { @@ -784,7 +806,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) { hts_pos_t len64; char *ret = fai_fetch64(fai, str, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -803,7 +825,7 @@ char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) { char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { hts_pos_t len64; char *ret = fai_fetchqual64(fai, str, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -822,26 +844,40 @@ const char *faidx_iseq(const faidx_t *fai, int i) return fai->name[i]; } -int faidx_seq_len(const faidx_t *fai, const char *seq) +hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq) { khint_t k = kh_get(s, fai->hash, seq); if ( k == kh_end(fai->hash) ) return -1; return kh_val(fai->hash, k).len; } -static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, hts_pos_t *p_beg_i, hts_pos_t *p_end_i, hts_pos_t *len) { +int faidx_seq_len(const faidx_t *fai, const char *seq) +{ + hts_pos_t len = faidx_seq_len64(fai, seq); + return len < INT_MAX ? len : INT_MAX; +} + +static int faidx_adjust_position(const faidx_t *fai, int end_adjust, + faidx1_t *val_out, const char *c_name, + hts_pos_t *p_beg_i, hts_pos_t *p_end_i, + hts_pos_t *len) { khiter_t iter; + faidx1_t *val; // Adjust position iter = kh_get(s, fai->hash, c_name); if (iter == kh_end(fai->hash)) { - *len = -2; + if (len) + *len = -2; hts_log_error("The sequence \"%s\" was not found", c_name); return 1; } - *val = kh_value(fai->hash, iter); + val = &kh_value(fai->hash, iter); + + if (val_out) + *val_out = *val; if(*p_end_i < *p_beg_i) *p_beg_i = *p_end_i; @@ -849,22 +885,100 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char * if(*p_beg_i < 0) *p_beg_i = 0; else if(val->len <= *p_beg_i) - *p_beg_i = val->len - 1; + *p_beg_i = val->len; if(*p_end_i < 0) *p_end_i = 0; else if(val->len <= *p_end_i) - *p_end_i = val->len - 1; + *p_end_i = val->len - end_adjust; return 0; } +int fai_adjust_region(const faidx_t *fai, int tid, + hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t orig_beg, orig_end; + + if (!fai || !beg || !end || tid < 0 || tid >= fai->n) + return -1; + + orig_beg = *beg; + orig_end = *end; + if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) { + hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"", + fai->name[tid]); + return -1; + } + + return ((orig_beg != *beg ? 1 : 0) | + (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0)); +} +static void fai_retrieve_into_buffer(const faidx_t *fai, const faidx1_t *val, + const uint64_t offset, const hts_pos_t beg, const hts_pos_t end, + char* s, hts_pos_t *len) { + size_t l; + int c = 0; + int ret; + + if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { + hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end); + *len = -1; + return; + } + + if (val->line_blen <= 0) { + hts_log_error("Invalid line length in index: %d", val->line_blen); + *len = -1; + return; + } + + ret = bgzf_useek(fai->bgzf, + offset + + beg / val->line_blen * val->line_len + + beg % val->line_blen, SEEK_SET); + + if (ret < 0) { + *len = -1; + hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)"); + return; + } + + l = 0; + + while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 ) + if (isgraph(c)) s[l++] = c; + if (c < 0) { + hts_log_error("Failed to retrieve block: %s", + c == -1 ? "unexpected end of file" : "error reading file"); + *len = -1; + return; + } + + s[l] = '\0'; + *len = l; +} + +void faidx_fetch_seq_into_buffer(const faidx_t *fai, + const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, char* s, hts_pos_t *len) +{ + faidx1_t val; + + // Adjust position + if (faidx_adjust_position(fai, 1,&val, c_name, &p_beg_i, &p_end_i, len)) { + *len = 0; + return; + } + + fai_retrieve_into_buffer(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, s, len); +} + char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; // Adjust position - if (faidx_adjust_position(fai, &val, c_name, &p_beg_i, &p_end_i, len)) { + if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) { return NULL; } @@ -876,7 +990,7 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p { hts_pos_t len64; char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -885,7 +999,7 @@ char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg faidx1_t val; // Adjust position - if (faidx_adjust_position(fai, &val, c_name, &p_beg_i, &p_end_i, len)) { + if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) { return NULL; } @@ -897,7 +1011,7 @@ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int { hts_pos_t len64; char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -919,6 +1033,11 @@ void fai_set_cache_size(faidx_t *fai, int cache_size) { bgzf_set_cache_size(fai->bgzf, cache_size); } +// Adds a thread pool to the underlying BGZF layer. +int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) { + return bgzf_thread_pool(fai->bgzf, pool, qsize); +} + char *fai_path(const char *fa) { char *fai = NULL; if (!fa) { diff --git a/fuzz_settings.h b/fuzz_settings.h new file mode 100644 index 000000000..821581927 --- /dev/null +++ b/fuzz_settings.h @@ -0,0 +1,35 @@ +/* fuzz_settings.h -- fuzz-tester specific definitions + + Copyright (C) 2023 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTSLIB_FUZZ_SETTINGS_H +#define HTSLIB_FUZZ_SETTINGS_H +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + +#ifndef FUZZ_ALLOC_LIMIT +// By default libfuzzer reports out-of-memory on allocations > 2 Gbytes +#define FUZZ_ALLOC_LIMIT 2000000000ULL +#endif + +#endif +#endif diff --git a/header.c b/header.c index 148268cc6..43fd52c57 100644 --- a/header.c +++ b/header.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2018-2020 Genome Research Ltd. +Copyright (c) 2018-2020, 2023 Genome Research Ltd. Authors: James Bonfield , Valeriu Ohan Redistribution and use in source and binary forms, with or without @@ -330,9 +330,14 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, while (tag) { if (tag->str[0] == 'I' && tag->str[1] == 'D') { - assert(tag->len >= 3); - hrecs->pg[npg].name = tag->str + 3; - hrecs->pg[npg].name_len = tag->len - 3; + /* Avoid duplicate ID tags coming from other applications */ + if (!hrecs->pg[npg].name) { + assert(tag->len >= 3); + hrecs->pg[npg].name = tag->str + 3; + hrecs->pg[npg].name_len = tag->len - 3; + } else { + hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name); + } } else if (tag->str[0] == 'P' && tag->str[1] == 'P') { // Resolve later if needed khint_t k; @@ -682,7 +687,7 @@ static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) { pool_free(hrecs->tag_pool, tag); } -static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found) { +static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found, int remove_hash) { if (!hrecs || !type_name || !type_found) return -1; @@ -710,7 +715,7 @@ static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_ } } - if (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2)) + if (remove_hash && (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2))) sam_hrecs_remove_hash_entry(hrecs, itype, type_found); sam_hrecs_free_tags(hrecs, type_found->tag); @@ -1424,7 +1429,7 @@ int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, if (!type_found) return 0; - int ret = sam_hrecs_remove_line(hrecs, type, type_found); + int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1); if (ret == 0) { if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) return -1; @@ -1464,7 +1469,7 @@ int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) { if (!type_found) return -1; - int ret = sam_hrecs_remove_line(hrecs, type, type_found); + int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1); if (ret == 0) { if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) return -1; @@ -1604,6 +1609,37 @@ int sam_hdr_update_line(sam_hdr_t *bh, const char *type, return ret; } +static int rebuild_hash(sam_hrecs_t *hrecs, const char *type) { + sam_hrec_type_t *head, *step; + khiter_t k; + + if (strncmp(type, "SQ", 2) == 0) { + hrecs->nref = 0; + kh_clear(m_s2i, hrecs->ref_hash); + } else if (strncmp(type, "RG", 2) == 0) { + hrecs->nrg = 0; + kh_clear(m_s2i, hrecs->rg_hash); + } + + k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type)); + + if (k != kh_end(hrecs->h)) { // something to rebuild + head = kh_val(hrecs->h, k); + step = head; + + do { + if (sam_hrecs_update_hashes(hrecs, TYPEKEY(type), step) == -1) { + hts_log_error("Unable to rebuild hashes"); + return -1; + } + + step = step->next; + } while (step != head); + } + + return 0; +} + int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) { sam_hrecs_t *hrecs; if (!bh || !type) @@ -1638,11 +1674,21 @@ int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, c while (step != type_found) { sam_hrec_type_t *to_remove = step; step = step->next; - ret &= sam_hrecs_remove_line(hrecs, type, to_remove); + ret &= sam_hrecs_remove_line(hrecs, type, to_remove, 0); } if (remove_all) - ret &= sam_hrecs_remove_line(hrecs, type, type_found); + ret &= sam_hrecs_remove_line(hrecs, type, type_found, 0); + + /* if RG or SQ, delete then rebuild the hashes (as it is faster + to rebuild than delete one by one). + */ + + if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) { + if (rebuild_hash(hrecs, type)) { + return -1; + } + } if (!ret && hrecs->dirty) redact_header_text(bh); @@ -1686,7 +1732,7 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void * if (k == kh_end(rh)) { // value is not in the hash table, so remove sam_hrec_type_t *to_remove = step; step = step->next; - ret |= sam_hrecs_remove_line(hrecs, type, to_remove); + ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0); } else { step = step->next; } @@ -1702,10 +1748,20 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void * if (k == kh_end(rh)) { // value is not in the hash table, so remove sam_hrec_type_t *to_remove = head; head = head->next; - ret |= sam_hrecs_remove_line(hrecs, type, to_remove); + ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0); } } + /* if RG or SQ, delete then rebuild the hashes (as it is faster + to rebuild than delete one by one). + */ + + if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) { + if (rebuild_hash(hrecs, type)) { + return -1; + } + } + if (!ret && hrecs->dirty) redact_header_text(bh); diff --git a/hfile.c b/hfile.c index 8bbba5cf9..82451fe71 100644 --- a/hfile.c +++ b/hfile.c @@ -1,6 +1,6 @@ /* hfile.c -- buffered low-level input/output streams. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: John Marshall @@ -524,7 +524,7 @@ void hclose_abruptly(hFILE *fp) typedef struct { hFILE base; int fd; - unsigned is_socket:1; + unsigned is_socket:1, is_shared:1; } hFILE_fd; static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes) @@ -564,6 +564,16 @@ static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes) static off_t fd_seek(hFILE *fpv, off_t offset, int whence) { hFILE_fd *fp = (hFILE_fd *) fpv; +#ifdef _WIN32 + // On windows lseek can return non-zero values even on a pipe. Instead + // it's likely to seek somewhere within the pipe memory buffer. + // This breaks bgzf_check_EOF among other things. + if (GetFileType((HANDLE)_get_osfhandle(fp->fd)) == FILE_TYPE_PIPE) { + errno = ESPIPE; + return -1; + } +#endif + return lseek(fp->fd, offset, whence); } @@ -589,6 +599,10 @@ static int fd_close(hFILE *fpv) { hFILE_fd *fp = (hFILE_fd *) fpv; int ret; + + // If we don't own the fd, return successfully without actually closing it + if (fp->is_shared) return 0; + do { #ifdef HAVE_CLOSESOCKET ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd); @@ -626,6 +640,7 @@ static hFILE *hopen_fd(const char *filename, const char *mode) fp->fd = fd; fp->is_socket = 0; + fp->is_shared = 0; fp->base.backend = &fd_backend; return &fp->base; @@ -692,6 +707,7 @@ hFILE *hdopen(int fd, const char *mode) fp->fd = fd; fp->is_socket = (strchr(mode, 's') != NULL); + fp->is_shared = (strchr(mode, 'S') != NULL); fp->base.backend = &fd_backend; return &fp->base; } @@ -713,10 +729,12 @@ static hFILE *hopen_fd_fileuri(const char *url, const char *mode) static hFILE *hopen_fd_stdinout(const char *mode) { int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO; + char mode_shared[101]; + snprintf(mode_shared, sizeof mode_shared, "S%s", mode); #if defined HAVE_SETMODE && defined O_BINARY if (setmode(fd, O_BINARY) < 0) return NULL; #endif - return hdopen(fd, mode); + return hdopen(fd, mode_shared); } HTSLIB_EXPORT @@ -1061,7 +1079,6 @@ static int load_hfile_plugins() hfile_add_scheme_handler("data", &data); hfile_add_scheme_handler("file", &file); hfile_add_scheme_handler("preload", &preload); - init_add_plugin(NULL, hfile_plugin_init_net, "knetfile"); init_add_plugin(NULL, hfile_plugin_init_mem, "mem"); init_add_plugin(NULL, hfile_plugin_init_crypt4gh_needed, "crypt4gh-needed"); @@ -1114,7 +1131,7 @@ static hFILE *hopen_unknown_scheme(const char *fname, const char *mode) } /* Returns the appropriate handler, or NULL if the string isn't an URL. */ -static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) +const struct hFILE_scheme_handler *find_scheme_handler(const char *s) { static const struct hFILE_scheme_handler unknown_scheme = { hopen_unknown_scheme, hfile_always_local, "built-in", 0 }; @@ -1143,6 +1160,118 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) return (k != kh_end(schemes))? kh_value(schemes, k) : &unknown_scheme; } + +/*************************** + * Library introspection functions + ***************************/ + +/* + * Fills out sc_list[] with the list of known URL schemes. + * This can be restricted to just ones from a specific plugin, + * or all (plugin == NULL). + * + * Returns number of schemes found on success; + * -1 on failure. + */ +HTSLIB_EXPORT +int hfile_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + khiter_t k; + int ns = 0; + + for (k = kh_begin(schemes); k != kh_end(schemes); k++) { + if (!kh_exist(schemes, k)) + continue; + + const struct hFILE_scheme_handler *s = kh_value(schemes, k); + if (plugin && strcmp(s->provider, plugin) != 0) + continue; + + if (ns < *nschemes) + sc_list[ns] = kh_key(schemes, k); + ns++; + } + + if (*nschemes > ns) + *nschemes = ns; + + return ns; +} + + +/* + * Fills out plist[] with the list of known hFILE plugins. + * + * Returns number of schemes found on success; + * -1 on failure + */ +HTSLIB_EXPORT +int hfile_list_plugins(const char *plist[], int *nplugins) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + int np = 0; + if (*nplugins) + plist[np++] = "built-in"; + + struct hFILE_plugin_list *p = plugins; + while (p) { + if (np < *nplugins) + plist[np] = p->plugin.name; + + p = p->next; + np++; + } + + if (*nplugins > np) + *nplugins = np; + + return np; +} + + +/* + * Tests for the presence of a specific hFILE plugin. + * + * Returns 1 if true + * 0 otherwise + */ +HTSLIB_EXPORT +int hfile_has_plugin(const char *name) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + struct hFILE_plugin_list *p = plugins; + while (p) { + if (strcmp(p->plugin.name, name) == 0) + return 1; + p = p->next; + } + + return 0; +} + +/*************************** + * hFILE interface proper + ***************************/ + hFILE *hopen(const char *fname, const char *mode, ...) { const struct hFILE_scheme_handler *handler = find_scheme_handler(fname); @@ -1213,3 +1342,82 @@ char *haddextension(struct kstring_t *buffer, const char *filename, kputs(trailing, buffer) >= 0) return buffer->s; else return NULL; } + + +/* + * ---------------------------------------------------------------------- + * Minimal stub functions for knet, added after the removal of + * hfile_net.c and knetfile.c. + * + * They exist purely for ABI compatibility, but are simply wrappers to + * hFILE. API should be compatible except knet_fileno (unused?). + * + * CULL THESE and knetfile.h at the next .so version bump. + */ +typedef struct knetFile_s { + // As per htslib/knetfile.h. Duplicated here as we don't wish to + // have any dependence on the deprecated knetfile.h interface, plus + // it's hopefully only temporary. + int type, fd; + int64_t offset; + char *host, *port; + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; + int64_t file_size; + char *path, *http_host; + + // Our local addition + hFILE *hf; +} knetFile; + +HTSLIB_EXPORT +knetFile *knet_open(const char *fn, const char *mode) { + knetFile *fp = calloc(1, sizeof(*fp)); + if (!fp) return NULL; + if (!(fp->hf = hopen(fn, mode))) { + free(fp); + return NULL; + } + + // FD backend is the only one implementing knet_fileno + fp->fd = fp->hf->backend == &fd_backend + ? ((hFILE_fd *)fp->hf)->fd + : -1; + + return fp; +} + +HTSLIB_EXPORT +knetFile *knet_dopen(int fd, const char *mode) { + knetFile *fp = calloc(1, sizeof(*fp)); + if (!fp) return NULL; + if (!(fp->hf = hdopen(fd, mode))) { + free(fp); + return NULL; + } + fp->fd = fd; + return fp; +} + +HTSLIB_EXPORT +ssize_t knet_read(knetFile *fp, void *buf, size_t len) { + ssize_t r = hread(fp->hf, buf, len); + fp->offset += r>0?r:0; + return r; +} + +HTSLIB_EXPORT +off_t knet_seek(knetFile *fp, off_t off, int whence) { + off_t r = hseek(fp->hf, off, whence); + if (r >= 0) + fp->offset = r; + return r; +} + +HTSLIB_EXPORT +int knet_close(knetFile *fp) { + int r = hclose(fp->hf); + free(fp); + return r; +} diff --git a/hfile_gcs.c b/hfile_gcs.c index e6f72ae4c..2f01a20ce 100644 --- a/hfile_gcs.c +++ b/hfile_gcs.c @@ -1,6 +1,6 @@ /* hfile_gcs.c -- Google Cloud Storage backend for low-level file streams. - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016, 2021 Genome Research Ltd. Author: John Marshall @@ -42,10 +42,11 @@ static hFILE * gcs_rewrite(const char *gsurl, const char *mode, int mode_has_colon, va_list *argsp) { - const char *bucket, *path, *access_token; + const char *bucket, *path, *access_token, *requester_pays_project; kstring_t mode_colon = { 0, 0, NULL }; kstring_t url = { 0, 0, NULL }; kstring_t auth_hdr = { 0, 0, NULL }; + kstring_t requester_pays_hdr = { 0, 0, NULL }; hFILE *fp = NULL; // GCS URL format is gs[+SCHEME]://BUCKET/PATH @@ -81,15 +82,35 @@ gcs_rewrite(const char *gsurl, const char *mode, int mode_has_colon, kputs(access_token, &auth_hdr); } - if (argsp || auth_hdr.l > 0 || mode_has_colon) { + requester_pays_project = getenv("GCS_REQUESTER_PAYS_PROJECT"); + + if (requester_pays_project) { + kputs("X-Goog-User-Project: ", &requester_pays_hdr); + kputs(requester_pays_project, &requester_pays_hdr); + } + + if (argsp || mode_has_colon || auth_hdr.l > 0 || requester_pays_hdr.l > 0) { if (! mode_has_colon) { kputs(mode, &mode_colon); kputc(':', &mode_colon); mode = mode_colon.s; } - fp = hopen(url.s, mode, "va_list", argsp, - "httphdr", (auth_hdr.l > 0)? auth_hdr.s : NULL, NULL); + if (auth_hdr.l > 0 && requester_pays_hdr.l > 0) { + fp = hopen( + url.s, mode, "va_list", argsp, + "httphdr:l", + auth_hdr.s, + requester_pays_hdr.s, + NULL, + NULL + ); + + } + else { + fp = hopen(url.s, mode, "va_list", argsp, + "httphdr", (auth_hdr.l > 0)? auth_hdr.s : NULL, NULL); + } } else fp = hopen(url.s, mode); @@ -97,6 +118,7 @@ gcs_rewrite(const char *gsurl, const char *mode, int mode_has_colon, free(mode_colon.s); free(url.s); free(auth_hdr.s); + free(requester_pays_hdr.s); return fp; } diff --git a/hfile_internal.h b/hfile_internal.h index 386689626..2e365ae7d 100644 --- a/hfile_internal.h +++ b/hfile_internal.h @@ -90,11 +90,13 @@ struct hFILE_backend { /* May be called by hopen_*() functions to decode a fopen()-style mode into open(2)-style flags. */ +HTSLIB_EXPORT int hfile_oflags(const char *mode); /* Must be called by hopen_*() functions to allocate the hFILE struct and set up its base. Capacity is a suggested buffer size (e.g., via fstat(2)) or 0 for a default-sized buffer. */ +HTSLIB_EXPORT hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity); /* Alternative to hfile_init() for in-memory backends for which the base @@ -107,6 +109,7 @@ hFILE *hfile_init_fixed(size_t struct_size, const char *mode, /* May be called by hopen_*() functions to undo the effects of hfile_init() in the event opening the stream subsequently fails. (This is safe to use even if fp is NULL. This takes care to preserve errno.) */ +HTSLIB_EXPORT void hfile_destroy(hFILE *fp); @@ -138,10 +141,13 @@ struct hFILE_scheme_handler { }; /* May be used as an isremote() function in simple cases. */ +HTSLIB_EXPORT extern int hfile_always_local (const char *fname); +HTSLIB_EXPORT extern int hfile_always_remote(const char *fname); /* Should be called by plugins for each URL scheme they wish to handle. */ +HTSLIB_EXPORT void hfile_add_scheme_handler(const char *scheme, const struct hFILE_scheme_handler *handler); @@ -179,9 +185,6 @@ extern int hfile_plugin_init_s3(struct hFILE_plugin *self); extern int hfile_plugin_init_s3_write(struct hFILE_plugin *self); #endif -/* This one is never built as a separate plugin. */ -extern int hfile_plugin_init_net(struct hFILE_plugin *self); - // Callback to allow headers to be set in http connections. Currently used // to allow s3 to renew tokens when seeking. Kept internal for now, // although we may consider exposing it in the API later. diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 090db348a..1e4a4486f 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -1221,7 +1221,7 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) err = curl_easy_setopt(fp->easy, CURLOPT_PRIVATE, fp); // Avoid many repeated CWD calls with FTP, instead requesting the filename - // by full path (as done in knet, but not strictly compliant with RFC1738). + // by full path (but not strictly compliant with RFC1738). err |= curl_easy_setopt(fp->easy, CURLOPT_FTP_FILEMETHOD, CURLFTPMETHOD_NOCWD); if (mode == 'r') { @@ -1330,13 +1330,20 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) } if (mode == 'r') { +#if LIBCURL_VERSION_NUM >= 0x073700 // 7.55.0 + curl_off_t offset; + + if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, + &offset) == CURLE_OK && offset > 0) + fp->file_size = (off_t) offset; +#else double dval; if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &dval) == CURLE_OK && dval >= 0.0) fp->file_size = (off_t) (dval + 0.1); +#endif } - fp->base.backend = &libcurl_backend; return &fp->base; diff --git a/hfile_net.c b/hfile_net.c deleted file mode 100644 index 5443b2240..000000000 --- a/hfile_net.c +++ /dev/null @@ -1,112 +0,0 @@ -/* hfile_net.c -- network backend for low-level input/output streams. - - Copyright (C) 2013-2015 Genome Research Ltd. - - Author: John Marshall - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include - -#include "hfile_internal.h" - -#include "htslib/knetfile.h" - -typedef struct { - hFILE base; - knetFile *netfp; -} hFILE_net; - -static int net_inited = 0; - -#ifdef _WIN32 -static void net_exit(void) -{ - knet_win32_destroy(); -} -#endif - -static int net_init(void) -{ -#ifdef _WIN32 - if (knet_win32_init() != 0) return -1; - - // In the unlikely event atexit() fails, it's better to succeed here and - // carry on and do the I/O; then eventually when the program exits, we'll - // merely have failed to clean up properly, as if we had aborted. - (void) atexit(net_exit); -#endif - - net_inited = 1; - return 0; -} - -static ssize_t net_read(hFILE *fpv, void *buffer, size_t nbytes) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_read(fp->netfp, buffer, nbytes); -} - -static off_t net_seek(hFILE *fpv, off_t offset, int whence) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_seek(fp->netfp, offset, whence); -} - -static int net_close(hFILE *fpv) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_close(fp->netfp); -} - -static const struct hFILE_backend net_backend = -{ - net_read, NULL, net_seek, NULL, net_close -}; - -hFILE *hopen_net(const char *filename, const char *mode) -{ - hFILE_net *fp; - - // Do any networking initialisation if this is the first use. - if (! net_inited) { if (net_init() < 0) return NULL; } - - fp = (hFILE_net *) hfile_init(sizeof (hFILE_net), mode, 0); - if (fp == NULL) return NULL; - - fp->netfp = knet_open(filename, mode); - if (fp->netfp == NULL) { hfile_destroy((hFILE *) fp); return NULL; } - - fp->base.backend = &net_backend; - return &fp->base; -} - -int hfile_plugin_init_net(struct hFILE_plugin *self) -{ - static const struct hFILE_scheme_handler handler = - { hopen_net, hfile_always_remote, "knetfile", 0 }; - - self->name = "knetfile"; - hfile_add_scheme_handler("http", &handler); - hfile_add_scheme_handler("ftp", &handler); - return 0; -} diff --git a/hfile_s3.c b/hfile_s3.c index eeb355065..e2718f656 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -1,6 +1,6 @@ /* hfile_s3.c -- Amazon S3 backend for low-level file streams. - Copyright (C) 2015-2017, 2019-2020 Genome Research Ltd. + Copyright (C) 2015-2017, 2019-2023 Genome Research Ltd. Author: John Marshall @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include @@ -39,6 +40,7 @@ DEALINGS IN THE SOFTWARE. */ #endif #include "htslib/hts.h" // for hts_version() and hts_verbose #include "htslib/kstring.h" +#include "hts_time_funcs.h" typedef struct s3_auth_data { kstring_t id; @@ -48,6 +50,8 @@ typedef struct s3_auth_data { kstring_t canonical_query_string; kstring_t user_query_string; kstring_t host; + kstring_t profile; + time_t creds_expiry_time; char *bucket; kstring_t auth_hdr; time_t auth_time; @@ -56,11 +60,12 @@ typedef struct s3_auth_data { char date_short[9]; kstring_t date_html; char mode; - char *headers[4]; + char *headers[5]; int refcount; } s3_auth_data; -#define AUTH_LIFETIME 60 +#define AUTH_LIFETIME 60 // Regenerate auth headers if older than this +#define CREDENTIAL_LIFETIME 60 // Seconds before expiry to reread credentials #if defined HAVE_COMMONCRYPTO @@ -234,7 +239,10 @@ static void parse_ini(const char *fname, const char *section, ...) va_start(args, section); while ((akey = va_arg(args, const char *)) != NULL) { kstring_t *avar = va_arg(args, kstring_t *); - if (strcmp(key, akey) == 0) { kputs(value, avar); break; } + if (strcmp(key, akey) == 0) { + avar->l = 0; + kputs(value, avar); + break; } } va_end(args); } @@ -269,17 +277,37 @@ static void parse_simple(const char *fname, kstring_t *id, kstring_t *secret) static int copy_auth_headers(s3_auth_data *ad, char ***hdrs) { char **hdr = &ad->headers[0]; + int idx = 0; *hdrs = hdr; - *hdr = strdup(ad->date); - if (!*hdr) return -1; - hdr++; + + hdr[idx] = strdup(ad->date); + if (!hdr[idx]) return -1; + idx++; + + if (ad->token.l) { + kstring_t token_hdr = KS_INITIALIZE; + kputs("X-Amz-Security-Token: ", &token_hdr); + kputs(ad->token.s, &token_hdr); + if (token_hdr.s) { + hdr[idx++] = token_hdr.s; + } else { + goto fail; + } + } + if (ad->auth_hdr.l) { - *hdr = strdup(ad->auth_hdr.s); - if (!*hdr) { free(ad->headers[0]); return -1; } - hdr++; + hdr[idx] = strdup(ad->auth_hdr.s); + if (!hdr[idx]) goto fail; + idx++; } - *hdr = NULL; + + hdr[idx] = NULL; return 0; + + fail: + for (--idx; idx >= 0; --idx) + free(hdr[idx]); + return -1; } static void free_auth_data(s3_auth_data *ad) { @@ -287,6 +315,7 @@ static void free_auth_data(s3_auth_data *ad) { --ad->refcount; return; } + free(ad->profile.s); free(ad->id.s); free(ad->token.s); free(ad->secret.s); @@ -300,6 +329,67 @@ static void free_auth_data(s3_auth_data *ad) { free(ad); } +static time_t parse_rfc3339_date(kstring_t *datetime) +{ + int offset = 0; + time_t when; + int num; + char should_be_t = '\0', timezone[10] = { '\0' }; + unsigned int year, mon, day, hour, min, sec; + + if (!datetime->s) + return 0; + + // It should be possible to do this with strptime(), but it seems + // to not get on with our feature definitions. + num = sscanf(datetime->s, "%4u-%2u-%2u%c%2u:%2u:%2u%9s", + &year, &mon, &day, &should_be_t, &hour, &min, &sec, timezone); + if (num < 8) + return 0; + if (should_be_t != 'T' && should_be_t != 't' && should_be_t != ' ') + return 0; + struct tm parsed = { sec, min, hour, day, mon - 1, year - 1900, 0, 0, 0 }; + + switch (timezone[0]) { + case 'Z': + case 'z': + case '\0': + break; + case '+': + case '-': { + unsigned hr_off, min_off; + if (sscanf(timezone + 1, "%2u:%2u", &hr_off, &min_off)) { + if (hr_off < 24 && min_off <= 60) { + offset = ((hr_off * 60 + min_off) + * (timezone[0] == '+' ? -60 : 60)); + } + } + break; + } + default: + return 0; + } + + when = hts_time_gm(&parsed); + return when >= 0 ? when + offset : 0; +} + +static void refresh_auth_data(s3_auth_data *ad) { + // Basically a copy of the AWS_SHARED_CREDENTIALS_FILE part of + // setup_auth_data(), but this only reads the authorisation parts. + const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); + kstring_t expiry_time = KS_INITIALIZE; + parse_ini(v? v : "~/.aws/credentials", ad->profile.s, + "aws_access_key_id", &ad->id, + "aws_secret_access_key", &ad->secret, + "aws_session_token", &ad->token, + "expiry_time", &expiry_time); + if (expiry_time.l) { + ad->creds_expiry_time = parse_rfc3339_date(&expiry_time); + } + ks_free(&expiry_time); +} + static int auth_header_callback(void *ctx, char ***hdrs) { s3_auth_data *ad = (s3_auth_data *) ctx; @@ -319,7 +409,10 @@ static int auth_header_callback(void *ctx, char ***hdrs) { return 0; } - if (now - ad->auth_time < AUTH_LIFETIME) { + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } else if (now - ad->auth_time < AUTH_LIFETIME) { // Last auth string should still be valid *hdrs = NULL; return 0; @@ -358,12 +451,12 @@ static int auth_header_callback(void *ctx, char ***hdrs) { /* like a escape path but for query strings '=' and '&' are untouched */ static char *escape_query(const char *qs) { - size_t i, j = 0, length; + size_t i, j = 0, length, alloced; char *escaped; length = strlen(qs); - - if ((escaped = malloc(length * 3 + 1)) == NULL) { + alloced = length * 3 + 1; + if ((escaped = malloc(alloced)) == NULL) { return NULL; } @@ -374,29 +467,25 @@ static char *escape_query(const char *qs) { c == '_' || c == '-' || c == '~' || c == '.' || c == '/' || c == '=' || c == '&') { escaped[j++] = c; } else { - sprintf(escaped + j, "%%%02X", c); + snprintf(escaped + j, alloced - j, "%%%02X", c); j += 3; } } - if (i != length) { - // in the case of a '?' copy the rest of the qs across unchanged - strcpy(escaped + j, qs + i); - } else { - escaped[j] = '\0'; - } + escaped[j] = '\0'; return escaped; } static char *escape_path(const char *path) { - size_t i, j = 0, length; + size_t i, j = 0, length, alloced; char *escaped; length = strlen(path); + alloced = length * 3 + 1; - if ((escaped = malloc(length * 3 + 1)) == NULL) { + if ((escaped = malloc(alloced)) == NULL) { return NULL; } @@ -409,7 +498,7 @@ static char *escape_path(const char *path) { c == '_' || c == '-' || c == '~' || c == '.' || c == '/') { escaped[j++] = c; } else { - sprintf(escaped + j, "%%%02X", c); + snprintf(escaped + j, alloced - j, "%%%02X", c); j += 3; } } @@ -498,11 +587,11 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, s3_auth_data *ad = calloc(1, sizeof(*ad)); const char *bucket, *path; char *escaped = NULL; - kstring_t profile = { 0, 0, NULL }; size_t url_path_pos; ptrdiff_t bucket_len; int is_https = 1, dns_compliant; char *query_start; + enum {s3_auto, s3_virtual, s3_path} address_style = s3_auto; if (!ad) return NULL; @@ -530,7 +619,7 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if (*path == '@') { const char *colon = strpbrk(bucket, ":@"); if (*colon != ':') { - urldecode_kput(bucket, colon - bucket, &profile); + urldecode_kput(bucket, colon - bucket, &ad->profile); } else { const char *colon2 = strpbrk(&colon[1], ":@"); @@ -552,32 +641,89 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if ((v = getenv("AWS_DEFAULT_REGION")) != NULL) kputs(v, &ad->region); if ((v = getenv("HTS_S3_HOST")) != NULL) kputs(v, &ad->host); - if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile); - else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile); - else kputs("default", &profile); + if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &ad->profile); + else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &ad->profile); + else kputs("default", &ad->profile); + + if ((v = getenv("HTS_S3_ADDRESS_STYLE")) != NULL) { + if (strcasecmp(v, "virtual") == 0) { + address_style = s3_virtual; + } else if (strcasecmp(v, "path") == 0) { + address_style = s3_path; + } + } } if (ad->id.l == 0) { + kstring_t url_style = KS_INITIALIZE; + kstring_t expiry_time = KS_INITIALIZE; const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); - parse_ini(v? v : "~/.aws/credentials", profile.s, + parse_ini(v? v : "~/.aws/credentials", ad->profile.s, "aws_access_key_id", &ad->id, "aws_secret_access_key", &ad->secret, "aws_session_token", &ad->token, - "region", &ad->region, NULL); + "region", &ad->region, + "addressing_style", &url_style, + "expiry_time", &expiry_time, + NULL); + + if (url_style.l) { + if (strcmp(url_style.s, "virtual") == 0) { + address_style = s3_virtual; + } else if (strcmp(url_style.s, "path") == 0) { + address_style = s3_path; + } else { + address_style = s3_auto; + } + } + if (expiry_time.l) { + // Not a real part of the AWS configuration file, but it allows + // support for short-term credentials like those for the IAM + // service. The botocore library uses the key "expiry_time" + // internally for this purpose. + // See https://github.com/boto/botocore/blob/develop/botocore/credentials.py + ad->creds_expiry_time = parse_rfc3339_date(&expiry_time); + } + + ks_free(&url_style); + ks_free(&expiry_time); } if (ad->id.l == 0) { + kstring_t url_style = KS_INITIALIZE; const char *v = getenv("HTS_S3_S3CFG"); - parse_ini(v? v : "~/.s3cfg", profile.s, "access_key", &ad->id, + parse_ini(v? v : "~/.s3cfg", ad->profile.s, "access_key", &ad->id, "secret_key", &ad->secret, "access_token", &ad->token, "host_base", &ad->host, - "bucket_location", &ad->region, NULL); + "bucket_location", &ad->region, + "host_bucket", &url_style, + NULL); + + if (url_style.l) { + // Conforming to s3cmd's GitHub PR#416, host_bucket without the "%(bucket)s" string + // indicates use of path style adressing. + if (strstr(url_style.s, "%(bucket)s") == NULL) { + address_style = s3_path; + } else { + address_style = s3_auto; + } + } + + ks_free(&url_style); } if (ad->id.l == 0) parse_simple("~/.awssecret", &ad->id, &ad->secret); - dns_compliant = is_dns_compliant(bucket, path, is_https); + + // if address_style is set, force the dns_compliant setting + if (address_style == s3_virtual) { + dns_compliant = 1; + } else if (address_style == s3_path) { + dns_compliant = 0; + } else { + dns_compliant = is_dns_compliant(bucket, path, is_https); + } if (ad->host.l == 0) kputs("s3.amazonaws.com", &ad->host); @@ -651,13 +797,11 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, *query_start = 0; } - free(profile.s); free(escaped); return ad; error: - free(profile.s); free(escaped); free_auth_data(ad); return NULL; @@ -665,23 +809,13 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) { - char *header_list[4], **header = header_list; - kstring_t url = { 0, 0, NULL }; - kstring_t token_hdr = { 0, 0, NULL }; s3_auth_data *ad = setup_auth_data(s3url, mode, 2, &url); if (!ad) return NULL; - if (ad->token.l > 0) { - kputs("X-Amz-Security-Token: ", &token_hdr); - kputs(ad->token.s, &token_hdr); - *header++ = token_hdr.s; - } - - *header = NULL; - hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -690,12 +824,10 @@ static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) if (!fp) goto fail; free(url.s); - free(token_hdr.s); return fp; fail: free(url.s); - free(token_hdr.s); free_auth_data(ad); return NULL; } @@ -706,14 +838,14 @@ AWS S3 sig version 4 writing code ****************************************************************/ -static void hash_string(char *in, size_t length, char *out) { +static void hash_string(char *in, size_t length, char *out, size_t out_len) { unsigned char hashed[SHA256_DIGEST_BUFSIZE]; int i, j; s3_sha256((const unsigned char *)in, length, hashed); for (i = 0, j = 0; i < SHA256_DIGEST_BUFSIZE; i++, j+= 2) { - sprintf(out + j, "%02x", hashed[i]); + snprintf(out + j, out_len - j, "%02x", hashed[i]); } } @@ -730,7 +862,7 @@ static void ksfree(kstring_t *s) { } -static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *signature_string) { +static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *signature_string, size_t sig_string_len) { unsigned char date_key[SHA256_DIGEST_BUFSIZE]; unsigned char date_region_key[SHA256_DIGEST_BUFSIZE]; unsigned char date_region_service_key[SHA256_DIGEST_BUFSIZE]; @@ -757,7 +889,7 @@ static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *sig s3_sign_sha256(signing_key, len, (const unsigned char *)string_to_sign->s, string_to_sign->l, signature, &len); for (i = 0, j = 0; i < len; i++, j+= 2) { - sprintf(signature_string + j, "%02x", signature[i]); + snprintf(signature_string + j, sig_string_len - j, "%02x", signature[i]); } ksfree(&secret_access_key); @@ -809,7 +941,7 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten goto cleanup; } - hash_string(canonical_request.s, canonical_request.l, cr_hash); + hash_string(canonical_request.s, canonical_request.l, cr_hash, sizeof(cr_hash)); ksprintf(&scope, "%s/%s/s3/aws4_request", ad->date_short, ad->region.s); @@ -823,7 +955,7 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten goto cleanup; } - if (make_signature(ad, &string_to_sign, signature_string)) { + if (make_signature(ad, &string_to_sign, signature_string, sizeof(signature_string))) { goto cleanup; } @@ -847,9 +979,8 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten } -static int update_time(s3_auth_data *ad) { +static int update_time(s3_auth_data *ad, time_t now) { int ret = -1; - time_t now = time(NULL); #ifdef HAVE_GMTIME_R struct tm tm_buffer; struct tm *tm = gmtime_r(&now, &tm_buffer); @@ -940,6 +1071,7 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co kstring_t *token, int uqs) { s3_auth_data *ad = (s3_auth_data *)auth; char content_hash[HASH_LENGTH_SHA256]; + time_t now; if (request == NULL) { // signal to free auth data @@ -947,15 +1079,21 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co return 0; } - if (update_time(ad)) { + now = time(NULL); + + if (update_time(ad, now)) { return -1; } + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } if (content) { - hash_string(content->s, content->l, content_hash); + hash_string(content->s, content->l, content_hash, sizeof(content_hash)); } else { // empty hash - hash_string("", 0, content_hash); + hash_string("", 0, content_hash, sizeof(content_hash)); } ad->canonical_query_string.l = 0; @@ -997,20 +1135,34 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co static int v4_auth_header_callback(void *ctx, char ***hdrs) { s3_auth_data *ad = (s3_auth_data *) ctx; char content_hash[HASH_LENGTH_SHA256]; - kstring_t content = {0, 0, NULL}; - kstring_t authorisation = {0, 0, NULL}; + kstring_t content = KS_INITIALIZE; + kstring_t authorisation = KS_INITIALIZE; + kstring_t token_hdr = KS_INITIALIZE; char *date_html = NULL; + time_t now; + int idx; if (!hdrs) { // Closing connection free_auth_data(ad); return 0; } - if (update_time(ad)) { + now = time(NULL); + + if (update_time(ad, now)) { return -1; } - hash_string("", 0, content_hash); // empty hash + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } + + if (!ad->id.l || !ad->secret.l) { + return copy_auth_headers(ad, hdrs); + } + + hash_string("", 0, content_hash, sizeof(content_hash)); // empty hash ad->canonical_query_string.l = 0; @@ -1031,18 +1183,27 @@ static int v4_auth_header_callback(void *ctx, char ***hdrs) { ksprintf(&content, "x-amz-content-sha256: %s", content_hash); date_html = strdup(ad->date_html.s); + if (ad->token.l > 0) { + kputs("X-Amz-Security-Token: ", &token_hdr); + kputs(ad->token.s, &token_hdr); + } + if (content.l == 0 || date_html == NULL) { ksfree(&authorisation); ksfree(&content); + ksfree(&token_hdr); free(date_html); return -1; } *hdrs = &ad->headers[0]; - ad->headers[0] = ks_release(&authorisation); - ad->headers[1] = date_html; - ad->headers[2] = ks_release(&content); - ad->headers[3] = NULL; + idx = 0; + ad->headers[idx++] = ks_release(&authorisation); + ad->headers[idx++] = date_html; + ad->headers[idx++] = ks_release(&content); + if (token_hdr.s) + ad->headers[idx++] = ks_release(&token_hdr); + ad->headers[idx++] = NULL; return 0; } @@ -1115,9 +1276,7 @@ static int http_status_errno(int status) static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { kstring_t url = { 0, 0, NULL }; - kstring_t token_hdr = { 0, 0, NULL }; - char *header_list[4], **header = header_list; s3_auth_data *ad = setup_auth_data(s3url, mode, 4, &url); hFILE *fp = NULL; @@ -1128,14 +1287,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { if (ad->mode == 'r') { long http_response = 0; - if (ad->token.l > 0) { - kputs("x-amz-security-token: ", &token_hdr); - kputs(ad->token.s, &token_hdr); - *header++ = token_hdr.s; - } - - *header = NULL; - fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", v4_auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -1152,7 +1304,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { goto error; } hclose_abruptly(fp); - fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", v4_auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -1185,7 +1337,6 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { } free(url.s); - free(token_hdr.s); return fp; @@ -1193,7 +1344,6 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { if (fp) hclose_abruptly(fp); free(url.s); - free(token_hdr.s); free_auth_data(ad); return NULL; diff --git a/hfile_s3_write.c b/hfile_s3_write.c index eec56696b..d54945839 100644 --- a/hfile_s3_write.c +++ b/hfile_s3_write.c @@ -321,7 +321,7 @@ static int complete_upload(hFILE_s3_write *fp, kstring_t *resp) { curl_easy_reset(fp->curl); curl_easy_setopt(fp->curl, CURLOPT_POST, 1L); curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDS, fp->completion_message.s); - curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, fp->completion_message.l); + curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, (long) fp->completion_message.l); curl_easy_setopt(fp->curl, CURLOPT_WRITEFUNCTION, response_callback); curl_easy_setopt(fp->curl, CURLOPT_WRITEDATA, (void *)resp); curl_easy_setopt(fp->curl, CURLOPT_URL, url.s); diff --git a/hts.c b/hts.c index 4552b5a80..d90fea61f 100644 --- a/hts.c +++ b/hts.c @@ -1,6 +1,6 @@ /* hts.c -- format-neutral I/O, indexing, and iterator API functions. - Copyright (C) 2008, 2009, 2012-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2012-2023 Genome Research Ltd. Copyright (C) 2012, 2013 Broad Institute. Author: Heng Li @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include @@ -40,21 +41,36 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#ifdef HAVE_LIBLZMA +#ifdef HAVE_LZMA_H +#include +#else +#include "os/lzma_stub.h" +#endif +#endif + #include "htslib/hts.h" #include "htslib/bgzf.h" #include "cram/cram.h" #include "htslib/hfile.h" #include "htslib/hts_endian.h" #include "version.h" +#include "config_vars.h" #include "hts_internal.h" #include "hfile_internal.h" #include "sam_internal.h" +#include "htslib/hts_expr.h" #include "htslib/hts_os.h" // drand48 #include "htslib/khash.h" #include "htslib/kseq.h" #include "htslib/ksort.h" #include "htslib/tbx.h" +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#else +#include "htscodecs/htscodecs/htscodecs.h" +#endif #ifndef EFTYPE #define EFTYPE ENOEXEC @@ -70,6 +86,152 @@ const char *hts_version() return HTS_VERSION_TEXT; } +unsigned int hts_features(void) { + unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present + +#ifdef PACKAGE_URL + feat |= HTS_FEATURE_CONFIGURE; +#endif + +#ifdef ENABLE_PLUGINS + feat |= HTS_FEATURE_PLUGINS; +#endif + +#ifdef HAVE_LIBCURL + feat |= HTS_FEATURE_LIBCURL; +#endif + +#ifdef ENABLE_S3 + feat |= HTS_FEATURE_S3; +#endif + +#ifdef ENABLE_GCS + feat |= HTS_FEATURE_GCS; +#endif + +#ifdef HAVE_LIBDEFLATE + feat |= HTS_FEATURE_LIBDEFLATE; +#endif + +#ifdef HAVE_LIBLZMA + feat |= HTS_FEATURE_LZMA; +#endif + +#ifdef HAVE_LIBBZ2 + feat |= HTS_FEATURE_BZIP2; +#endif + + return feat; +} + +const char *hts_test_feature(unsigned int id) { + unsigned int feat = hts_features(); + + switch (id) { + case HTS_FEATURE_CONFIGURE: + return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; + case HTS_FEATURE_PLUGINS: + return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; + case HTS_FEATURE_LIBCURL: + return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; + case HTS_FEATURE_S3: + return feat & HTS_FEATURE_S3 ? "yes" : NULL; + case HTS_FEATURE_GCS: + return feat & HTS_FEATURE_GCS ? "yes" : NULL; + case HTS_FEATURE_LIBDEFLATE: + return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; + case HTS_FEATURE_BZIP2: + return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; + case HTS_FEATURE_LZMA: + return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + + case HTS_FEATURE_HTSCODECS: + return htscodecs_version(); + + case HTS_FEATURE_CC: + return HTS_CC; + case HTS_FEATURE_CFLAGS: + return HTS_CFLAGS; + case HTS_FEATURE_LDFLAGS: + return HTS_LDFLAGS; + case HTS_FEATURE_CPPFLAGS: + return HTS_CPPFLAGS; + + default: + fprintf(stderr, "Unknown feature code: %u\n", id); + } + + return NULL; +} + +// Note this implementation also means we can just "strings" the library +// to find the configuration parameters. +const char *hts_feature_string(void) { + static char config[1200]; + const char *flags= + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef HAVE_LIBCURL + "libcurl=yes " +#else + "libcurl=no " +#endif + +#ifdef ENABLE_S3 + "S3=yes " +#else + "S3=no " +#endif + +#ifdef ENABLE_GCS + "GCS=yes " +#else + "GCS=no " +#endif + +#ifdef HAVE_LIBDEFLATE + "libdeflate=yes " +#else + "libdeflate=no " +#endif + +#ifdef HAVE_LIBLZMA + "lzma=yes " +#else + "lzma=no " +#endif + +#ifdef HAVE_LIBBZ2 + "bzip2=yes " +#else + "bzip2=no " +#endif + +// "plugins=" must stay at the end as it is followed by "plugin-path=" +#ifdef ENABLE_PLUGINS + "plugins=yes"; +#else + "plugins=no"; +#endif + +#ifdef ENABLE_PLUGINS + snprintf(config, sizeof(config), + "%s plugin-path=%.1000s htscodecs=%.40s", + flags, hts_plugin_path(), htscodecs_version()); +#else + snprintf(config, sizeof(config), + "%s htscodecs=%.40s", + flags, htscodecs_version()); +#endif + return config; +} + + HTSLIB_EXPORT const unsigned char seq_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, @@ -108,6 +270,7 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) case sam: case cram: case fastq_format: + case fasta_format: return sequence_data; case vcf: @@ -124,9 +287,9 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) return index_file; case bed: + case d4_format: return region_list; - case fasta_format: case htsget: case hts_crypt4gh_format: return unknown_category; @@ -144,13 +307,14 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) // Decompress several hundred bytes by peeking at the file, which must be // positioned at the start of a GZIP block. -static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) +static ssize_t +decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) { unsigned char buffer[2048]; z_stream zs; ssize_t npeek = hpeek(fp, buffer, sizeof buffer); - if (npeek < 0) return 0; + if (npeek < 0) return -1; zs.zalloc = NULL; zs.zfree = NULL; @@ -158,17 +322,63 @@ static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) zs.avail_in = npeek; zs.next_out = dest; zs.avail_out = destsize; - if (inflateInit2(&zs, 31) != Z_OK) return 0; + if (inflateInit2(&zs, 31) != Z_OK) return -1; - while (zs.total_out < destsize) - if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break; + int ret; + const unsigned char *last_in = buffer; + while (zs.avail_out > 0) { + ret = inflate(&zs, Z_SYNC_FLUSH); + if (ret == Z_STREAM_END) { + if (last_in == zs.next_in) + break; // Paranoia to avoid potential looping. Shouldn't happen + else + last_in = zs.next_in; + inflateReset(&zs); + } else if (ret != Z_OK) { + // eg Z_BUF_ERROR due to avail_in/out becoming zero + break; + } + } - destsize = zs.total_out; + // NB: zs.total_out is changed by inflateReset, so use pointer diff instead + destsize = zs.next_out - dest; inflateEnd(&zs); return destsize; } +#ifdef HAVE_LIBLZMA +// Similarly decompress a portion by peeking at the file, which must be +// positioned at the start of the file. +static ssize_t +decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize) +{ + unsigned char buffer[2048]; + ssize_t npeek = hpeek(fp, buffer, sizeof buffer); + if (npeek < 0) return -1; + + lzma_stream ls = LZMA_STREAM_INIT; + if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK) + return -1; + + ls.next_in = buffer; + ls.avail_in = npeek; + ls.next_out = dest; + ls.avail_out = destsize; + + int r = lzma_code(&ls, LZMA_RUN); + if (! (r == LZMA_OK || r == LZMA_STREAM_END)) { + lzma_end(&ls); + return -1; + } + + destsize = ls.total_out; + lzma_end(&ls); + + return destsize; +} +#endif + // Parse "x.y" text, taking care because the string is not NUL-terminated // and filling in major/minor only when the digits are followed by a delimiter, // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer. @@ -221,12 +431,17 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim) return 1; } -static int -secondline_is_bases(const unsigned char *u, const unsigned char *ulim) +static int is_fastaq(const unsigned char *u, const unsigned char *ulim) { - // Skip to second line, returning false if there isn't one - u = memchr(u, '\n', ulim - u); - if (u == NULL || ++u == ulim) return 0; + const unsigned char *eol = memchr(u, '\n', ulim - u); + + // Check that the first line is entirely textual + if (! is_text_only(u, eol? eol : ulim)) return 0; + + // If the first line is very long, consider the file to indeed be FASTA/Q + if (eol == NULL) return 1; + + u = eol+1; // Now points to the first character of the second line // Scan over all base-encoding letters (including 'N' but not SEQ's '=') while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) { @@ -311,7 +526,12 @@ static int colmatch(const char *columns, const char *pattern) int hts_detect_format(hFILE *hfile, htsFormat *fmt) { - char columns[24]; + return hts_detect_format2(hfile, NULL, fmt); +} + +int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt) +{ + char extension[HTS_MAX_EXT_LEN], columns[24]; unsigned char s[1024]; int complete = 0; ssize_t len = hpeek(hfile, s, 18); @@ -327,12 +547,17 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) { // The stream is either gzip-compressed or BGZF-compressed. // Determine which, and decompress the first few records or lines. - fmt->compression = (len >= 18 && (s[3] & 4) && - memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip; + fmt->compression = gzip; + if (len >= 18 && (s[3] & 4)) { + if (memcmp(&s[12], "BC\2\0", 4) == 0) + fmt->compression = bgzf; + else if (memcmp(&s[12], "RAZF", 4) == 0) + fmt->compression = razf_compression; + } if (len >= 9 && s[2] == 8) fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1; - len = decompress_peek(hfile, s, sizeof s); + len = decompress_peek_gz(hfile, s, sizeof s); } else if (len >= 10 && memcmp(s, "BZh", 3) == 0 && (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 || @@ -346,6 +571,19 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) if (s[4] == '\x31') return 0; else len = 0; } + else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) { + fmt->compression = xz_compression; +#ifdef HAVE_LIBLZMA + len = decompress_peek_xz(hfile, s, sizeof s); +#else + // Without liblzma, we can't recognise the decompressed contents. + return 0; +#endif + } + else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) { + fmt->compression = zstd_compression; + return 0; + } else { len = hpeek(hfile, s, sizeof s); } @@ -356,6 +594,18 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) return 0; } + // We avoid using filename extensions wherever possible (as filenames are + // not always available), but in a few cases they must be considered: + // - FASTA/Q indexes are simply tab-separated text; files that match these + // patterns but not the fai/fqi extension are usually generic BED files + // - GZI indexes have no magic numbers so can only be detected by filename + if (fname && strcmp(fname, "-") != 0) { + char *s; + if (find_file_extension(fname, extension) < 0) extension[0] = '\0'; + for (s = extension; *s; s++) *s = tolower_c(*s); + } + else extension[0] = '\0'; + if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) { fmt->category = sequence_data; fmt->format = cram; @@ -401,6 +651,13 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->format = tbi; return 0; } + // GZI indexes have no magic numbers, so must be recognised solely by + // filename extension. + else if (strcmp(extension, "gzi") == 0) { + fmt->category = index_file; + fmt->format = gzi; + return 0; + } } else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) { fmt->category = variant_data; @@ -423,6 +680,13 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->version.major = 1, fmt->version.minor = -1; return 0; } + else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) { + fmt->category = region_list; + fmt->format = d4_format; + // How to decode the D4 Format Version bytes is not yet specified + // so we don't try to set fmt->version.{major,minor}. + return 0; + } else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) { fmt->category = unknown_category; fmt->format = htsget; @@ -433,11 +697,12 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->format = hts_crypt4gh_format; return 0; } - else if (len >= 1 && s[0] == '>' && secondline_is_bases(s, &s[len])) { + else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) { + fmt->category = sequence_data; fmt->format = fasta_format; return 0; } - else if (len >= 1 && s[0] == '@' && secondline_is_bases(s, &s[len])) { + else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) { fmt->category = sequence_data; fmt->format = fastq_format; return 0; @@ -459,12 +724,12 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->format = crai; return 0; } - else if (colmatch(columns, "Ziiiii") == 6) { + else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) { fmt->category = index_file; fmt->format = fqi_format; return 0; } - else if (colmatch(columns, "Ziiii") == 5) { + else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) { fmt->category = index_file; fmt->format = fai_format; return 0; @@ -506,6 +771,7 @@ char *hts_format_description(const htsFormat *format) case gzi: kputs("GZI", &str); break; case tbi: kputs("Tabix", &str); break; case bed: kputs("BED", &str); break; + case d4_format: kputs("D4", &str); break; case htsget: kputs("htsget", &str); break; case hts_crypt4gh_format: kputs("crypt4gh", &str); break; case empty_format: kputs("empty", &str); break; @@ -523,8 +789,12 @@ char *hts_format_description(const htsFormat *format) switch (format->compression) { case bzip2_compression: kputs(" bzip2-compressed", &str); break; + case razf_compression: kputs(" legacy-RAZF-compressed", &str); break; + case xz_compression: kputs(" XZ-compressed", &str); break; + case zstd_compression: kputs(" Zstandard-compressed", &str); break; case custom: kputs(" compressed", &str); break; case gzip: kputs(" gzip-compressed", &str); break; + case bgzf: switch (format->format) { case bam: @@ -539,6 +809,22 @@ char *hts_format_description(const htsFormat *format) break; } break; + + case no_compression: + switch (format->format) { + case bam: + case bcf: + case cram: + case csi: + case tbi: + // These are normally compressed, so emphasise that this one isn't + kputs(" uncompressed", &str); + break; + default: + break; + } + break; + default: break; } @@ -580,11 +866,12 @@ char *hts_format_description(const htsFormat *format) htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) { - char smode[101], *cp, *cp2, *mode_c; + char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL; htsFile *fp = NULL; hFILE *hfile = NULL; char fmt_code = '\0'; - const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0"; + // see enum htsExactFormat in htslib/hts.h + const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0"; strncpy(smode, mode, 99); smode[99]=0; @@ -597,8 +884,13 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) fmt_code = 'b'; else if (*cp == 'c') fmt_code = 'c'; - else + else { *cp2++ = *cp; + // Cache the uncompress flag 'u' pos if present + if (!uncomp && (*cp == 'u')) { + uncomp = cp2 - 1; + } + } } mode_c = cp2; *cp2++ = fmt_code; @@ -610,6 +902,11 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) *mode_c = format_to_mode[fmt->format]; } + // Uncompressed bam/bcf is not supported, change 'u' to '0' on write + if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) { + *uncomp = '0'; + } + // If we really asked for a compressed text format then mode_c above will // point to nul. We set to 'z' to enable bgzf. if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) { @@ -743,6 +1040,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "NO_REF") == 0) o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val); + else if (strcmp(o->arg, "pos_delta") == 0 || + strcmp(o->arg, "POS_DELTA") == 0) + o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val); + else if (strcmp(o->arg, "ignore_md5") == 0 || strcmp(o->arg, "IGNORE_MD5") == 0) o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val); @@ -759,6 +1060,34 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "USE_LZMA") == 0) o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val); + else if (strcmp(o->arg, "use_tok") == 0 || + strcmp(o->arg, "USE_TOK") == 0) + o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_fqz") == 0 || + strcmp(o->arg, "USE_FQZ") == 0) + o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_arith") == 0 || + strcmp(o->arg, "USE_ARITH") == 0) + o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val); + + else if (strcmp(o->arg, "fast") == 0 || + strcmp(o->arg, "FAST") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST; + + else if (strcmp(o->arg, "normal") == 0 || + strcmp(o->arg, "NORMAL") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL; + + else if (strcmp(o->arg, "small") == 0 || + strcmp(o->arg, "SMALL") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL; + + else if (strcmp(o->arg, "archive") == 0 || + strcmp(o->arg, "ARCHIVE") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE; + else if (strcmp(o->arg, "reference") == 0 || strcmp(o->arg, "REFERENCE") == 0) o->opt = CRAM_OPT_REFERENCE, o->val.s = val; @@ -783,8 +1112,8 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { // NB: Doesn't support floats, eg 1.5g // TODO: extend hts_parse_decimal? See also samtools sort. switch (*endp) { - case 'g': case 'G': o->val.i *= 1024; - case 'm': case 'M': o->val.i *= 1024; + case 'g': case 'G': o->val.i *= 1024; // fall through + case 'm': case 'M': o->val.i *= 1024; // fall through case 'k': case 'K': o->val.i *= 1024; break; case '\0': break; default: @@ -823,6 +1152,30 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "LEVEL") == 0) o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0); + else if (strcmp(o->arg, "filter") == 0 || + strcmp(o->arg, "FILTER") == 0) + o->opt = HTS_OPT_FILTER, o->val.s = val; + + else if (strcmp(o->arg, "fastq_aux") == 0 || + strcmp(o->arg, "FASTQ_AUX") == 0) + o->opt = FASTQ_OPT_AUX, o->val.s = val; + + else if (strcmp(o->arg, "fastq_barcode") == 0 || + strcmp(o->arg, "FASTQ_BARCODE") == 0) + o->opt = FASTQ_OPT_BARCODE, o->val.s = val; + + else if (strcmp(o->arg, "fastq_rnum") == 0 || + strcmp(o->arg, "FASTQ_RNUM") == 0) + o->opt = FASTQ_OPT_RNUM, o->val.i = 1; + + else if (strcmp(o->arg, "fastq_casava") == 0 || + strcmp(o->arg, "FASTQ_CASAVA") == 0) + o->opt = FASTQ_OPT_CASAVA, o->val.i = 1; + + else if (strcmp(o->arg, "fastq_name2") == 0 || + strcmp(o->arg, "FASTQ_NAME2") == 0) + o->opt = FASTQ_OPT_NAME2, o->val.i = 1; + else { hts_log_error("Unknown option '%s'", o->arg); free(o->arg); @@ -862,6 +1215,9 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) { // fall through case CRAM_OPT_VERSION: case CRAM_OPT_PREFIX: + case HTS_OPT_FILTER: + case FASTQ_OPT_AUX: + case FASTQ_OPT_BARCODE: if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) return -1; break; @@ -946,7 +1302,7 @@ int hts_parse_format(htsFormat *format, const char *str) { if (strcmp(fmt, "sam") == 0) { format->category = sequence_data; format->format = sam; - format->compression = no_compression;; + format->compression = no_compression; format->compression_level = 0; } else if (strcmp(fmt, "sam.gz") == 0) { format->category = sequence_data; @@ -966,13 +1322,33 @@ int hts_parse_format(htsFormat *format, const char *str) { } else if (strcmp(fmt, "vcf") == 0) { format->category = variant_data; format->format = vcf; - format->compression = no_compression;; + format->compression = no_compression; format->compression_level = 0; } else if (strcmp(fmt, "bcf") == 0) { format->category = variant_data; format->format = bcf; format->compression = bgzf; format->compression_level = -1; + } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) { + format->category = sequence_data; + format->format = fastq_format; + format->compression = no_compression; + format->compression_level = 0; + } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) { + format->category = sequence_data; + format->format = fastq_format; + format->compression = bgzf; + format->compression_level = 0; + } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) { + format->category = sequence_data; + format->format = fasta_format; + format->compression = no_compression; + format->compression_level = 0; + } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) { + format->category = sequence_data; + format->format = fasta_format; + format->compression = bgzf; + format->compression_level = 0; } else { return -1; } @@ -1014,18 +1390,22 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode, hFILE *hfile1 = *hfile_ptr; hFILE *hfile2 = NULL; char fn_buf[512], *fn2 = fn_buf; + char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen() const char *prefix = "crypt4gh:"; size_t fn2_len = strlen(prefix) + strlen(fn) + 1; int ret = -1; if (fn2_len > sizeof(fn_buf)) { + if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning + return -1; fn2 = malloc(fn2_len); if (!fn2) return -1; } // Reopen fn using the crypt4gh plug-in (if available) snprintf(fn2, fn2_len, "%s%s", prefix, fn); - hfile2 = hopen(fn2, mode, "parent", hfile1, NULL); + snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":"); + hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL); if (hfile2) { // Replace original hfile with the new one. The original is now // enclosed within hfile2 @@ -1063,7 +1443,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) if (strchr(simple_mode, 'r')) { const int max_loops = 5; // Should be plenty int loops = 0; - if (hts_detect_format(hfile, &fp->format) < 0) goto error; + if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error; // Deal with formats that re-direct an underlying file via a plug-in. // Loops as we may have crypt4gh served via htsget, or @@ -1088,7 +1468,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) } // Re-detect format against the result of the redirection - if (hts_detect_format(hfile, &fp->format) < 0) goto error; + if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error; } } else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) { @@ -1097,6 +1477,8 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) if (strchr(simple_mode, 'b')) fmt->format = binary_format; else if (strchr(simple_mode, 'c')) fmt->format = cram; + else if (strchr(simple_mode, 'f')) fmt->format = fastq_format; + else if (strchr(simple_mode, 'F')) fmt->format = fasta_format; else fmt->format = text_format; if (strchr(simple_mode, 'z')) fmt->compression = bgzf; @@ -1107,6 +1489,8 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) switch (fmt->format) { case binary_format: fmt->compression = bgzf; break; case cram: fmt->compression = custom; break; + case fastq_format: fmt->compression = no_compression; break; + case fasta_format: fmt->compression = no_compression; break; case text_format: fmt->compression = no_compression; break; default: abort(); } @@ -1134,7 +1518,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) fp->fp.cram = cram_dopen(hfile, fn, simple_mode); if (fp->fp.cram == NULL) goto error; if (!fp->is_write) - cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1); + cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto fp->is_cram = 1; break; @@ -1184,7 +1568,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) int hts_close(htsFile *fp) { - int ret, save; + int ret = 0, save; switch (fp->format.format) { case binary_format: @@ -1214,7 +1598,11 @@ int hts_close(htsFile *fp) case fastq_format: case sam: case vcf: - ret = sam_state_destroy(fp); + if (fp->format.format == sam) + ret = sam_state_destroy(fp); + else if (fp->format.format == fastq_format || + fp->format.format == fasta_format) + fastq_state_destroy(fp); if (fp->format.compression != no_compression) ret |= bgzf_close(fp->fp.bgzf); @@ -1230,6 +1618,7 @@ int hts_close(htsFile *fp) save = errno; sam_hdr_destroy(fp->bam_header); hts_idx_destroy(fp->idx); + hts_filter_free(fp->filter); free(fp->fn); free(fp->fn_aux); free(fp->line.s); @@ -1238,6 +1627,38 @@ int hts_close(htsFile *fp) return ret; } +int hts_flush(htsFile *fp) +{ + if (fp == NULL) return 0; + + switch (fp->format.format) { + case binary_format: + case bam: + case bcf: + return bgzf_flush(fp->fp.bgzf); + + case cram: + return cram_flush(fp->fp.cram); + + case empty_format: + case text_format: + case bed: + case fasta_format: + case fastq_format: + case sam: + case vcf: + if (fp->format.compression != no_compression) + return bgzf_flush(fp->fp.bgzf); + else + return hflush(fp->fp.hfile); + + default: + break; + } + + return 0; +} + const htsFormat *hts_get_format(htsFile *fp) { return fp? &fp->format : NULL; @@ -1261,6 +1682,7 @@ const char *hts_format_file_extension(const htsFormat *format) { case gzi: return "gzi"; case tbi: return "tbi"; case bed: return "bed"; + case d4_format: return "d4"; case fasta_format: return "fa"; case fastq_format: return "fq"; default: return "?"; @@ -1275,6 +1697,8 @@ static hFILE *hts_hfile(htsFile *fp) { case cram: return cram_hfile(fp->fp.cram); case text_format: return fp->fp.hfile; case vcf: // fall through + case fastq_format: // fall through + case fasta_format: // fall through case sam: return fp->format.compression != no_compression ? bgzf_hfile(fp->fp.bgzf) : fp->fp.hfile; @@ -1326,12 +1750,73 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { return 0; } + case FASTQ_OPT_CASAVA: + case FASTQ_OPT_RNUM: + case FASTQ_OPT_NAME2: + if (fp->format.format == fastq_format || + fp->format.format == fasta_format) + return fastq_state_set(fp, opt); + return 0; + + case FASTQ_OPT_AUX: + if (fp->format.format == fastq_format || + fp->format.format == fasta_format) { + va_start(args, opt); + char *list = va_arg(args, char *); + va_end(args); + return fastq_state_set(fp, opt, list); + } + return 0; + + case FASTQ_OPT_BARCODE: + if (fp->format.format == fastq_format || + fp->format.format == fasta_format) { + va_start(args, opt); + char *bc = va_arg(args, char *); + va_end(args); + return fastq_state_set(fp, opt, bc); + } + return 0; + + // Options below here flow through to cram_set_voption case HTS_OPT_COMPRESSION_LEVEL: { va_start(args, opt); int level = va_arg(args, int); va_end(args); if (fp->is_bgzf) fp->fp.bgzf->compress_level = level; + else if (fp->format.format == cram) + return cram_set_option(fp->fp.cram, opt, level); + return 0; + } + + case HTS_OPT_FILTER: { + va_start(args, opt); + char *expr = va_arg(args, char *); + va_end(args); + return hts_set_filter_expression(fp, expr); + } + + case HTS_OPT_PROFILE: { + va_start(args, opt); + enum hts_profile_option prof = va_arg(args, int); + va_end(args); + if (fp->is_bgzf) { + switch (prof) { +#ifdef HAVE_LIBDEFLATE + case HTS_PROFILE_FAST: fp->fp.bgzf->compress_level = 2; break; + case HTS_PROFILE_NORMAL: fp->fp.bgzf->compress_level = -1; break; + case HTS_PROFILE_SMALL: fp->fp.bgzf->compress_level = 10; break; + case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break; +#else + case HTS_PROFILE_FAST: fp->fp.bgzf->compress_level = 1; break; + case HTS_PROFILE_NORMAL: fp->fp.bgzf->compress_level = -1; break; + case HTS_PROFILE_SMALL: fp->fp.bgzf->compress_level = 8; break; + case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 9; break; +#endif + } + } // else CRAM manages this in its own way + break; } default: @@ -1395,6 +1880,18 @@ int hts_set_fai_filename(htsFile *fp, const char *fn_aux) return 0; } +int hts_set_filter_expression(htsFile *fp, const char *expr) +{ + if (fp->filter) + hts_filter_free(fp->filter); + + if (!expr) + return 0; + + return (fp->filter = hts_filter_init(expr)) + ? 0 : -1; +} + hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname) { int pid = (int) getpid(); @@ -1452,7 +1949,7 @@ int hts_getline(htsFile *fp, int delimiter, kstring_t *str) case no_compression: str->l = 0; ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile); - if (ret >= 0) ret = str->l; + if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX; else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile); else ret = -1; break; @@ -1480,8 +1977,9 @@ char **hts_readlist(const char *string, int is_file, int *_n) if ( !fp ) return NULL; kstring_t str; + int ret; str.s = 0; str.l = str.m = 0; - while (bgzf_getline(fp, '\n', &str) >= 0) + while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { if (str.l == 0) continue; if (hts_resize(char*, n + 1, &m, &s, 0) < 0) @@ -1491,6 +1989,8 @@ char **hts_readlist(const char *string, int is_file, int *_n) goto err; n++; } + if (ret < -1) // Read error + goto err; bgzf_close(fp); free(str.s); } @@ -1537,8 +2037,9 @@ char **hts_readlines(const char *fn, int *_n) BGZF *fp = bgzf_open(fn, "r"); if ( fp ) { // read from file kstring_t str; + int ret; str.s = 0; str.l = str.m = 0; - while (bgzf_getline(fp, '\n', &str) >= 0) { + while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { if (str.l == 0) continue; if (hts_resize(char *, n + 1, &m, &s, 0) < 0) goto err; @@ -1547,6 +2048,8 @@ char **hts_readlines(const char *fn, int *_n) goto err; n++; } + if (ret < -1) // Read error + goto err; bgzf_close(fp); free(str.s); } else if (*fn == ':') { // read from string @@ -1593,7 +2096,7 @@ int hts_file_type(const char *fname) if (f == NULL) return 0; htsFormat fmt; - if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; } + if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; } if (hclose(f) < 0) return 0; switch (fmt.format) { @@ -1674,6 +2177,46 @@ static char * idx_format_name(int fmt) { } } +#ifdef DEBUG_INDEX +static void idx_dump(const hts_idx_t *idx) { + int i; + int64_t j; + + if (!idx) fprintf(stderr, "Null index\n"); + + fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ", + idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta); + fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor); + for (i = 0; i < idx->n; i++) { + bidx_t *bidx = idx->bidx[i]; + lidx_t *lidx = &idx->lidx[i]; + if (bidx) { + fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size); + int b; + for (b = 0; b < META_BIN(idx); b++) { + khint_t k; + if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) { + bins_t *entries = &kh_value(bidx, k); + int l = hts_bin_level(b); + int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift); + fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n", + b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width); + for (j = 0; j < entries->n; j++) + fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v); + } + } + } + if (lidx) { + fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n); + for (j = 0; j < lidx->n; j++) { + fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n", + j, lidx->offset[j], j*(1<min_shift)+1, (j+1)*(1<min_shift)); + } + } + } +} +#endif + static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) { khint_t k; @@ -1757,17 +2300,11 @@ static void update_loff(hts_idx_t *idx, int i, int free_lidx) lidx_t *lidx = &idx->lidx[i]; khint_t k; int l; - uint64_t offset0 = 0; - if (bidx) { - k = kh_get(bin, bidx, META_BIN(idx)); - if (k != kh_end(bidx)) - offset0 = kh_val(bidx, k).list[0].u; - for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l) - lidx->offset[l] = offset0; - } else l = 1; - for (; l < lidx->n; ++l) // fill missing values + // the last entry is always valid + for (l=lidx->n-2; l >= 0; l--) { if (lidx->offset[l] == (uint64_t)-1) - lidx->offset[l] = lidx->offset[l-1]; + lidx->offset[l] = lidx->offset[l+1]; + } if (bidx == 0) return; for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff if (kh_exist(bidx, k)) @@ -1864,26 +2401,16 @@ int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); if (tid < 0 || (beg <= maxpos && end <= maxpos)) return 0; - int64_t max = end > beg ? end : beg, s = 1 << 14; - int n_lvls = 0; - while (max > s) { - n_lvls++; - s <<= 3; - } if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a csi index " - "with min_shift = %d, n_lvls = %d. Try using " - "min_shift = 14, n_lvls >= %d", - beg, end, - idx->min_shift, idx->n_lvls, - n_lvls); + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" " + "cannot be stored in a csi index with these parameters. " + "Please use a larger min_shift or depth", + beg, end); } else { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a %s index. " - "Try using a csi index with min_shift = 14, " - "n_lvls >= %d", - beg, end, idx_format_name(idx->fmt), - n_lvls); + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos + " cannot be stored in a %s index. Try using a csi index", + beg, end, idx_format_name(idx->fmt)); } errno = ERANGE; return -1; @@ -1939,14 +2466,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t if ( tid>=0 ) { if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin); - if (is_mapped) { - // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin - if (beg < 0) beg = 0; - if (end <= 0) end = 1; - // idx->z.last_off points to the start of the current record - if (insert_to_l(&idx->lidx[tid], beg, end, - idx->z.last_off, idx->min_shift) < 0) return -1; - } + // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin + if (beg < 0) beg = 0; + if (end <= 0) end = 1; + // idx->z.last_off points to the start of the current record + if (insert_to_l(&idx->lidx[tid], beg, end, + idx->z.last_off, idx->min_shift) < 0) return -1; } else idx->n_no_coor++; bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls); @@ -2081,7 +2606,7 @@ static inline void swap_bins(bins_t *p) } } -static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) +static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) { int32_t i, j; @@ -2134,6 +2659,10 @@ static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) } check(idx_write_uint64(fp, idx->n_no_coor)); +#ifdef DEBUG_INDEX + idx_dump(idx); +#endif + return 0; #undef check } @@ -2183,7 +2712,7 @@ int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int check(bgzf_write(fp, "BAI\1", 4)); } else abort(); - check(hts_idx_save_core(idx, fp, fmt)); + check(idx_save_core(idx, fp, fmt)); return bgzf_close(fp); #undef check @@ -2231,7 +2760,7 @@ static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt) if (is_be) swap_bins(p); } if (fmt != HTS_FMT_CSI) { // load linear index - int j; + int j, k; uint32_t x; if (bgzf_read(fp, &x, 4) != 4) return -1; if (is_be) ed_swap_4p(&x); @@ -2243,13 +2772,18 @@ static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt) if (l->offset == NULL) return -2; if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1; if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]); - for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix - if (l->offset[j] == 0) l->offset[j] = l->offset[j-1]; + for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry + for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix + if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j]; update_loff(idx, i, 0); } } if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0; if (is_be) ed_swap_8p(&idx->n_no_coor); +#ifdef DEBUG_INDEX + idx_dump(idx); +#endif + return 0; } @@ -2355,7 +2889,7 @@ uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) { - if ( !idx->n ) + if ( !idx || !idx->n ) { *n = 0; return NULL; @@ -2373,14 +2907,21 @@ const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, return names; } +int hts_idx_nseq(const hts_idx_t *idx) { + if (!idx) return -1; + return idx->n; +} + int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped) { + if (!idx) return -1; if ( idx->fmt == HTS_FMT_CRAI ) { *mapped = 0; *unmapped = 0; return -1; } bidx_t *h = idx->bidx[tid]; + if (!h) return -1; khint_t k = kh_get(bin, h, META_BIN(idx)); if (k != kh_end(h)) { *mapped = kh_val(h, k).list[1].u; @@ -2394,6 +2935,7 @@ int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) { + if (idx->fmt == HTS_FMT_CRAI) return 0; return idx->n_no_coor; } @@ -2402,73 +2944,201 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) ****************/ // Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<= end) return 0; - if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; - if (itr->bins.n + n > itr->bins.m) { - itr->bins.m = itr->bins.n + n; - kroundup32(itr->bins.m); - itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m); + int i; + b = t + (beg>>s); e = t + (end>>s); + for (i = b; i <= e; ++i) { + if (kh_get(bin, bidx, i) != kh_end(bidx)) { + assert(itr->bins.n < itr->bins.m); + itr->bins.a[itr->bins.n++] = i; + } + } + } + return itr->bins.n; +} + +static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) +{ + khint_t i; + hts_pos_t max_shift = 3 * n_lvls + min_shift; + --end; + if (beg < 0) beg = 0; + for (i = kh_begin(bidx); i != kh_end(bidx); i++) { + if (!kh_exist(bidx, i)) continue; + hts_pos_t bin = (hts_pos_t) kh_key(bidx, i); + int level = hts_bin_level(bin); + if (level > n_lvls) continue; // Dodgy index? + hts_pos_t first = hts_bin_first(level); + hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level)); + hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level)); + if (beg_at_level <= bin && bin <= end_at_level) { + assert(itr->bins.n < itr->bins.m); + itr->bins.a[itr->bins.n++] = bin; } - for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i; } return itr->bins.n; } +static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) +{ + int l, t, s = min_shift + (n_lvls<<1) + n_lvls; + size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins; + hts_pos_t end1; + if (end >= 1LL<= end) return 0; + end1 = end - 1; + + // Count bins to see if it's faster to iterate through the hash table + // or the set of bins covering the region + for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + reg_bin_count += (end1 >> s) - (beg >> s) + 1; + } + max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx); + if (itr->bins.m - itr->bins.n < max_bins) { + // Worst-case memory usage. May be wasteful on very sparse + // data, but the bin list usually won't be too big anyway. + size_t new_m = max_bins + itr->bins.n; + if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) { + errno = ENOMEM; + return -1; + } + int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a)); + if (!new_a) return -1; + itr->bins.a = new_a; + itr->bins.m = new_m; + } + if (reg_bin_count < hash_bin_count) { + return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx); + } else { + return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx); + } +} + +static inline int add_to_interval(hts_itr_t *iter, bins_t *bin, + int tid, uint32_t interval, + uint64_t min_off, uint64_t max_off) +{ + hts_pair64_max_t *off; + int j; + + if (!bin->n) + return 0; + off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off)); + if (!off) + return -2; + + iter->off = off; + for (j = 0; j < bin->n; ++j) { + if (bin->list[j].v > min_off && bin->list[j].u < max_off) { + iter->off[iter->n_off].u = min_off > bin->list[j].u + ? min_off : bin->list[j].u; + iter->off[iter->n_off].v = max_off < bin->list[j].v + ? max_off : bin->list[j].v; + // hts_pair64_max_t::max is now used to link + // file offsets to region list entries. + // The iterator can use this to decide if it + // can skip some file regions. + iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval; + iter->n_off++; + } + } + return 0; +} + +static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx, + int tid, int64_t beg, int64_t end, + uint32_t interval, + uint64_t min_off, uint64_t max_off, + int min_shift, int n_lvls) +{ + int l, t, s = min_shift + n_lvls * 3; + hts_pos_t b, e, i; + + for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + b = t + (beg>>s); e = t + (end>>s); + for (i = b; i <= e; ++i) { + khint_t k = kh_get(bin, bidx, i); + if (k != kh_end(bidx)) { + bins_t *bin = &kh_value(bidx, k); + int res = add_to_interval(iter, bin, tid, interval, min_off, max_off); + if (res < 0) + return res; + } + } + } + return 0; +} + +static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx, + int tid, int64_t beg, int64_t end, + uint32_t interval, + uint64_t min_off, uint64_t max_off, + int min_shift, int n_lvls) +{ + khint_t i; + hts_pos_t max_shift = 3 * n_lvls + min_shift; + --end; + if (beg < 0) beg = 0; + for (i = kh_begin(bidx); i != kh_end(bidx); i++) { + if (!kh_exist(bidx, i)) continue; + hts_pos_t bin = (hts_pos_t) kh_key(bidx, i); + int level = hts_bin_level(bin); + if (level > n_lvls) continue; // Dodgy index? + hts_pos_t first = hts_bin_first(level); + hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level)); + hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level)); + if (beg_at_level <= bin && bin <= end_at_level) { + bins_t *bin = &kh_value(bidx, i); + int res = add_to_interval(iter, bin, tid, interval, min_off, max_off); + if (res < 0) + return res; + } + } + return 0; +} + static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) { int l, t, s; int i, j; - hts_pos_t b, e; - hts_pair64_max_t *off; + hts_pos_t end1; bidx_t *bidx; - khint_t k; - int start_n_off = iter->n_off; + int start_n_off; + size_t reg_bin_count = 0, hash_bin_count; + int res; if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end) return -1; + hash_bin_count = kh_n_buckets(bidx); + s = min_shift + (n_lvls<<1) + n_lvls; if (end >= 1LL<>s); e = t + (end>>s); - - for (i = b; i <= e; ++i) { - if ((k = kh_get(bin, bidx, i)) != kh_end(bidx)) { - bins_t *p = &kh_value(bidx, k); + end1 = end - 1; + // Count bins to see if it's faster to iterate through the hash table + // or the set of bins covering the region + for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + reg_bin_count += (end1 >> s) - (beg >> s) + 1; + } - if (p->n) { - off = realloc(iter->off, (iter->n_off + p->n) * sizeof(*off)); - if (!off) - return -2; + start_n_off = iter->n_off; - iter->off = off; - for (j = 0; j < p->n; ++j) { - if (p->list[j].v > min_off && p->list[j].u < max_off) { - iter->off[iter->n_off].u = min_off > p->list[j].u - ? min_off : p->list[j].u; - iter->off[iter->n_off].v = max_off < p->list[j].v - ? max_off : p->list[j].v; - // hts_pair64_max_t::max is now used to link - // file offsets to region list entries. - // The iterator can use this to decide if it - // can skip some file regions. - iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval; - iter->n_off++; - } - } - } - } - } + // Populate iter->off with the intervals for this region + if (reg_bin_count < hash_bin_count) { + res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval, + min_off, max_off, min_shift, n_lvls); + } else { + res = reg2intervals_wide(iter, bidx, tid, beg, end, interval, + min_off, max_off, min_shift, n_lvls); } + if (res < 0) + return res; if (iter->n_off - start_n_off > 1) { ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off); @@ -2561,6 +3231,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t bidx_t *bidx; uint64_t min_off, max_off; hts_itr_t *iter; + uint32_t unmapped = 0, rel_off; // It's possible to call this function with NULL idx iff // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE @@ -2583,24 +3254,29 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t free(iter); iter = NULL; } + } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) { + iter->finished = 1; } else { if (beg < 0) beg = 0; if (end < beg) { free(iter); return NULL; } - if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) { - free(iter); - return NULL; - } + + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) + unmapped = kh_val(bidx, k).list[1].v; + else + unmapped = 1; iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; iter->readrec = readrec; if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } + rel_off = beg>>idx->min_shift; // compute min_off - bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + bin = hts_bin_first(idx->n_lvls) + rel_off; do { int first; k = kh_get(bin, bidx, bin); @@ -2611,27 +3287,58 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } while (bin); if (bin == 0) k = kh_get(bin, bidx, bin); min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; + // min_off can be calculated more accurately if the + // linear index is available if (idx->lidx[tid].offset - && beg>>idx->min_shift < idx->lidx[tid].n - && min_off < idx->lidx[tid].offset[beg>>idx->min_shift]) - min_off = idx->lidx[tid].offset[beg>>idx->min_shift]; + && rel_off < idx->lidx[tid].n) { + if (min_off < idx->lidx[tid].offset[rel_off]) + min_off = idx->lidx[tid].offset[rel_off]; + if (unmapped) { + // unmapped reads are not covered by the linear index, + // so search backwards for a smaller offset + int tmp_off; + for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { + if (idx->lidx[tid].offset[tmp_off] < min_off) { + min_off = idx->lidx[tid].offset[tmp_off]; + break; + } + } + // if the search went too far back or no satisfactory entry + // was found, revert to the bin index loff value + if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0)) + min_off = kh_val(bidx, k).loff; + } + } else if (unmapped) { //CSI index + if (k != kh_end(bidx)) + min_off = kh_val(bidx, k).loff; + } // compute max_off: a virtual offset from a bin to the right of end - bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; - if (bin >= idx->n_bins) bin = 0; - while (1) { - // search for an extant bin by moving right, but moving up to the - // parent whenever we get to a first child (which also covers falling - // off the RHS, which wraps around and immediately goes up to bin 0) - while (bin % 8 == 1) bin = hts_bin_parent(bin); - if (bin == 0) { max_off = (uint64_t)-1; break; } - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } - bin++; + // First check if end lies within the range of the index (it won't + // if it's HTS_POS_MAX) + if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { + // search for an extant bin by moving right, but moving up to the + // parent whenever we get to a first child (which also covers falling + // off the RHS, which wraps around and immediately goes up to bin 0) + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = UINT64_MAX; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } + bin++; + } + } else { + // Searching to end of reference + max_off = UINT64_MAX; } // retrieve bins - reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); + if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) { + hts_itr_destroy(iter); + return NULL; + } for (i = n_off = 0; i < iter->bins.n; ++i) if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) @@ -2697,6 +3404,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) int tid; hts_pos_t beg, end; hts_reglist_t *curr_reg; + uint32_t unmapped = 0, rel_off; if (!idx || !iter || !iter->multi) return -1; @@ -2713,6 +3421,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) switch (tid) { case HTS_IDX_NONE: iter->finished = 1; + // fall through case HTS_IDX_START: case HTS_IDX_REST: iter->curr_off = t_off; @@ -2729,6 +3438,12 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx)) continue; + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) + unmapped = kh_val(bidx, k).list[1].v; + else + unmapped = 1; + for(j=0; jcount; j++) { hts_pair32_t *curr_intv = &curr_reg->intervals[j]; if (curr_intv->end < curr_intv->beg) @@ -2736,12 +3451,13 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) beg = curr_intv->beg; end = curr_intv->end; + rel_off = beg>>idx->min_shift; /* Compute 'min_off' by searching the lowest level bin containing 'beg'. If the computed bin is not in the index, try the next bin to the left, belonging to the same parent. If it is the first sibling bin, try the parent bin. */ - bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + bin = hts_bin_first(idx->n_lvls) + rel_off; do { int first; k = kh_get(bin, bidx, bin); @@ -2756,25 +3472,48 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) // min_off can be calculated more accurately if the // linear index is available if (idx->lidx[tid].offset - && beg>>idx->min_shift < idx->lidx[tid].n - && min_off < idx->lidx[tid].offset[beg>>idx->min_shift]) - min_off = idx->lidx[tid].offset[beg>>idx->min_shift]; + && rel_off < idx->lidx[tid].n) { + if (min_off < idx->lidx[tid].offset[rel_off]) + min_off = idx->lidx[tid].offset[rel_off]; + if (unmapped) { + int tmp_off; + for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { + if (idx->lidx[tid].offset[tmp_off] < min_off) { + min_off = idx->lidx[tid].offset[tmp_off]; + break; + } + } + + if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0)) + min_off = kh_val(bidx, k).loff; + } + } else if (unmapped) { //CSI index + if (k != kh_end(bidx)) + min_off = kh_val(bidx, k).loff; + } // compute max_off: a virtual offset from a bin to the right of end - bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; - if (bin >= idx->n_bins) bin = 0; - while (1) { - // search for an extant bin by moving right, but moving up to the - // parent whenever we get to a first child (which also covers falling - // off the RHS, which wraps around and immediately goes up to bin 0) - while (bin % 8 == 1) bin = hts_bin_parent(bin); - if (bin == 0) { max_off = (uint64_t)-1; break; } - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { - max_off = kh_val(bidx, k).list[0].u; - break; + // First check if end lies within the range of the index (it + // won't if it's HTS_POS_MAX) + if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { + // search for an extant bin by moving right, but moving up to the + // parent whenever we get to a first child (which also covers falling + // off the RHS, which wraps around and immediately goes up to bin 0) + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = UINT64_MAX; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { + max_off = kh_val(bidx, k).list[0].u; + break; + } + bin++; } - bin++; + } else { + // Searching to end of reference + max_off = UINT64_MAX; } //convert coordinates to file offsets @@ -2857,14 +3596,12 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) } if (e) { - off[n_off++].v = e->next - ? e->next + off[n_off++].v = e->e_next + ? e->e_next->offset : e->offset + e->slice + e->len; } else { hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end); } - } else { - hts_log_warning("No index entry for region %d:%"PRIhts_pos"-%"PRIhts_pos"", tid, beg, end); } } } else { @@ -2944,32 +3681,32 @@ static inline long long push_digit(long long i, char c) long long hts_parse_decimal(const char *str, char **strend, int flags) { long long n = 0; - int decimals = 0, e = 0, lost = 0; + int digits = 0, decimals = 0, e = 0, lost = 0; char sign = '+', esign = '+'; - const char *s; + const char *s, *str_orig = str; while (isspace_c(*str)) str++; s = str; if (*s == '+' || *s == '-') sign = *s++; while (*s) - if (isdigit_c(*s)) n = push_digit(n, *s++); + if (isdigit_c(*s)) digits++, n = push_digit(n, *s++); else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++; else break; if (*s == '.') { s++; - while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++); + while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++); } - if (*s == 'E' || *s == 'e') { + switch (*s) { + case 'e': case 'E': s++; if (*s == '+' || *s == '-') esign = *s++; while (isdigit_c(*s)) e = push_digit(e, *s++); if (esign == '-') e = -e; - } + break; - switch (*s) { case 'k': case 'K': e += 3; s++; break; case 'm': case 'M': e += 6; s++; break; case 'g': case 'G': e += 9; s++; break; @@ -2984,7 +3721,10 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) } if (strend) { - *strend = (char *)s; + // Set to the original input str pointer if not valid number syntax + *strend = (digits > 0)? (char *)s : (char *)str_orig; + } else if (digits == 0) { + hts_log_warning("Invalid numeric value %.8s[truncated]", str); } else if (*s) { if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ',')) hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); @@ -3170,14 +3910,17 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, char *hyphen; *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; if (*beg < 0) { + if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') { + // User specified zero, but we're 1-based. + hts_log_error("Coordinates must be > 0"); + return NULL; + } if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') { // interpret chr:-100 as chr:1-100 *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1); *beg = 0; return s_end; - } else if (*hyphen == '-') { - *beg = 0; - } else { + } else if (*beg < -1) { hts_log_error("Unexpected string \"%s\" after region", hyphen); return NULL; } @@ -3704,7 +4447,7 @@ static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_ free(s.s); return -1; } - if (hts_detect_format(remote_hfp, &fmt)) { + if (hts_detect_format2(remote_hfp, fn, &fmt)) { hts_log_error("Failed to detect format of index file '%s'", fn); goto fail; } @@ -3968,11 +4711,11 @@ static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags) if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) { if (flags & HTS_IDX_SAVE_REMOTE) { - fnidx = hts_idx_getfn(fn, ".csi"); + fnidx = idx_filename(fn, ".csi", HTS_IDX_SAVE_REMOTE); if (!fnidx) { switch (fmt) { - case HTS_FMT_BAI: fnidx = hts_idx_getfn(fn, ".bai"); break; - case HTS_FMT_TBI: fnidx = hts_idx_getfn(fn, ".tbi"); break; + case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", HTS_IDX_SAVE_REMOTE); break; + case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", HTS_IDX_SAVE_REMOTE); break; default: break; } } @@ -4042,7 +4785,9 @@ hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags) hts_idx_t *idx = idx_read(fnidx); if (!idx && !(flags & HTS_IDX_SILENT_FAIL)) - hts_log_error("Could not load local index file '%s'", fnidx); + hts_log_error("Could not load local index file '%s'%s%s", fnidx, + errno ? " : " : "", errno ? strerror(errno) : ""); + free(local_fnidx); diff --git a/hts_expr.c b/hts_expr.c new file mode 100644 index 000000000..5e5a132ea --- /dev/null +++ b/hts_expr.c @@ -0,0 +1,921 @@ +/* hts_expr.c -- filter expression parsing and processing. + + Copyright (C) 2020-2022 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +// TODO: +// - ?: operator for conditionals? + +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "htslib/hts_expr.h" +#include "htslib/hts_log.h" +#include "textutils_internal.h" + +// Could also cache hts_expr_val_t stack here for kstring reuse? +#define MAX_REGEX 10 +struct hts_filter_t { + char *str; + int parsed; + int curr_regex, max_regex; + regex_t preg[MAX_REGEX]; +}; + +/* + * This is designed to be mostly C like with mostly same the precedence rules, + * with the exception of bit operators (widely considered as a mistake in C). + * It's not full C (eg no bit-shifting), but good enough for our purposes. + * + * Supported syntax, in order of precedence: + * + * Grouping: (, ), eg "(1+2)*3" + * Values: integers, floats, strings or variables + * Unary ops: +, -, !, ~ eg -10 +10, !10 (0), ~5 (bitwise not) + * Math ops: *, /, % [TODO: add // for floor division?] + * Math ops: +, - + * Bit-wise: &, ^, | [NB as 3 precedence levels, in that order] + * Conditionals: >, >=, <, <=, + * Equality: ==, !=, =~, !~ + * Boolean: &&, || + */ + +// Skip to start of term +static char *ws(char *str) { + while (*str && (*str == ' ' || *str == '\t')) + str++; + return str; +} + +static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res); + +/* + * Simple functions operating on strings only. + * length, min, max, avg. + * + * All return 0 on success, + * -1 on failure + */ +static int expr_func_length(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + res->is_str = 0; + res->d = res->s.l; + return 0; +} + +static int expr_func_min(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + int v = INT_MAX; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + if (v > x[l]) + v = x[l]; + + res->is_str = 0; + res->d = v == INT_MAX ? NAN : v; + + return 0; +} + +static int expr_func_max(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + int v = INT_MIN; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + if (v < x[l]) + v = x[l]; + + res->is_str = 0; + res->d = v == INT_MIN ? NAN : v; + + return 0; +} + +static int expr_func_avg(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + double v = 0; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + v += x[l]; + if (l) + v /= l; + + res->is_str = 0; + res->d = v; + + return 0; +} + +/* + * functions: FUNC(expr). + * Note for simplicity of parsing, the "(" must immediately follow FUNC, + * so "FUNC (x)" is invalid. + */ +static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + int func_ok = -1; + switch (*str) { + case 'a': + if (strncmp(str, "avg(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_avg(res); + } + break; + + case 'd': + if (strncmp(str, "default(", 8) == 0) { + if (expression(filt, data, fn, str+8, end, res)) return -1; + if (**end != ',') + return -1; + (*end)++; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + if (expression(filt, data, fn, ws(*end), end, &val)) return -1; + func_ok = 1; + if (!hts_expr_val_existsT(res)) { + kstring_t swap = res->s; + *res = val; + val.s = swap; + hts_expr_val_free(&val); + } + } + break; + + case 'e': + if (strncmp(str, "exists(", 7) == 0) { + if (expression(filt, data, fn, str+7, end, res)) return -1; + func_ok = 1; + res->is_true = res->d = hts_expr_val_existsT(res); + res->is_str = 0; + } else if (strncmp(str, "exp(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + res->d = exp(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); + } + + break; + + case 'l': + if (strncmp(str, "length(", 7) == 0) { + if (expression(filt, data, fn, str+7, end, res)) return -1; + func_ok = expr_func_length(res); + } else if (strncmp(str, "log(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + res->d = log(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); + } + break; + + case 'm': + if (strncmp(str, "min(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_min(res); + } else if (strncmp(str, "max(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_max(res); + } + break; + + case 'p': + if (strncmp(str, "pow(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + + if (**end != ',') + return -1; + (*end)++; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + if (expression(filt, data, fn, ws(*end), end, &val)) return -1; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else if (res->is_str || val.is_str) { + hts_expr_val_free(&val); // arith on strings + return -1; + } else { + func_ok = 1; + res->d = pow(res->d, val.d); + hts_expr_val_free(&val); + res->is_str = 0; + } + + if (isnan(res->d)) + hts_expr_val_undef(res); + } + break; + + case 's': + if (strncmp(str, "sqrt(", 5) == 0) { + if (expression(filt, data, fn, str+5, end, res)) return -1; + func_ok = 1; + res->d = sqrt(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); + } + break; + } + + if (func_ok < 0) + return -1; + + str = ws(*end); + if (*str != ')') { + fprintf(stderr, "Missing ')'\n"); + return -1; + } + *end = str+1; + + return 0; +} + +/* + * simple_expr + * : identifier + * | constant + * | string + * | func_expr + * | '(' expression ')' +*/ +static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + // Main recursion step + str = ws(str); + if (*str == '(') { + if (expression(filt, data, fn, str+1, end, res)) return -1; + str = ws(*end); + if (*str != ')') { + fprintf(stderr, "Missing ')'\n"); + return -1; + } + *end = str+1; + + return 0; + } + + // Otherwise a basic element. + int fail = 0; + double d = hts_str2dbl(str, end, &fail); + if (str != *end) { + res->is_str = 0; + res->d = d; + } else { + // Not valid floating point syntax. + // TODO: add function call names in here; len(), sqrt(), pow(), etc + if (*str == '"') { + res->is_str = 1; + char *e = str+1; + int backslash = 0; + while (*e && *e != '"') { + if (*e == '\\') + backslash=1, e+=1+(e[1]!='\0'); + else + e++; + } + + kputsn(str+1, e-(str+1), ks_clear(&res->s)); + if (backslash) { + size_t i, j; + for (i = j = 0; i < res->s.l; i++) { + res->s.s[j++] = res->s.s[i]; + if (res->s.s[i] == '\\') { + switch (res->s.s[++i]) { + case '"': res->s.s[j-1] = '"'; break; + case '\\':res->s.s[j-1] = '\\'; break; + case 't': res->s.s[j-1] = '\t'; break; + case 'n': res->s.s[j-1] = '\n'; break; + case 'r': res->s.s[j-1] = '\r'; break; + default: res->s.s[j++] = res->s.s[i]; + } + } + } + res->s.s[j] = 0; + res->s.l = j; + } + if (*e != '"') + return -1; + *end = e+1; + } else if (fn) { + // Try lookup as variable, if not as function + if (fn(data, str, end, res) == 0) + return 0; + else + return func_expr(filt, data, fn, str, end, res); + } else { + return -1; + } + } + + return 0; +} + +/* + * unary_expr + * : simple_expr + * | '+' simple_expr + * | '-' simple_expr + * | '!' unary_expr // higher precedence + * | '~' unary_expr // higher precedence + */ +static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + int err; + str = ws(str); + if (*str == '+' || *str == '-') { + err = simple_expr(filt, data, fn, str+1, end, res); + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + err |= res->is_str; + if (*str == '-') + res->d = -res->d; + res->is_true = res->d != 0; + } + } else if (*str == '!') { + err = unary_expr(filt, data, fn, str+1, end, res); + if (res->is_true) { + // Any explicitly true value becomes false + res->d = res->is_true = 0; + } else if (!hts_expr_val_exists(res)) { + // We can also still negate undef values by toggling the + // is_true override value. + res->d = res->is_true = !res->is_true; + } else if (res->is_str) { + // !null = true, !"foo" = false, NOTE: !"" = false also + res->d = res->is_true = (res->s.s == NULL); + } else { + res->d = !(int64_t)res->d; + res->is_true = res->d != 0; + } + res->is_str = 0; + } else if (*str == '~') { + err = unary_expr(filt, data, fn, str+1, end, res); + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + err |= res->is_str; + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + res->d = ~(int64_t)res->d; + res->is_true = res->d != 0; + } + } + } else { + err = simple_expr(filt, data, fn, str, end, res); + } + return err ? -1 : 0; +} + + +/* + * mul_expr + * : unary_expr ( + * '*' unary_expr + * | '/' unary_expr + * | '%' unary_expr + * )* + */ +static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + if (unary_expr(filt, data, fn, str, end, res)) + return -1; + + str = *end; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + while (*str) { + str = ws(str); + if (*str == '*' || *str == '/' || *str == '%') { + if (unary_expr(filt, data, fn, str+1, end, &val)) return -1; + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else if (val.is_str || res->is_str) { + hts_expr_val_free(&val); + return -1; // arith on strings + } + } + + if (*str == '*') + res->d *= val.d; + else if (*str == '/') + res->d /= val.d; + else if (*str == '%') { + if (val.d) + res->d = (int64_t)res->d % (int64_t)val.d; + else + hts_expr_val_undef(res); + } else + break; + + res->is_true = hts_expr_val_exists(res) && (res->d != 0); + str = *end; + } + + hts_expr_val_free(&val); + + return 0; +} + +/* + * add_expr + * : mul_expr ( + * '+' mul_expr + * | '-' mul_expr + * )* + */ +static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + if (mul_expr(filt, data, fn, str, end, res)) + return -1; + + str = *end; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + while (*str) { + str = ws(str); + int undef = 0; + if (*str == '+' || *str == '-') { + if (mul_expr(filt, data, fn, str+1, end, &val)) return -1; + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (val.is_str || res->is_str) { + hts_expr_val_free(&val); + return -1; // arith on strings + } + } + + if (*str == '+') + res->d += val.d; + else if (*str == '-') + res->d -= val.d; + else + break; + + if (undef) + hts_expr_val_undef(res); + else + res->is_true = res->d != 0; + + str = *end; + } + + hts_expr_val_free(&val); + + return 0; +} + +/* + * bitand_expr + * : add_expr + * | bitand_expr '&' add_expr + */ +static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + if (add_expr(filt, data, fn, str, end, res)) return -1; + + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; + for (;;) { + str = ws(*end); + if (*str == '&' && str[1] != '&') { + if (add_expr(filt, data, fn, str+1, end, &val)) return -1; + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { + hts_expr_val_free(&val); + return -1; + } + res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0; + } else { + break; + } + } + hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); + + return 0; +} + +/* + * bitxor_expr + * : bitand_expr + * | bitxor_expr '^' bitand_expr + */ +static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + if (bitand_expr(filt, data, fn, str, end, res)) return -1; + + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; + for (;;) { + str = ws(*end); + if (*str == '^') { + if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1; + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { + hts_expr_val_free(&val); + return -1; + } + res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0; + } else { + break; + } + } + hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); + + return 0; +} + +/* + * bitor_expr + * : bitxor_expr + * | bitor_expr '|' bitxor_expr + */ +static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + if (bitxor_expr(filt, data, fn, str, end, res)) return -1; + + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; + for (;;) { + str = ws(*end); + if (*str == '|' && str[1] != '|') { + if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1; + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { + hts_expr_val_free(&val); + return -1; + } + res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0; + } else { + break; + } + } + hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); + + return 0; +} + +/* + * cmp_expr + * : bitor_expr + * | cmp_expr '<=' bitor_expr + * | cmp_expr '<' bitor_expr + * | cmp_expr '>=' bitor_expr + * | cmp_expr '>' bitor_expr + */ +static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + if (bitor_expr(filt, data, fn, str, end, res)) return -1; + + str = ws(*end); + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int err = 0, cmp_done = 0; + + if (*str == '>' && str[1] == '=') { + cmp_done = 1; + err = cmp_expr(filt, data, fn, str+2, end, &val); + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) >= 0 + : !res->is_str && !val.is_str && res->d >= val.d; + res->is_str = 0; + } + } else if (*str == '>') { + cmp_done = 1; + err = cmp_expr(filt, data, fn, str+1, end, &val); + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) > 0 + : !res->is_str && !val.is_str && res->d > val.d; + res->is_str = 0; + } + } else if (*str == '<' && str[1] == '=') { + cmp_done = 1; + err = cmp_expr(filt, data, fn, str+2, end, &val); + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) <= 0 + : !res->is_str && !val.is_str && res->d <= val.d; + res->is_str = 0; + } + } else if (*str == '<') { + cmp_done = 1; + err = cmp_expr(filt, data, fn, str+1, end, &val); + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) < 0 + : !res->is_str && !val.is_str && res->d < val.d; + res->is_str = 0; + } + } + + if (cmp_done && (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res))) + hts_expr_val_undef(res); + hts_expr_val_free(&val); + + return err ? -1 : 0; +} + +/* + * eq_expr + * : cmp_expr + * | eq_expr '==' cmp_expr + * | eq_expr '!=' cmp_expr + * | eq_expr '=~' cmp_expr + * | eq_expr '!~' cmp_expr + */ +static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + if (cmp_expr(filt, data, fn, str, end, res)) return -1; + + str = ws(*end); + + int err = 0, eq_done = 0; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + + // numeric vs numeric comparison is as expected + // string vs string comparison is as expected + // numeric vs string is false + if (str[0] == '=' && str[1] == '=') { + eq_done = 1; + if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { + res->is_true = res->d = 0; + } else { + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s)==0 :0) + : !res->is_str && !val.is_str && res->d == val.d; + } + } + res->is_str = 0; + + } else if (str[0] == '!' && str[1] == '=') { + eq_done = 1; + if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { + res->is_true = res->d = 0; + } else { + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s) != 0 :1) + : res->is_str != val.is_str || res->d != val.d; + } + } + res->is_str = 0; + + } else if ((str[0] == '=' && str[1] == '~') || + (str[0] == '!' && str[1] == '~')) { + eq_done = 1; + err = eq_expr(filt, data, fn, str+2, end, &val); + if (!val.is_str || !res->is_str) { + hts_expr_val_free(&val); + return -1; + } + if (val.s.s && res->s.s && val.is_true >= 0 && res->is_true >= 0) { + regex_t preg_, *preg; + if (filt->curr_regex >= filt->max_regex) { + // Compile regex if not seen before + if (filt->curr_regex >= MAX_REGEX) { + preg = &preg_; + } else { + preg = &filt->preg[filt->curr_regex]; + filt->max_regex++; + } + + int ec = regcomp(preg, val.s.s, REG_EXTENDED | REG_NOSUB); + if (ec != 0) { + char errbuf[1024]; + regerror(ec, preg, errbuf, 1024); + fprintf(stderr, "Failed regex: %.1024s\n", errbuf); + hts_expr_val_free(&val); + return -1; + } + } else { + preg = &filt->preg[filt->curr_regex]; + } + res->is_true = res->d = regexec(preg, res->s.s, 0, NULL, 0) == 0 + ? *str == '=' // matcn + : *str == '!'; // no-match + if (preg == &preg_) + regfree(preg); + + filt->curr_regex++; + } else { + // nul regexp or input is considered false + res->is_true = 0; + } + res->is_str = 0; + } + + if (eq_done && ((!hts_expr_val_exists(&val)) || !hts_expr_val_exists(res))) + hts_expr_val_undef(res); + hts_expr_val_free(&val); + + return err ? -1 : 0; +} + +/* + * and_expr + * : eq_expr + * | and_expr 'and' eq_expr + * | and_expr 'or' eq_expr + */ +static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + if (eq_expr(filt, data, fn, str, end, res)) return -1; + + for (;;) { + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + str = ws(*end); + if (str[0] == '&' && str[1] == '&') { + if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; + if (!hts_expr_val_existsT(res) || !hts_expr_val_existsT(&val)) { + hts_expr_val_undef(res); + res->d = 0; + } else { + res->is_true = res->d = + (res->is_true || (res->is_str && res->s.s) || res->d) && + (val.is_true || (val.is_str && val.s.s) || val.d); + res->is_str = 0; + } + } else if (str[0] == '|' && str[1] == '|') { + if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; + if (!hts_expr_val_existsT(res) && !hts_expr_val_existsT(&val)) { + // neither defined + hts_expr_val_undef(res); + res->d = 0; + } else if (!hts_expr_val_existsT(res) && + !(val.is_true || (val.is_str && val.s.s ) || val.d)) { + // LHS undef and RHS false + hts_expr_val_undef(res); + res->d = 0; + } else if (!hts_expr_val_existsT(&val) && + !(res->is_true || (res->is_str && res->s.s) || res->d)){ + // RHS undef and LHS false + hts_expr_val_undef(res); + res->d = 0; + } else { + res->is_true = res->d = + res->is_true || (res->is_str && res->s.s) || res->d || + val.is_true || (val.is_str && val.s.s ) || val.d; + res->is_str = 0; + } + } else { + break; + } + hts_expr_val_free(&val); + } + + return 0; +} + +static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + return and_expr(filt, data, fn, str, end, res); +} + +hts_filter_t *hts_filter_init(const char *str) { + hts_filter_t *f = calloc(1, sizeof(*f)); + if (!f) return NULL; + + // Oversize to permit faster comparisons with memcmp over strcmp + size_t len = strlen(str)+100; + if (!(f->str = malloc(len))) { + free(f); + return NULL; + } + strcpy(f->str, str); + return f; +} + +void hts_filter_free(hts_filter_t *filt) { + if (!filt) + return; + + int i; + for (i = 0; i < filt->max_regex; i++) + regfree(&filt->preg[i]); + + free(filt->str); + free(filt); +} + +static int hts_filter_eval_(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { + char *end = NULL; + + filt->curr_regex = 0; + if (expression(filt, data, fn, filt->str, &end, res)) + return -1; + + if (end && *ws(end)) { + fprintf(stderr, "Unable to parse expression at %s\n", filt->str); + return -1; + } + + // Strings evaluate to true. An empty string is also true, but an + // absent (null) string is false, unless overriden by is_true. An + // empty string has kstring length of zero, but a pointer as it's + // nul-terminated. + if (res->is_str) { + res->is_true |= res->s.s != NULL; + res->d = res->is_true; + } else if (hts_expr_val_exists(res)) { + res->is_true |= res->d != 0; + } + + return 0; +} + +int hts_filter_eval(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { + if (res->s.l != 0 || res->s.m != 0 || res->s.s != NULL) { + // As *res is cleared below, it's not safe to call this function + // with res->s.s set, as memory would be leaked. It's also not + // possible to know is res was initialised correctly, so in + // either case we fail. + hts_log_error("Results structure must be cleared before calling this function"); + return -1; + } + + memset(res, 0, sizeof(*res)); + + return hts_filter_eval_(filt, data, fn, res); +} + +int hts_filter_eval2(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { + ks_free(&res->s); + memset(res, 0, sizeof(*res)); + + return hts_filter_eval_(filt, data, fn, res); +} diff --git a/hts_internal.h b/hts_internal.h index 4e5e20151..61956da21 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "textutils_internal.h" -#define HTS_MAX_EXT_LEN 8 +#define HTS_MAX_EXT_LEN 9 #ifdef __cplusplus extern "C" { @@ -104,6 +104,7 @@ plugin_void_func *load_plugin(void **pluginp, const char *filename, const char * void *plugin_sym(void *plugin, const char *name, const char **errmsg); plugin_void_func *plugin_func(void *plugin, const char *name, const char **errmsg); void close_plugin(void *plugin); +const char *hts_plugin_path(void); /* * Buffers up arguments to hts_idx_push for later use, once we've written all bar @@ -138,7 +139,8 @@ static inline int find_file_extension(const char *fn, char ext_out[static HTS_MA { for (ext--; ext > fn && *ext != '.' && *ext != '/'; --ext) {} } - if (*ext != '.' || delim - ext > HTS_MAX_EXT_LEN || delim - ext < 4) return -1; + if (*ext != '.' || delim - ext > HTS_MAX_EXT_LEN || delim - ext < 3) + return -1; memcpy(ext_out, ext + 1, delim - ext - 1); ext_out[delim - ext - 1] = '\0'; return 0; diff --git a/hts_os.c b/hts_os.c index 359392173..b391a41e0 100644 --- a/hts_os.c +++ b/hts_os.c @@ -1,7 +1,7 @@ /// @file hts_os.c /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019 Genome Research Ltd. + Copyright (C) 2017, 2019-2020 Genome Research Ltd. Author: James Bonfield @@ -49,7 +49,7 @@ HTSLIB_EXPORT double hts_drand48(void) { return drand48(); } HTSLIB_EXPORT -double hts_lrand48(void) { return lrand48(); } +long hts_lrand48(void) { return lrand48(); } #endif // // On Windows when using the MSYS or Cygwin terminals, isatty fails diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh new file mode 100755 index 000000000..71c6f5d01 --- /dev/null +++ b/hts_probe_cc.sh @@ -0,0 +1,143 @@ +#!/bin/sh + +# Check compiler options for non-configure builds and create Makefile fragment +# +# Copyright (C) 2022-2023 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Arguments are: +# 1. C compiler command +# 2. Initial CFLAGS +# 3. LDFLAGS + +CC=$1 +CFLAGS=$2 +LDFLAGS=$3 + +# Try running the compiler. Uses the same contest.* names as +# configure for temporary files. +run_compiler () +{ + "$CC" $CFLAGS $1 $LDFLAGS -o conftest conftest.c 2> conftest.err + retval=$? + rm -f conftest.err conftest + return $retval +} + +# Run a test. $1 is the flag to try, $2 is the Makefile variable to set +# with the flag probe result, $3 is a Makefile variable which will be +# set to 1 if the code was built successfully. The code to test should +# be passed in via fd 0. +# First try compiling conftest.c without the flag. If that fails, try +# again with it to see if the flag is needed. +run_test () +{ + rm -f conftest conftest.err conftest.c + cat - > conftest.c + if run_compiler ; then + echo "$2 =" + echo "$3 = 1" + elif run_compiler "$1" ; then + echo "$2 = $1" + echo "$3 = 1" + else + echo "$3 =" + fi +} + +echo "# Compiler probe results, generated by $0" + +# Check for ssse3 +run_test "-mssse3" HTS_CFLAGS_SSSE3 HTS_BUILD_SSSE3 <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_shuffle_epi8(a, b); + return *((char *) &c); +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF + +# Check for popcnt +run_test "-mpopcnt" HTS_CFLAGS_POPCNT HTS_BUILD_POPCNT <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + unsigned int i = _mm_popcnt_u32(1); + return i != 1; +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF + +# Check for sse4.1 etc. support +run_test "-msse4.1" HTS_CFLAGS_SSE4_1 HTS_BUILD_SSE4_1 <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_max_epu32(a, b); + return *((char *) &c); +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF + +echo 'HTS_CFLAGS_SSE4 = $(HTS_CFLAGS_SSSE3) $(HTS_CFLAGS_POPCNT) $(HTS_CFLAGS_SSE4_1)' + +# Check for avx2 + +run_test -mavx2 HTS_CFLAGS_AVX2 HTS_BUILD_AVX2 <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + __m256i b = _mm256_add_epi32(a, a); + long long c = _mm256_extract_epi64(b, 0); + return (int) c; +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF + +# Check for avx512 + +run_test -mavx512f HTS_CFLAGS_AVX512 HTS_BUILD_AVX512 <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + __m512i a = _mm512_set1_epi32(1); + __m512i b = _mm512_add_epi32(a, a); + return *((char *) &b); +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF + +rm -f conftest.c diff --git a/hts_time_funcs.h b/hts_time_funcs.h new file mode 100644 index 000000000..2a0508412 --- /dev/null +++ b/hts_time_funcs.h @@ -0,0 +1,170 @@ +/* hts_time_funcs.h -- Implementations of non-standard time functions + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + This mainly exists because timegm() is not a standard function, and so + Cannot be used in portable code. Unfortunately the standard one (mktime) + always takes the local timezone into accout so doing a UTC conversion + with it involves changing the TZ environment variable, which is rather + messy and not likely to go well with threaded code. + + The code here is a much simplified version of the BSD timegm() implementation. + It currently rejects dates before 1970, avoiding problems with -ve time_t. + It also works strictly in UTC, so doesn't have to worry about tm_isdst + which makes the calculation much easier. + + Some of this is derived from BSD sources, for example + https://github.com/NetBSD/src/blob/trunk/lib/libc/time/localtime.c + which state: + + ** This file is in the public domain, so clarified as of + ** 1996-06-05 by Arthur David Olson. + + Non-derived code is copyright as above. +*/ + +#include +#include +#include +#include + +static inline int hts_time_normalise(int *tens, int *units, int base) { + if (*units < 0 || *units >= base) { + int delta = *units >= 0 ? *units / base : (-1 - (-1 - *units) / base); + int64_t tmp = (int64_t) (*tens) + delta; + if (tmp < INT_MIN || tmp > INT_MAX) return 1; + *tens = tmp; + *units -= delta * base; + } + return 0; +} + +static inline int hts_year_is_leap(int64_t year) { + return ((year % 4 == 0) && (year % 100 != 0)) || (year % 400 == 0); +} + +// Number of leap years to start of year +// Only works for year >= 1. +static inline int64_t hts_leaps_to_year_start(int64_t year) { + --year; + return year / 4 - year / 100 + year / 400; +} + +static inline int hts_time_normalise_tm(struct tm *t) +{ + const int days_per_mon[2][12] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } + }; + const int year_days[2] = { 365, 366 }; + int overflow = 0; + int64_t year; + + if (t->tm_sec > 62) { + overflow |= hts_time_normalise(&t->tm_min, &t->tm_sec, 60); + } + overflow |= hts_time_normalise(&t->tm_hour, &t->tm_min, 60); + overflow |= hts_time_normalise(&t->tm_mday, &t->tm_hour, 24); + overflow |= hts_time_normalise(&t->tm_year, &t->tm_mon, 12); + if (overflow) + return 1; + + year = (int64_t) t->tm_year + 1900LL; + while (t->tm_mday <= 0) { + --year; + t->tm_mday += year_days[hts_year_is_leap(year + (1 < t->tm_mon))]; + } + while (t->tm_mday > 366) { + t->tm_mday -= year_days[hts_year_is_leap(year + (1 < t->tm_mon))]; + ++year; + } + for (;;) { + int mdays = days_per_mon[hts_year_is_leap(year)][t->tm_mon]; + if (t->tm_mday <= mdays) + break; + t->tm_mday -= mdays; + t->tm_mon++; + if (t->tm_mon >= 12) { + year++; + t->tm_mon = 0; + } + } + year -= 1900; + if (year != t->tm_year) { + if (year < INT_MIN || year > INT_MAX) + return 1; + t->tm_year = year; + } + return 0; +} + +/** + * Convert broken-down time to an equivalent time_t value + * @param target Target broken-down time structure + * @return Equivalent time_t value on success; -1 on failure + * + * This function first normalises the time in @p target so that the + * structure members are in the valid range. It then calculates the + * number of seconds (ignoring leap seconds) between midnight Jan 1st 1970 + * and the target date. + * + * If @p target is outside the range that can be represented in a time_t, + * or tm_year is less than 70 (which would return a negative value) then + * it returns -1 and sets errno to EOVERFLOW. + */ + +static inline time_t hts_time_gm(struct tm *target) +{ + int month_start[2][12] = { + { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 }, + { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335 } + }; + int years_from_epoch, leaps, days; + int64_t secs; + + if (hts_time_normalise_tm(target) != 0) + goto overflow; + + if (target->tm_year < 70) + goto overflow; + + years_from_epoch = target->tm_year - 70; + leaps = (hts_leaps_to_year_start(target->tm_year + 1900) + - hts_leaps_to_year_start(1970)); + days = ((365 * (years_from_epoch - leaps) + 366 * leaps) + + month_start[hts_year_is_leap(target->tm_year + 1900)][target->tm_mon] + + target->tm_mday - 1); + secs = ((int64_t) days * 86400LL + + target->tm_hour * 3600 + + target->tm_min * 60 + + target->tm_sec); + if (sizeof(time_t) < 8 && secs > INT_MAX) + goto overflow; + + return (time_t) secs; + + overflow: + errno = EOVERFLOW; + return (time_t) -1; +} diff --git a/htscodecs b/htscodecs new file mode 160000 index 000000000..dcb331678 --- /dev/null +++ b/htscodecs @@ -0,0 +1 @@ +Subproject commit dcb33167839622903897fc985a8cccf89b3358e2 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk new file mode 100644 index 000000000..6274350f5 --- /dev/null +++ b/htscodecs_bundled.mk @@ -0,0 +1,72 @@ +# Makefile fragment to add settings needed when bundling htscodecs functions +# +# Copyright (C) 2021-2022 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + +HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ + $(HTSPREFIX)htscodecs/htscodecs/fqzcomp_qual.c \ + $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \ + $(HTSPREFIX)htscodecs/htscodecs/pack.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ + $(HTSPREFIX)htscodecs/htscodecs/rle.c \ + $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c \ + $(HTSPREFIX)htscodecs/htscodecs/utils.c + + +HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) + +# htscodecs public headers +htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h +htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h +htscodecs_htscodecs_h = htscodecs/htscodecs/htscodecs.h $(htscodecs_version_h) +htscodecs_pack_h = htscodecs/htscodecs/pack.h +htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h +htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h +htscodecs_rle_h = htscodecs/htscodecs/rle.h +htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h +htscodecs_varint_h = htscodecs/htscodecs/varint.h + +# htscodecs internal headers +htscodecs_htscodecs_endian_h = htscodecs/htscodecs/htscodecs_endian.h +htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h +htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) +htscodecs_permute_h = htscodecs/htscodecs/permute.h +htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h +htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_rANS_static16_int_h = htscodecs/htscodecs/rANS_static16_int.h $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs_rANS_static32x16pr_h = htscodecs/htscodecs/rANS_static32x16pr.h +htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h $(htscodecs_htscodecs_endian_h) +htscodecs_utils_h = htscodecs/htscodecs/utils.h +htscodecs_version_h = htscodecs/htscodecs/version.h + +# Add htscodecs tests into the HTSlib test framework + +HTSCODECS_TEST_TARGETS = test_htscodecs_rans4x8 \ + test_htscodecs_rans4x16 test_htscodecs_arith test_htscodecs_tok3 \ + test_htscodecs_fqzcomp test_htscodecs_varint diff --git a/htscodecs_external.mk b/htscodecs_external.mk new file mode 100644 index 000000000..3f86811f5 --- /dev/null +++ b/htscodecs_external.mk @@ -0,0 +1,46 @@ +# Makefile fragment for use when linking to an external libhtscodecs +# +# Copyright (C) 2021 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +HTSCODECS_SOURCES = +HTSCODECS_OBJS = +HTSCODECS_TEST_TARGETS = + +htscodecs_arith_dynamic_h = +htscodecs_fqzcomp_qual_h = +htscodecs_htscodecs_h = +htscodecs_pack_h = +htscodecs_rANS_static_h = +htscodecs_rANS_static4x16_h = +htscodecs_rle_h = +htscodecs_tokenise_name3_h = +htscodecs_varint_h = + +htscodecs_htscodecs_endian_h = +htscodecs_c_range_coder_h = +htscodecs_c_simple_model_h = +htscodecs_pooled_alloc_h = +htscodecs_rANS_byte_h = +htscodecs_rANS_word_h = +htscodecs_utils_h = +htscodecs_version_h = diff --git a/htsfile.1 b/htsfile.1 index 9d3606715..eeb2947d8 100644 --- a/htsfile.1 +++ b/htsfile.1 @@ -1,4 +1,4 @@ -.TH htsfile 1 "19 December 2019" "htslib-1.10.2" "Bioinformatics tools" +.TH htsfile 1 "25 July 2023" "htslib-1.18" "Bioinformatics tools" .SH NAME htsfile \- identify high-throughput sequencing data files .\" diff --git a/htsfile.c b/htsfile.c index e37eb7215..229af6656 100644 --- a/htsfile.c +++ b/htsfile.c @@ -31,7 +31,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include "htslib/hfile.h" #include "htslib/hts.h" @@ -62,13 +61,6 @@ void error(const char *format, ...) status = EXIT_FAILURE; } -static htsFile *dup_stdout(const char *mode) -{ - int fd = dup(STDOUT_FILENO); - hFILE *hfp = (fd >= 0)? hdopen(fd, mode) : NULL; - return hfp? hts_hopen(hfp, "-", mode) : NULL; -} - static void view_sam(samFile *in, const char *filename) { bam1_t *b = NULL; @@ -81,7 +73,7 @@ static void view_sam(samFile *in, const char *filename) goto clean; } - out = dup_stdout("w"); + out = hts_open("-", "w"); if (out == NULL) { error("reopening standard output failed"); goto clean; } if (show_headers) { @@ -125,7 +117,7 @@ static void view_vcf(vcfFile *in, const char *filename) goto clean; } - out = dup_stdout("w"); + out = hts_open("-", "w"); if (out == NULL) { error("reopening standard output failed"); goto clean; } if (show_headers) { @@ -258,7 +250,7 @@ int main(int argc, char **argv) case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; @@ -283,7 +275,7 @@ int main(int argc, char **argv) if (mode == identify) { htsFormat fmt; - if (hts_detect_format(fp, &fmt) < 0) { + if (hts_detect_format2(fp, argv[i], &fmt) < 0) { error("detecting \"%s\" format failed", argv[i]); hclose_abruptly(fp); continue; @@ -325,5 +317,8 @@ int main(int argc, char **argv) if (fp && hclose(fp) < 0) error("closing \"%s\" failed", argv[i]); } + if (fclose(stdout) != 0 && errno != EBADF) + error("closing standard output failed"); + return status; } diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index 676223758..019e8d0b7 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -1,8 +1,8 @@ -.TH htslib-s3-plugin 7 "19 December 2019" "htslib-1.10.2" "Bioinformatics tools" +.TH htslib-s3-plugin 7 "25 July 2023" "htslib-1.18" "Bioinformatics tools" .SH NAME -s3 plugin \- htslib AWS S3 plugin +htslib-s3-plugin \- htslib AWS S3 plugin .\" -.\" Copyright (C) 2019 Genome Research Ltd. +.\" Copyright (C) 2021-2022 Genome Research Ltd. .\" .\" Author: Andrew Whitwham .\" @@ -24,6 +24,21 @@ s3 plugin \- htslib AWS S3 plugin .\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER .\" DEALINGS IN THE SOFTWARE. .\" +. +.\" For code blocks and examples (cf groff's Ultrix-specific man macros) +.de EX + +. in +\\$1 +. nf +. ft CR +.. +.de EE +. ft +. fi +. in + +.. + .SH DESCRIPTION The S3 plugin allows htslib file functions to communicate with servers that use the AWS S3 protocol. Files are identified by their bucket and object key in a @@ -105,18 +120,96 @@ Sets the upload part size in Mb, the minimum being 5Mb. By default the part size starts at 5Mb and expands at regular intervals to accommodate bigger files (up to 2.5 Tbytes with the current rate). Using this setting disables the automatic part size expansion. +.TP +.B HTS_S3_ADDRESS_STYLE +Sets the URL style. Options are auto (default), virtual or path. .LP In the absence of an ID from the previous two methods the credential/config files will be used. The default file locations are either \fI~/.aws/credentials\fR or \fI~/.s3cfg\fR (in that order). + +Entries used in aws style credentials file are aws_access_key_id, +aws_secret_access_key, aws_session_token, region, addressing_style and +expiry_time (unofficial, see SHORT-LIVED CREDENTIALS below). +Only the first two are usually needed. + +Entries used in s3cmd style config files are access_key, secret_key, +access_token, host_base, bucket_location and host_bucket. Again only the first +two are usually needed. The host_bucket option is only used to set a path-style +URL, see below. + +.SH SHORT-LIVED CREDENTIALS + +Some cloud identity and access management (IAM) systems can make short-lived +credentials that allow access to resources. +These credentials will expire after a time and need to be renewed to +give continued access. +To enable this, the S3 plugin allows an \fIexpiry_time\fR entry to be set in the +\fI.aws/credentials\fR file. +The value for this entry should be the time when the token expires, +following the format in RFC3339 section 5.6, which takes the form: + + 2012-04-29T05:20:48Z + +That is, year - month - day, the letter "T", hour : minute : second. +The time can be followed by the letter "Z", indicating the UTC timezone, +or an offset from UTC which is a "+" or "-" sign followed by two digits for +the hours offset, ":", and two digits for the minutes. + +The S3 plugin will attempt to re-read the credentials file up to 1 minute +before the given expiry time, which means the file needs to be updated with +new credentials before then. +As the exact way of doing this can vary between services and IAM providers, +the S3 plugin expects this to be done by an external user-supplied process. +This may be achieved by running a program that replaces the file as new +credentials become available. +The following script shows how it might be done for AWS instance credentials: +.EX 2 +#!/bin/sh +instance='http://169.254.169.254' +tok_url="$instance/latest/api/token" +ttl_hdr='X-aws-ec2-metadata-token-ttl-seconds: 10' +creds_url="$instance/latest/meta-data/iam/security-credentials" +key1='aws_access_key_id = \(rs(.AccessKeyId)\(rsn' +key2='aws_secret_access_key = \(rs(.SecretAccessKey)\(rsn' +key3='aws_session_token = \(rs(.Token)\(rsn' +key4='expiry_time = \(rs(.Expiration)\(rsn' +while true; do + token=`curl -X PUT -H "$ttl_hdr" "$tok_url"` + tok_hdr="X-aws-ec2-metadata-token: $token" + role=`curl -H "$tok_hdr" "$creds_url/"` + expires='now' + ( curl -H "$tok_hdr" "$creds_url/$role" \(rs + | jq -r "\(rs"${key1}${key2}${key3}${key4}\(rs"" > credentials.new ) \(rs + && mv -f credentials.new credentials \(rs + && expires=`grep expiry_time credentials | cut -d ' ' -f 3-` + if test $? -ne 0 ; then break ; fi + expiry=`date -d "$expires - 3 minutes" '+%s'` + now=`date '+%s'` + test "$expiry" -gt "$now" && sleep $((($expiry - $now) / 2)) + sleep 30 +done +.EE + +Note that the \fIexpiry_time\fR key is currently only supported for the +\fI.aws/credentials\fR file (or the file referred to in the +.B AWS_SHARED_CREDENTIALS_FILE +environment variable). + .SH NOTES In most cases this plugin transforms the given URL into a virtual host-style format e.g. \fIhttps://bucket.host/path/to/file\fR. A path-style format is used where the URL is not DNS compliant or the bucket name contains a dot e.g. \fIhttps://host/bu.cket/path/to/file\fR. +Path-style can be forced by setting one either HTS_S3_ADDRESS_STYLE, +addressing_style or host_bucket. The first two can be set to \fBpath\fR while +host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string. + .SH "SEE ALSO" -.BR htsfile (1) -.BR samtools (1) +.IR htsfile (1) +.IR samtools (1) +.PP +RFC 3339: .PP htslib website: diff --git a/htslib.map b/htslib.map new file mode 100644 index 000000000..9542861bd --- /dev/null +++ b/htslib.map @@ -0,0 +1,638 @@ +HTSLIB_1.0 { + bam_aux2A; + bam_aux2Z; + bam_aux2f; + bam_aux2i; + bam_aux_append; + bam_aux_del; + bam_aux_get; + bam_cigar2qlen; + bam_cigar2rlen; + bam_copy1; + bam_destroy1; + bam_dup1; + bam_endpos; + bam_flag2str; + bam_hdr_read; + bam_hdr_write; + bam_init1; + bam_mplp_auto; + bam_mplp_destroy; + bam_mplp_init; + bam_mplp_init_overlaps; + bam_mplp_set_maxcnt; + bam_plp_auto; + bam_plp_destroy; + bam_plp_init; + bam_plp_next; + bam_plp_push; + bam_plp_reset; + bam_plp_set_maxcnt; + bam_read1; + bam_str2flag; + bam_write1; + bcf_add_filter; + bcf_calc_ac; + bcf_clear; + bcf_destroy; + bcf_dup; + bcf_enc_vchar; + bcf_enc_vfloat; + bcf_enc_vint; + bcf_float_missing; + bcf_float_vector_end; + bcf_fmt_array; + bcf_fmt_sized_array; + bcf_get_fmt; + bcf_get_format_string; + bcf_get_format_values; + bcf_get_info; + bcf_get_info_values; + bcf_get_variant_type; + bcf_get_variant_types; + bcf_gt_type; + bcf_has_filter; + bcf_hdr_add_hrec; + bcf_hdr_add_sample; + bcf_hdr_append; + bcf_hdr_combine; + bcf_hdr_destroy; + bcf_hdr_dup; + bcf_hdr_fmt_text; + bcf_hdr_get_hrec; + bcf_hdr_get_version; + bcf_hdr_id2int; + bcf_hdr_init; + bcf_hdr_parse; + bcf_hdr_parse_line; + bcf_hdr_printf; + bcf_hdr_read; + bcf_hdr_remove; + bcf_hdr_seqnames; + bcf_hdr_set; + bcf_hdr_set_samples; + bcf_hdr_set_version; + bcf_hdr_subset; + bcf_hdr_sync; + bcf_hdr_write; + bcf_hrec_add_key; + bcf_hrec_destroy; + bcf_hrec_dup; + bcf_hrec_find_key; + bcf_hrec_format; + bcf_hrec_set_val; + bcf_index_build; + bcf_init; + bcf_is_snp; + bcf_read; + bcf_readrec; + bcf_remove_alleles; + bcf_remove_filter; + bcf_sr_add_reader; + bcf_sr_destroy; + bcf_sr_init; + bcf_sr_next_line; + bcf_sr_regions_destroy; + bcf_sr_regions_flush; + bcf_sr_regions_init; + bcf_sr_regions_next; + bcf_sr_regions_overlap; + bcf_sr_regions_seek; + bcf_sr_remove_reader; + bcf_sr_seek; + bcf_sr_set_regions; + bcf_sr_set_samples; + bcf_sr_set_targets; + bcf_subset; + bcf_subset_format; + bcf_sweep_bwd; + bcf_sweep_destroy; + bcf_sweep_fwd; + bcf_sweep_hdr; + bcf_sweep_init; + bcf_translate; + bcf_trim_alleles; + bcf_type_shift; + bcf_unpack; + bcf_update_alleles; + bcf_update_alleles_str; + bcf_update_filter; + bcf_update_format; + bcf_update_format_string; + bcf_update_id; + bcf_update_info; + bcf_write; + bgzf_check_EOF; + bgzf_close; + bgzf_dopen; + bgzf_flush; + bgzf_flush_try; + bgzf_getc; + bgzf_getline; + bgzf_hopen; + bgzf_index_build_init; + bgzf_index_dump; + bgzf_index_load; + bgzf_is_bgzf; + bgzf_mt; + bgzf_open; + bgzf_raw_read; + bgzf_raw_write; + bgzf_read; + bgzf_read_block; + bgzf_seek; + bgzf_set_cache_size; + bgzf_useek; + bgzf_utell; + bgzf_write; + cram_close; + cram_compress_block; + cram_dopen; + cram_eof; + cram_flush; + cram_free_block; + cram_free_container; + cram_new_block; + cram_new_container; + cram_open; + cram_read_block; + cram_read_container; + cram_seek; + cram_set_header; + cram_set_option; + cram_set_voption; + cram_uncompress_block; + cram_write_block; + cram_write_container; + fai_build; + fai_destroy; + fai_fetch; + fai_load; + faidx_fetch_nseq; + faidx_fetch_seq; + faidx_has_seq; + hclose; + hclose_abruptly; + hdopen; + hfile_destroy; + hfile_init; + hfile_oflags; + hflush; + hgetc2; + hopen; + hpeek; + hputc2; + hputs2; + hread2; + hrec_add_idx; + hseek; + hts_close; + hts_file_type; + hts_get_bgzfp; + hts_getline; + hts_idx_destroy; + hts_idx_finish; + hts_idx_get_meta; + hts_idx_get_n_no_coor; + hts_idx_get_stat; + hts_idx_init; + hts_idx_load; + hts_idx_push; + hts_idx_save; + hts_idx_seqnames; + hts_idx_set_meta; + hts_itr_destroy; + hts_itr_next; + hts_itr_query; + hts_itr_querys; + hts_open; + hts_parse_reg; + hts_readlines; + hts_readlist; + hts_set_fai_filename; + hts_set_threads; + hts_verbose; + hts_version; + hwrite2; + kf_betai; + kf_erfc; + kf_gammap; + kf_gammaq; + kf_lgamma; + kmemmem; + knet_close; + knet_dopen; + knet_open; + knet_read; + knet_seek; + ksplit_core; + ksprintf; + kstrnstr; + kstrstr; + kstrtok; + kt_fisher_exact; + kvsprintf; + sam_format1; + sam_hdr_add_lines; + sam_hdr_dup; + sam_hdr_incr_ref; + sam_hdr_length; + sam_hdr_parse; + sam_hdr_read; + sam_hdr_str; + sam_hdr_write; + sam_index_load; + sam_itr_queryi; + sam_itr_querys; + sam_open_mode; + sam_parse1; + sam_read1; + sam_write1; + seq_nt16_str; + seq_nt16_table; + stringify_argv; + tbx_conf_bed; + tbx_conf_gff; + tbx_conf_psltbl; + tbx_conf_sam; + tbx_conf_vcf; + tbx_destroy; + tbx_index; + tbx_index_build; + tbx_index_load; + tbx_name2id; + tbx_readrec; + tbx_seqnames; + vcf_format; + vcf_hdr_read; + vcf_hdr_write; + vcf_parse; + vcf_read; + vcf_write; + vcf_write_line; +}; + +HTSLIB_1.1 { + bcf_get_fmt_id; + bcf_get_info_id; + faidx_iseq; + faidx_nseq; + faidx_seq_len; +} HTSLIB_1.0; + + +HTSLIB_1.2.1 { + bcf_copy; + bcf_sr_strerror; + hisremote; + hts_detect_format; + hts_format_description; + hts_get_format; + hts_hopen; + hts_set_opt; + regidx_destroy; + regidx_init; + regidx_insert; + regidx_nregs; + regidx_overlap; + regidx_parse_bed; + regidx_parse_tab; + regidx_seq_names; + regidx_seq_nregs; + seq_nt16_int; +} HTSLIB_1.1; + +HTSLIB_1.3 { + bcf_add_id; + bcf_empty; + bcf_hdr_merge; + bcf_index_build2; + bcf_index_load2; + bcf_remove_allele_set; + bgzf_compress; + cram_block_append; + cram_block_get_comp_size; + cram_block_get_content_id; + cram_block_get_content_type; + cram_block_get_crc32; + cram_block_get_data; + cram_block_get_offset; + cram_block_get_uncomp_size; + cram_block_set_comp_size; + cram_block_set_content_id; + cram_block_set_crc32; + cram_block_set_data; + cram_block_set_offset; + cram_block_set_uncomp_size; + cram_block_size; + cram_block_update_size; + cram_container_get_landmarks; + cram_container_get_length; + cram_container_get_num_blocks; + cram_container_is_empty; + cram_container_set_landmarks; + cram_container_set_length; + cram_container_set_num_blocks; + cram_container_size; + cram_copy_slice; + cram_fd_get_fp; + cram_fd_get_header; + cram_fd_get_version; + cram_fd_set_fp; + cram_fd_set_header; + cram_fd_set_version; + cram_major_vers; + cram_minor_vers; + cram_store_container; + cram_transcode_rg; + hfile_add_scheme_handler; + hfile_always_local; + hfile_always_remote; + hts_format_file_extension; + hts_idx_load2; + hts_idx_save_as; + hts_md5_destroy; + hts_md5_final; + hts_md5_hex; + hts_md5_init; + hts_md5_reset; + hts_md5_update; + hts_open_format; + hts_opt_add; + hts_opt_apply; + hts_opt_free; + hts_parse_decimal; + hts_parse_format; + hts_parse_opt_list; + int32_put_blk; + kgetline; + sam_index_build; + sam_index_build2; + sam_index_load2; + sam_open_mode_opts; + tbx_index_build2; + tbx_index_load2; +} HTSLIB_1.2.1; + +HTSLIB_1.4 { + bam_auxB2f; + bam_auxB2i; + bam_auxB_len; + bam_aux_update_str; + bam_mplp_constructor; + bam_mplp_destructor; + bam_mplp_reset; + bam_plp_constructor; + bam_plp_destructor; + bcf_hdr_format; + bcf_index_build3; + bcf_sr_destroy_threads; + bcf_sr_set_opt; + bcf_sr_set_threads; + bgzf_block_write; + bgzf_compression; + bgzf_index_dump_hfile; + bgzf_index_load_hfile; + bgzf_thread_pool; + cram_check_EOF; + cram_get_refs; + errmod_cal; + errmod_destroy; + errmod_init; + fai_build3; + fai_load3; + hgetdelim; + hgets; + hts_check_EOF; + hts_json_fnext; + hts_json_fskip_value; + hts_json_snext; + hts_json_sskip_value; + hts_realloc_or_die; + hts_set_cache_size; + hts_set_thread_pool; + hts_tpool_delete_result; + hts_tpool_destroy; + hts_tpool_dispatch; + hts_tpool_dispatch2; + hts_tpool_init; + hts_tpool_kill; + hts_tpool_next_result; + hts_tpool_next_result_wait; + hts_tpool_process_attach; + hts_tpool_process_destroy; + hts_tpool_process_detach; + hts_tpool_process_empty; + hts_tpool_process_flush; + hts_tpool_process_init; + hts_tpool_process_len; + hts_tpool_process_qsize; + hts_tpool_process_ref_decr; + hts_tpool_process_ref_incr; + hts_tpool_process_reset; + hts_tpool_process_shutdown; + hts_tpool_process_sz; + hts_tpool_result_data; + hts_tpool_size; + hts_tpool_wake_dispatch; + kputd; + probaln_glocal; + sam_cap_mapq; + sam_index_build3; + sam_prob_realn; + tbx_index_build3; +} HTSLIB_1.3; + +HTSLIB_1.5 { + hfile_set_blksize; + hts_get_log_level; + hts_log; + hts_set_log_level; +} HTSLIB_1.4; + +HTSLIB_1.6 { + hts_drand48; + hts_erand48; + hts_lrand48; + hts_srand48; +} HTSLIB_1.5; + +HTSLIB_1.7 { + hfile_mem_get_buffer; + hfile_mem_steal_buffer; + hts_itr_multi_bam; + hts_itr_multi_cram; + hts_itr_multi_next; + hts_itr_regions; + hts_json_alloc_token; + hts_json_free_token; + hts_json_token_str; + hts_json_token_type; + hts_reglist_free; + sam_hdr_change_HD; + sam_itr_regions; +} HTSLIB_1.6; + +HTSLIB_1.9 { + bam_aux_update_array; + bam_aux_update_float; + bam_aux_update_int; + fai_fetchqual; + fai_load3_format; + fai_load_format; + faidx_fetch_qual; +} HTSLIB_1.7; + +HTSLIB_1.10 { + bam_cigar_table; + bam_mplp64_auto; + bam_plp64_auto; + bam_plp64_next; + bam_plp_insertion; + bam_set_qname; + bcf_idx_init; + bcf_idx_save; + bcf_index_load3; + bgzf_peek; + fai_fetch64; + fai_fetchqual64; + fai_parse_region; + fai_set_cache_size; + faidx_fetch_qual64; + faidx_fetch_seq64; + haddextension; + hts_free; + hts_idx_fmt; + hts_idx_load3; + hts_idx_tbi_name; + hts_parse_reg64; + hts_parse_region; + hts_reglist_create; + hts_resize_array_; + hts_tpool_dispatch3; + kgetline2; + regidx_init_string; + regidx_insert_list; + regidx_parse_reg; + regidx_parse_vcf; + regidx_push; + regitr_copy; + regitr_destroy; + regitr_init; + regitr_loop; + regitr_overlap; + regitr_reset; + sam_hdr_add_line; + sam_hdr_add_pg; + sam_hdr_count_lines; + sam_hdr_destroy; + sam_hdr_find_line_id; + sam_hdr_find_line_pos; + sam_hdr_find_tag_id; + sam_hdr_find_tag_pos; + sam_hdr_init; + sam_hdr_line_index; + sam_hdr_line_name; + sam_hdr_name2tid; + sam_hdr_nref; + sam_hdr_pg_id; + sam_hdr_remove_except; + sam_hdr_remove_line_id; + sam_hdr_remove_line_pos; + sam_hdr_remove_lines; + sam_hdr_remove_tag_id; + sam_hdr_tid2len; + sam_hdr_tid2name; + sam_hdr_update_line; + sam_idx_init; + sam_idx_save; + sam_index_load3; + sam_itr_regarray; + sam_parse_region; + tbx_index_load3; +} HTSLIB_1.9; + +HTSLIB_1.11 { + fai_path; + hts_lib_shutdown; + hts_tpool_process_is_shutdown; + vcf_open_mode; +} HTSLIB_1.10; + +HTSLIB_1.12 { + bam_parse_cigar; + bam_set1; + hfile_has_plugin; + hfile_list_plugins; + hfile_list_schemes; + hts_feature_string; + hts_features; + hts_filter_eval; + hts_filter_free; + hts_filter_init; + hts_set_filter_expression; + hts_test_feature; + sam_parse_cigar; + sam_passes_filter; +} HTSLIB_1.11; + +HTSLIB_1.13 { + hts_idx_nseq; +} HTSLIB_1.12; + +HTSLIB_1.14 { + bam_mods_at_next_pos; + bam_mods_at_qpos; + bam_next_basemod; + bam_parse_basemod; + bam_plp_insertion_mod; + hts_base_mod_state_alloc; + hts_base_mod_state_free; + hts_flush; +} HTSLIB_1.13; + +HTSLIB_1.15 { + hts_detect_format2; +} HTSLIB_1.14; + +HTSLIB_1.16 { + bam_mods_query_type; + bam_mods_recorded; + bcf_has_variant_type; + bcf_has_variant_types; + bcf_variant_length; + cram_decode_slice_header; + cram_free_slice_header; + cram_slice_hdr_get_coords; + cram_slice_hdr_get_embed_ref_id; + cram_slice_hdr_get_num_blocks; + hts_filter_eval2; +} HTSLIB_1.15; + +HTSLIB_1.17 { + bam_aux_first; + bam_aux_next; + bam_aux_remove; + bcf_strerror; + cram_block_get_method; + cram_cid2ds_free; + cram_cid2ds_query; + cram_codec_describe; + cram_codec_get_content_ids; + cram_container_get_num_bases; + cram_container_get_num_records; + cram_decode_compression_header; + cram_describe_encodings; + cram_expand_method; + cram_free_compression_header; + cram_update_cid2ds_map; + fai_adjust_region; + fai_line_length; + faidx_seq_len64; +} HTSLIB_1.16; + +HTSLIB_1.18 { + bam_mods_queryi; + bam_parse_basemod2; + fai_thread_pool; +} HTSLIB_1.17; diff --git a/htslib.mk b/htslib.mk index b750869c8..9c60ffc2b 100644 --- a/htslib.mk +++ b/htslib.mk @@ -1,6 +1,6 @@ # Makefile rules useful for third-party code using htslib's public API. # -# Copyright (C) 2013-2017, 2019 Genome Research Ltd. +# Copyright (C) 2013-2017, 2019, 2021 Genome Research Ltd. # # Author: John Marshall # @@ -27,14 +27,22 @@ # If your source file foo.c #includes and , # you can write the correct prerequisites for foo.o as: # -# HTSDIR = +# HTSDIR = # include $(HTSDIR)/htslib.mk # # foo.o: foo.c $(htslib_hts_h) $(htslib_kstring_h) -HTSPREFIX = $(HTSDIR)/ +HTSSRCDIR = $(HTSDIR) +HTSPREFIX = $(HTSSRCDIR)/ include $(HTSDIR)/htslib_vars.mk +# This file provides the HTSCODECS_SOURCES variable. It may not be present +# in a freshly checked-out htslib, so is only included if available. The +# absence is unlikely to cause a problem as there will be plenty of other +# missing files that will trigger a build in htslib, and when that happens +# htslib's makefile will create it. +-include $(HTSDIR)/htscodecs.mk + # Rules for rebuilding an in-development htslib's static and shared libraries. # If your program foo links with libhts, adding the appropriate prerequisite # will cause the library to be rebuilt as necessary: @@ -47,106 +55,102 @@ include $(HTSDIR)/htslib_vars.mk # $(HTSDIR)/tabix -p bed bar.bed.bgz HTSLIB_PUBLIC_HEADERS = \ - $(HTSDIR)/htslib/bgzf.h \ - $(HTSDIR)/htslib/cram.h \ - $(HTSDIR)/htslib/faidx.h \ - $(HTSDIR)/htslib/hfile.h \ - $(HTSDIR)/htslib/hts.h \ - $(HTSDIR)/htslib/hts_defs.h \ - $(HTSDIR)/htslib/hts_endian.h \ - $(HTSDIR)/htslib/hts_log.h \ - $(HTSDIR)/htslib/hts_os.h \ - $(HTSDIR)/htslib/kbitset.h \ - $(HTSDIR)/htslib/kfunc.h \ - $(HTSDIR)/htslib/khash.h \ - $(HTSDIR)/htslib/khash_str2int.h \ - $(HTSDIR)/htslib/klist.h \ - $(HTSDIR)/htslib/knetfile.h \ - $(HTSDIR)/htslib/kseq.h \ - $(HTSDIR)/htslib/ksort.h \ - $(HTSDIR)/htslib/kstring.h \ - $(HTSDIR)/htslib/regidx.h \ - $(HTSDIR)/htslib/sam.h \ - $(HTSDIR)/htslib/synced_bcf_reader.h \ - $(HTSDIR)/htslib/tbx.h \ - $(HTSDIR)/htslib/thread_pool.h \ - $(HTSDIR)/htslib/vcf.h \ - $(HTSDIR)/htslib/vcf_sweep.h \ - $(HTSDIR)/htslib/vcfutils.h + $(HTSSRCDIR)/htslib/bgzf.h \ + $(HTSSRCDIR)/htslib/cram.h \ + $(HTSSRCDIR)/htslib/faidx.h \ + $(HTSSRCDIR)/htslib/hfile.h \ + $(HTSSRCDIR)/htslib/hts.h \ + $(HTSSRCDIR)/htslib/hts_defs.h \ + $(HTSSRCDIR)/htslib/hts_endian.h \ + $(HTSSRCDIR)/htslib/hts_expr.h \ + $(HTSSRCDIR)/htslib/hts_log.h \ + $(HTSSRCDIR)/htslib/hts_os.h \ + $(HTSSRCDIR)/htslib/kbitset.h \ + $(HTSSRCDIR)/htslib/kfunc.h \ + $(HTSSRCDIR)/htslib/khash.h \ + $(HTSSRCDIR)/htslib/khash_str2int.h \ + $(HTSSRCDIR)/htslib/klist.h \ + $(HTSSRCDIR)/htslib/kseq.h \ + $(HTSSRCDIR)/htslib/ksort.h \ + $(HTSSRCDIR)/htslib/kstring.h \ + $(HTSSRCDIR)/htslib/regidx.h \ + $(HTSSRCDIR)/htslib/sam.h \ + $(HTSSRCDIR)/htslib/synced_bcf_reader.h \ + $(HTSSRCDIR)/htslib/tbx.h \ + $(HTSSRCDIR)/htslib/thread_pool.h \ + $(HTSSRCDIR)/htslib/vcf.h \ + $(HTSSRCDIR)/htslib/vcf_sweep.h \ + $(HTSSRCDIR)/htslib/vcfutils.h HTSLIB_ALL = \ $(HTSLIB_PUBLIC_HEADERS) \ - $(HTSDIR)/bcf_sr_sort.c \ - $(HTSDIR)/bcf_sr_sort.h \ - $(HTSDIR)/bgzf.c \ + $(HTSSRCDIR)/bcf_sr_sort.c \ + $(HTSSRCDIR)/bcf_sr_sort.h \ + $(HTSSRCDIR)/bgzf.c \ $(HTSDIR)/config.h \ - $(HTSDIR)/errmod.c \ - $(HTSDIR)/faidx.c \ - $(HTSDIR)/header.c \ - $(HTSDIR)/header.h \ - $(HTSDIR)/hfile_internal.h \ - $(HTSDIR)/hfile.c \ - $(HTSDIR)/hfile_gcs.c \ - $(HTSDIR)/hfile_libcurl.c \ - $(HTSDIR)/hfile_net.c \ - $(HTSDIR)/hfile_s3.c \ - $(HTSDIR)/hfile_s3_write.c \ - $(HTSDIR)/hts.c \ - $(HTSDIR)/hts_internal.h \ - $(HTSDIR)/hts_os.c \ - $(HTSDIR)/kfunc.c \ - $(HTSDIR)/knetfile.c \ - $(HTSDIR)/kstring.c \ - $(HTSDIR)/md5.c \ - $(HTSDIR)/multipart.c \ - $(HTSDIR)/plugin.c \ - $(HTSDIR)/probaln.c \ - $(HTSDIR)/realn.c \ - $(HTSDIR)/regidx.c \ - $(HTSDIR)/region.c \ - $(HTSDIR)/sam.c \ - $(HTSDIR)/sam_internal.h \ - $(HTSDIR)/synced_bcf_reader.c \ - $(HTSDIR)/tbx.c \ - $(HTSDIR)/textutils.c \ - $(HTSDIR)/textutils_internal.h \ - $(HTSDIR)/thread_pool.c \ - $(HTSDIR)/thread_pool_internal.h \ - $(HTSDIR)/vcf.c \ - $(HTSDIR)/vcf_sweep.c \ - $(HTSDIR)/vcfutils.c \ - $(HTSDIR)/cram/cram.h \ - $(HTSDIR)/cram/cram_codecs.c \ - $(HTSDIR)/cram/cram_codecs.h \ - $(HTSDIR)/cram/cram_decode.c \ - $(HTSDIR)/cram/cram_decode.h \ - $(HTSDIR)/cram/cram_encode.c \ - $(HTSDIR)/cram/cram_encode.h \ - $(HTSDIR)/cram/cram_external.c \ - $(HTSDIR)/cram/cram_index.c \ - $(HTSDIR)/cram/cram_index.h \ - $(HTSDIR)/cram/cram_io.c \ - $(HTSDIR)/cram/cram_io.h \ - $(HTSDIR)/cram/cram_samtools.c \ - $(HTSDIR)/cram/cram_samtools.h \ - $(HTSDIR)/cram/cram_stats.c \ - $(HTSDIR)/cram/cram_stats.h \ - $(HTSDIR)/cram/cram_structs.h \ - $(HTSDIR)/cram/mFILE.c \ - $(HTSDIR)/cram/mFILE.h \ - $(HTSDIR)/cram/misc.h \ - $(HTSDIR)/cram/open_trace_file.c \ - $(HTSDIR)/cram/open_trace_file.h \ - $(HTSDIR)/cram/os.h \ - $(HTSDIR)/cram/pooled_alloc.c \ - $(HTSDIR)/cram/pooled_alloc.h \ - $(HTSDIR)/cram/rANS_byte.h \ - $(HTSDIR)/cram/rANS_static.c \ - $(HTSDIR)/cram/rANS_static.h \ - $(HTSDIR)/cram/string_alloc.c \ - $(HTSDIR)/cram/string_alloc.h \ - $(HTSDIR)/os/lzma_stub.h \ - $(HTSDIR)/os/rand.c + $(HTSSRCDIR)/errmod.c \ + $(HTSSRCDIR)/faidx.c \ + $(HTSSRCDIR)/header.c \ + $(HTSSRCDIR)/header.h \ + $(HTSSRCDIR)/hfile_internal.h \ + $(HTSSRCDIR)/hfile.c \ + $(HTSSRCDIR)/hfile_gcs.c \ + $(HTSSRCDIR)/hfile_libcurl.c \ + $(HTSSRCDIR)/hfile_s3.c \ + $(HTSSRCDIR)/hfile_s3_write.c \ + $(HTSSRCDIR)/hts.c \ + $(HTSSRCDIR)/hts_expr.c \ + $(HTSSRCDIR)/hts_internal.h \ + $(HTSSRCDIR)/hts_os.c \ + $(HTSSRCDIR)/kfunc.c \ + $(HTSSRCDIR)/kstring.c \ + $(HTSSRCDIR)/md5.c \ + $(HTSSRCDIR)/multipart.c \ + $(HTSSRCDIR)/plugin.c \ + $(HTSSRCDIR)/probaln.c \ + $(HTSSRCDIR)/realn.c \ + $(HTSSRCDIR)/regidx.c \ + $(HTSSRCDIR)/region.c \ + $(HTSSRCDIR)/sam.c \ + $(HTSSRCDIR)/sam_internal.h \ + $(HTSSRCDIR)/synced_bcf_reader.c \ + $(HTSSRCDIR)/tbx.c \ + $(HTSSRCDIR)/textutils.c \ + $(HTSSRCDIR)/textutils_internal.h \ + $(HTSSRCDIR)/thread_pool.c \ + $(HTSSRCDIR)/thread_pool_internal.h \ + $(HTSSRCDIR)/vcf.c \ + $(HTSSRCDIR)/vcf_sweep.c \ + $(HTSSRCDIR)/vcfutils.c \ + $(HTSSRCDIR)/cram/cram.h \ + $(HTSSRCDIR)/cram/cram_codecs.c \ + $(HTSSRCDIR)/cram/cram_codecs.h \ + $(HTSSRCDIR)/cram/cram_decode.c \ + $(HTSSRCDIR)/cram/cram_decode.h \ + $(HTSSRCDIR)/cram/cram_encode.c \ + $(HTSSRCDIR)/cram/cram_encode.h \ + $(HTSSRCDIR)/cram/cram_external.c \ + $(HTSSRCDIR)/cram/cram_index.c \ + $(HTSSRCDIR)/cram/cram_index.h \ + $(HTSSRCDIR)/cram/cram_io.c \ + $(HTSSRCDIR)/cram/cram_io.h \ + $(HTSSRCDIR)/cram/cram_samtools.h \ + $(HTSSRCDIR)/cram/cram_stats.c \ + $(HTSSRCDIR)/cram/cram_stats.h \ + $(HTSSRCDIR)/cram/cram_structs.h \ + $(HTSSRCDIR)/cram/mFILE.c \ + $(HTSSRCDIR)/cram/mFILE.h \ + $(HTSSRCDIR)/cram/misc.h \ + $(HTSSRCDIR)/cram/open_trace_file.c \ + $(HTSSRCDIR)/cram/open_trace_file.h \ + $(HTSSRCDIR)/cram/os.h \ + $(HTSSRCDIR)/cram/pooled_alloc.c \ + $(HTSSRCDIR)/cram/pooled_alloc.h \ + $(HTSSRCDIR)/cram/string_alloc.c \ + $(HTSSRCDIR)/cram/string_alloc.h \ + $(HTSSRCDIR)/os/lzma_stub.h \ + $(HTSSRCDIR)/os/rand.c \ + $(HTSCODECS_SOURCES) $(HTSDIR)/config.h: +cd $(HTSDIR) && $(MAKE) config.h @@ -163,15 +167,13 @@ $(HTSDIR)/libhts.so: $(HTSLIB_ALL) $(HTSDIR)/libhts.dylib $(HTSDIR)/libhts.dll.a $(HTSDIR)/hts.dll.a: $(HTSDIR)/hts-object-files +cd $(HTSDIR) && $(MAKE) lib-shared -$(HTSDIR)/bgzip: $(HTSDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +$(HTSDIR)/bgzip: $(HTSSRCDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +cd $(HTSDIR) && $(MAKE) bgzip -$(HTSDIR)/htsfile: $(HTSDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a - +$(HTSDIR)/htsfile: $(HTSSRCDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +cd $(HTSDIR) && $(MAKE) htsfile -$(HTSDIR)/tabix: $(HTSDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a - +$(HTSDIR)/tabix: $(HTSSRCDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +cd $(HTSDIR) && $(MAKE) tabix $(HTSDIR)/htslib_static.mk: $(HTSDIR)/htslib.pc.tmp @@ -185,7 +187,8 @@ $(HTSDIR)/htslib.pc.tmp: # # clean: clean-htslib -all-htslib clean-htslib install-htslib plugins-htslib: +all-htslib check-htslib clean-htslib distclean-htslib install-htslib mostlyclean-htslib plugins-htslib test-htslib testclean-htslib: +cd $(HTSDIR) && $(MAKE) $(@:-htslib=) -.PHONY: all-htslib clean-htslib install-htslib plugins-htslib +.PHONY: all-htslib check-htslib clean-htslib distclean-htslib install-htslib +.PHONY: mostlyclean-htslib plugins-htslib test-htslib testclean-htslib diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 8e6b9b17e..ea4ec3ece 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -3,7 +3,7 @@ /* Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013, 2014, 2017, 2018-2019 Genome Research Ltd + Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2023 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -35,6 +35,13 @@ #include "hts_defs.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -296,7 +303,8 @@ typedef struct BGZF BGZF; * @param fp BGZF file handler * @param delim delimiter * @param str string to write to; must be initialized - * @return length of the string; -1 on end-of-file; <= -2 on error + * @return length of the string (capped at INT_MAX); + * -1 on end-of-file; <= -2 on error */ HTSLIB_EXPORT int bgzf_getline(BGZF *fp, int delim, struct kstring_t *str); @@ -308,23 +316,24 @@ typedef struct BGZF BGZF; int bgzf_read_block(BGZF *fp) HTS_RESULT_USED; /** - * Enable multi-threading (when compiled with -DBGZF_MT) via a shared - * thread pool. This means both encoder and decoder can balance - * usage across a single pool of worker jobs. + * Enable multi-threading via a shared thread pool. This means + * both encoder and decoder can balance usage across a single pool + * of worker jobs. * - * @param fp BGZF file handler; must be opened for writing + * @param fp BGZF file handler * @param pool The thread pool (see hts_create_threads) + * @param qsize The size of the job queue. If 0 this is twice the + * number of threads in the pool. */ HTSLIB_EXPORT int bgzf_thread_pool(BGZF *fp, struct hts_tpool *pool, int qsize); /** - * Enable multi-threading (only effective when the library was compiled - * with -DBGZF_MT) + * Enable multi-threading * - * @param fp BGZF file handler; must be opened for writing - * @param n_threads #threads used for writing - * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + * @param fp BGZF file handler + * @param n_threads #threads used for reading / writing + * @param n_sub_blks Unused (was #blocks processed by each thread) */ HTSLIB_EXPORT int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); @@ -450,4 +459,9 @@ typedef struct BGZF BGZF; } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/cram.h b/htslib/cram.h index bbbabe82e..e0b51839c 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016, 2018-2019 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield @@ -47,17 +47,87 @@ DEALINGS IN THE SOFTWARE. */ extern "C" { #endif +// see cram/cram_structs.h for an internal more complete copy of this enum + +// Htslib 1.11 had these listed without any hts prefix, and included +// some internal values such as RANS1 and GZIP_RLE (which shouldn't have ever +// been public). +// +// We can't find evidence of these being used and the data type occurs +// nowhere in functions or structures meaning using it would be pointless. +// However for safety, if you absolute need the API to not change then +// define HTS_COMPAT to 101100 (XYYYZZ for X.Y[.Z], meaning 1.11). +#if defined(HTS_COMPAT) && HTS_COMPAT <= 101100 enum cram_block_method { + // Public methods as defined in the CRAM spec. BM_ERROR = -1, + + // CRAM 2.x and 3.0 RAW = 0, GZIP = 1, BZIP2 = 2, LZMA = 3, - RANS = 4, // Generic; either order + RANS = 4, + + // NB: the subsequent numbers may change. They're simply here for + // compatibility with the old API, but may have no bearing on the + // internal way htslib works. DO NOT USE RANS0 = 4, - RANS1 = 10, // Not externalised; stored as RANS (generic) - GZIP_RLE = 11, // NB: not externalised in CRAM + RANS1 = 10, + GZIP_RLE = 11, +}; +#else + +// Values as defined in the CRAM specifications. +// See cram/cram_structs.h cram_block_method_int for an expanded version of +// this with local specialisations assigned to codes. +enum cram_block_method { + CRAM_COMP_UNKNOWN = -1, + + // CRAM 2.x and 3.0 + CRAM_COMP_RAW = 0, + CRAM_COMP_GZIP = 1, + CRAM_COMP_BZIP2 = 2, + + // CRAM 3.0 + CRAM_COMP_LZMA = 3, + CRAM_COMP_RANS4x8 = 4, // 4-way interleaving, 8-bit renormalisation + + // CRAM 3.1 + CRAM_COMP_RANSNx16 = 5, // both 4x16 and 32x16 variants, plus transforms + CRAM_COMP_ARITH = 6, // aka Range coding + CRAM_COMP_FQZ = 7, // FQZComp + CRAM_COMP_TOK3 = 8, // Name tokeniser }; +#endif + +/* NOTE this structure may be expanded in future releases by appending + * additional fields. + * + * Do not assume the size is fixed and avoid using arrays of this struct. + */ +typedef struct { + enum cram_block_method method; + + // Generic compression level if known (0 if not). + // 1 or 9 for gzip min/max flag (else 5). 1-9 for bzip2 + // 1 or 11 for for tok3 (rans/arith encoder). + int level; + + // For rans* and arith codecs + int order; + + // ransNx16/arith specific + int rle; + int pack; + int stripe; + int cat; + int nosz; + int Nway; + + // Arithmetic coder only + int ext; // external: use gz, xz or bzip2 +} cram_method_details; enum cram_content_type { CT_ERROR = -1, @@ -78,6 +148,7 @@ typedef struct cram_slice cram_slice; typedef struct cram_metrics cram_metrics; typedef struct cram_block_slice_hdr cram_block_slice_hdr; typedef struct cram_block_compression_hdr cram_block_compression_hdr; +typedef struct cram_codec cram_codec; typedef struct refs_t refs_t; struct hFILE; @@ -128,6 +199,10 @@ int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) HTSLIB_EXPORT void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, int32_t *landmarks); +HTSLIB_EXPORT +int32_t cram_container_get_num_records(cram_container *c); +HTSLIB_EXPORT +int64_t cram_container_get_num_bases(cram_container *c); /* Returns true if the container is empty (EOF marker) */ HTSLIB_EXPORT @@ -148,9 +223,14 @@ HTSLIB_EXPORT int32_t cram_block_get_crc32(cram_block *b); HTSLIB_EXPORT void * cram_block_get_data(cram_block *b); - HTSLIB_EXPORT enum cram_content_type cram_block_get_content_type(cram_block *b); +HTSLIB_EXPORT +enum cram_block_method cram_block_get_method(cram_block *b); + +HTSLIB_EXPORT +cram_method_details *cram_expand_method(uint8_t *data, int32_t size, + enum cram_block_method comp); HTSLIB_EXPORT void cram_block_set_content_id(cram_block *b, int32_t id); @@ -181,6 +261,27 @@ void cram_block_set_offset(cram_block *b, size_t offset); HTSLIB_EXPORT uint32_t cram_block_size(cram_block *b); +/* + * Returns the Block Content ID values referred to by a cram_codec in + * ids[2]. + * + * -2 is unused. + * -1 is CORE + * >= 0 is the block with that Content ID + */ +HTSLIB_EXPORT +void cram_codec_get_content_ids(cram_codec *c, int ids[2]); + +/* + * Produces a human readable description of the codec parameters. + * This is appended to an existing kstring 'ks'. + * + * Returns 0 on succes, + * <0 on failure + */ +HTSLIB_EXPORT +int cram_codec_describe(cram_codec *c, kstring_t *ks); + /* * Renumbers RG numbers in a cram compression header. * @@ -228,6 +329,106 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, HTSLIB_EXPORT int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice); +/* + * Decodes a CRAM block compression header. + * Returns header ptr on success + * NULL on failure + */ +HTSLIB_EXPORT +cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, + cram_block *b); +/* + * Frees a cram_block_compression_hdr structure. + */ +HTSLIB_EXPORT +void cram_free_compression_header(cram_block_compression_hdr *hdr); + +typedef struct cram_cid2ds_t cram_cid2ds_t; + +/* + * Map cram block numbers to data-series. It's normally a 1:1 mapping, + * but in rare cases it can be 1:many (or even many:many). + * The key is the block number and the value is an index into the data-series + * array, which we iterate over until reaching a negative value. + * + * Provide cid2ds as NULL to allocate a new map or pass in an existing one + * to append to this map. The new (or existing) map is returned. + * + * Returns the cid2ds (newly allocated or as provided) on success, + * NULL on failure. + */ +HTSLIB_EXPORT +cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr, + cram_cid2ds_t *cid2ds); + +/* + * Return a list of data series observed as belonging to a block with + * the specified content_id. *n is the number of data series + * returned, or 0 if block is unused. + * Block content_id of -1 is used to indicate the CORE block. + * + * The pointer returned is owned by the cram_cid2ds state and should + * not be freed by the caller. + */ +HTSLIB_EXPORT +int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n); + +/* + * Frees a cram_cid2ds_t allocated by cram_update_cid2ds_map + */ +HTSLIB_EXPORT +void cram_cid2ds_free(cram_cid2ds_t *cid2ds); + +/* + * Produces a description of the record and tag encodings held within + * a compression header and appends to 'ks'. + * + * Returns 0 on success, + * <0 on failure. + */ +HTSLIB_EXPORT +int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks); + +/* + *----------------------------------------------------------------------------- + * cram slice interrogation + */ + +/* + * Returns the number of cram blocks within this slice. + */ +HTSLIB_EXPORT +int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr); + +/* + * Returns the block content_id for the block containing an embedded reference + * sequence. If none is present, -1 is returned. + */ +HTSLIB_EXPORT +int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h); + +/* + * Returns slice reference ID, start and span (length) coordinates. + * Return parameters may be NULL in which case they are ignored. + */ +HTSLIB_EXPORT +void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, + int *refid, hts_pos_t *start, hts_pos_t *span); + +/* + * Decodes a slice header from a cram block. + * Returns the opaque cram_block_slice_hdr pointer on success, + * NULL on failure. + */ +HTSLIB_EXPORT +cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b); + +/* + * Frees a cram_block_slice_hdr structure. + */ +HTSLIB_EXPORT +void cram_free_slice_header(cram_block_slice_hdr *hdr); + /* *----------------------------------------------------------------------------- * cram_io basics @@ -306,6 +507,9 @@ int cram_uncompress_block(cram_block *b); HTSLIB_EXPORT int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, int method, int level); +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level); /**@}*/ /**@{ ---------------------------------------------------------------------- diff --git a/htslib/faidx.h b/htslib/faidx.h index 149cebd2e..12f3f4b40 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -1,7 +1,7 @@ /// @file htslib/faidx.h /// FASTA random access. /* - Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020, 2022-2023 Genome Research Ltd. Author: Heng Li @@ -70,6 +70,9 @@ struct faidx_t; /// Opaque structure representing FASTA index typedef struct faidx_t faidx_t; +/// Opaque structure; sole item needed from htslib/thread_pool.h +struct hts_tpool; + /// File format to be dealing with. enum fai_format_options { FAI_NONE, @@ -188,6 +191,15 @@ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); HTSLIB_EXPORT char *fai_fetch64(const faidx_t *fai, const char *reg, hts_pos_t *len); +/// Query the line-wrap length for a chromosome specified as part of a region +/** @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @return The line length (excluding newline), + negative on error. +*/ +HTSLIB_EXPORT +hts_pos_t fai_line_length(const faidx_t *fai, const char *reg); + /// Fetch the quality string for a region for FASTQ files /** @param fai Pointer to the faidx_t struct @param reg Region in the format "chr2:20,000-30,000" @@ -225,6 +237,10 @@ by end users by calling `free()` on it. HTSLIB_EXPORT char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +void faidx_fetch_seq_into_buffer(const faidx_t *fai, + const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, + char* s, hts_pos_t *len); + /// Fetch the sequence in a region /** @param fai Pointer to the faidx_t struct @param c_name Region name @@ -283,7 +299,22 @@ int faidx_nseq(const faidx_t *fai); HTSLIB_EXPORT const char *faidx_iseq(const faidx_t *fai, int i); -/// Return sequence length, -1 if not present +/// Return sequence length +/** @param fai Pointer to the faidx_t struct + @param seq Name of the sequence + @return Sequence length, or -1 if not present +*/ +HTSLIB_EXPORT +hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq); + +/// Return sequence length +/** @param fai Pointer to the faidx_t struct + @param seq Name of the sequence + @return Sequence length, or -1 if not present + + @deprecated This funtion cannot handle very long sequences. + Use faidx_seq_len64() instead. +*/ HTSLIB_EXPORT int faidx_seq_len(const faidx_t *fai, const char *seq); @@ -305,6 +336,27 @@ const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, hts_pos_t *beg, hts_pos_t *end, int flags); +/// Adjust region to the actual sequence length +/** @param fai Pointer to the faidx_t struct + @param tid Sequence index, as returned by fai_parse_region() + @param beg[in,out] The start of the region (0 based) + @param end[in,out] One past end of the region (0 based) + @return 1, 2, or 3 if @p beg, @p end, or both are adjusted, + 0 if @p beg and @p end are unchanged + -1 on error + + Looks up the length of @p tid, and then adjusts the values of @p beg + and @p end if they fall outside the boundaries of the sequence. + + If @p beg > @p end, it will be set to @p end. + + The return value indicates which, if any, of the inputs have been + adjusted. -1 will be returned if @p tid is not a valid sequence index. +*/ +HTSLIB_EXPORT +int fai_adjust_region(const faidx_t *fai, int tid, + hts_pos_t *beg, hts_pos_t *end); + /// Sets the cache size of the underlying BGZF compressed file /** @param fai Pointer to the faidx_t struct * @param cache_size Selected cache size in bytes @@ -312,6 +364,15 @@ const char *fai_parse_region(const faidx_t *fai, const char *s, HTSLIB_EXPORT void fai_set_cache_size(faidx_t *fai, int cache_size); +/// Adds a thread pool to the underlying BGZF layer. +/** @param fai FAI file handler + * @param pool The thread pool (see hts_create_threads) + * @param qsize The size of the job queue. If 0 this is twice the + * number of threads in the pool. + */ +HTSLIB_EXPORT +int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize); + /// Determines the path to the reference index file /** @param fa String with the path to the reference file * @return String with the path to the reference index file, or NULL on failure diff --git a/htslib/hfile.h b/htslib/hfile.h index 08d3edf7e..bc86757a2 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -1,7 +1,7 @@ /// @file htslib/hfile.h /// Buffered low-level input/output streams. /* - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: John Marshall @@ -32,6 +32,13 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -73,6 +80,10 @@ Note that the file must be opened in binary mode, or else there will be problems on platforms that make a difference between text and binary mode. +By default, the returned hFILE "takes ownership" of the file descriptor +and _fd_ will be closed by hclose(). When _mode_ contains `S` (shared fd), +hclose() will destroy the hFILE but not close the underlying _fd_. + For socket descriptors (on Windows), _mode_ should contain `s`. */ HTSLIB_EXPORT @@ -151,6 +162,7 @@ static inline off_t htell(hFILE *fp) */ static inline int hgetc(hFILE *fp) { + HTSLIB_EXPORT extern int hgetc2(hFILE *); return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp); } @@ -222,6 +234,7 @@ or I/O errors. static inline ssize_t HTS_RESULT_USED hread(hFILE *fp, void *buffer, size_t nbytes) { + HTSLIB_EXPORT extern ssize_t hread2(hFILE *, void *, size_t, size_t); size_t n = fp->end - fp->begin; @@ -236,6 +249,7 @@ hread(hFILE *fp, void *buffer, size_t nbytes) */ static inline int hputc(int c, hFILE *fp) { + HTSLIB_EXPORT extern int hputc2(int, hFILE *); if (fp->begin < fp->limit) *(fp->begin++) = c; else c = hputc2(c, fp); @@ -247,6 +261,7 @@ static inline int hputc(int c, hFILE *fp) */ static inline int hputs(const char *text, hFILE *fp) { + HTSLIB_EXPORT extern int hputs2(const char *, size_t, size_t, hFILE *); size_t nbytes = strlen(text), n = fp->limit - fp->begin; @@ -264,7 +279,9 @@ In the absence of I/O errors, the full _nbytes_ will be written. static inline ssize_t HTS_RESULT_USED hwrite(hFILE *fp, const void *buffer, size_t nbytes) { + HTSLIB_EXPORT extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t); + HTSLIB_EXPORT extern int hfile_set_blksize(hFILE *fp, size_t bufsiz); if (!fp->mobile) { @@ -316,8 +333,61 @@ purpose other than closing. HTSLIB_EXPORT char *hfile_mem_steal_buffer(hFILE *file, size_t *length); +/// Fills out sc_list[] with the list of known URL schemes. +/** + * @param plugin [in] Restricts schemes to only those from 'plugin. + * @param sc_list [out] Filled out with the scheme names + * @param nschemes [in/out] Size of sc_list (in) and number returned (out) + * + * Plugin may be passed in as NULL in which case all schemes are returned. + * Use plugin "built-in" to list the built in schemes. + * The size of sc_list is determined by the input value of *nschemes. + * This is updated to return the output size. It is up to the caller to + * determine whether to call again with a larger number if this is too small. + * + * The return value represents the total number found matching plugin, which + * may be larger than *nschemes if too small a value was specified. + * + * @return the number of schemes found on success. + * -1 on failure + */ +HTSLIB_EXPORT +int hfile_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); + +/// Fills out plist[] with the list of known hFILE plugins. +/* + * @param plist [out] Filled out with the plugin names + * @param nplugins [in/out] Size of plist (in) and number returned (out) + * + * The size of plist is determined by the input value of *nplugins. + * This is updated to return the output size. It is up to the caller to + * determine whether to call again with a larger number if this is too small. + * + * The return value represents the total number found, which may be + * larger than *nplugins if too small a value was specified. + * + * @return the number of plugins found on success. + * -1 on failure + */ +HTSLIB_EXPORT +int hfile_list_plugins(const char *plist[], int *nplugins); + +/// Tests for the presence of a specific hFILE plugin. +/* + * @param name The name of the plugin to query. + * + * @return 1 if found, 0 otherwise. + */ +HTSLIB_EXPORT +int hfile_has_plugin(const char *name); + #ifdef __cplusplus } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/hts.h b/htslib/hts.h index 373062f81..37dae5d96 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1,7 +1,7 @@ /// @file htslib/hts.h /// Format-neutral I/O, indexing, and iterator API functions. /* - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Copyright (C) 2010, 2012 Broad Institute. Portions copyright (C) 2003-2006, 2008-2010 by Heng Li @@ -205,11 +205,13 @@ enum htsExactFormat { empty_format, // File is empty (or empty after decompression) fasta_format, fastq_format, fai_format, fqi_format, hts_crypt4gh_format, + d4_format, format_maximum = 32767 }; enum htsCompression { - no_compression, gzip, bgzf, custom, bzip2_compression, + no_compression, gzip, bgzf, custom, bzip2_compression, razf_compression, + xz_compression, zstd_compression, compression_maximum = 32767 }; @@ -224,6 +226,7 @@ typedef struct htsFormat { struct hts_idx_t; typedef struct hts_idx_t hts_idx_t; +struct hts_filter_t; /** * @brief File handle returned by hts_open() etc. @@ -256,6 +259,7 @@ typedef struct htsFile { hts_idx_t *idx; const char *fnidx; struct sam_hdr_t *bam_header; + struct hts_filter_t *filter; } htsFile; // A combined thread pool and queue allocation size. @@ -314,6 +318,10 @@ enum hts_fmt_option { CRAM_OPT_STORE_MD, CRAM_OPT_STORE_NM, CRAM_OPT_RANGE_NOSEEK, // CRAM_OPT_RANGE minus the seek + CRAM_OPT_USE_TOK, + CRAM_OPT_USE_FQZ, + CRAM_OPT_USE_ARITH, + CRAM_OPT_POS_DELTA, // force delta for AP, even on non-pos sorted data // General purpose HTS_OPT_COMPRESSION_LEVEL = 100, @@ -321,6 +329,52 @@ enum hts_fmt_option { HTS_OPT_THREAD_POOL, HTS_OPT_CACHE_SIZE, HTS_OPT_BLOCK_SIZE, + HTS_OPT_FILTER, + HTS_OPT_PROFILE, + + // Fastq + + // Boolean. + // Read / Write CASAVA 1.8 format. + // See https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf + // + // The CASAVA tag matches \d:[YN]:\d+:[ACGTN]+ + // The first \d is read 1/2 (1 or 2), [YN] is QC-PASS/FAIL flag, + // \d+ is a control number, and the sequence at the end is + // for barcode sequence. Barcodes are read into the aux tag defined + // by FASTQ_OPT_BARCODE ("BC" by default). + FASTQ_OPT_CASAVA = 1000, + + // String. + // Whether to read / write extra SAM format aux tags from the fastq + // identifier line. For reading this can simply be "1" to request + // decoding aux tags. For writing it is a comma separated list of aux + // tag types to be written out. + FASTQ_OPT_AUX, + + // Boolean. + // Whether to add /1 and /2 to read identifiers when writing FASTQ. + // These come from the BAM_FREAD1 or BAM_FREAD2 flags. + // (Detecting the /1 and /2 is automatic when reading fastq.) + FASTQ_OPT_RNUM, + + // Two character string. + // Barcode aux tag for CASAVA; defaults to "BC". + FASTQ_OPT_BARCODE, + + // Process SRA and ENA read names which pointlessly move the original + // name to the second field and insert a constructed . + // name in its place. + FASTQ_OPT_NAME2, +}; + +// Profile options for encoding; primarily used at present in CRAM +// but also usable in BAM as a synonym for deflate compression levels. +enum hts_profile_option { + HTS_PROFILE_FAST, + HTS_PROFILE_NORMAL, + HTS_PROFILE_SMALL, + HTS_PROFILE_ARCHIVE, }; // For backwards compatibility @@ -402,16 +456,19 @@ The input character may be either an IUPAC ambiguity code, '=' for 0, or '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 for A/C/G/T or combinations of these bits for ambiguous bases. */ +HTSLIB_EXPORT extern const unsigned char seq_nt16_table[256]; /*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC ambiguity code letter (or '=' when given 0). */ +HTSLIB_EXPORT extern const char seq_nt16_str[]; /*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). */ +HTSLIB_EXPORT extern const int seq_nt16_int[]; /*! @@ -432,17 +489,77 @@ const char *hts_version(void); // Immediately after release, bump ZZ to 90 to distinguish in-development // Git repository builds from the release; you may wish to increment this // further when significant features are merged. -#define HTS_VERSION 101090 +#define HTS_VERSION 101890 + +/*! @abstract Introspection on the features enabled in htslib + * + * @return a bitfield of HTS_FEATURE_* macros. + */ +HTSLIB_EXPORT +unsigned int hts_features(void); + +HTSLIB_EXPORT +const char *hts_test_feature(unsigned int id); + +/*! @abstract Introspection on the features enabled in htslib, string form + * + * @return a string describing htslib build features + */ +HTSLIB_EXPORT +const char *hts_feature_string(void); + +// Whether ./configure was used or vanilla Makefile +#define HTS_FEATURE_CONFIGURE 1 + +// Whether --enable-plugins was used +#define HTS_FEATURE_PLUGINS 2 + +// Transport specific +#define HTS_FEATURE_LIBCURL (1u<<10) +#define HTS_FEATURE_S3 (1u<<11) +#define HTS_FEATURE_GCS (1u<<12) + +// Compression options +#define HTS_FEATURE_LIBDEFLATE (1u<<20) +#define HTS_FEATURE_LZMA (1u<<21) +#define HTS_FEATURE_BZIP2 (1u<<22) +#define HTS_FEATURE_HTSCODECS (1u<<23) // htscodecs library version + +// Build params +#define HTS_FEATURE_CC (1u<<27) +#define HTS_FEATURE_CFLAGS (1u<<28) +#define HTS_FEATURE_CPPFLAGS (1u<<29) +#define HTS_FEATURE_LDFLAGS (1u<<30) + /*! @abstract Determine format by peeking at the start of a file @param fp File opened for reading, positioned at the beginning @param fmt Format structure that will be filled out on return @return 0 for success, or negative if an error occurred. + + Equivalent to hts_detect_format2(fp, NULL, fmt). */ HTSLIB_EXPORT int hts_detect_format(struct hFILE *fp, htsFormat *fmt); +/*! + @abstract Determine format primarily by peeking at the start of a file + @param fp File opened for reading, positioned at the beginning + @param fname Name of the file, or NULL if not available + @param fmt Format structure that will be filled out on return + @return 0 for success, or negative if an error occurred. + @since 1.15 + +Some formats are only recognised if the filename is available and has the +expected extension, as otherwise more generic files may be misrecognised. +In particular: + - FASTA/Q indexes must have .fai/.fqi extensions; without this requirement, + some similar BED files would be misrecognised as indexes. +*/ +HTSLIB_EXPORT +int hts_detect_format2(struct hFILE *fp, const char *fname, htsFormat *fmt); + /*! @abstract Get a human-readable description of the file format @param fmt Format structure holding type, version, compression, etc. @@ -457,7 +574,7 @@ char *hts_format_description(const htsFormat *format); @param fn The file name or "-" for stdin/stdout. For indexed files with a non-standard naming, the file name can include the name of the index file delimited with HTS_IDX_DELIM - @param mode Mode matching / [rwa][bceguxz0-9]* / + @param mode Mode matching / [rwa][bcefFguxz0-9]* / @discussion With 'r' opens for reading; any further format mode letters are ignored as the format is detected by checking the first few bytes or BGZF blocks @@ -465,6 +582,8 @@ char *hts_format_description(const htsFormat *format); specifier letters: b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) c CRAM format + f FASTQ format + F FASTA format g gzip compressed u uncompressed z bgzf compressed @@ -509,6 +628,15 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) HTSLIB_EXPORT htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode); +/*! + @abstract For output streams, flush any buffered data + @param fp The file handle to be flushed + @return 0 for success, or negative if an error occurred. + @since 1.14 +*/ +HTSLIB_EXPORT +int hts_flush(htsFile *fp); + /*! @abstract Close a file handle, flushing buffered data for output streams @param fp The file handle to be closed @@ -548,7 +676,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...); @param fp The file handle @param delimiter Unused, but must be '\n' (or KS_SEP_LINE) @param str The line (not including the terminator) is written here - @return Length of the string read; + @return Length of the string read (capped at INT_MAX); -1 on end-of-file; <= -2 on error */ HTSLIB_EXPORT @@ -607,6 +735,15 @@ HTSLIB_EXPORT int hts_set_fai_filename(htsFile *fp, const char *fn_aux); +/*! + @abstract Sets a filter expression + @return 0 for success, negative on failure + @discussion + To clear an existing filter, specifying expr as NULL. +*/ +HTSLIB_EXPORT +int hts_set_filter_expression(htsFile *fp, const char *expr); + /*! @abstract Determine whether a given htsFile contains a valid EOF block @return 3 for a non-EOF checkable filetype; @@ -746,8 +883,10 @@ typedef struct hts_itr_t { typedef hts_itr_t hts_itr_multi_t; - #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7) - #define hts_bin_parent(l) (((l) - 1) >> 3) +/// Compute the first bin on a given level +#define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7) +/// Compute the parent bin of a given bin +#define hts_bin_parent(b) (((b) - 1) >> 3) /////////////////////////////////////////////////////////// // Low-level API for building indexes. @@ -909,6 +1048,8 @@ hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags); /////////////////////////////////////////////////////////// // Functions for accessing meta-data stored in indexes +typedef const char *(*hts_id2name_f)(void*, int); + /// Get extra index meta-data /** @param idx The index @param l_meta Pointer to where the length of the extra data is stored @@ -965,6 +1106,26 @@ int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* HTSLIB_EXPORT uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); +/// Return a list of target names from an index +/** @param idx Index + @param[out] n Location to store the number of targets + @param getid Callback function to get the name for a target ID + @param hdr Header from indexed file + @return An array of pointers to the names on success; NULL on failure + + @note The names are pointers into the header data structure. When cleaning + up, only the array should be freed, not the names. + */ +HTSLIB_EXPORT +const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values + +/// Return the number of targets from an index +/** @param idx Index + @return The number of targets + */ +HTSLIB_EXPORT +int hts_idx_nseq(const hts_idx_t *idx); + /////////////////////////////////////////////////////////// // Region parsing @@ -979,16 +1140,31 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); @param strend If non-NULL, set on return to point to the first character in @a str after those forming the parsed number @param flags Or'ed-together combination of HTS_PARSE_* flags - @return Converted value of the parsed number. - - When @a strend is NULL, a warning will be printed (if hts_verbose is HTS_LOG_WARNING - or more) if there are any trailing characters after the number. + @return Integer value of the parsed number, or 0 if no valid number + + The input string is parsed as: optional whitespace; an optional '+' or + '-' sign; decimal digits possibly including ',' characters (if @a flags + includes HTS_PARSE_THOUSANDS_SEP) and a '.' decimal point; and an optional + case-insensitive suffix, which may be either 'k', 'M', 'G', or scientific + notation consisting of 'e'/'E' followed by an optional '+' or '-' sign and + decimal digits. To be considered a valid numeric value, the main part (not + including any suffix or scientific notation) must contain at least one + digit (either before or after the decimal point). + + When @a strend is NULL, @a str is expected to contain only (optional + whitespace followed by) the numeric value. A warning will be printed + (if hts_verbose is HTS_LOG_WARNING or more) if no valid parsable number + is found or if there are any unused characters after the number. + + When @a strend is non-NULL, @a str starts with (optional whitespace + followed by) the numeric value. On return, @a strend is set to point + to the first unused character after the numeric value, or to @a str + if no valid parsable number is found. */ HTSLIB_EXPORT long long hts_parse_decimal(const char *str, char **strend, int flags); typedef int (*hts_name2id_f)(void*, const char*); -typedef const char *(*hts_id2name_f)(void*, int); /// Parse a "CHR:START-END"-style region string /** @param str String to be parsed @@ -1138,19 +1314,6 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g HTSLIB_EXPORT int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED; -/// Return a list of target names from an index -/** @param idx Index - @param[out] n Location to store the number of targets - @param getid Callback function to get the name for a target ID - @param hdr Header from indexed file - @return An array of pointers to the names on success; NULL on failure - - @note The names are pointers into the header data structure. When cleaning - up, only the array should be freed, not the names. - */ -HTSLIB_EXPORT -const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values - /********************************** * Iterator with multiple regions * **********************************/ @@ -1347,10 +1510,27 @@ static inline int hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n return 0; } +/// Compute the level of a bin in a binning index +static inline int hts_bin_level(int bin) { + int l, b; + for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); + return l; +} + +//! Compute the corresponding entry into the linear index of a given bin from +//! a binning index +/*! + * @param bin The bin number + * @param n_lvls The index depth (number of levels - 0 based) + * @return The integer offset into the linear index + * + * Explanation of the return value formula: + * Each bin on level l covers exp(2, (n_lvls - l)*3 + min_shift) base pairs. + * A linear index entry covers exp(2, min_shift) base pairs. + */ static inline int hts_bin_bot(int bin, int n_lvls) { - int l, b; - for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); // compute the level of bin + int l = hts_bin_level(bin); return (bin - hts_bin_first(l)) << (n_lvls - l) * 3; } diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 8b24167e9..7719215c1 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -79,7 +79,7 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_DEPRECATED(message) #endif -#if HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(6,4) +#if (HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(6,4)) && !defined(__ICC) #define HTS_DEPRECATED_ENUM(message) __attribute__ ((__deprecated__ (message))) #else #define HTS_DEPRECATED_ENUM(message) diff --git a/htslib/hts_endian.h b/htslib/hts_endian.h index 790d2d5c6..12effab7b 100644 --- a/htslib/hts_endian.h +++ b/htslib/hts_endian.h @@ -100,7 +100,7 @@ DEALINGS IN THE SOFTWARE. */ #endif #if HTS_ALLOW_UNALIGNED != 0 -# if defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) +# if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) || defined(__clang__) // This prevents problems with gcc's vectoriser generating the wrong // instructions for unaligned data. typedef uint16_t uint16_u __attribute__ ((__aligned__ (1))); @@ -113,6 +113,14 @@ typedef uint64_t uint64_u; # endif #endif +/// Get a uint8_t value from an unsigned byte array +/** @param buf Pointer to source byte, may be unaligned + * @return An 8-bit unsigned integer + */ +static inline uint8_t le_to_u8(const uint8_t *buf) { + return *buf; +} + /// Get a uint16_t value from an unsigned byte array /** @param buf Pointer to source byte, may be unaligned * @return A 16 bit unsigned integer diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h new file mode 100644 index 000000000..43da89d6a --- /dev/null +++ b/htslib/hts_expr.h @@ -0,0 +1,152 @@ +/* expr.c -- filter expression parsing and processing. + + Copyright (C) 2020, 2022 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTS_EXPR_H +#define HTS_EXPR_H + +#include +#include "kstring.h" +#include "hts_defs.h" + +/// Holds a filter variable. This is also used to return the results. +/** + * The expression language has 3-states of string, numeric, and unknown. + * The unknown state is either a NaN numeric or a null string, with both + * internally considered to have the same "unknown" meaning. + * + * These largely match the IEE 754 semantics for NaN comparisons: <, >, ==, + * != all fail, (even NaN == NaN). Similarly arithmetic (+,-,/,*,%) with + * unknown values are still unknown (and false). + * + * The departure from NaN semantics though is that our unknown/null state is + * considered to be false while NaN in C is true. Similarly the false nature + * of our unknown state meants !val becomes true, !!val is once again false, + * val && 1 is false, val || 0 is false, and val || 1 is true along with + * !val || 0 and !val && 1. + * + * Note it is possible for empty strings and zero numbers to also be true. + * An example of this is the aux string '[NM]' which returns true if the + * NM tag is found, regardless of whether it is also zero. However the + * better approach added in 1.16 is 'exists([NM])'. + */ +typedef struct hts_expr_val_t { + char is_str; // Use .s vs .d + char is_true; // Force true if even zero + kstring_t s; // is_str and empty s permitted (eval as false) + double d; // otherwise this +} hts_expr_val_t; + +/// Returns true if an hts_expr_val_t is defined. +/* An example usage of this is in the SAM expression filter where an + * [X0] aux tag will be the value of X0 (string or numeric) if set, or + * a false nul-string (not the same as an empty one) when not set. + */ +static inline int hts_expr_val_exists(hts_expr_val_t *v) { + return v && !(v->is_str == 1 && v->s.s == NULL) + && !(v->is_str == 0 && isnan(v->d)); +} + +/// Returns true if an hts_expr_val_t is defined or is undef-but-true +static inline int hts_expr_val_existsT(hts_expr_val_t *v) { + return (v && v->is_true) || hts_expr_val_exists(v); +} + +/// Set a value to be undefined (nan). +static inline void hts_expr_val_undef(hts_expr_val_t *v) { + ks_clear(&v->s); + v->is_true = 0; + v->is_str = 0; + v->d = NAN; +} + +/// Frees a hts_expr_val_t type. +static inline void hts_expr_val_free(hts_expr_val_t *f) { + ks_free(&f->s); +} + +/// Opaque hts_filter_t type. Definition in hts_expr.c +typedef struct hts_filter_t hts_filter_t; + +/// For static initialisation of hts_expr_val_t values +#define HTS_EXPR_VAL_INIT {0, 0, KS_INITIALIZE, 0} + +/// Creates a filter for expression "str". +/** @param str The filter expression + * @return A pointer on success, NULL on failure + */ +HTSLIB_EXPORT +hts_filter_t *hts_filter_init(const char *str); + +/// Frees an hts_filter_t created via hts_filter_init +/** @param filt The filter pointer. + */ +HTSLIB_EXPORT +void hts_filter_free(hts_filter_t *filt); + +/// Type for expression symbol lookups; name -> value. +typedef int (hts_expr_sym_func)(void *data, char *str, char **end, + hts_expr_val_t *res); + +/// Evaluates a filter expression and returns the value +/** @param filt The filter, produced by hts_filter_init + * @param data Arbitrary caller data, passed into sym_func + * @param sym_func Callback function to lookup variables. + * @param res Filled out with the result of the filter evaluation + * @return Returns 0 on success, -1 on failure + * + * sym_func and data may be NULL if the caller does not need its own data + * pointer or if it has no variables to lookup. + * + * The type of the returned result may be numeric of string, as defined by + * the is_str member. It can also be explicitly defined to be true even + * for a null value. This may be used to check for the existence of + * something, irrespective of whether that something evaluates to zero. + * + * @p res must be initialized using HTS_EXPR_VAL_INIT before passing it + * to this function for the first time. + */ +HTSLIB_EXPORT +int hts_filter_eval2(hts_filter_t *filt, + void *data, hts_expr_sym_func *sym_func, + hts_expr_val_t *res); + +/// Evaluate a filter expression (derecated API) +/** + * @copydetails hts_filter_eval2() + * + * If calling this function more than once with the same @p res + * parameter, hts_expr_val_free(res) must be used between invocations + * to clear any allocated memory prior to reuse. + * + * @deprecated This function has been replaced by hts_filter_eval2(), + * which clears @p res properly itself. + */ +HTSLIB_EXPORT +int hts_filter_eval(hts_filter_t *filt, + void *data, hts_expr_sym_func *sym_func, + hts_expr_val_t *res) + HTS_DEPRECATED("Please use hts_filter_eval2 instead"); + + +#endif /* HTS_EXPR_H */ diff --git a/htslib/hts_log.h b/htslib/hts_log.h index b2336a4df..f6a50b333 100644 --- a/htslib/hts_log.h +++ b/htslib/hts_log.h @@ -58,6 +58,7 @@ enum htsLogLevel hts_get_log_level(void); * One of the HTS_LOG_* values. The default is HTS_LOG_WARNING. * \note Avoid direct use of this variable. Use hts_set_log_level and hts_get_log_level instead. */ +HTSLIB_EXPORT extern int hts_verbose; /*! Logs an event. diff --git a/htslib/hts_os.h b/htslib/hts_os.h index 2f988c3c9..c715b0612 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -1,7 +1,7 @@ /// @file hts_os.h /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019 Genome Research Ltd. + Copyright (C) 2017, 2019-2020 Genome Research Ltd. Author: James Bonfield @@ -77,4 +77,10 @@ extern int is_cygpty(int fd); #define random rand #endif +/* MSVC does not provide ssize_t in its . This ensures the type + is available (unless suppressed by defining HTS_NO_SSIZE_T first). */ +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined HTS_NO_SSIZE_T && !defined ssize_t +#define ssize_t intptr_t #endif + +#endif // HTSLIB_HTS_OS_H diff --git a/htslib/khash.h b/htslib/khash.h index 7cd0c9b62..4cea91020 100644 --- a/htslib/khash.h +++ b/htslib/khash.h @@ -447,7 +447,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) key ^= (key >> 16); return key; } -#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) +#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)(key)) /* --- END OF HASH FUNCTIONS --- */ diff --git a/htslib/knetfile.h b/htslib/knetfile.h index 87fba4adc..0f2adec83 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2012, 2014 Genome Research Ltd (GRL). + Copyright (c) 2008, 2012, 2014, 2021-2022 Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining @@ -44,25 +44,33 @@ #define netclose(fd) closesocket(fd) #endif +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + // FIXME: currently I/O is unbuffered #define KNF_TYPE_LOCAL 1 #define KNF_TYPE_FTP 2 #define KNF_TYPE_HTTP 3 +// Kept for API/ABI compatability only. Do not use directly! typedef struct knetFile_s { - int type, fd; - int64_t offset; - char *host, *port; - - // the following are for FTP only - int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; - char *response, *retr, *size_cmd; - int64_t seek_offset; // for lazy seek - int64_t file_size; - - // the following are for HTTP only - char *path, *http_host; + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; } knetFile; #define knet_tell(fp) ((fp)->offset) @@ -72,38 +80,38 @@ typedef struct knetFile_s { extern "C" { #endif -#ifdef _WIN32 - int knet_win32_init(); - void knet_win32_destroy(); -#endif - HTSLIB_EXPORT - knetFile *knet_open(const char *fn, const char *mode); + knetFile *knet_open(const char *fn, const char *mode) HTS_DEPRECATED("Please use hopen instead"); - /* - This only works with local files. - */ + /* + This only works with local files. + */ HTSLIB_EXPORT - knetFile *knet_dopen(int fd, const char *mode); + knetFile *knet_dopen(int fd, const char *mode) HTS_DEPRECATED("Please use hdopen instead"); - /* - If ->is_ready==0, this routine updates ->fd; otherwise, it simply - reads from ->fd. - */ + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ HTSLIB_EXPORT - ssize_t knet_read(knetFile *fp, void *buf, size_t len); + ssize_t knet_read(knetFile *fp, void *buf, size_t len) HTS_DEPRECATED("Please use hread instead"); - /* - This routine only sets ->offset and ->is_ready=0. It does not - communicate with the FTP server. - */ + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ HTSLIB_EXPORT - off_t knet_seek(knetFile *fp, off_t off, int whence); + off_t knet_seek(knetFile *fp, off_t off, int whence) HTS_DEPRECATED("Please use hseek instead"); HTSLIB_EXPORT - int knet_close(knetFile *fp); + int knet_close(knetFile *fp) HTS_DEPRECATED("Please use hclose instead"); #ifdef __cplusplus } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/kseq.h b/htslib/kseq.h index ee13dd0d8..ea887f14c 100644 --- a/htslib/kseq.h +++ b/htslib/kseq.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos - Copyright (C) 2013, 2018, 2020 Genome Research Ltd. + Copyright (C) 2013, 2018, 2020, 2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -24,8 +24,6 @@ SOFTWARE. */ -/* Last Modified: 05MAR2012 */ - #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -57,6 +55,7 @@ unsigned char *buf; \ } kstream_t; +#define ks_err(ks) ((ks)->end == -1) #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) @@ -78,11 +77,13 @@ #define __KS_INLINED(__read) \ static inline klib_unused int ks_getc(kstream_t *ks) \ { \ + if (ks_err(ks)) return -3; \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end == 0) { ks->is_eof = 1; return -1; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ } \ ks->seek_pos++; \ return (int)ks->buf[ks->begin++]; \ @@ -99,11 +100,13 @@ uint64_t seek_pos = str->l; \ for (;;) { \ int i; \ + if (ks_err(ks)) return -3; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end == 0) { ks->is_eof = 1; break; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ } else break; \ } \ if (delimiter == KS_SEP_LINE) { \ @@ -180,25 +183,27 @@ >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string + -3 error reading stream + -4 overflow error */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ - int c; \ + int c,r; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + if (c < 0) return c; /* end of file or error */ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ @@ -207,6 +212,7 @@ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + if (seq->seq.l + 1 >= seq->seq.m) return -4; /* error: adjusting m overflowed */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ @@ -215,9 +221,10 @@ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1)) >= 0 && seq->qual.l < seq->seq.l); \ + if (c == -3) return -3; /* stream error */ \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ diff --git a/htslib/ksort.h b/htslib/ksort.h index 755010951..7857d4c77 100644 --- a/htslib/ksort.h +++ b/htslib/ksort.h @@ -64,6 +64,7 @@ #include #include +#include "hts_defs.h" #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) @@ -81,6 +82,7 @@ extern "C" { // problems on Windows. Don't include htslib/hts_os.h for this as it // may not get on with older attempts to fix this in code that includes // this file. +HTSLIB_EXPORT extern double hts_drand48(void); typedef struct { @@ -88,7 +90,7 @@ typedef struct { int depth; } ks_isort_stack_t; -#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } +#define KSORT_SWAP(type_t, a, b) { type_t t=(a); (a)=(b); (b)=t; } #define KSORT_INIT(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, , type_t, __sort_lt) #define KSORT_INIT_STATIC(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, static klib_unused, type_t, __sort_lt) diff --git a/htslib/kstring.h b/htslib/kstring.h index ed9bdeeaa..53a19806d 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2014, 2016, 2018-2020 Genome Research Ltd. + Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -55,6 +55,13 @@ #endif #endif +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + /* kstring_t is a simple non-opaque type whose fields are likely to be * used directly by user code (but see also ks_str() and ks_len() below). * A kstring_t object is initialised by either of @@ -109,18 +116,21 @@ extern "C" { HTSLIB_EXPORT char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); - /* kgetline() uses the supplied fgets()-like function to read a "\n"- - * or "\r\n"-terminated line from fp. The line read is appended to the - * kstring without its terminator and 0 is returned; EOF is returned at - * EOF or on error (determined by querying fp, as per fgets()). */ - typedef char *kgets_func(char *, int, void *); + /* kgetline() uses the supplied fgets()-like function to read a "\n"- + * or "\r\n"-terminated line from fp. The line read is appended to the + * kstring without its terminator and 0 is returned; EOF is returned at + * EOF or on error (determined by querying fp, as per fgets()). */ + typedef char *kgets_func(char *, int, void *); HTSLIB_EXPORT - int kgetline(kstring_t *s, kgets_func *fgets, void *fp); + int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp); - // This matches the signature of hgetln(), apart from the last pointer - typedef ssize_t kgets_func2(char *, size_t, void *); + /* kgetline2() uses the supplied hgetln()-like function to read a "\n"- + * or "\r\n"-terminated line from fp. The line read is appended to the + * ksring without its terminator and 0 is returned; EOF is returned at + * EOF or on error (determined by querying fp, as per fgets()). */ + typedef ssize_t kgets_func2(char *, size_t, void *); HTSLIB_EXPORT - int kgetline2(kstring_t *s, kgets_func2 *fgets, void *fp); + int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp); #ifdef __cplusplus } @@ -393,4 +403,9 @@ static inline int *ksplit(kstring_t *s, int delimiter, int *n) return offsets; } +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/sam.h b/htslib/sam.h index f3b684291..512db5420 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1,7 +1,7 @@ /// @file htslib/sam.h /// High-level SAM/BAM/CRAM sequence file operations. /* - Copyright (C) 2008, 2009, 2013-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2023 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -27,10 +27,19 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTSLIB_SAM_H #define HTSLIB_SAM_H +#include #include +#include #include "hts.h" #include "hts_endian.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -109,6 +118,7 @@ typedef sam_hdr_t bam_hdr_t; Result is operator code or -1. Be sure to cast the index if it is a plain char: int op = bam_cigar_table[(unsigned char) ch]; */ +HTSLIB_EXPORT extern const int8_t bam_cigar_table[256]; #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) @@ -179,7 +189,7 @@ extern const int8_t bam_cigar_table[256]; * Mate position and insert size also need to be 64-bit, but * we won't accept more than 32-bit for tid. * - * The bam_core_t structure is the *in memory* layout and not + * The bam1_core_t structure is the *in memory* layout and not * the same as the on-disk format. 64-bit changes here permit * SAM to work with very long chromosomes and permit BAM and CRAM * to seamlessly update in the future without further API/ABI @@ -237,7 +247,7 @@ typedef struct bam1_core_t { See the bam_cigar_* macros for manipulation. 4. seq is nibble-encoded according to bam_nt16_table. See the bam_seqi macro for retrieving individual bases. - 5. Per base qualilties are stored in the Phred scale with no +33 offset. + 5. Per base qualities are stored in the Phred scale with no +33 offset. Ie as per the BAM specification and not the SAM ASCII printable method. */ typedef struct bam1_t { @@ -475,7 +485,7 @@ int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len); /// Adds a single line to an existing header. /*! * Specify type and one or more key,value pairs, ending with the NULL key. - * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL). + * Eg. sam_hdr_add_line(h, "SQ", "SN", "foo", "LN", "100", NULL). * * @param type Type of the added line. Eg. "SQ" * @return 0 on success, -1 on failure @@ -902,7 +912,7 @@ void bam_destroy1(bam1_t *b); if (!recs || !buffer) goto cleanup; for (nrecs = 0; nrecs < MAX_RECS; nrecs++) { - bam_set_mempolicy(BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA); + bam_set_mempolicy(&recs[nrecs], BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA); // Set data pointer to unused part of buffer recs[nrecs].data = &buffer[buff_used]; @@ -935,10 +945,12 @@ void bam_destroy1(bam1_t *b); // ... use data ... cleanup: - for (size_t i = 0; i < nrecs; i++) - bam_destroy1(i); + if (recs) { + for (size_t i = 0; i < nrecs; i++) + bam_destroy1(&recs[i]); + free(recs); + } free(buffer); - free(recs); \endcode */ @@ -1002,6 +1014,37 @@ bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) HTS_RESULT_USED; HTSLIB_EXPORT bam1_t *bam_dup1(const bam1_t *bsrc); +/// Sets all components of an alignment structure +/** + @param bam Target alignment structure. Must be initialized by a call to bam_init1(). + The data field will be reallocated automatically as needed. + @param l_qname Length of the query name. If set to 0, the placeholder query name "*" will be used. + @param qname Query name, may be NULL if l_qname = 0 + @param flag Bitwise flag, a combination of the BAM_F* constants. + @param tid Chromosome ID, defined by sam_hdr_t (a.k.a. RNAME). + @param pos 0-based leftmost coordinate. + @param mapq Mapping quality. + @param n_cigar Number of CIGAR operations. + @param cigar CIGAR data, may be NULL if n_cigar = 0. + @param mtid Chromosome ID of next read in template, defined by sam_hdr_t (a.k.a. RNEXT). + @param mpos 0-based leftmost coordinate of next read in template (a.k.a. PNEXT). + @param isize Observed template length ("insert size") (a.k.a. TLEN). + @param l_seq Length of the query sequence (read) and sequence quality string. + @param seq Sequence, may be NULL if l_seq = 0. + @param qual Sequence quality, may be NULL. + @param l_aux Length to be reserved for auxiliary field data, may be 0. + + @return >= 0 on success (number of bytes written to bam->data), negative (with errno set) on failure. +*/ +HTSLIB_EXPORT +int bam_set1(bam1_t *bam, + size_t l_qname, const char *qname, + uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, + size_t n_cigar, const uint32_t *cigar, + int32_t mtid, hts_pos_t mpos, hts_pos_t isize, + size_t l_seq, const char *seq, const char *qual, + size_t l_aux); + /// Calculate query length from CIGAR data /** @param n_cigar Number of items in @p cigar @@ -1071,6 +1114,35 @@ char *bam_flag2str(int flag); /** The string must be freed by the user */ HTSLIB_EXPORT int bam_set_qname(bam1_t *b, const char *qname); +/*! @function + @abstract Parse a CIGAR string into a uint32_t array + @param in [in] pointer to the source string + @param end [out] address of the pointer to the new end of the input string + can be NULL + @param a_cigar [in/out] address of the destination uint32_t buffer + @param a_mem [in/out] address of the allocated number of buffer elements + @return number of processed CIGAR operators; -1 on error + */ +HTSLIB_EXPORT +ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem); + +/*! @function + @abstract Parse a CIGAR string into a bam1_t struct + @param in [in] pointer to the source string + @param end [out] address of the pointer to the new end of the input string + can be NULL + @param b [in/out] address of the destination bam1_t struct + @return number of processed CIGAR operators; -1 on error + + @discussion The BAM record may be partial and empty of existing cigar, seq + and quality, as is the case during SAM parsing, or it may be an existing + BAM record in which case this function replaces the existing CIGAR field + and shuffles data accordingly. A CIGAR of "*" will remove the CIGAR, + returning zero. + */ +HTSLIB_EXPORT +ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b); + /************************* *** BAM/CRAM indexing *** *************************/ @@ -1080,7 +1152,7 @@ int bam_set_qname(bam1_t *b, const char *qname); #define bam_itr_destroy(iter) hts_itr_destroy(iter) #define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end) #define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region) -#define bam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) +#define bam_itr_next(htsfp, itr, r) sam_itr_next((htsfp), (itr), (r)) // Load/build .csi or .bai BAM index file. Does not work with CRAM. // It is recommended to use the sam_index_* functions below instead. @@ -1103,7 +1175,7 @@ int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx); /// Writes the index initialised with sam_idx_init to disk. /** @param fp File handle for the data file being written. - @return 0 on success, <0 on filaure. + @return 0 on success, <0 on failure. */ HTSLIB_EXPORT int sam_idx_save(htsFile *fp) HTS_RESULT_USED; @@ -1314,6 +1386,7 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, #define sam_open(fn, mode) (hts_open((fn), (mode))) #define sam_open_format(fn, mode, fmt) (hts_open_format((fn), (mode), (fmt))) + #define sam_flush(fp) hts_flush((fp)) #define sam_close(fp) hts_close(fp) HTSLIB_EXPORT @@ -1347,23 +1420,35 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, /** @param fp Pointer to the destination file * @param h Pointer to the header structure previously read * @param b Pointer to the record to be written - * @return >= 0 on successfully writing the record, -1 on error + * @return >= 0 on successfully writing the record, -ve on error */ HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; +// Forward declaration, see hts_expr.h for full. +struct hts_filter_t; + +/// sam_passes_filter - Checks whether a record passes an hts_filter. +/** @param h Pointer to the header structure previously read + * @param b Pointer to the BAM record to be checked + * @param filt Pointer to the filter, created from hts_filter_init. + * @return 1 if passes, 0 if not, and <0 on error. + */ +HTSLIB_EXPORT +int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, + struct hts_filter_t *filt); + /************************************* *** Manipulating auxiliary fields *** *************************************/ /// Converts a BAM aux tag to SAM format /* - * @param b Pointer to the bam record * @param key Two letter tag key * @param type Single letter type code: ACcSsIifHZB. * @param tag Tag data pointer, in BAM format * @param end Pointer to end of bam record (largest extent of tag) - * @param ks Kstring to write the formatted tag to + * @param ks kstring to write the formatted tag to * * @return pointer to end of tag on success, * NULL on failure. @@ -1422,7 +1507,8 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, ++s; } else if (type == 'f') { if (end - s >= 4) { - ksprintf(ks, "f:%g", le_to_float(s)); + // cast to avoid triggering -Wdouble-promotion + ksprintf(ks, "f:%g", (double)le_to_float(s)); s += 4; } else goto bad_aux; @@ -1439,12 +1525,13 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, r |= kputc_(type, ks) < 0; r |= kputc_(':', ks) < 0; while (s < end && *s) r |= kputc_(*s++, ks) < 0; + r |= kputsn("", 0, ks) < 0; //ensures NUL termination if (s >= end) goto bad_aux; ++s; } else if (type == 'B') { uint8_t sub_type = *(s++); - int sub_type_size; + unsigned sub_type_size; // or externalise sam.c's aux_type2size function? switch (sub_type) { @@ -1467,7 +1554,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, goto bad_aux; n = le_to_u32(s); s += 4; // now points to the start of the array - if ((end - s) / sub_type_size < n) + if ((size_t)(end - s) / sub_type_size < n) goto bad_aux; r |= kputsn_("B:", 2, ks) < 0; r |= kputc(sub_type, ks) < 0; // write the type @@ -1524,7 +1611,8 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, if (ks_expand(ks, n*8) < 0) goto mem_err; for (i = 0; i < n; ++i) { ks->s[ks->l++] = ','; - r |= kputd(le_to_float(s), ks) < 0; + // cast to avoid triggering -Wdouble-promotion + r |= kputd((double)le_to_float(s), ks) < 0; s += 4; } break; @@ -1546,6 +1634,29 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, return NULL; } +/// Return a pointer to a BAM record's first aux field +/** @param b Pointer to the BAM record + @return Aux field pointer, or NULL if the record has none + +When NULL is returned, errno will also be set to ENOENT. ("Aux field pointers" +point to the TYPE byte within the auxiliary data for that field; but in general +it is unnecessary for user code to be aware of this.) + */ +HTSLIB_EXPORT +uint8_t *bam_aux_first(const bam1_t *b); + +/// Return a pointer to a BAM record's next aux field +/** @param b Pointer to the BAM record + @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return Pointer to the next aux field, or NULL if no next field or error + +Whenever NULL is returned, errno will also be set: ENOENT if @p s was the +record's last aux field; otherwise EINVAL, indicating that the BAM record's +aux data is corrupt. + */ +HTSLIB_EXPORT +uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s); + /// Return a pointer to an aux record /** @param b Pointer to the bam record @param tag Desired aux tag @@ -1558,6 +1669,19 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, HTSLIB_EXPORT uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); +/// Return the aux field's 2-character tag +/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return Pointer to the tag characters, NOT NUL-terminated + */ +static inline +const char *bam_aux_tag(const uint8_t *s) { return (const char *) (s-2); } + +/// Return the aux field's type character +/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return The type character: one of cCsSiI/fd/A/Z/H/B + */ +static inline char bam_aux_type(const uint8_t *s) { return *s; } + /// Return a SAM formatting string containing a BAM tag /** @param b Pointer to the bam record @param tag Desired aux tag @@ -1589,11 +1713,11 @@ static inline int bam_aux_get_str(const bam1_t *b, HTSLIB_EXPORT int64_t bam_aux2i(const uint8_t *s); -/// Get an integer aux value +/// Get a float aux value /** @param s Pointer to the tag data, as returned by bam_aux_get() - @return The value, or 0 if the tag was not an integer type + @return The value, or 0 if the tag was not a float type If the tag is not an numeric type, errno is set to EINVAL. The value of - integer flags will be returned cast to a double. + the float will be returned cast to a double. */ HTSLIB_EXPORT double bam_aux2f(const uint8_t *s); @@ -1660,15 +1784,33 @@ HTSLIB_EXPORT int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data); /// Delete tag data from a bam record -/* @param b The bam record to update - @param s Pointer to the tag to delete, as returned by bam_aux_get(). - @return 0 on success; -1 on failure - If the bam record's aux data is corrupt, errno is set to EINVAL and this - function returns -1; +/** @param b The BAM record to update + @param s Pointer to the aux field to delete, as returned by bam_aux_get() + Must not be NULL + @return 0 on success; -1 on failure + +If the BAM record's aux data is corrupt, errno is set to EINVAL and this +function returns -1. */ HTSLIB_EXPORT int bam_aux_del(bam1_t *b, uint8_t *s); +/// Delete an aux field from a BAM record +/** @param b The BAM record to update + @param s Pointer to the aux field to delete, as returned by + bam_aux_first()/_next()/_get(); must not be NULL + @return Pointer to the following aux field, or NULL if none or on error + +Identical to @c bam_aux_del() apart from the return value, which is an +aux iterator suitable for use with @c bam_aux_next()/etc. + +Whenever NULL is returned, errno will also be set: ENOENT if the aux field +deleted was the record's last one; otherwise EINVAL, indicating that the +BAM record's aux data is corrupt. + */ +HTSLIB_EXPORT +uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s); + /// Update or add a string-type tag /* @param b The bam record to update @param tag Tag identifier @@ -1879,10 +2021,13 @@ typedef struct bam_mplp_s *bam_mplp_t; /** * bam_plp_constructor() - sets a callback to initialise any per-pileup1_t fields. * @plp: The bam_plp_t initialised using bam_plp_init. - * @func: The callback function itself. When called, it is given the - * data argument (specified in bam_plp_init), the bam structure and - * a pointer to a locally allocated bam_pileup_cd union. This union - * will also be present in each bam_pileup1_t created. + * @func: The callback function itself. When called, it is given + * the data argument (specified in bam_plp_init), the bam + * structure and a pointer to a locally allocated + * bam_pileup_cd union. This union will also be present in + * each bam_pileup1_t created. + * The callback function should have a negative return + * value to indicate an error. (Similarly for destructor.) */ HTSLIB_EXPORT void bam_plp_constructor(bam_plp_t plp, @@ -1907,6 +2052,36 @@ typedef struct bam_mplp_s *bam_mplp_t; HTSLIB_EXPORT int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) HTS_RESULT_USED; + + /*! @typedef + @abstract An opaque type used for caching base modification state between + successive calls to bam_mods_* functions. + */ + typedef struct hts_base_mod_state hts_base_mod_state; + + /// Get pileup padded insertion sequence, including base modifications + /** + * @param p pileup data + * @param m state data for the base modification finder + * @param ins the kstring where the insertion sequence will be written + * @param del_len location for deletion length + * @return the number of insertion string on success, with string length + * being accessable via ins->l; -1 on failure. + * + * Fills out the kstring with the padded insertion sequence for the current + * location in 'p'. If this is not an insertion site, the string is blank. + * + * The modification state needs to have been previously initialised using + * bam_parse_basemod. It is permitted to be passed in as NULL, in which + * case this function outputs identically to bam_plp_insertion. + * + * If del_len is not NULL, the location pointed to is set to the length of + * any deletion immediately following the insertion, or zero if none. + */ + HTSLIB_EXPORT + int bam_plp_insertion_mod(const bam_pileup1_t *p, hts_base_mod_state *m, + kstring_t *ins, int *del_len) HTS_RESULT_USED; + /// Create a new bam_mplp_t structure /** The struct returned by a successful call should be freed * via bam_mplp_destroy() when it is no longer needed. @@ -1962,6 +2137,21 @@ typedef struct bam_mplp_s *bam_mplp_t; HTSLIB_EXPORT int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres); +// Used as flag parameter in sam_prob_realn. +enum htsRealnFlags { + BAQ_APPLY = 1, + BAQ_EXTEND = 2, + BAQ_REDO = 4, + + // Platform subfield, in bit position 3 onwards + BAQ_AUTO = 0<<3, + BAQ_ILLUMINA = 1<<3, + BAQ_PACBIOCCS = 2<<3, + BAQ_PACBIO = 3<<3, + BAQ_ONT = 4<<3, + BAQ_GENAPSYS = 5<<3 +}; + /// Calculate BAQ scores /** @param b BAM record @param ref Reference sequence @@ -1976,9 +2166,15 @@ This function calculates base alignment quality (BAQ) values using the method described in "Improving SNP discovery by base alignment quality", Heng Li, Bioinformatics, Volume 27, Issue 8 (https://doi.org/10.1093/bioinformatics/btr076). +The @param flag value can be generated using the htsRealnFlags enum, but for +backwards compatibilty reasons is retained as an "int". An example usage +of the enum could be this, equivalent to flag 19: + + sam_prob_realn(b, ref, len, BAQ_APPLY | BAQ_EXTEND | BAQ_PACBIOCCS); + The following @param flag bits can be used: -Bit 0: Adjust the quality values using the BAQ values +Bit 0 (BAQ_APPLY): Adjust the quality values using the BAQ values If set, the data in the BQ:Z tag is used to adjust the quality values, and the BQ:Z tag is renamed to ZQ:Z. @@ -1986,27 +2182,244 @@ Bit 0: Adjust the quality values using the BAQ values If clear, and a ZQ:Z tag is present, the quality values are reverted using the data in the tag, and the tag is renamed to BQ:Z. -Bit 1: Use "extended" BAQ. +Bit 1 (BAQ_EXTEND): Use "extended" BAQ. Changes the BAQ calculation to increase sensitivity at the expense of reduced specificity. -Bit 2: Recalculate BAQ, even if a BQ tag is present. +Bit 2 (BAQ_REDO): Recalculate BAQ, even if a BQ tag is present. Force BAQ to be recalculated. Note that a ZQ:Z tag will always disable recalculation. +Bits 3-10: Choose parameters tailored to a specific instrument type. + + One of BAQ_AUTO, BAQ_ILLUMINA, BAQ_PACBIOCCS, BAQ_PACBIO, BAQ_ONT and + BAQ_GENAPSYS. The BAQ parameter tuning are still a work in progress and + at the time of writing mainly consist of Illumina vs long-read technology + adjustments. + @bug If the input read has both BQ:Z and ZQ:Z tags, the ZQ:Z one will be removed. Depending on what previous processing happened, this may or may not be the correct thing to do. It would be wise to avoid this situation if possible. */ - HTSLIB_EXPORT int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag); +// --------------------------- +// Base modification retrieval + +/*! @typedef + @abstract Holds a single base modification. + @field modified_base The short base code (m, h, etc) or -ChEBI (negative) + @field canonical_base The canonical base referred to in the MM tag. + One of A, C, G, T or N. Note this may not be the + explicit base recorded in the SEQ column (esp. if N). + @field stran 0 or 1, indicating + or - strand from MM tag. + @field qual Quality code (256*probability), or -1 if unknown + + @discussion + Note this doesn't hold any location data or information on which other + modifications may be possible at this site. +*/ +typedef struct hts_base_mod { + int modified_base; + int canonical_base; + int strand; + int qual; +} hts_base_mod; + +#define HTS_MOD_UNKNOWN -1 // In MM but no ML +#define HTS_MOD_UNCHECKED -2 // Not in MM and in explicit mode + +// Flags for hts_parse_basemod2 +#define HTS_MOD_REPORT_UNCHECKED 1 + +/// Allocates an hts_base_mode_state. +/** + * @return An hts_base_mode_state pointer on success, + * NULL on failure. + * + * This just allocates the memory. The initialisation of the contents is + * done using bam_parse_basemod. Successive calls may be made to that + * without the need to free and allocate a new state. + * + * The state be destroyed using the hts_base_mode_state_free function. + */ +HTSLIB_EXPORT +hts_base_mod_state *hts_base_mod_state_alloc(void); + +/// Destroys an hts_base_mode_state. +/** + * @param state The base modification state pointer. + * + * The should have previously been created by hts_base_mode_state_alloc. + */ +HTSLIB_EXPORT +void hts_base_mod_state_free(hts_base_mod_state *state); + +/// Parses the Mm and Ml tags out of a bam record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @return 0 on success, + * -1 on failure. + * + * This fills out the contents of the modification state, resetting the + * iterator location to the first sequence base. + */ +HTSLIB_EXPORT +int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state); + +/// Parses the Mm and Ml tags out of a bam record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param flags A bit-field controlling base modification processing + * + * @return 0 on success, + * -1 on failure. + * + * This fills out the contents of the modification state, resetting the + * iterator location to the first sequence base. + */ +HTSLIB_EXPORT +int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, + uint32_t flags); + +/// Returns modification status for the next base position in the query seq. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param mods A supplied array for returning base modifications + * @param n_mods The size of the mods array + * @return The number of modifications found on success, + * -1 on failure. + * + * This is intended to be used as an iterator, with one call per location + * along the query sequence. + * + * If no modifications are found, the returned value is zero. + * If more than n_mods modifications are found, the total found is returned. + * Note this means the caller needs to check whether this is higher than + * n_mods. + */ +HTSLIB_EXPORT +int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods); + +/// Finds the next location containing base modifications and returns them +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param mods A supplied array for returning base modifications + * @param n_mods The size of the mods array + * @param pos Pointer holding position of modification in sequence + * @return The number of modifications found on success, + * 0 if no more modifications are present, + * -1 on failure. + * + * Unlike bam_mods_at_next_pos this skips ahead to the next site + * with modifications. + * + * If more than n_mods modifications are found, the total found is returned. + * Note this means the caller needs to check whether this is higher than + * n_mods. + */ +HTSLIB_EXPORT +int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods, int *pos); + +/// Returns modification status for a specific query position. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param mods A supplied array for returning base modifications + * @param n_mods The size of the mods array + * @return The number of modifications found on success, + * -1 on failure. + * + * Note if called multipled times, qpos must be higher than the previous call. + * Hence this is suitable for use from a pileup iterator. If more random + * access is required, bam_parse_basemod must be called each time to reset + * the state although this has an efficiency cost. + * + * If no modifications are found, the returned value is zero. + * If more than n_mods modifications are found, the total found is returned. + * Note this means the caller needs to check whether this is higher than + * n_mods. + */ +HTSLIB_EXPORT +int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods); + + +/// Returns data about a specific modification type for the alignment record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param code Modification code. If positive this is a character code, + * if negative it is a -ChEBI code. + * + * @param strand Boolean for top (0) or bottom (1) strand + * @param implicit Boolean for whether unlisted positions should be + * implicitly assumed to be unmodified, or require an + * explicit score and should be considered as unknown. + * Returned. + * @param canonical Canonical base type associated with this modification + * Returned. + * + * @return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +HTSLIB_EXPORT +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical); + +/// Returns data about the i^th modification type for the alignment record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param i Modification index, from 0 to ntype-1 + * @param strand Boolean for top (0) or bottom (1) strand + * @param implicit Boolean for whether unlisted positions should be + * implicitly assumed to be unmodified, or require an + * explicit score and should be considered as unknown. + * Returned. + * @param canonical Canonical base type associated with this modification + * Returned. + * + * @return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +HTSLIB_EXPORT +int bam_mods_queryi(hts_base_mod_state *state, int i, + int *strand, int *implicit, char *canonical); + +/// Returns the list of base modification codes provided for this +/// alignment record as an array of character codes (+ve) or ChEBI numbers +/// (negative). +/* + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param ntype Filled out with the number of array elements returned + * + * @return the type array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +HTSLIB_EXPORT +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype); + #ifdef __cplusplus } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 8d3554445..58d3d9389 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2023 Genome Research Ltd. Author: Petr Danecek @@ -96,7 +96,9 @@ typedef enum { BCF_SR_REQUIRE_IDX, BCF_SR_PAIR_LOGIC, // combination of the PAIR_* values above - BCF_SR_ALLOW_NO_IDX // allow to proceed even if required index is not present (at the user's risk) + BCF_SR_ALLOW_NO_IDX, // allow to proceed even if required index is not present (at the user's risk) + BCF_SR_REGIONS_OVERLAP, // include overlapping records with POS outside the regions: 0=no, 1=VCF line overlap, 2=true variant overlap [1] + BCF_SR_TARGETS_OVERLAP // include overlapping records with POS outside the targets: 0=no, 1=VCF line overlap, 2=true variant overlap [0] } bcf_sr_opt_t; @@ -131,6 +133,7 @@ typedef struct bcf_sr_regions_t hts_pos_t start, end; // current position: start, end of the region (0-based) int prev_seq; hts_pos_t prev_start, prev_end; + int overlap; // see BCF_SR_REGIONS_OVERLAP/BCF_SR_TARGETS_OVERLAP } bcf_sr_regions_t; @@ -138,6 +141,7 @@ typedef struct bcf_sr_t { htsFile *file; tbx_t *tbx_idx; + unsigned char read_one_record_only; hts_idx_t *bcf_idx; bcf_hdr_t *header; hts_itr_t *itr; @@ -302,6 +306,11 @@ int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); * file are currently not supported. * Targets (but not regions) can be prefixed with "^" to request logical complement, * for example "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. + * + * API notes: + * - bcf_sr_set_targets MUST be called before the first call to bcf_sr_add_reader() + * - calling bcf_sr_set_regions AFTER readers have been initialized will + * reposition the readers and discard all previous regions. */ HTSLIB_EXPORT int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); @@ -330,6 +339,8 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); * supply 'from' in place of 'to'. When 'to' is negative, first * abs(to) will be attempted and if that fails, 'from' will be used * instead. + * If chromosome name contains the characters ':' or '-', it should + * be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000" * * The bcf_sr_regions_t struct returned by a successful call should be freed * via bcf_sr_regions_destroy() when it is no longer needed. diff --git a/htslib/tbx.h b/htslib/tbx.h index 9b9e111b9..3d2037cbb 100644 --- a/htslib/tbx.h +++ b/htslib/tbx.h @@ -52,6 +52,7 @@ typedef struct tbx_t { void *dict; } tbx_t; +HTSLIB_EXPORT extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf; #define tbx_itr_destroy(iter) hts_itr_destroy(iter) diff --git a/htslib/vcf.h b/htslib/vcf.h index fb5106e17..908f15b65 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2020, 2022-2023 Genome Research Ltd. Author: Heng Li @@ -41,6 +41,9 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" #include "hts_endian.h" +/* Included only for backwards compatibility with e.g. bcftools 1.10 */ +#include + #ifdef __cplusplus extern "C" { #endif @@ -60,14 +63,22 @@ extern "C" { #define BCF_HT_INT 1 #define BCF_HT_REAL 2 #define BCF_HT_STR 3 -#define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! +#define BCF_HT_UINT 4 +#define BCF_HT_CHAR 5 +#define BCF_HT_INT64 6 +#define BCF_HT_LONG BCF_HT_INT64 // BCF_HT_INT, but for int64_t values; VCF only! +#define BCF_HT_UINT64 7 +#define BCF_HT_VOID 8 +#define BCF_HT_DOUBLE 9 +#define BCF_NUM_HT_TYPES 10 #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 #define BCF_VL_A 2 #define BCF_VL_G 3 #define BCF_VL_R 4 - +#define BCF_VL_P 5 //ploidy +#define BCF_VL_Phased_Ploidy 6 //ploidy with phase /* === Dictionary === The header keeps three dictionaries. The first keeps IDs in the @@ -84,6 +95,10 @@ extern "C" { #define BCF_DT_CTG 1 #define BCF_DT_SAMPLE 2 +#define BCF_V_2_1_HEADER_MAGIC_STRING "BCF\2\1" +#define BCF_V_2_2_HEADER_MAGIC_STRING "BCF\2\2" +#define BCF_HEADER_MAGIC_STRING_LENGTH 5 + // Complete textual representation of a header line typedef struct bcf_hrec_t { int type; // One of the BCF_HL_* type @@ -120,6 +135,7 @@ typedef struct bcf_hdr_t { int32_t m[3]; // m: allocated size of the dictionary block in use (see n above) } bcf_hdr_t; +HTSLIB_EXPORT extern uint8_t bcf_type_shift[]; /************** @@ -134,13 +150,18 @@ extern uint8_t bcf_type_shift[]; #define BCF_BT_FLOAT 5 #define BCF_BT_CHAR 7 -#define VCF_REF 0 -#define VCF_SNP 1 -#define VCF_MNP 2 -#define VCF_INDEL 4 -#define VCF_OTHER 8 -#define VCF_BND 16 // breakend -#define VCF_OVERLAP 32 // overlapping deletion, ALT=* +#define VCF_REF 0 +#define VCF_SNP (1<<0) +#define VCF_MNP (1<<1) +#define VCF_INDEL (1<<2) +#define VCF_OTHER (1<<3) +#define VCF_BND (1<<4) // breakend +#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_SPANNING_DELETION VCF_OVERLAP +#define VCF_INS (1<<6) // implies VCF_INDEL +#define VCF_DEL (1<<7) // implies VCF_INDEL +#define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) +#define VCF_NON_REF (1<<8) typedef struct bcf_variant_t { int type, n; // variant type and the number of bases affected, negative for deletions @@ -179,7 +200,7 @@ typedef struct bcf_dec_t { int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change int n_flt; // Number of FILTER fields int *flt; // FILTER keys in the dictionary - char *id, *als; // ID and REF+ALT block (\0-seperated) + char *id, *als; // ID and REF+ALT block (\0-separated) char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated bcf_info_t *info; // INFO bcf_fmt_t *fmt; // FORMAT and individual sample @@ -198,6 +219,22 @@ typedef struct bcf_dec_t { #define BCF_ERR_CTG_INVALID 32 #define BCF_ERR_TAG_INVALID 64 +/// Get error description for bcf error code +/** @param errorcode The error code which is to be described + @param buffer The buffer in which description to be added + @param maxbuffer The size of buffer passed + @return NULL on invalid buffer; buffer on other cases + +The buffer will be an empty string when @p errorcode is 0. +Description of errors present in code will be appended to @p buffer with ',' separation. +The buffer has to be at least 4 characters long. NULL will be returned if it is smaller or when buffer is NULL. + +'...' will be appended if the description doesn't fit in the given buffer. + */ + +HTSLIB_EXPORT +const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer); + /* The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file is slower because the string is first to be parsed, packed into BCF line @@ -214,6 +251,7 @@ typedef struct bcf1_t { hts_pos_t rlen; // length of REF int32_t rid; // CHROM float qual; // QUAL + hts_pos_t m_end_point; //END - must be after QUAL due to a memcpy() in vcf.c uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; kstring_t shared, indiv; @@ -301,6 +339,7 @@ typedef struct bcf1_t { typedef htsFile vcfFile; #define bcf_open(fn, mode) hts_open((fn), (mode)) #define vcf_open(fn, mode) hts_open((fn), (mode)) + #define bcf_flush(fp) hts_flush((fp)) #define bcf_close(fp) hts_close(fp) #define vcf_close(fp) hts_close(fp) @@ -314,6 +353,7 @@ typedef struct bcf1_t { */ HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_read(htsFile *fp) HTS_RESULT_USED; + bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_sample_line_required); /** * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed @@ -351,6 +391,16 @@ typedef struct bcf1_t { HTSLIB_EXPORT int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) HTS_RESULT_USED; + + /* + * Serialize BCF header into buffer + * + * Returns new offset value in buffer if the new data fits within the buffer capacity, + * else returns the same offset value without modifying the buffer + */ + size_t bcf_hdr_serialize(bcf_hdr_t* h, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const uint8_t keep_idx_fields); + + size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf); /** * Parse VCF line contained in kstring and populate the bcf1_t struct * The line must not end with \n or \r characters. @@ -372,6 +422,27 @@ typedef struct bcf1_t { HTSLIB_EXPORT int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s); + /* + * Same as vcf_format, but for bcfs + * + * Returns new offset value in buffer if the new data fits within the buffer capacity, + * else returns the same offset value without modifying the buffer + * + * If vcf, then the hdr and tmp pointers must be valid. For bcfs, they might be null + */ + size_t bcf_serialize(bcf1_t* v, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr, kstring_t* tmp); + /* + * Same as vcf_parse, but for bcfs + * + * Returns new offset value in buffer if a full vcf record is read, + * else returns the same offset value + * + * If vcf, then the hdr and tmp pointers must be valid. For bcfs, they might be null + * + * Note that vcf parsing modifies the buffer (tokenize function) + */ + size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr); + /// Read next VCF or BCF record /** @param fp The file to read the record from @param h The header for the vcf/bcf file @@ -444,7 +515,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). */ HTSLIB_EXPORT bcf_hdr_t *vcf_hdr_read(htsFile *fp) HTS_RESULT_USED; - + bcf_hdr_t *vcf_hdr_read_required_sample_line(htsFile *fp, const uint8_t is_sample_line_required); /// Write a VCF format header /** @param fp Output file @param h The header to write @@ -612,7 +683,11 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap); - /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */ + /** + * Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names). + * NB: sequence name indexes returned by bcf_hdr_seqnames() may not correspond to bcf1_t.rid, use + * bcf_hdr_id2name() or bcf_seqname() instead. + */ HTSLIB_EXPORT const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs); @@ -623,6 +698,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). /** The following functions are for internal use and should rarely be called directly */ HTSLIB_EXPORT int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt); + int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_length, + const uint8_t is_sample_line_required); /// Synchronize internal header structures /** @param h Header @@ -663,6 +740,22 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str); + /// Add a header record into a header + /** + * @param hdr Destination header + * @param hrec Header record + * @return 0 on success, -1 on failure + * + * If this function returns success, ownership of @p hrec will have + * been transferred to the header structure. It may also have been + * freed if it was a duplicate of a record already in the header. + * Therefore the @p hrec pointer should not be used after a successful + * return from this function. + * + * If this function returns failure, ownership will not have been taken + * and the caller is responsible for cleaning up @p hrec. + */ + HTSLIB_EXPORT int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec); @@ -745,15 +838,102 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line); + /// Get variant types in a BCF record /** - * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc + * @param rec BCF/VCF record + * @return Types of variant present + * + * The return value will be a bitwise-or of VCF_SNP, VCF_MNP, + * VCF_INDEL, VCF_OTHER, VCF_BND or VCF_OVERLAP. If will return + * VCF_REF (i.e. 0) if none of the other types is present. + * @deprecated Please use bcf_has_variant_types() instead */ HTSLIB_EXPORT int bcf_get_variant_types(bcf1_t *rec); + /// Get variant type in a BCF record, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele to check + * @return Type of variant present + * + * The return value will be one of VCF_REF, VCF_SNP, VCF_MNP, + * VCF_INDEL, VCF_OTHER, VCF_BND or VCF_OVERLAP. + * @deprecated Please use bcf_has_variant_type() instead + */ HTSLIB_EXPORT int bcf_get_variant_type(bcf1_t *rec, int ith_allele); + /// Match mode for bcf_has_variant_types() + enum bcf_variant_match { + bcf_match_exact, ///< Types present exactly match tested for + bcf_match_overlap, ///< At least one variant type in common + bcf_match_subset, ///< Test set is a subset of types present + }; + + /// Check for presence of variant types in a BCF record + /** + * @param rec BCF/VCF record + * @param bitmask Set of variant types to test for + * @param mode Match mode + * @return >0 if the variant types are present, + * 0 if not present, + * -1 on error + * + * @p bitmask should be the bitwise-or of the variant types (VCF_SNP, + * VCF_MNP, etc.) to test for. + * + * The return value is the bitwise-and of the set of types present + * and @p bitmask. Callers that want to check for the presence of more + * than one type can avoid function call overhead by passing all the + * types to be checked for in a single call to this function, in + * bcf_match_overlap mode, and then check for them individually in the + * returned value. + * + * As VCF_REF is represented by 0 (i.e. the absence of other variants) + * it should be tested for using + * bcf_has_variant_types(rec, VCF_REF, bcf_match_exact) + * which will return 1 if no other variant type is present, otherwise 0. + */ + HTSLIB_EXPORT + int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, enum bcf_variant_match mode); + + /// Check for presence of variant types in a BCF record, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele to check + * @param bitmask Set of variant types to test for + * @return >0 if one of the variant types is present, + * 0 if not present, + * -1 on error + * + * @p bitmask should be the bitwise-or of the variant types (VCF_SNP, + * VCF_MNP, etc.) to test for, or VCF_REF on its own. + * + * The return value is the bitwise-and of the set of types present + * and @p bitmask. Callers that want to check for the presence of more + * than one type can avoid function call overhead by passing all the + * types to be checked for in a single call to this function, and then + * check for them individually in the returned value. + * + * As a special case, if @p bitmask is VCF_REF (i.e. 0), the function + * tests for an exact match. The return value will be 1 if the + * variant type calculated for the allele is VCF_REF, otherwise if + * any other type is present it will be 0. + */ + HTSLIB_EXPORT + int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask); + + /// Return the number of bases affected by a variant, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele index + * @return The number of bases affected (negative for deletions), + * or bcf_int32_missing on error. + */ + HTSLIB_EXPORT + int bcf_variant_length(bcf1_t *rec, int ith_allele); + HTSLIB_EXPORT int bcf_is_snp(bcf1_t *v); @@ -857,6 +1037,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). return bcf_update_info(hdr, line, key, values, n, BCF_HT_LONG); } + void bcf_set_end_point_from_info(const bcf_hdr_t* hdr, bcf1_t* line); + /* * bcf_update_format_*() - functions for updating FORMAT fields * @values: pointer to the array of values, the same number of elements @@ -1020,7 +1202,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). * int max_ploidy = ngt/nsmpl; * for (i=0; iid[BCF_DT_ID][int_id].val->info[type]>>12) #define bcf_hdr_id2type(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) - #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) + #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && (int_id)<(hdr)->n[BCF_DT_ID] && (hdr)->id[BCF_DT_ID][int_id].val && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) + uint64_t bcf_hdr_id2contig_length(const bcf_hdr_t* hdr, const int id); /// Convert BCF FORMAT data to string form /** * @param s kstring to write into @@ -1337,7 +1520,9 @@ which works for both BCF and VCF. #define BCF_MIN_BT_INT16 (-32760) /* INT16_MIN + 8 */ #define BCF_MIN_BT_INT32 (-2147483640) /* INT32_MIN + 8 */ +HTSLIB_EXPORT extern uint32_t bcf_float_vector_end; +HTSLIB_EXPORT extern uint32_t bcf_float_missing; static inline void bcf_float_set(float *ptr, uint32_t value) { @@ -1363,21 +1548,23 @@ static inline int bcf_float_is_vector_end(float f) static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) { uint32_t e = 0; - #define BRANCH(type_t, missing, vector_end) { \ - type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \ + #define BRANCH(type_t, convert, missing, vector_end) { \ + uint8_t *ptr = fmt->p + isample*fmt->size; \ int i; \ - for (i=0; in && ptr[i]!=vector_end; i++) \ + for (i=0; in; i++, ptr += sizeof(type_t)) \ { \ - if ( i ) e |= kputc("/|"[ptr[i]&1], str) < 0; \ - if ( !(ptr[i]>>1) ) e |= kputc('.', str) < 0; \ - else e |= kputw((ptr[i]>>1) - 1, str) < 0; \ + type_t val = convert(ptr); \ + if ( val == vector_end ) break; \ + if ( i ) e |= kputc("/|"[val&1], str) < 0; \ + if ( !(val>>1) ) e |= kputc('.', str) < 0; \ + else e |= kputw((val>>1) - 1, str) < 0; \ } \ if (i == 0) e |= kputc('.', str) < 0; \ } switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break; case BCF_BT_NULL: e |= kputc('.', str) < 0; break; default: hts_log_error("Unexpected type %d", fmt->type); return -2; } @@ -1385,60 +1572,87 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) return e == 0 ? 0 : -1; } -static inline int bcf_enc_size(kstring_t *s, int size, int type) +static inline int bcf_enc_size(kstring_t *s, size_t size, int type) { - uint32_t e = 0; - uint8_t x[4]; - if (size >= 15) { - e |= kputc(15<<4|type, s) < 0; - if (size >= 128) { - if (size >= 32768) { - i32_to_le(size, x); - e |= kputc(1<<4|BCF_BT_INT32, s) < 0; - e |= kputsn((char*)&x, 4, s) < 0; - } else { - i16_to_le(size, x); - e |= kputc(1<<4|BCF_BT_INT16, s) < 0; - e |= kputsn((char*)&x, 2, s) < 0; - } - } else { - e |= kputc(1<<4|BCF_BT_INT8, s) < 0; - e |= kputc(size, s) < 0; + // Most common case is first + if (size < 15) { + if (ks_resize(s, s->l + 1) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + *p++ = (size<<4) | type; + s->l++; + return 0; + } + + if (ks_resize(s, s->l + 6) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + *p++ = 15<<4|type; + + if (size < 128) { + *p++ = 1<<4|BCF_BT_INT8; + *p++ = size; + s->l += 3; + } else { + if (size < 32768) { + *p++ = 1<<4|BCF_BT_INT16; + i16_to_le(size, p); + s->l += 4; + } + else if(size <= INT32_MAX){ + *p++ = 1<<4|BCF_BT_INT32; + i32_to_le(size, p); + s->l += 6; } - } else e |= kputc(size<<4|type, s) < 0; - return e == 0 ? 0 : -1; + else{ + *p++ = 1<<4|BCF_BT_INT64; + i64_to_le(size,p); + s->l += 10; + return -1; + } + } + return 0; } static inline int bcf_enc_inttype(long x) { if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) return BCF_BT_INT8; if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) return BCF_BT_INT16; - return BCF_BT_INT32; + if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) return BCF_BT_INT32; + return BCF_BT_INT64; } static inline int bcf_enc_int1(kstring_t *s, int32_t x) { - uint32_t e = 0; - uint8_t z[4]; + if (ks_resize(s, s->l + 5) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + if (x == bcf_int32_vector_end) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(bcf_int8_vector_end, s) < 0; + // An inline implementation of bcf_enc_size with size==1 and + // memory allocation already accounted for. + *p = (1<<4) | BCF_BT_INT8; + p[1] = bcf_int8_vector_end; + s->l+=2; } else if (x == bcf_int32_missing) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(bcf_int8_missing, s) < 0; + *p = (1<<4) | BCF_BT_INT8; + p[1] = bcf_int8_missing; + s->l+=2; } else if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(x, s) < 0; + *p = (1<<4) | BCF_BT_INT8; + p[1] = x; + s->l+=2; } else if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) { - i16_to_le(x, z); - e |= bcf_enc_size(s, 1, BCF_BT_INT16); - e |= kputsn((char*)&z, 2, s) < 0; + *p = (1<<4) | BCF_BT_INT16; + i16_to_le(x, p+1); + s->l+=3; } else { - i32_to_le(x, z); - e |= bcf_enc_size(s, 1, BCF_BT_INT32); - e |= kputsn((char*)&z, 4, s) < 0; + *p = (1<<4) | BCF_BT_INT32; + i32_to_le(x, p+1); + s->l+=5; } - return e == 0 ? 0 : -1; + + return 0; } /// Return the value of a single typed integer. diff --git a/htslib_vars.mk b/htslib_vars.mk index f22c7c22b..6af71863c 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -1,6 +1,6 @@ # Makefile variables useful for third-party code using htslib's public API. # -# Copyright (C) 2013-2017, 2019 Genome Research Ltd. +# Copyright (C) 2013-2017, 2019-2020 Genome Research Ltd. # # Author: John Marshall # @@ -32,6 +32,7 @@ htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h htslib_hts_endian_h = $(HTSPREFIX)htslib/hts_endian.h +htslib_hts_expr_h = $(HTSPREFIX)htslib/hts_expr.h $(htslib_kstring_h) $(htslib_hts_defs_h) htslib_hts_log_h = $(HTSPREFIX)htslib/hts_log.h $(htslib_hts_defs_h) htslib_hts_os_h = $(HTSPREFIX)htslib/hts_os.h $(htslib_hts_defs_h) htslib_kbitset_h = $(HTSPREFIX)htslib/kbitset.h @@ -39,10 +40,9 @@ htslib_kfunc_h = $(HTSPREFIX)htslib/kfunc.h $(htslib_hts_defs_h) htslib_khash_h = $(HTSPREFIX)htslib/khash.h $(htslib_kstring_h) $(htslib_kroundup_h) htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h) htslib_klist_h = $(HTSPREFIX)htslib/klist.h -htslib_knetfile_h = $(HTSPREFIX)htslib/knetfile.h $(htslib_hts_defs_h) htslib_kroundup_h = $(HTSPREFIX)htslib/kroundup.h htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h -htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h +htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h $(htslib_hts_defs_h) htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h $(htslib_hts_defs_h) $(htslib_kroundup_h) htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h) htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) $(htslib_hts_endian_h) diff --git a/kfunc.c b/kfunc.c index b7d7d521a..bf15cdf33 100644 --- a/kfunc.c +++ b/kfunc.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2010, 2013-2014 Genome Research Ltd. + Copyright (C) 2010, 2013-2014, 2020 Genome Research Ltd. Copyright (C) 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining diff --git a/knetfile.c b/knetfile.c deleted file mode 100644 index f0a608ea4..000000000 --- a/knetfile.c +++ /dev/null @@ -1,645 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2012-2014, 2017 Genome Research Ltd (GRL). - 2010 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Probably I will not do socket programming in the next few years and - therefore I decide to heavily annotate this file, for Linux and - Windows as well. -ac */ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef _WIN32 -#include -#include -#include -#include -#endif - -#include "htslib/knetfile.h" -#include "htslib/hts_log.h" - -/* In winsock.h, the type of a socket is SOCKET, which is: "typedef - * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed - * integer -1. In knetfile.c, I use "int" for socket type - * throughout. This should be improved to avoid confusion. - * - * In Linux/Mac, recv() and read() do almost the same thing. You can see - * in the header file that netread() is simply an alias of read(). In - * Windows, however, they are different and using recv() is mandatory. - */ - -/* This function tests if the file handler is ready for reading (or - * writing if is_read==0). */ -static int socket_wait(int fd, int is_read) -{ - fd_set fds, *fdr = 0, *fdw = 0; - struct timeval tv; - int ret; - tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out - FD_ZERO(&fds); - FD_SET(fd, &fds); - if (is_read) fdr = &fds; - else fdw = &fds; - ret = select(fd+1, fdr, fdw, 0, &tv); -#ifndef _WIN32 - if (ret == -1) perror("select"); -#else - if (ret == 0) - hts_log_warning("Select timed out"); - else if (ret == SOCKET_ERROR) - hts_log_error("Select returned error %d", WSAGetLastError()); -#endif - return ret; -} - -#ifndef _WIN32 -/* This function does not work with Windows due to the lack of - * getaddrinfo() in winsock. It is addapted from an example in "Beej's - * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ -# ifdef __SUNPRO_C -# pragma error_messages(off, E_END_OF_LOOP_CODE_NOT_REACHED) -# endif -static int socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) - - int ai_err, on = 1, fd; - struct linger lng = { 0, 0 }; - struct addrinfo hints, *res = 0; - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - /* In Unix/Mac, getaddrinfo() is the most convenient way to get - * server information. */ - if ((ai_err = getaddrinfo(host, port, &hints, &res)) != 0) { hts_log_error("Can't resolve %s:%s: %s", host, port, gai_strerror(ai_err)); return -1; } - if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); - /* The following two setsockopt() are used by ftplib - * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they - * necessary. */ - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); - freeaddrinfo(res); - return fd; -} -# ifdef __SUNPRO_C -# pragma error_messages(off, E_END_OF_LOOP_CODE_NOT_REACHED) -# endif -#else -/* MinGW's printf has problem with "%lld" */ -char *int64tostr(char *buf, int64_t x) -{ - int cnt; - int i = 0; - do { - buf[i++] = '0' + x % 10; - x /= 10; - } while (x); - buf[i] = 0; - for (cnt = i, i = 0; i < cnt/2; ++i) { - int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; - } - return buf; -} - -int64_t strtoint64(const char *buf) -{ - int64_t x; - for (x = 0; *buf != '\0'; ++buf) - x = x * 10 + ((int64_t) *buf - 48); - return x; -} -/* In windows, the first thing is to establish the TCP connection. */ -int knet_win32_init() -{ - WSADATA wsaData; - return WSAStartup(MAKEWORD(2, 2), &wsaData); -} -void knet_win32_destroy() -{ - WSACleanup(); -} -/* A slightly modfied version of the following function also works on - * Mac (and presummably Linux). However, this function is not stable on - * my Mac. It sometimes works fine but sometimes does not. Therefore for - * non-Windows OS, I do not use this one. */ -static SOCKET socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) \ - do { \ - hts_log_error("The %s operation returned error %d", func, WSAGetLastError()); \ - return -1; \ - } while (0) - - int on = 1; - SOCKET fd; - struct linger lng = { 0, 0 }; - struct sockaddr_in server; - struct hostent *hp = 0; - // open socket - if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - // get host info - if (isalpha(host[0])) hp = gethostbyname(host); - else { - struct in_addr addr; - addr.s_addr = inet_addr(host); - hp = gethostbyaddr((char*)&addr, 4, AF_INET); - } - if (hp == 0) __err_connect("gethost"); - // connect - server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); - server.sin_family= AF_INET; - server.sin_port = htons(atoi(port)); - if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); - // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) - return fd; -} -#endif - -static off_t my_netread(int fd, void *buf, off_t len) -{ - off_t rest = len, curr, l = 0; - /* recv() and read() may not read the required length of data with - * one call. They have to be called repeatedly. */ - while (rest) { - if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading - curr = netread(fd, (void*)((char*)buf + l), rest); - /* According to the glibc manual, section 13.2, a zero returned - * value indicates end-of-file (EOF), which should mean that - * read() will not return zero if EOF has not been met but data - * are not immediately available. */ - if (curr == 0) break; - l += curr; rest -= curr; - } - return l; -} - -/************************* - * FTP specific routines * - *************************/ - -static int kftp_get_response(knetFile *ftp) -{ -#ifndef _WIN32 - unsigned char c; -#else - char c; -#endif - int n = 0; - char *p; - if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; - while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O - //fputc(c, stderr); - if (n >= ftp->max_response) { - ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; - ftp->response = (char*)realloc(ftp->response, ftp->max_response); - } - ftp->response[n++] = c; - if (c == '\n') { - if (n >= 4 && isdigit((int)((unsigned char) ftp->response[0])) && isdigit((int)((unsigned char) ftp->response[1])) && isdigit((int)((unsigned char) ftp->response[2])) - && ftp->response[3] != '-') break; - n = 0; - continue; - } - } - if (n < 2) return -1; - ftp->response[n-2] = 0; - return strtol(ftp->response, &p, 0); -} - -static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) -{ - if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing - int len = strlen(cmd); - if ( netwrite(ftp->ctrl_fd, cmd, len) != len ) return -1; - return is_get? kftp_get_response(ftp) : 0; -} - -static int kftp_pasv_prep(knetFile *ftp) -{ - char *p; - int v[6]; - kftp_send_cmd(ftp, "PASV\r\n", 1); - for (p = ftp->response; *p && *p != '('; ++p); - if (*p != '(') return -1; - ++p; - sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); - memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); - ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; - return 0; -} - - -static int kftp_pasv_connect(knetFile *ftp) -{ - char host[80], port[10]; - if (ftp->pasv_port == 0) { - hts_log_error("Must call kftp_pasv_prep() first"); - return -1; - } - sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); - sprintf(port, "%d", ftp->pasv_port); - ftp->fd = socket_connect(host, port); - if (ftp->fd == -1) return -1; - return 0; -} - -int kftp_connect(knetFile *ftp) -{ - ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); - if (ftp->ctrl_fd == -1) return -1; - kftp_get_response(ftp); - kftp_send_cmd(ftp, "USER anonymous\r\n", 1); - kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); - kftp_send_cmd(ftp, "TYPE I\r\n", 1); - return 0; -} - -int kftp_reconnect(knetFile *ftp) -{ - if (ftp->ctrl_fd != -1) { - netclose(ftp->ctrl_fd); - ftp->ctrl_fd = -1; - } - netclose(ftp->fd); - ftp->fd = -1; - return kftp_connect(ftp); -} - -// initialize ->type, ->host, ->retr and ->size -knetFile *kftp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p; - int l; - if (strstr(fn, "ftp://") != fn) return 0; - for (p = (char*)fn + 6; *p && *p != '/'; ++p); - if (*p != '/') return 0; - l = p - fn - 6; - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_FTP; - fp->fd = -1; - /* the Linux/Mac version of socket_connect() also recognizes a port - * like "ftp", but the Windows version does not. */ - fp->port = strdup("21"); - fp->host = (char*)calloc(l + 1, 1); - if (strchr(mode, 'c')) fp->no_reconnect = 1; - strncpy(fp->host, fn + 6, l); - fp->retr = (char*)calloc(strlen(p) + 8, 1); - sprintf(fp->retr, "RETR %s\r\n", p); - fp->size_cmd = (char*)calloc(strlen(p) + 8, 1); - sprintf(fp->size_cmd, "SIZE %s\r\n", p); - fp->seek_offset = 0; - return fp; -} -// place ->fd at offset off -int kftp_connect_file(knetFile *fp) -{ - int ret; - long long file_size; - if (fp->fd != -1) { - netclose(fp->fd); - if (fp->no_reconnect) kftp_get_response(fp); - } - kftp_pasv_prep(fp); - kftp_send_cmd(fp, fp->size_cmd, 1); -#ifndef _WIN32 - // If the file does not exist, the response will be "550 Could not get file - // size". Be silent on failure, hts_idx_load can be trying the existence of .csi or .tbi. - if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) return -1; -#else - const char *p = fp->response; - while (*p != ' ') ++p; - while (*p < '0' || *p > '9') ++p; - file_size = strtoint64(p); -#endif - fp->file_size = file_size; - if (fp->offset>=0) { - char tmp[32]; -#ifndef _WIN32 - sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); -#else - strcpy(tmp, "REST "); - int64tostr(tmp + 5, fp->offset); - strcat(tmp, "\r\n"); -#endif - kftp_send_cmd(fp, tmp, 1); - } - kftp_send_cmd(fp, fp->retr, 0); - kftp_pasv_connect(fp); - ret = kftp_get_response(fp); - if (ret != 150) { - hts_log_error("%s", fp->response); - netclose(fp->fd); - fp->fd = -1; - return -1; - } - fp->is_ready = 1; - return 0; -} - - -/************************** - * HTTP specific routines * - **************************/ - -knetFile *khttp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p, *proxy, *q; - int l; - if (strstr(fn, "http://") != fn) return 0; - // set ->http_host - for (p = (char*)fn + 7; *p && *p != '/'; ++p); - l = p - fn - 7; - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->http_host = (char*)calloc(l + 1, 1); - strncpy(fp->http_host, fn + 7, l); - fp->http_host[l] = 0; - for (q = fp->http_host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - // get http_proxy - proxy = getenv("http_proxy"); - // set ->host, ->port and ->path - if (proxy == 0) { - fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. - fp->port = strdup(*q? q : "80"); - fp->path = strdup(*p? p : "/"); - } else { - fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); - for (q = fp->host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - fp->port = strdup(*q? q : "80"); - fp->path = strdup(fn); - } - fp->type = KNF_TYPE_HTTP; - fp->ctrl_fd = fp->fd = -1; - fp->seek_offset = 0; - return fp; -} - -int khttp_connect_file(knetFile *fp) -{ - int ret, l = 0; - char *buf, *p; - if (fp->fd != -1) netclose(fp->fd); - fp->fd = socket_connect(fp->host, fp->port); - buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. - l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); - if (fp->offset != 0) l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); - l += sprintf(buf + l, "\r\n"); - if ( netwrite(fp->fd, buf, l) != l ) { free(buf); return -1; } - l = 0; - while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency - if (buf[l] == '\n' && l >= 3) - if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; - ++l; - } - buf[l] = 0; - if (l < 14) { // prematured header - free(buf); - netclose(fp->fd); - fp->fd = -1; - return -1; - } - ret = strtol(buf + 8, &p, 0); // HTTP return code - if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file - off_t rest = fp->offset; - while (rest) { - off_t l = rest < 0x10000? rest : 0x10000; - rest -= my_netread(fp->fd, buf, l); - } - } else if (ret != 206 && ret != 200) { - // failed to open file - free(buf); - netclose(fp->fd); - switch (ret) { - case 401: errno = EPERM; break; - case 403: errno = EACCES; break; - case 404: errno = ENOENT; break; - case 407: errno = EPERM; break; - case 408: errno = ETIMEDOUT; break; - case 410: errno = ENOENT; break; - case 503: errno = EAGAIN; break; - case 504: errno = ETIMEDOUT; break; - default: errno = (ret >= 400 && ret < 500)? EINVAL : EIO; break; - } - fp->fd = -1; - return -1; - } - free(buf); - fp->is_ready = 1; - return 0; -} - -/******************** - * Generic routines * - ********************/ - -knetFile *knet_open(const char *fn, const char *mode) -{ - knetFile *fp = 0; - if (mode[0] != 'r') { - hts_log_error("Only mode \"r\" is supported"); - errno = ENOTSUP; - return 0; - } - if (strstr(fn, "ftp://") == fn) { - fp = kftp_parse_url(fn, mode); - if (fp == 0) return 0; - if (kftp_connect(fp) == -1) { - knet_close(fp); - return 0; - } - kftp_connect_file(fp); - } else if (strstr(fn, "http://") == fn) { - fp = khttp_parse_url(fn, mode); - if (fp == 0) return 0; - khttp_connect_file(fp); - } else { // local file -#ifdef _WIN32 - /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may - * be undefined on some systems, although it is defined on my - * Mac and the Linux I have tested on. */ - int fd = open(fn, O_RDONLY | O_BINARY); -#else - int fd = open(fn, O_RDONLY); -#endif - if (fd == -1) { - perror("open"); - return 0; - } - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - fp->ctrl_fd = -1; - } - if (fp && fp->fd == -1) { - knet_close(fp); - return 0; - } - return fp; -} - -knetFile *knet_dopen(int fd, const char *mode) -{ - knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - return fp; -} - -ssize_t knet_read(knetFile *fp, void *buf, size_t len) -{ - off_t l = 0; - if (fp->fd == -1) return 0; - if (fp->type == KNF_TYPE_FTP) { - if (fp->is_ready == 0) { - if (!fp->no_reconnect) kftp_reconnect(fp); - kftp_connect_file(fp); - } - } else if (fp->type == KNF_TYPE_HTTP) { - if (fp->is_ready == 0) - khttp_connect_file(fp); - } - if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX - size_t rest = len; - ssize_t curr; - while (rest) { - do { - curr = read(fp->fd, (void*)((char*)buf + l), rest); - } while (curr < 0 && EINTR == errno); - if (curr < 0) return -1; - if (curr == 0) break; - l += curr; rest -= curr; - } - } else l = my_netread(fp->fd, buf, len); - fp->offset += l; - return l; -} - -off_t knet_seek(knetFile *fp, off_t off, int whence) -{ - if (whence == SEEK_SET && off == fp->offset) return 0; - if (fp->type == KNF_TYPE_LOCAL) { - /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */ - off_t offset = lseek(fp->fd, off, whence); - if (offset == -1) return -1; - fp->offset = offset; - return fp->offset; - } else if (fp->type == KNF_TYPE_FTP) { - if (whence == SEEK_CUR) fp->offset += off; - else if (whence == SEEK_SET) fp->offset = off; - else if (whence == SEEK_END) fp->offset = fp->file_size + off; - else return -1; - fp->is_ready = 0; - return fp->offset; - } else if (fp->type == KNF_TYPE_HTTP) { - if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? - hts_log_error("SEEK_END is not supported for HTTP. Offset is unchanged"); - errno = ESPIPE; - return -1; - } - if (whence == SEEK_CUR) fp->offset += off; - else if (whence == SEEK_SET) fp->offset = off; - else return -1; - fp->is_ready = 0; - return fp->offset; - } - errno = EINVAL; - hts_log_error("%s", strerror(errno)); - return -1; -} - -int knet_close(knetFile *fp) -{ - if (fp == 0) return 0; - if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific - if (fp->fd != -1) { - /* On Linux/Mac, netclose() is an alias of close(), but on - * Windows, it is an alias of closesocket(). */ - if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); - else netclose(fp->fd); - } - free(fp->host); free(fp->port); - free(fp->response); free(fp->retr); // FTP specific - free(fp->path); free(fp->http_host); // HTTP specific - free(fp->size_cmd); - free(fp); - return 0; -} - -#ifdef KNETFILE_MAIN -int main(void) -{ - char *buf; - knetFile *fp; - int type = 4, l; -#ifdef _WIN32 - knet_win32_init(); -#endif - buf = calloc(0x100000, 1); - if (type == 0) { - fp = knet_open("knetfile.c", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 1) { // NCBI FTP, large file - fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); - knet_seek(fp, 2500000000ll, SEEK_SET); - l = knet_read(fp, buf, 255); - } else if (type == 2) { - fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 3) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 4) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); - knet_read(fp, buf, 10000); - knet_seek(fp, 20000, SEEK_SET); - knet_seek(fp, 10000, SEEK_SET); - l = knet_read(fp, buf+10000, 10000000) + 10000; - } - if (type != 4 && type != 1) { - knet_read(fp, buf, 255); - buf[255] = 0; - printf("%s\n", buf); - } else write(fileno(stdout), buf, l); - knet_close(fp); - free(buf); - return 0; -} -#endif diff --git a/kstring.c b/kstring.c index 55626745d..f8e0f9f3d 100644 --- a/kstring.c +++ b/kstring.c @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2018, 2020 Genome Research Ltd. + Copyright (C) 2013-2018, 2020-2021 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -57,7 +57,7 @@ int kputd(double d, kstring_t *s) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases - int s2 = sprintf(s->s + s->l, "%g", d); + int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d); len += s2; s->l += s2; return len; @@ -153,6 +153,15 @@ int kvsprintf(kstring_t *s, const char *fmt, va_list ap) return l; } + if (!s->s) { + const size_t sz = 64; + s->s = malloc(sz); + if (!s->s) + return -1; + s->m = sz; + s->l = 0; + } + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'. va_end(args); if (l + 1 > s->m - s->l) { @@ -195,8 +204,17 @@ char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux) for (p = start; *p; ++p) if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; } else { - for (p = start; *p; ++p) - if (*p == aux->sep) break; + // Using strchr is fast for next token, but slower for + // last token due to extra pass from strlen. Overall + // on a VCF parse this func was 146% faster with // strchr. + // Equiv to: + // for (p = start; *p; ++p) if (*p == aux->sep) break; + + // NB: We could use strchrnul() here from glibc if detected, + // which is ~40% faster again, but it's not so portable. + // i.e. p = (uint8_t *)strchrnul((char *)start, aux->sep); + uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep); + p = p2 ? p2 : start + strlen((char *)start); } aux->p = (const char *) p; // end of token if (*p == 0) aux->finished = 1; // no more tokens diff --git a/m4/hts_check_compile_flags_needed.m4 b/m4/hts_check_compile_flags_needed.m4 new file mode 100644 index 000000000..fb668e86f --- /dev/null +++ b/m4/hts_check_compile_flags_needed.m4 @@ -0,0 +1,63 @@ +# hts_check_compile_flags_needed.m4 +# +# SYNOPSIS +# +# HTS_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAGS, [INPUT], [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# Check whether the given FLAGS are required to build and link INPUT with +# the current language's compiler. Compilation and linking are first +# tries without FLAGS. If that fails it then tries to compile and +# link again with FLAGS. +# +# FEATURE describes the feature being tested, and is used when printing +# messages and to name the cache entry (along with the tested flags). +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. In ACTION-SUCCESS, $flags_needed will be set to +# either an empty string or FLAGS depending on the test results. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# If omitted, INPUT defaults to AC_LANG_PROGRAM(), although that probably +# isn't very useful. +# +# NOTE: Implementation based on AX_CHECK_COMPILE_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim +# Copyright (c) 2011 Maarten Bosmans +# Copyright (c) 2023 Robert Davies +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +# AX_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) + +AC_DEFUN([HTS_CHECK_COMPILE_FLAGS_NEEDED], +[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF +AS_VAR_PUSHDEF([CACHEVAR],[hts_cv_check_[]_AC_LANG_ABBREV[]flags_needed_$1_$6_$2])dnl +AC_CACHE_CHECK([_AC_LANG compiler flags needed for $1], CACHEVAR, [ + AC_LINK_IFELSE([m4_default([$3],[AC_LANG_PROGRAM()])], + [AS_VAR_SET(CACHEVAR,[none])], + [ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS + _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $6 $2" + AC_LINK_IFELSE([m4_default([$3],[AC_LANG_PROGRAM()])], + [AS_VAR_SET(CACHEVAR,[$2])], + [AS_VAR_SET(CACHEVAR,[unsupported])]) + _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])]) +AS_VAR_IF(CACHEVAR,unsupported, [ + m4_default([$5], :) +], [ + AS_VAR_IF(CACHEVAR,none,[flags_needed=""], [flags_needed="$CACHEVAR"]) + m4_default([$4], :) +]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl HTS_CHECK_COMPILE_FLAGS_NEEDED diff --git a/plugin.c b/plugin.c index dbbe03eab..670081f84 100644 --- a/plugin.c +++ b/plugin.c @@ -153,7 +153,7 @@ plugin_void_func *load_plugin(void **pluginp, const char *filename, const char * const char *basename = slash? slash+1 : filename; kputsn(basename, strcspn(basename, ".-+"), &symbolg); - *(void **) &sym = dlsym(lib, symbol); + *(void **) &sym = dlsym(lib, symbolg.s); free(symbolg.s); if (sym == NULL) goto error; } @@ -191,3 +191,30 @@ void close_plugin(void *plugin) __func__, dlerror()); } } + +const char *hts_plugin_path(void) { +#ifdef ENABLE_PLUGINS + char *path = getenv("HTS_PATH"); + if (!path) path = ""; + + kstring_t ks = {0}; + while(1) { + size_t len = strcspn(path, HTS_PATH_SEPARATOR_STR); + if (len == 0) kputs(PLUGINPATH, &ks); + else kputsn(path, len, &ks); + kputc(HTS_PATH_SEPARATOR_CHAR, &ks); + + path += len; + if (*path == HTS_PATH_SEPARATOR_CHAR) path++; + else break; + } + + static char s_path[1024]; + snprintf(s_path, sizeof(s_path), "%s", ks.s ? ks.s : ""); + free(ks.s); + + return s_path; +#else + return NULL; +#endif +} diff --git a/probaln.c b/probaln.c index 9b9442c5b..b42f85685 100644 --- a/probaln.c +++ b/probaln.c @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2003-2006, 2008-2010 by Heng Li - Copyright (C) 2016-2017 Genome Research Ltd. + Copyright (C) 2016-2017, 2020, 2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -74,8 +74,9 @@ static float g_qual2prob[256]; Returns phred-scaled likelihood score, or INT_MIN on failure. */ -int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_query, - const uint8_t *iqual, const probaln_par_t *c, int *state, uint8_t *q) +int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, + int l_query, const uint8_t *iqual, const probaln_par_t *c, + int *state, uint8_t *q) { double *f = NULL, *b = NULL, *s = NULL, m[9], sI, sM, bI, bM; float *qual = NULL; @@ -96,8 +97,10 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu bw2 = bw * 2 + 1; size_t i_dim = bw2 < l_ref ? (size_t) bw2*3+6 : (size_t) l_ref*3+6; - // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] - // Ideally these callocs would be mallocs + initialisation of the few bits needed. + // allocate the forward and backward matrices f[][] and b[][] + // and the scaling array s[] + // Ideally these callocs would be mallocs + initialisation of + // the few bits needed. if (SIZE_MAX / (l_query+1) / i_dim < sizeof(double)) { errno = ENOMEM; // Allocation would fail return INT_MIN; @@ -108,7 +111,9 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu b = calloc((l_query+1)*i_dim, sizeof(double)); if (!b) goto fail; } - s = malloc((l_query+2) * sizeof(double)); // s[] is the scaling factor to avoid underflow + + // s[] is the scaling factor to avoid underflow + s = malloc((l_query+2) * sizeof(double)); if (!s) goto fail; // initialize qual @@ -122,11 +127,26 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu qual[i] = g_qual2prob[iqual? iqual[i] : 30]; // initialize transition probability - sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof - m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM); - m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.; - m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e; - bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1 + // the value here seems not to affect results; FIXME: need proof + sM = sI = 1. / (2 * l_query + 2); + m[0*3+0] = (1 - c->d - c->d) * (1 - sM); + m[0*3+1] = m[0*3+2] = c->d * (1 - sM); + m[1*3+0] = (1 - c->e) * (1 - sI); + m[1*3+1] = c->e * (1 - sI); + m[1*3+2] = 0.; + m[2*3+0] = 1 - c->e; + m[2*3+1] = 0.; + m[2*3+2] = c->e; + bM = (1 - c->d) / l_ref; // (bM+bI)*l_ref==1 + bI = c->d / l_ref; + + // f[] and b[] are 2-d arrays of three scores, with rows along the + // query and columns across the band. The first query base and + // first band position appear at index 1 allowing edge conditions + // to be stored in index 0. Hence the loops below appear to use + // 1-based indexing instead of 0-based as you'd normally expect in C, + // and the sequences are accessed using query[i - 1] and ref[k - 1]. + /*** forward ***/ // f[0] set_u(k, bw, 0, 0); @@ -150,6 +170,15 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu uint8_t qyi = query[i - 1]; x = i - bw; beg = beg > x? beg : x; // band start x = i + bw; end = end < x? end : x; // band end + + // NB end-beg is almost always 14 (99.9% of the time) + // Hence not a large volume to parallelise. + // + // Maybe stripe in diagonal doing 14 lines together? + // + // Consider rotation? 150x14 vs 14x150 so inner loop + // takes longer. + double E[] = { qli * EM, // 00 1. - qli, // 01 @@ -157,26 +186,89 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu 1., // 11 }; double M = 1./s[i-1]; + + // Note this code has the original version listed here (albeit + // with improved formatting), but we do not compile using + // -DPROBALN_ORIG. The purpose of this code is to act as an + // easier(?) to understand version of the heavily optimised + // version following it and as an easy validation path in case + // of any differences in results. +#ifdef PROBALN_ORIG for (k = beg, sum = 0.; k <= end; ++k) { int u, v11, v01, v10; double e; e = E[(ref[k - 1] > 3 || qyi > 3)*2 + (ref[k - 1] == qyi)]; - set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1); + set_u(u, bw, i, k); + set_u(v11, bw, i-1, k-1); + set_u(v10, bw, i-1, k); + set_u(v01, bw, i, k-1); fi[u+0] = e * (m[0] * M*fi1[v11+0] + m[3] * M*fi1[v11+1] + m[6] * M*fi1[v11+2]); fi[u+1] = EI * (m[1] * M*fi1[v10+0] + m[4] * M*fi1[v10+1]); fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; sum += fi[u] + fi[u+1] + fi[u+2]; -// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG } +#else + // We use EI*(M*m[1]*? + M*m[4]*?) a lot. So factor it out here. + double xm[5]; + xm[0] = M*m[0]; + xm[1] = M*m[3]; + xm[2] = M*m[6]; + xm[3] = EI*M*m[1]; + xm[4] = EI*M*m[4]; + + { + int u, v11; + set_u(u, bw, i, beg); + set_u(v11, bw, i-1, beg-1); + // Rather than recompute k->{u,v01,v10,v11} each loop + // we just increment the pointers. + double *xi = &fi[u]; + double *yi = &fi1[v11]; + // Derived from xi[0,2] in previous loop iter. + double l_x0 = m[2]*xi[0]; + double l_x2 = m[8]*xi[2]; + for (k = beg, sum = 0.; k <= end; ++k, xi+=3, yi+=3) { + int cond = (ref[k-1] > 3 || qyi > 3)*2 + (ref[k-1] == qyi); + + double z0 = xm[0]*yi[0]; + double z1 = xm[1]*yi[1]; + double z2 = xm[2]*yi[2]; + double z3 = xm[3]*yi[3]; + double z4 = xm[4]*yi[4]; + + xi[0] = E[cond] * (z0+z1+z2); + xi[1] = z3 + z4; + xi[2] = l_x0 + l_x2; + sum += xi[0] + xi[1] + xi[2]; + + l_x0 = m[2]*xi[0]; + l_x2 = m[8]*xi[2]; + } + } +#endif s[i] = sum; } + { // f[l_query+1] double sum; double M = 1./s[l_query]; + // Note that this goes from 1 to l_ref inclusive, but as the + // alignment is banded not all of the values will have been + // calculated (the rest are taken as 0), so the summation + // actually goes over the values set in the last iteration of + // the previous loop (when i = l_query). For some reason lost to + // time this is done by looking for valid values of 'u' instead of + // working out 'beg' and 'end'. + + // From HTSlib 1.8 to 1.17, the endpoint was incorrectly set + // to i_dim - 3. When l_query <= bandwidth, this caused the last + // column to be missed, and if l_ref == l_query then a match at the end + // could incorrectly be reported as an insertion. See #1605. + for (k = 1, sum = 0.; k <= l_ref; ++k) { int u; set_u(u, bw, l_query, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; sum += M*f[l_query*i_dim + u+0] * sM + M*f[l_query*i_dim + u+1] * sI; } s[l_query+1] = sum; // the last scaling factor @@ -200,12 +292,12 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu int u; double *bi = &b[l_query*i_dim]; set_u(u, bw, l_query, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; } // b[l_query-1..1] for (i = l_query - 1; i >= 1; --i) { - int beg = 1, end = l_ref, x, _beg, _end; + int beg = 1, end = l_ref, x; double *bi = &b[i*i_dim], *bi1 = &b[(i+1)*i_dim], y = (i > 1), qli1 = qual[i]; uint8_t qyi1 = query[i]; x = i - bw; beg = beg > x? beg : x; @@ -217,10 +309,15 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu 1., //011 //0,0,0,0 //1xx }; + +#ifdef PROBALN_ORIG for (k = end; k >= beg; --k) { int u, v11, v01, v10; double e; - set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1); + set_u(u, bw, i, k); + set_u(v11, bw, i+1, k+1); + set_u(v10, bw, i+1, k); + set_u(v01, bw, i, k+1); e = (k>=l_ref)?0 :E[(ref[k] > 3 || qyi1 > 3)*2 + (ref[k] == qyi1)] * bi1[v11]; bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; @@ -228,8 +325,43 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu // fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG } // rescale + int _beg, _end; set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; +#else + { + int u, v10; + set_u(u, bw, i, end); + set_u(v10, bw, i+1, end); + // Rather than recompute k->{u,v01,v10,v11} each loop + // we just increment the pointers. + double *xi = &bi[u]; + double *yi = &bi1[v10]; + // NB xi[5] is equiv to v01+2. + double xi_5 = xi[5]; + // Manual loop invariant removal + double e1 = EI*m[1]; + double e4 = EI*m[4]; + // Do renorm too in the same pass. + double n = 1./s[i]; + for (k = end; k >= beg; --k, xi -= 3, yi -= 3) { + double e = (k>=l_ref) + ? 0 + : E[(ref[k]>3 || qyi1>3)*2 + (ref[k] == qyi1)] * yi[3]; + + xi[1] = e * m[3] + e4 * yi[1]; + xi[0] = e * m[0] + e1 * yi[1] + m[2] * xi_5; + xi[2] = (e * m[6] + m[8] * xi_5) * y; + // bi[u+2] from this iter becomes bi[v01+2] in next iter + xi_5 = xi[2]; + + // rescale + xi[1] *= n; + xi[0] *= n; + xi[2] *= n; + } + } +#endif } { // b[0] int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; @@ -238,7 +370,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu int u; double e = (ref[k - 1] > 3 || query[0] > 3)? 1. : ref[k - 1] == query[0]? 1. - qual[0] : qual[0] * EM; set_u(u, bw, 1, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; sum += e * b[1*i_dim + u+0] * bM + EI * b[1*i_dim + u+1] * bI; } set_u(k, bw, 0, 0); @@ -251,13 +383,36 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu x = i - bw; beg = beg > x? beg : x; x = i + bw; end = end < x? end : x; double M = 1./s[i]; +#ifdef PROBALN_ORIG for (k = beg; k <= end; ++k) { int u; double z; set_u(u, bw, i, k); - z = M*fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z; - z = M*fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z; + z = M*fi[u+0] * bi[u+0]; + if (z > max) max = z, max_k = (k-1)<<2 | 0; + sum += z; + z = M*fi[u+1] * bi[u+1]; + if (z > max) max = z, max_k = (k-1)<<2 | 1; + sum += z; + } +#else + { + int u; + set_u(u, bw, i, beg); + for (k = beg; k <= end; ++k, u+=3) { + double z1, z2; + z1 = M*fi[u+0] * bi[u+0]; + z2 = M*fi[u+1] * bi[u+1]; + int which = z2 > z1; // strictly z2 >= z1 matches old code + double zm = which ? z2 : z1; + if (zm > max) { + max = zm; + max_k = (k-1)<<2 | which; + } + sum += z1 + z2; + } } +#endif max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 if (state) state[i-1] = max_k; if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; @@ -268,6 +423,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu "ACGT"[query[i - 1]], "ACGT"[ref[(max_k>>2)]], max_k&3, max); // DEBUG #endif } + /*** free ***/ free(f); free(b); free(s); free(qual); return Pr; diff --git a/realn.c b/realn.c index 40b796b46..d7e8255f8 100644 --- a/realn.c +++ b/realn.c @@ -1,6 +1,6 @@ /* realn.c -- BAQ calculation and realignment. - Copyright (C) 2009-2011, 2014-2016, 2018 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2016, 2018, 2021, 2023 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li @@ -91,25 +91,44 @@ int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres) static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity, const char *type, const bam1_t *b) { if (*tg != 'Z') { - hts_log(severity, "Incorrect %s tag type (%c) for read %s", + hts_log(severity, __func__, "Incorrect %s tag type (%c) for read %s", type, *tg, bam_get_qname(b)); return -1; } if (b->core.l_qseq != strlen((const char *) tg + 1)) { - hts_log(severity, "Read %s %s tag is wrong length", + hts_log(severity, __func__, "Read %s %s tag is wrong length", bam_get_qname(b), type); return -1; } return 0; } -int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) -{ - int k, bw, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; +int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) { + int k, bw, y, yb, ye, xb, xe, fix_bq = 0, apply_baq = flag & BAQ_APPLY, + extend_baq = flag & BAQ_EXTEND, redo_baq = flag & BAQ_REDO; + enum htsRealnFlags system = flag & (0xff << 3); hts_pos_t i, x; uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; - probaln_par_t conf = { 0.001, 0.1, 10 }; + + // d(I) e(M) band + probaln_par_t conf = { 0.001, 0.1, 10 }; // Illumina + + if (b->core.l_qseq > 1000 || system > BAQ_ILLUMINA) { + // Params that work well on PacBio CCS 15k. Unknown if they + // help other long-read platforms yet, but likely better than + // the short-read tuned ones. + // + // This function has no access to the SAM header. + // Ideally the calling function would check for e.g. + // @RG PL = "PACBIO" and DS contains "READTYPE=CCS". + // + // In the absense of this, we simply auto-detect via a crude + // short vs long strategy. + conf.d = 1e-7; + conf.e = 1e-1; + } + uint8_t *bq = NULL, *zq = NULL, *qual = bam_get_qual(b); int *state = NULL; if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1) @@ -177,6 +196,7 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) if (abs((xe - xb) - (ye - yb)) > bw) bw = abs((xe - xb) - (ye - yb)) + 3; conf.bw = bw; + xb -= yb + bw/2; if (xb < 0) xb = 0; xe += c->l_qseq - ye + bw/2; if (xe - xb - c->l_qseq > bw) @@ -248,8 +268,28 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) // tseq,tref are no longer needed, so we can steal them to avoid mallocs uint8_t *left = tseq; uint8_t *rght = tref; + int len = 0; + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; + + // concatenate alignment matches (including sequence (mis)matches) + // otherwise 50M50M gives a different result to 100M + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if ((k + 1) < c->n_cigar) { + int next_op = bam_cigar_op(cigar[k + 1]); + + if (next_op == BAM_CMATCH || next_op == BAM_CEQUAL || next_op == BAM_CDIFF) { + len += l; + continue; + } + } + + // last of M/X/= ops + l += len; + len = 0; + } + if (l == 0) continue; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { // Sanity check running off the end of the sequence @@ -282,6 +322,7 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq); free(bq); free(state); } + return 0; fail: diff --git a/sam.c b/sam.c index 02cd6092e..de603cf16 100644 --- a/sam.c +++ b/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2020 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2023 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -35,6 +35,11 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#include "fuzz_settings.h" +#endif // Suppress deprecation message for cigar_tab, which we initialise #include "htslib/hts_defs.h" @@ -48,10 +53,12 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_internal.h" #include "htslib/hfile.h" #include "htslib/hts_endian.h" +#include "htslib/hts_expr.h" #include "header.h" #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) +KHASH_SET_INIT_INT(tag) #ifndef EFTYPE #define EFTYPE ENOEXEC @@ -248,6 +255,9 @@ sam_hdr_t *bam_hdr_read(BGZF *fp) bufsize = h->l_text + 1; if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem; +#endif h->text = (char*)malloc(bufsize); if (!h->text) goto nomem; h->text[h->l_text] = 0; // make sure it is NULL terminated @@ -261,6 +271,10 @@ sam_hdr_t *bam_hdr_read(BGZF *fp) if (h->n_targets < 0) goto invalid; // read reference sequence names and lengths +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t))) + goto nomem; +#endif if (h->n_targets > 0) { h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); if (!h->target_name) goto nomem; @@ -336,17 +350,23 @@ int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) if (h->hrecs) { if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; - if (hdr_ks.l > INT32_MAX) { + if (hdr_ks.l > UINT32_MAX) { hts_log_error("Header too long for BAM format"); free(hdr_ks.s); return -1; + } else if (hdr_ks.l > INT32_MAX) { + hts_log_warning("Header too long for BAM specification (>2GB)"); + hts_log_warning("Output file may not be portable"); } text = hdr_ks.s; l_text = hdr_ks.l; } else { - if (h->l_text > INT32_MAX) { + if (h->l_text > UINT32_MAX) { hts_log_error("Header too long for BAM format"); return -1; + } else if (h->l_text > INT32_MAX) { + hts_log_warning("Header too long for BAM specification (>2GB)"); + hts_log_warning("Output file may not be portable"); } text = h->text; l_text = h->l_text; @@ -416,6 +436,12 @@ int sam_realloc_bam_data(bam1_t *b, size_t desired) errno = ENOMEM; // Not strictly true but we can't store the size return -1; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (new_m_data > FUZZ_ALLOC_LIMIT) { + errno = ENOMEM; + return -1; + } +#endif if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { new_data = realloc(b->data, new_m_data); } else { @@ -484,6 +510,138 @@ static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, } } +static int subtract_check_underflow(size_t length, size_t *limit) +{ + if (length <= *limit) { + *limit -= length; + return 0; + } + + return -1; +} + +int bam_set1(bam1_t *bam, + size_t l_qname, const char *qname, + uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, + size_t n_cigar, const uint32_t *cigar, + int32_t mtid, hts_pos_t mpos, hts_pos_t isize, + size_t l_seq, const char *seq, const char *qual, + size_t l_aux) +{ + // use a default qname "*" if none is provided + if (l_qname == 0) { + l_qname = 1; + qname = "*"; + } + + // note: the qname is stored nul terminated and padded as described in the + // documentation for the bam1_t struct. + size_t qname_nuls = 4 - l_qname % 4; + + // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos(). + // can't use bam_endpos() directly as some fields not yet set up. + hts_pos_t rlen = 0, qlen = 0; + if (!(flag & BAM_FUNMAP)) { + bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen); + } + if (rlen == 0) { + rlen = 1; + } + + // validate parameters + if (l_qname > 254) { + hts_log_error("Query name too long"); + errno = EINVAL; + return -1; + } + if (HTS_POS_MAX - rlen <= pos) { + hts_log_error("Read ends beyond highest supported position"); + errno = EINVAL; + return -1; + } + if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) { + hts_log_error("Mapped query must have a CIGAR"); + errno = EINVAL; + return -1; + } + if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) { + hts_log_error("CIGAR and query sequence are of different length"); + errno = EINVAL; + return -1; + } + + size_t limit = INT32_MAX; + int u = subtract_check_underflow(l_qname + qname_nuls, &limit); + u += subtract_check_underflow(n_cigar * 4, &limit); + u += subtract_check_underflow((l_seq + 1) / 2, &limit); + u += subtract_check_underflow(l_seq, &limit); + u += subtract_check_underflow(l_aux, &limit); + if (u != 0) { + hts_log_error("Size overflow"); + errno = EINVAL; + return -1; + } + + // re-allocate the data buffer as needed. + size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq; + if (realloc_bam_data(bam, data_len + l_aux) < 0) { + return -1; + } + + bam->l_data = (int)data_len; + bam->core.pos = pos; + bam->core.tid = tid; + bam->core.bin = bam_reg2bin(pos, pos + rlen); + bam->core.qual = mapq; + bam->core.l_extranul = (uint8_t)(qname_nuls - 1); + bam->core.flag = flag; + bam->core.l_qname = (uint16_t)(l_qname + qname_nuls); + bam->core.n_cigar = (uint32_t)n_cigar; + bam->core.l_qseq = (int32_t)l_seq; + bam->core.mtid = mtid; + bam->core.mpos = mpos; + bam->core.isize = isize; + + uint8_t *cp = bam->data; + strncpy((char *)cp, qname, l_qname); + int i; + for (i = 0; i < qname_nuls; i++) { + cp[l_qname + i] = '\0'; + } + cp += l_qname + qname_nuls; + + if (n_cigar > 0) { + memcpy(cp, cigar, n_cigar * 4); + } + cp += n_cigar * 4; + +#define NN 16 + const uint8_t *useq = (uint8_t *)seq; + for (i = 0; i + NN < l_seq; i += NN) { + int j; + const uint8_t *u2 = useq+i; + for (j = 0; j < NN/2; j++) + cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]]; + cp += NN/2; + } + for (; i + 1 < l_seq; i += 2) { + *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]]; + } + + for (; i < l_seq; i++) { + *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; + } + + if (qual) { + memcpy(cp, qual, l_seq); + } + else { + memset(cp, '\xff', l_seq); + } + + return (int)data_len; +} + hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) { int k; @@ -529,7 +687,8 @@ static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 errno = saved_errno; // restore errno on expected no-CG-tag case return 0; } - if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I + if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i')) + return 0; // not of type B,I CG_len = le_to_u32(CG + 2); if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length @@ -935,7 +1094,7 @@ int sam_idx_save(htsFile *fp) { errno = -ret; return -1; } - if (bgzf_flush(fp->fp.bgzf) < 0) + if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0) return -1; hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); @@ -975,20 +1134,412 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_po return ret; } +// Internal (for now) func used by bam_sym_lookup. This is copied from +// samtools/bam.c. +static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b) +{ + const char *rg; + kstring_t lib = { 0, 0, NULL }; + rg = (char *)bam_aux_get(b, "RG"); + + if (!rg) + return NULL; + else + rg++; + + if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0) + return NULL; + + static char LB_text[1024]; + int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; + + memcpy(LB_text, lib.s, len); + LB_text[len] = 0; + + free(lib.s); + + return LB_text; +} + + +// Bam record pointer and SAM header combined +typedef struct { + const sam_hdr_t *h; + const bam1_t *b; +} hb_pair; + +// Looks up variable names in str and replaces them with their value. +// Also supports aux tags. +// +// Note the expression parser deliberately overallocates str size so it +// is safe to use memcmp over strcmp. +static int bam_sym_lookup(void *data, char *str, char **end, + hts_expr_val_t *res) { + hb_pair *hb = (hb_pair *)data; + const bam1_t *b = hb->b; + + res->is_str = 0; + switch(*str) { + case 'c': + if (memcmp(str, "cigar", 5) == 0) { + *end = str+5; + res->is_str = 1; + ks_clear(&res->s); + uint32_t *cigar = bam_get_cigar(b); + int i, n = b->core.n_cigar, r = 0; + if (n) { + for (i = 0; i < n; i++) { + r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0; + r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0; + } + r |= kputs("", &res->s) < 0; + } else { + r |= kputs("*", &res->s) < 0; + } + return r ? -1 : 0; + } + break; + + case 'e': + if (memcmp(str, "endpos", 6) == 0) { + *end = str+6; + res->d = bam_endpos(b); + return 0; + } + break; + + case 'f': + if (memcmp(str, "flag", 4) == 0) { + str = *end = str+4; + if (*str != '.') { + res->d = b->core.flag; + return 0; + } else { + str++; + if (!memcmp(str, "paired", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FPAIRED; + return 0; + } else if (!memcmp(str, "proper_pair", 11)) { + *end = str+11; + res->d = b->core.flag & BAM_FPROPER_PAIR; + return 0; + } else if (!memcmp(str, "unmap", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FUNMAP; + return 0; + } else if (!memcmp(str, "munmap", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FMUNMAP; + return 0; + } else if (!memcmp(str, "reverse", 7)) { + *end = str+7; + res->d = b->core.flag & BAM_FREVERSE; + return 0; + } else if (!memcmp(str, "mreverse", 8)) { + *end = str+8; + res->d = b->core.flag & BAM_FMREVERSE; + return 0; + } else if (!memcmp(str, "read1", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD1; + return 0; + } else if (!memcmp(str, "read2", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD2; + return 0; + } else if (!memcmp(str, "secondary", 9)) { + *end = str+9; + res->d = b->core.flag & BAM_FSECONDARY; + return 0; + } else if (!memcmp(str, "qcfail", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FQCFAIL; + return 0; + } else if (!memcmp(str, "dup", 3)) { + *end = str+3; + res->d = b->core.flag & BAM_FDUP; + return 0; + } else if (!memcmp(str, "supplementary", 13)) { + *end = str+13; + res->d = b->core.flag & BAM_FSUPPLEMENTARY; + return 0; + } else { + hts_log_error("Unrecognised flag string"); + return -1; + } + } + } + break; + + case 'h': + if (memcmp(str, "hclen", 5) == 0) { + int hclen = 0; + uint32_t *cigar = bam_get_cigar(b); + uint32_t ncigar = b->core.n_cigar; + + // left + if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) + hclen = bam_cigar_oplen(cigar[0]); + + // right + if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP) + hclen += bam_cigar_oplen(cigar[ncigar-1]); + + *end = str+5; + res->d = hclen; + return 0; + } + break; + + case 'l': + if (memcmp(str, "library", 7) == 0) { + *end = str+7; + res->is_str = 1; + const char *lib = bam_get_library(hb->h, b); + kputs(lib ? lib : "", ks_clear(&res->s)); + return 0; + } + break; + + case 'm': + if (memcmp(str, "mapq", 4) == 0) { + *end = str+4; + res->d = b->core.qual; + return 0; + } else if (memcmp(str, "mpos", 4) == 0) { + *end = str+4; + res->d = b->core.mpos+1; + return 0; + } else if (memcmp(str, "mrname", 6) == 0) { + *end = str+6; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "mrefid", 6) == 0) { + *end = str+6; + res->d = b->core.mtid; + return 0; + } + break; + + case 'n': + if (memcmp(str, "ncigar", 6) == 0) { + *end = str+6; + res->d = b->core.n_cigar; + return 0; + } + break; + + case 'p': + if (memcmp(str, "pos", 3) == 0) { + *end = str+3; + res->d = b->core.pos+1; + return 0; + } else if (memcmp(str, "pnext", 5) == 0) { + *end = str+5; + res->d = b->core.mpos+1; + return 0; + } + break; + + case 'q': + if (memcmp(str, "qlen", 4) == 0) { + *end = str+4; + res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)); + return 0; + } else if (memcmp(str, "qname", 5) == 0) { + *end = str+5; + res->is_str = 1; + kputs(bam_get_qname(b), ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "qual", 4) == 0) { + *end = str+4; + ks_clear(&res->s); + if (ks_resize(&res->s, b->core.l_qseq+1) < 0) + return -1; + memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq); + res->s.l = b->core.l_qseq; + res->is_str = 1; + return 0; + } + break; + + case 'r': + if (memcmp(str, "rlen", 4) == 0) { + *end = str+4; + res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + return 0; + } else if (memcmp(str, "rname", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "rnext", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "refid", 5) == 0) { + *end = str+5; + res->d = b->core.tid; + return 0; + } + break; + + case 's': + if (memcmp(str, "seq", 3) == 0) { + *end = str+3; + ks_clear(&res->s); + if (ks_resize(&res->s, b->core.l_qseq+1) < 0) + return -1; + nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq); + res->s.s[b->core.l_qseq] = 0; + res->s.l = b->core.l_qseq; + res->is_str = 1; + return 0; + } else if (memcmp(str, "sclen", 5) == 0) { + int sclen = 0; + uint32_t *cigar = bam_get_cigar(b); + int ncigar = b->core.n_cigar; + int left = 0; + + // left + if (ncigar > 0 + && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) + left = 0, sclen += bam_cigar_oplen(cigar[0]); + else if (ncigar > 1 + && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP + && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) + left = 1, sclen += bam_cigar_oplen(cigar[1]); + + // right + if (ncigar-1 > left + && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP) + sclen += bam_cigar_oplen(cigar[ncigar-1]); + else if (ncigar-2 > left + && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP + && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP) + sclen += bam_cigar_oplen(cigar[ncigar-2]); + + *end = str+5; + res->d = sclen; + return 0; + } + break; + + case 't': + if (memcmp(str, "tlen", 4) == 0) { + *end = str+4; + res->d = b->core.isize; + return 0; + } + break; + + case '[': + if (*str == '[' && str[1] && str[2] && str[3] == ']') { + /* aux tags */ + *end = str+4; + + uint8_t *aux = bam_aux_get(b, str+1); + if (aux) { + // we define the truth of a tag to be its presence, even if 0. + res->is_true = 1; + switch (*aux) { + case 'Z': + case 'H': + res->is_str = 1; + kputs((char *)aux+1, ks_clear(&res->s)); + break; + + case 'A': + res->is_str = 1; + kputsn((char *)aux+1, 1, ks_clear(&res->s)); + break; + + case 'i': case 'I': + case 's': case 'S': + case 'c': case 'C': + res->is_str = 0; + res->d = bam_aux2i(aux); + break; + + case 'f': + case 'd': + res->is_str = 0; + res->d = bam_aux2f(aux); + break; + + default: + hts_log_error("Aux type '%c not yet supported by filters", + *aux); + return -1; + } + return 0; + + } else { + // hence absent tags are always false (and strings) + res->is_str = 1; + res->s.l = 0; + res->d = 0; + res->is_true = 0; + return 0; + } + } + break; + } + + // All successful matches in switch should return 0. + // So if we didn't match, it's a parse error. + return -1; +} + +// Returns 1 when accepted by the filter, 0 if not, -1 on error. +int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) +{ + hb_pair hb = {h, b}; + hts_expr_val_t res = HTS_EXPR_VAL_INIT; + if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) { + hts_log_error("Couldn't process filter expression"); + hts_expr_val_free(&res); + return -1; + } + + int t = res.is_true; + hts_expr_val_free(&res); + + return t; +} + static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = fpv; bam1_t *b = bv; - int ret = cram_get_bam_seq(fp->fp.cram, &b); - if (ret < 0) - return cram_eof(fp->fp.cram) ? -1 : -2; + int pass_filter, ret; - if (bam_tag2cigar(b, 1, 1) < 0) - return -2; + do { + ret = cram_get_bam_seq(fp->fp.cram, &b); + if (ret < 0) + return cram_eof(fp->fp.cram) ? -1 : -2; + + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; + + *tid = b->core.tid; + *beg = b->core.pos; + *end = bam_endpos(b); - *tid = b->core.tid; - *beg = b->core.pos; - *end = bam_endpos(b); + if (fp->filter) { + pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter); + if (pass_filter < 0) + return -2; + } else { + pass_filter = 1; + } + } while (pass_filter == 0); return ret; } @@ -1245,6 +1796,22 @@ sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) return bh; } +static int valid_sam_header_type(const char *s) { + if (s[0] != '@') return 0; + switch (s[1]) { + case 'H': + return s[2] == 'D' && s[3] == '\t'; + case 'S': + return s[2] == 'Q' && s[3] == '\t'; + case 'R': + case 'P': + return s[2] == 'G' && s[3] == '\t'; + case 'C': + return s[2] == 'O'; + } + return 0; +} + // Minimal sanitisation of a header to ensure. // - null terminated string. // - all lines start with @ (also implies no blank lines). @@ -1319,6 +1886,20 @@ static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) { return h; } +static void known_stderr(const char *tool, const char *advice) { + hts_log_warning("SAM file corrupted by embedded %s error/log message", tool); + hts_log_warning("%s", advice); +} + +static void warn_if_known_stderr(const char *line) { + if (strstr(line, "M::bwa_idx_load_from_disk") != NULL) + known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`"); + else if (strstr(line, "M::mem_pestat") != NULL) + known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`"); + else if (strstr(line, "loaded/built the index") != NULL) + known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`"); +} + static sam_hdr_t *sam_hdr_create(htsFile* fp) { kstring_t str = { 0, 0, NULL }; khint_t k; @@ -1372,9 +1953,10 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { goto error; if (!absent) { - hts_log_warning("Duplicated sequence '%s'", sn); + hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn); free(sn); } else { + sn = NULL; if (ln >= UINT32_MAX) { // Stash away ref length that // doesn't fit in target_len array @@ -1384,7 +1966,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (!long_refs) goto error; } - k2 = kh_put(s2i, long_refs, sn, &absent); + k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); if (absent < 0) goto error; kh_val(long_refs, k2) = ln; @@ -1396,13 +1978,21 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { } } else { hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); + warn_if_known_stderr(fp->line.s); free(sn); } } else { hts_log_warning("Ignored @SQ line with missing SN: tag"); + warn_if_known_stderr(fp->line.s); } sn = NULL; } + else if (!valid_sam_header_type(fp->line.s)) { + hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO"); + warn_if_known_stderr(fp->line.s); + goto error; + } + if (kputsn(fp->line.s, fp->line.l, &str) < 0) goto error; @@ -1447,19 +2037,25 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { continue; sn = (char*)calloc(tab-line.s+1, 1); - if (!sn) + if (!sn) { + e = 1; break; + } memcpy(sn, line.s, tab-line.s); k = kh_put(s2i, d, sn, &absent); - if (absent < 0) + if (absent < 0) { + e = 1; break; + } ln = strtoll(tab, NULL, 10); if (!absent) { - hts_log_warning("Duplicated sequence '%s'", sn); + hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn); free(sn); + sn = NULL; } else { + sn = NULL; if (ln >= UINT32_MAX) { // Stash away ref length that // doesn't fit in target_len array @@ -1467,12 +2063,16 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { int absent = -1; if (!long_refs) { long_refs = kh_init(s2i); - if (!long_refs) - goto error; + if (!long_refs) { + e = 1; + break; + } + } + k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); + if (absent < 0) { + e = 1; + break; } - k2 = kh_put(s2i, long_refs, sn, &absent); - if (absent < 0) - goto error; kh_val(long_refs, k2) = ln; kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 | UINT32_MAX); @@ -1570,6 +2170,10 @@ sam_hdr_t *sam_hdr_read(htsFile *fp) case sam: return sam_hdr_create(fp); + case fastq_format: + case fasta_format: + return sam_hdr_init(); + case empty_format: errno = EPIPE; return NULL; @@ -1587,9 +2191,6 @@ int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) return -1; } - if (!h->hrecs && !h->text) - return 0; - switch (fp->format.format) { case binary_format: fp->format.category = sequence_data; @@ -1613,6 +2214,8 @@ int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) fp->format.format = sam; /* fall-through */ case sam: { + if (!h->hrecs && !h->text) + return 0; char *text; kstring_t hdr_ks = { 0, 0, NULL }; size_t l_text; @@ -1673,6 +2276,11 @@ int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) } break; + case fastq_format: + case fasta_format: + // Nothing to output; FASTQ has no file headers. + break; + default: errno = EBADF; return -1; @@ -1930,41 +2538,190 @@ static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { } } -int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) -{ -#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) - -#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff - -// Macro that operates on 64-bits at a time. -#define COPY_MINUS_N(to,from,n,l,failed) \ - do { \ - uint64_u *from8 = (uint64_u *)(from); \ - uint64_u *to8 = (uint64_u *)(to); \ - uint64_t uflow = 0; \ - size_t l8 = (l)>>3, i; \ - for (i = 0; i < l8; i++) { \ - to8[i] = from8[i] - (n)*0x0101010101010101UL; \ - uflow |= to8[i]; \ - } \ - for (i<<=3; i < (l); ++i) { \ - to[i] = from[i] - (n); \ - uflow |= to[i]; \ - } \ - failed = (uflow & 0x8080808080808080UL) > 0; \ +// Parse tag line and append to bam object b. +// Shared by both SAM and FASTQ parsers. +// +// The difference between the two is how lenient we are to recognising +// non-compliant strings. The FASTQ parser glosses over arbitrary +// non-SAM looking strings. +static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, + khash_t(tag) *tag_whitelist) { + int overflow = 0; + int checkpoint; + char logbuf[40]; + char *q = start, *p = end; + +#define _parse_err(cond, ...) \ + do { \ + if (cond) { \ + if (lenient) { \ + while (q < p && !isspace_c(*q)) \ + q++; \ + while (q < p && isspace_c(*q)) \ + q++; \ + b->l_data = checkpoint; \ + goto loop; \ + } else { \ + hts_log_error(__VA_ARGS__); \ + goto err_ret; \ + } \ + } \ } while (0) -#else + while (q < p) loop: { + char type; + checkpoint = b->l_data; + if (p - q < 5) { + if (lenient) { + break; + } else { + hts_log_error("Incomplete aux field"); + goto err_ret; + } + } + _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); -// Basic version which operates a byte at a time -#define COPY_MINUS_N(to,from,n,l,failed) do { \ - uint8_t uflow = 0; \ - for (i = 0; i < (l); ++i) { \ - (to)[i] = (from)[i] - (n); \ - uflow |= (uint8_t) (to)[i]; \ - } \ - failed = (uflow & 0x80) > 0; \ - } while (0) + if (lenient && (q[2] | q[4]) != ':') { + while (q < p && !isspace_c(*q)) + q++; + while (q < p && isspace_c(*q)) + q++; + continue; + } + + if (tag_whitelist) { + int tt = q[0]*256 + q[1]; + if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) { + while (q < p && *q != '\t') + q++; + continue; + } + } + + // Copy over id + if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; + memcpy(b->data + b->l_data, q, 2); b->l_data += 2; + q += 3; type = *q++; ++q; // q points to value + if (type != 'Z' && type != 'H') // the only zero length acceptable fields + _parse_err(*q <= '\t', "incomplete aux field"); + + // Ensure enough space for a double + type allocated. + if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; + + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { + b->data[b->l_data++] = 'A'; + b->data[b->l_data++] = *q++; + } else if (type == 'i' || type == 'I') { + if (*q == '-') { + int32_t x = hts_str2int(q, &q, 32, &overflow); + if (x >= INT8_MIN) { + b->data[b->l_data++] = 'c'; + b->data[b->l_data++] = x; + } else if (x >= INT16_MIN) { + b->data[b->l_data++] = 's'; + i16_to_le(x, b->data + b->l_data); + b->l_data += 2; + } else { + b->data[b->l_data++] = 'i'; + i32_to_le(x, b->data + b->l_data); + b->l_data += 4; + } + } else { + uint32_t x = hts_str2uint(q, &q, 32, &overflow); + if (x <= UINT8_MAX) { + b->data[b->l_data++] = 'C'; + b->data[b->l_data++] = x; + } else if (x <= UINT16_MAX) { + b->data[b->l_data++] = 'S'; + u16_to_le(x, b->data + b->l_data); + b->l_data += 2; + } else { + b->data[b->l_data++] = 'I'; + u32_to_le(x, b->data + b->l_data); + b->l_data += 4; + } + } + } else if (type == 'f') { + b->data[b->l_data++] = 'f'; + float_to_le(strtod(q, &q), b->data + b->l_data); + b->l_data += sizeof(float); + } else if (type == 'd') { + b->data[b->l_data++] = 'd'; + double_to_le(strtod(q, &q), b->data + b->l_data); + b->l_data += sizeof(double); + } else if (type == 'Z' || type == 'H') { + char *end = strchr(q, '\t'); + if (!end) end = q + strlen(q); + _parse_err(type == 'H' && ((end-q)&1) != 0, + "hex field does not have an even number of digits"); + b->data[b->l_data++] = type; + if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; + memcpy(b->data + b->l_data, q, end - q); + b->l_data += end - q; + b->data[b->l_data++] = '\0'; + q = end; + } else if (type == 'B') { + uint32_t n; + char *r; + type = *q++; // q points to the first ',' following the typing byte + _parse_err(*q && *q != ',' && *q != '\t', + "B aux field type not followed by ','"); + + for (r = q, n = 0; *r > '\t'; ++r) + if (*r == ',') ++n; + + if (sam_parse_B_vals(type, n, q, &q, r, b) < 0) + goto err_ret; + } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); + + while (*q > '\t') { q++; } // Skip any junk to next tab + q++; + } + + _parse_err(!lenient && overflow != 0, "numeric value out of allowed range"); +#undef _parse_err + + return 0; + +err_ret: + return -2; +} + +int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) +{ +#define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) + +#if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff + +// Macro that operates on 64-bits at a time. +#define COPY_MINUS_N(to,from,n,l,failed) \ + do { \ + uint64_u *from8 = (uint64_u *)(from); \ + uint64_u *to8 = (uint64_u *)(to); \ + uint64_t uflow = 0; \ + size_t l8 = (l)>>3, i; \ + for (i = 0; i < l8; i++) { \ + to8[i] = from8[i] - (n)*0x0101010101010101UL; \ + uflow |= to8[i]; \ + } \ + for (i<<=3; i < (l); ++i) { \ + to[i] = from[i] - (n); \ + uflow |= to[i]; \ + } \ + failed = (uflow & 0x8080808080808080UL) > 0; \ + } while (0) + +#else + +// Basic version which operates a byte at a time +#define COPY_MINUS_N(to,from,n,l,failed) do { \ + uint8_t uflow = 0; \ + for (i = 0; i < (l); ++i) { \ + (to)[i] = (from)[i] - (n); \ + uflow |= (uint8_t) (to)[i]; \ + } \ + failed = (uflow & 0x80) > 0; \ + } while (0) #endif @@ -2025,22 +2782,12 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) if (*p++ != '\t') goto err_ret; // cigar if (*p != '*') { - uint32_t *cigar; - size_t n_cigar = 0; - for (q = p; *p && *p != '\t'; ++p) - if (!isdigit_c(*p)) ++n_cigar; - if (*p++ != '\t') goto err_ret; - _parse_err(n_cigar == 0, "no CIGAR operations"); - _parse_err(n_cigar >= 2147483647, "too many CIGAR operations"); - c->n_cigar = n_cigar; - _get_mem(uint32_t, &cigar, b, c->n_cigar * sizeof(uint32_t)); - for (i = 0; i < c->n_cigar; ++i) { - int op; - cigar[i] = hts_str2uint(q, &q, 28, &overflow)<l_data; + int n_cigar = bam_parse_cigar(p, &p, b); + if (n_cigar < 1 || *p++ != '\t') goto err_ret; + cigar = (uint32_t *)(b->data + old_l_data); + // can't use bam_endpos() directly as some fields not yet set up cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; if (cigreflen == 0) cigreflen = 1; @@ -2104,94 +2851,10 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) _parse_err(failed, "invalid QUAL character"); p += c->l_qseq + 1; } - // aux - q = p; - p = s->s + s->l; - while (q < p) { - char type; - _parse_err(p - q < 5, "incomplete aux field"); - _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); - // Copy over id - if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; - memcpy(b->data + b->l_data, q, 2); b->l_data += 2; - q += 3; type = *q++; ++q; // q points to value - if (type != 'Z' && type != 'H') // the only zero length acceptable fields - _parse_err(*q <= '\t', "incomplete aux field"); - - // Ensure enough space for a double + type allocated. - if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; - - if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { - b->data[b->l_data++] = 'A'; - b->data[b->l_data++] = *q++; - } else if (type == 'i' || type == 'I') { - if (*q == '-') { - int32_t x = hts_str2int(q, &q, 32, &overflow); - if (x >= INT8_MIN) { - b->data[b->l_data++] = 'c'; - b->data[b->l_data++] = x; - } else if (x >= INT16_MIN) { - b->data[b->l_data++] = 's'; - i16_to_le(x, b->data + b->l_data); - b->l_data += 2; - } else { - b->data[b->l_data++] = 'i'; - i32_to_le(x, b->data + b->l_data); - b->l_data += 4; - } - } else { - uint32_t x = hts_str2uint(q, &q, 32, &overflow); - if (x <= UINT8_MAX) { - b->data[b->l_data++] = 'C'; - b->data[b->l_data++] = x; - } else if (x <= UINT16_MAX) { - b->data[b->l_data++] = 'S'; - u16_to_le(x, b->data + b->l_data); - b->l_data += 2; - } else { - b->data[b->l_data++] = 'I'; - u32_to_le(x, b->data + b->l_data); - b->l_data += 4; - } - } - } else if (type == 'f') { - b->data[b->l_data++] = 'f'; - float_to_le(strtod(q, &q), b->data + b->l_data); - b->l_data += sizeof(float); - } else if (type == 'd') { - b->data[b->l_data++] = 'd'; - double_to_le(strtod(q, &q), b->data + b->l_data); - b->l_data += sizeof(double); - } else if (type == 'Z' || type == 'H') { - char *end = strchr(q, '\t'); - if (!end) end = q + strlen(q); - _parse_err(type == 'H' && ((end-q)&1) != 0, - "hex field does not have an even number of digits"); - b->data[b->l_data++] = type; - if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; - memcpy(b->data + b->l_data, q, end - q); - b->l_data += end - q; - b->data[b->l_data++] = '\0'; - q = end; - } else if (type == 'B') { - uint32_t n; - char *r; - type = *q++; // q points to the first ',' following the typing byte - _parse_err(*q && *q != ',' && *q != '\t', - "B aux field type not followed by ','"); - - for (r = q, n = 0; *r > '\t'; ++r) - if (*r == ',') ++n; - - if (sam_parse_B_vals(type, n, q, &q, r, b) < 0) - goto err_ret; - } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); - - while (*q > '\t') { q++; } // Skip any junk to next tab - q++; - } - _parse_err(overflow != 0, "numeric value out of allowed range"); + // aux + if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0) + goto err_ret; if (bam_tag2cigar(b, 1, 1) < 0) return -2; @@ -2205,14 +2868,143 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) return -2; } +static uint32_t read_ncigar(const char *q) { + uint32_t n_cigar = 0; + for (; *q && *q != '\t'; ++q) + if (!isdigit_c(*q)) ++n_cigar; + if (!n_cigar) { + hts_log_error("No CIGAR operations"); + return 0; + } + if (n_cigar >= 2147483647) { + hts_log_error("Too many CIGAR operations"); + return 0; + } + + return n_cigar; +} + +/*! @function + @abstract Parse a CIGAR string into preallocated a uint32_t array + @param in [in] pointer to the source string + @param a_cigar [out] address of the destination uint32_t buffer + @return number of processed input characters; 0 on error + */ +static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { + int i, overflow = 0; + const char *p = in; + for (i = 0; i < n_cigar; i++) { + uint32_t len; + int op; + char *q; + len = hts_str2uint(p, &q, 28, &overflow)< *a_mem) { + uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar)); + if (a_tmp) { + *a_cigar = a_tmp; + *a_mem = n_cigar; + } else { + hts_log_error("Memory allocation error"); + return -1; + } + } + + if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1; + if (end) *end = (char *)in+diff; + + return n_cigar; +} + +ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { + size_t n_cigar = 0; + int diff; + + if (!in || !b) { + hts_log_error("NULL pointer arguments"); + return -1; + } + if (end) *end = (char *)in; + + n_cigar = (*in == '*') ? 0 : read_ncigar(in); + if (!n_cigar && b->core.n_cigar == 0) { + if (end) *end = (char *)in+1; + return 0; + } + + ssize_t cig_diff = n_cigar - b->core.n_cigar; + if (cig_diff > 0 && + possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) { + hts_log_error("Memory allocation error"); + return -1; + } + + uint32_t *cig = bam_get_cigar(b); + if ((uint8_t *)cig != b->data + b->l_data) { + // Modifying an BAM existing BAM record + uint8_t *seq = bam_get_seq(b); + memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq); + } + + if (n_cigar) { + if (!(diff = parse_cigar(in, cig, n_cigar))) + return -1; + } else { + diff = 1; // handle "*" + } + + b->l_data += cig_diff * sizeof(uint32_t); + b->core.n_cigar = n_cigar; + if (end) *end = (char *)in + diff; + + return n_cigar; +} + /* * ----------------------------------------------------------------------------- * SAM threading */ // Size of SAM text block (reading) -#define NM 240000 -// Number of BAM records (writing) -#define NB 1000 +#define SAM_NBYTES 240000 + +// Number of BAM records (writing, up to NB_mem in size) +#define SAM_NBAM 1000 struct SAM_state; @@ -2222,7 +3014,8 @@ typedef struct sp_bams { int serial; bam1_t *bams; - int nbams, abams; // used and alloc + int nbams, abams; // used and alloc for bams[] array + size_t bam_mem; // very approximate total size struct SAM_state *fd; } sp_bams; @@ -2254,6 +3047,7 @@ typedef struct SAM_state { pthread_mutex_t lines_m; hts_tpool_process *q; pthread_t dispatcher; + int dispatcher_set; sp_lines *lines; sp_bams *bams; @@ -2339,7 +3133,7 @@ int sam_state_destroy(htsFile *fp) { if (fd->q) hts_tpool_wake_dispatch(fd->q); // unstick the reader - if (!fp->is_write && fd->q && fd->dispatcher) { + if (!fp->is_write && fd->q && fd->dispatcher_set) { for (;;) { // Avoid deadlocks with dispatcher if (fd->command == SAM_CLOSE_DONE) @@ -2379,7 +3173,8 @@ int sam_state_destroy(htsFile *fp) { } // Wait for it to acknowledge - pthread_join(fd->dispatcher, NULL); + if (fd->dispatcher_set) + pthread_join(fd->dispatcher, NULL); if (!ret) ret = -fd->errcode; } @@ -2471,6 +3266,7 @@ static void *sam_parse_worker(void *arg) { goto err; } gb->nbams = 0; + gb->bam_mem = 0; } gb->serial = gl->serial; gb->next = NULL; @@ -2507,14 +3303,23 @@ static void *sam_parse_worker(void *arg) { // However this is an API change so for now we copy. char *nl = strchr(cp, '\n'); - if (!nl) nl = cp_end; - if (*nl) *nl++ = '\0'; - kstring_t ks = {nl-cp, gl->alloc, cp}; + char *line_end; + if (nl) { + line_end = nl; + if (line_end > cp && *(line_end - 1) == '\r') + line_end--; + nl++; + } else { + nl = line_end = cp_end; + } + *line_end = '\0'; + kstring_t ks = { line_end - cp, gl->alloc, cp }; if (sam_parse1(&ks, fd->h, &b[i]) < 0) { sam_state_err(fd, errno ? errno : EIO); cleanup_sp_lines(gl); goto err; } + cp = nl; i++; } @@ -2584,7 +3389,7 @@ static void *sam_dispatcher_read(void *vp) { l = calloc(1, sizeof(*l)); if (!l) goto err; - l->alloc = NM; + l->alloc = SAM_NBYTES; l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1 if (!l->data) { free(l); @@ -2595,11 +3400,11 @@ static void *sam_dispatcher_read(void *vp) { } l->next = NULL; - if (l->alloc < line_frag+NM/2) { - char *rp = realloc(l->data, line_frag+NM/2 +8); + if (l->alloc < line_frag+SAM_NBYTES/2) { + char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8); if (!rp) goto err; - l->alloc = line_frag+NM/2; + l->alloc = line_frag+SAM_NBYTES/2; l->data = rp; } memcpy(l->data, line.s, line_frag); @@ -2738,6 +3543,8 @@ static void *sam_dispatcher_write(void *vp) { i++; if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0) + goto err; if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) goto err; } else { @@ -2777,8 +3584,69 @@ static void *sam_dispatcher_write(void *vp) { pthread_mutex_unlock(&fd->lines_m); } else { if (fp->is_bgzf) { - if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size) - goto err; + // We keep track of how much in the current block we have + // remaining => R. We look for the last newline in input + // [i] to [i+R], backwards => position N. + // + // If we find a newline, we write out bytes i to N. + // We know we cannot fit the next record in this bgzf block, + // so we flush what we have and copy input N to i+R into + // the start of a new block, and recompute a new R for that. + // + // If we don't find a newline (i==N) then we cannot extend + // the current block at all, so flush whatever is in it now + // if it ends on a newline. + // We still copy i(==N) to i+R to the next block and + // continue as before with a new R. + // + // The only exception on the flush is when we run out of + // data in the input. In that case we skip it as we don't + // yet know if the next record will fit. + // + // Both conditions share the same code here: + // - Look for newline (pos N) + // - Write i to N (which maybe 0) + // - Flush if block ends on newline and not end of input + // - write N to i+R + + int i = 0; + BGZF *fb = fp->fp.bgzf; + while (i < gl->data_size) { + // remaining space in block + int R = BGZF_BLOCK_SIZE - fb->block_offset; + int eod = 0; + if (R > gl->data_size-i) + R = gl->data_size-i, eod = 1; + + // Find last newline in input data + int N = i + R; + while (--N > i) { + if (gl->data[N] == '\n') + break; + } + + if (N != i) { + // Found a newline + N++; + if (bgzf_write(fb, &gl->data[i], N-i) != N-i) + goto err; + } + + // Flush bgzf block + int b_off = fb->block_offset; + if (!eod && b_off && + ((char *)fb->uncompressed_block)[b_off-1] == '\n') + if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0) + goto err; + + // Copy from N onwards into next block + if (i+R > N) + if (bgzf_write(fb, &gl->data[N], i+R - N) + != i+R - N) + goto err; + + i = i+R; + } } else { if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) goto err; @@ -2873,177 +3741,482 @@ static void *sam_format_worker(void *arg) { return NULL; } -int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { - if (fp->state) - return 0; +int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { + if (fp->state) + return 0; + + if (!(fp->state = sam_state_create(fp))) + return -1; + SAM_state *fd = (SAM_state *)fp->state; + + pthread_mutex_init(&fd->lines_m, NULL); + pthread_mutex_init(&fd->command_m, NULL); + pthread_cond_init(&fd->command_c, NULL); + fd->p = p->pool; + int qsize = p->qsize; + if (!qsize) + qsize = 2*hts_tpool_size(fd->p); + fd->q = hts_tpool_process_init(fd->p, qsize, 0); + if (!fd->q) { + sam_state_destroy(fp); + return -1; + } + + if (fp->format.compression == bgzf) + return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); + + return 0; +} + +int sam_set_threads(htsFile *fp, int nthreads) { + if (nthreads <= 0) + return 0; + + htsThreadPool p; + p.pool = hts_tpool_init(nthreads); + p.qsize = nthreads*2; + + int ret = sam_set_thread_pool(fp, &p); + if (ret < 0) + return ret; + + SAM_state *fd = (SAM_state *)fp->state; + fd->own_pool = 1; + + return 0; +} + +typedef struct { + kstring_t name; + kstring_t comment; // NB: pointer into name, do not free + kstring_t seq; + kstring_t qual; + int casava; + int aux; + int rnum; + char BC[3]; // aux tag ID for barcode + khash_t(tag) *tags; // which aux tags to use (if empty, use all). + char nprefix; + int sra_names; +} fastq_state; + +// Initialise fastq state. +// Name char of '@' or '>' distinguishes fastq vs fasta variant +static fastq_state *fastq_state_init(int name_char) { + fastq_state *x = (fastq_state *)calloc(1, sizeof(*x)); + if (!x) + return NULL; + strcpy(x->BC, "BC"); + x->nprefix = name_char; + + return x; +} + +void fastq_state_destroy(htsFile *fp) { + if (fp->state) { + fastq_state *x = (fastq_state *)fp->state; + if (x->tags) + kh_destroy(tag, x->tags); + ks_free(&x->name); + ks_free(&x->seq); + ks_free(&x->qual); + free(fp->state); + } +} + +int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { + va_list args; + + if (!fp) + return -1; + if (!fp->state) + if (!(fp->state = fastq_state_init(fp->format.format == fastq_format + ? '@' : '>'))) + return -1; + + fastq_state *x = (fastq_state *)fp->state; + + switch (opt) { + case FASTQ_OPT_CASAVA: + x->casava = 1; + break; + + case FASTQ_OPT_NAME2: + x->sra_names = 1; + break; + + case FASTQ_OPT_AUX: { + va_start(args, opt); + x->aux = 1; + char *tag = va_arg(args, char *); + va_end(args); + if (tag && strcmp(tag, "1") != 0) { + if (!x->tags) + if (!(x->tags = kh_init(tag))) + return -1; + + size_t i, tlen = strlen(tag); + for (i = 0; i+3 <= tlen+1; i += 3) { + if (tag[i+0] == ',' || tag[i+1] == ',' || + !(tag[i+2] == ',' || tag[i+2] == '\0')) { + hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i); + break; + } + int ret, tcode = tag[i+0]*256 + tag[i+1]; + kh_put(tag, x->tags, tcode, &ret); + if (ret < 0) + return -1; + } + } + break; + } + + case FASTQ_OPT_BARCODE: { + va_start(args, opt); + char *bc = va_arg(args, char *); + va_end(args); + strncpy(x->BC, bc, 2); + x->BC[2] = 0; + break; + } + + case FASTQ_OPT_RNUM: + x->rnum = 1; + break; + + default: + break; + } + return 0; +} + +static int fastq_parse1(htsFile *fp, bam1_t *b) { + fastq_state *x = (fastq_state *)fp->state; + size_t i, l; + int ret = 0; + + if (fp->format.format == fasta_format && fp->line.s) { + // For FASTA we've already read the >name line; steal it + // Not the most efficient, but we don't optimise for fasta reading. + if (fp->line.l == 0) + return -1; // EOF + + free(x->name.s); + x->name = fp->line; + fp->line.l = fp->line.m = 0; + fp->line.s = NULL; + } else { + // Read a FASTQ format entry. + ret = hts_getline(fp, KS_SEP_LINE, &x->name); + if (ret == -1) + return -1; // EOF + else if (ret < -1) + return ret; // ERR + } + + // Name + if (*x->name.s != x->nprefix) + return -2; + + // Reverse the SRA strangeness of putting the run_name.number before + // the read name. + i = 0; + char *name = x->name.s+1; + if (x->sra_names) { + char *cp = strpbrk(x->name.s, " \t"); + if (cp) { + while (*cp == ' ' || *cp == '\t') + cp++; + *--cp = '@'; + i = cp - x->name.s; + name = cp+1; + } + } + + l = x->name.l; + char *s = x->name.s; + while (i < l && !isspace_c(s[i])) + i++; + if (i < l) { + s[i] = 0; + x->name.l = i++; + } + + // Comment; a kstring struct, but pointer into name line. (Do not free) + while (i < l && isspace_c(s[i])) + i++; + x->comment.s = s+i; + x->comment.l = l - i; + + // Seq + x->seq.l = 0; + for (;;) { + if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) + if (fp->format.format == fastq_format || ret < -1) + return -2; + if (ret == -1 || + *fp->line.s == (fp->format.format == fastq_format ? '+' : '>')) + break; + if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) + return -2; + } + + // Qual + if (fp->format.format == fastq_format) { + size_t remainder = x->seq.l; + x->qual.l = 0; + do { + if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) + return -2; + if (fp->line.l > remainder) + return -2; + if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0) + return -2; + remainder -= fp->line.l; + } while (remainder > 0); + + // Decr qual + for (i = 0; i < x->qual.l; i++) + x->qual.s[i] -= '!'; + } + + int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; + if (x->name.l > 2 && + x->name.s[x->name.l-2] == '/' && + isdigit_c(x->name.s[x->name.l-1])) { + switch(x->name.s[x->name.l-1]) { + case '1': flag |= BAM_FREAD1 | pflag; break; + case '2': flag |= BAM_FREAD2 | pflag; break; + default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; + } + x->name.s[x->name.l-=2] = 0; + } + + // Convert to BAM + ret = bam_set1(b, + x->name.s + x->name.l - name, name, + flag, + -1, -1, 0, // ref '*', pos, mapq, + 0, NULL, // no cigar, + -1, -1, 0, // mate + x->seq.l, x->seq.s, x->qual.s, + 0); + + // Identify Illumina CASAVA strings. + // ::: + char *barcode = NULL; + int barcode_len = 0; + kstring_t *kc = &x->comment; + char *endptr; + if (x->casava && + // \d:[YN]:\d+:[ACGTN]+ + kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) && + strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4 + && *endptr == ':') { + + // read num + switch(kc->s[0]) { + case '1': b->core.flag |= BAM_FREAD1 | pflag; break; + case '2': b->core.flag |= BAM_FREAD2 | pflag; break; + default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; + } + + if (kc->s[2] == 'Y') + b->core.flag |= BAM_FQCFAIL; - if (!(fp->state = sam_state_create(fp))) - return -1; - SAM_state *fd = (SAM_state *)fp->state; + // Barcode, maybe numeric in which case we skip it + if (!isdigit_c(endptr[1])) { + barcode = endptr+1; + for (i = barcode - kc->s; i < kc->l; i++) + if (isspace_c(kc->s[i])) + break; - pthread_mutex_init(&fd->lines_m, NULL); - pthread_mutex_init(&fd->command_m, NULL); - pthread_cond_init(&fd->command_c, NULL); - fd->p = p->pool; - int qsize = p->qsize; - if (!qsize) - qsize = 2*hts_tpool_size(fd->p); - fd->q = hts_tpool_process_init(fd->p, qsize, 0); - if (!fd->q) { - sam_state_destroy(fp); - return -1; + kc->s[i] = 0; + barcode_len = i+1-(barcode - kc->s); + } } - if (fp->format.compression == bgzf) - return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); + if (ret >= 0 && barcode_len) + if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0) + ret = -2; - return 0; -} + if (!x->aux) + return ret; -int sam_set_threads(htsFile *fp, int nthreads) { - if (nthreads <= 0) - return 0; + // Identify any SAM style aux tags in comments too. + if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0) + ret = -2; - htsThreadPool p; - p.pool = hts_tpool_init(nthreads); - p.qsize = nthreads*2; + return ret; +} - int ret = sam_set_thread_pool(fp, &p); +// Internal component of sam_read1 below +static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { + int ret = bam_read1(fp->fp.bgzf, b); + if (h && ret >= 0) { + if (b->core.tid >= h->n_targets || b->core.tid < -1 || + b->core.mtid >= h->n_targets || b->core.mtid < -1) { + errno = ERANGE; + return -3; + } + } + return ret; +} + +// Internal component of sam_read1 below +static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { + int ret = cram_get_bam_seq(fp->fp.cram, b); if (ret < 0) - return ret; + return cram_eof(fp->fp.cram) ? -1 : -2; - SAM_state *fd = (SAM_state *)fp->state; - fd->own_pool = 1; + if (bam_tag2cigar(*b, 1, 1) < 0) + return -2; - return 0; + return ret; } -// Returns 0 on success, -// -1 on EOF, -// <-1 on error -int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) -{ - switch (fp->format.format) { - case bam: { - int r = bam_read1(fp->fp.bgzf, b); - if (h && r >= 0) { - if (b->core.tid >= h->n_targets || b->core.tid < -1 || - b->core.mtid >= h->n_targets || b->core.mtid < -1) { - errno = ERANGE; - return -3; - } - } - return r; - } - - case cram: { - int ret = cram_get_bam_seq(fp->fp.cram, &b); - if (ret < 0) - return cram_eof(fp->fp.cram) ? -1 : -2; +// Internal component of sam_read1 below +static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { + int ret; - if (bam_tag2cigar(b, 1, 1) < 0) - return -2; + // Consume 1st line after header parsing as it wasn't using peek + if (fp->line.l != 0) { + ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; return ret; } - case sam: { - // Consume 1st line after header parsing as it wasn't using peek - if (fp->line.l != 0) { - int ret = sam_parse1(&fp->line, h, b); - fp->line.l = 0; - return ret; + if (fp->state) { + SAM_state *fd = (SAM_state *)fp->state; + + if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { + // We don't support multi-threaded SAM parsing with seeks yet. + int ret; + if ((ret = sam_state_destroy(fp)) < 0) { + errno = -ret; + return -2; + } + if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) + return -1; + fp->fp.bgzf->seeked = 0; + goto err_recover; } - if (fp->state) { - SAM_state *fd = (SAM_state *)fp->state; + if (!fd->h) { + fd->h = h; + fd->h->ref_count++; + // Ensure hrecs is initialised now as we don't want multiple + // threads trying to do this simultaneously. + if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) + return -2; - if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { - // We don't support multi-threaded SAM parsing with seeks yet. - int ret; - if ((ret = sam_state_destroy(fp)) < 0) { - errno = -ret; - return -2; - } - if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) - return -1; - fp->fp.bgzf->seeked = 0; - goto err_recover; - } + // We can only do this once we've got a header + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, + fp) != 0) + return -2; + fd->dispatcher_set = 1; + } - if (!fd->h) { - fd->h = h; - fd->h->ref_count++; - // Ensure hrecs is initialised now as we don't want multiple - // threads trying to do this simultaneously. - if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) - return -2; + if (fd->h != h) { + hts_log_error("SAM multi-threaded decoding does not support changing header"); + return -1; + } - // We can only do this once we've got a header - if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, fp) != 0) - return -2; + sp_bams *gb = fd->curr_bam; + if (!gb) { + if (fd->errcode) { + // In case reader failed + errno = fd->errcode; + return -2; } + hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); + if (!r) + return -2; + fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); + hts_tpool_delete_result(r, 0); + } + if (!gb) + return fd->errcode ? -2 : -1; + bam1_t *b_array = (bam1_t *)gb->bams; + if (fd->curr_idx < gb->nbams) + if (!bam_copy1(b, &b_array[fd->curr_idx++])) + return -2; + if (fd->curr_idx == gb->nbams) { + pthread_mutex_lock(&fd->lines_m); + gb->next = fd->bams; + fd->bams = gb; + pthread_mutex_unlock(&fd->lines_m); - if (fd->h != h) { - hts_log_error("SAM multi-threaded decoding does not support changing header"); - return -1; - } + fd->curr_bam = NULL; + fd->curr_idx = 0; + } - sp_bams *gb = fd->curr_bam; - if (!gb) { - if (fd->errcode) { - // In case reader failed - errno = fd->errcode; - return -2; - } - hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); - if (!r) - return -2; - fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); - hts_tpool_delete_result(r, 0); - } - if (!gb) - return fd->errcode ? -2 : -1; - bam1_t *b_array = (bam1_t *)gb->bams; - if (fd->curr_idx < gb->nbams) - if (!bam_copy1(b, &b_array[fd->curr_idx++])) - return -2; - if (fd->curr_idx == gb->nbams) { - pthread_mutex_lock(&fd->lines_m); - gb->next = fd->bams; - fd->bams = gb; - pthread_mutex_unlock(&fd->lines_m); + ret = 0; - fd->curr_bam = NULL; - fd->curr_idx = 0; - } + } else { + err_recover: + ret = hts_getline(fp, KS_SEP_LINE, &fp->line); + if (ret < 0) return ret; - return 0; + ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; + if (ret < 0) { + hts_log_warning("Parse error at line %lld", (long long)fp->lineno); + if (h && h->ignore_sam_err) goto err_recover; + } + } - } else { - int ret; - err_recover: + return ret; +} + +// Returns 0 on success, +// -1 on EOF, +// <-1 on error +int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) +{ + int ret, pass_filter; + + do { + switch (fp->format.format) { + case bam: + ret = sam_read1_bam(fp, h, b); + break; + + case cram: + ret = sam_read1_cram(fp, h, &b); + break; - ret = hts_getline(fp, KS_SEP_LINE, &fp->line); - if (ret < 0) return ret; + case sam: + ret = sam_read1_sam(fp, h, b); + break; - ret = sam_parse1(&fp->line, h, b); - fp->line.l = 0; - if (ret < 0) { - hts_log_warning("Parse error at line %lld", (long long)fp->lineno); - if (h->ignore_sam_err) goto err_recover; + case fasta_format: + case fastq_format: { + fastq_state *x = (fastq_state *)fp->state; + if (!x) { + if (!(fp->state = fastq_state_init(fp->format.format + == fastq_format ? '@' : '>'))) + return -2; } - return ret; + + return fastq_parse1(fp, b); } - } - case empty_format: - errno = EPIPE; - return -3; + case empty_format: + errno = EPIPE; + return -3; - default: - errno = EFTYPE; - return -3; - } + default: + errno = EFTYPE; + return -3; + } + + pass_filter = (ret >= 0 && fp->filter) + ? sam_passes_filter(h, b, fp->filter) + : 1; + } while (pass_filter == 0); + + return pass_filter < 0 ? -2 : ret; } @@ -3137,6 +4310,112 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) return sam_format1_append(h, b, str); } +static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end); +int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) +{ + unsigned flag = b->core.flag; + int i, e = 0, len = b->core.l_qseq; + uint8_t *seq, *qual; + + str->l = 0; + + // Name + if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) + return -1; + + // /1 or /2 suffix + if (x && x->rnum && (flag & BAM_FPAIRED)) { + int r12 = flag & (BAM_FREAD1 | BAM_FREAD2); + if (r12 == BAM_FREAD1) { + if (kputs("/1", str) == EOF) + return -1; + } else if (r12 == BAM_FREAD2) { + if (kputs("/2", str) == EOF) + return -1; + } + } + + // Illumina CASAVA tag. + // This is ::: + if (x && x->casava) { + int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0; + char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N'; + uint8_t *bc = bam_aux_get(b, x->BC); + if (ksprintf(str, " %d:%c:0:%s", rnum, filtered, + bc ? (char *)bc+1 : "0") < 0) + return -1; + + if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) { + hts_log_warning("BC tag starts with non-sequence base; using '0'"); + str->l -= strlen((char *)bc)-2; // limit to 1 char + str->s[str->l-1] = '0'; + str->s[str->l] = 0; + bc = NULL; + } + + // Replace any non-alpha with '+'. Ie seq-seq to seq+seq + if (bc) { + int l = strlen((char *)bc+1); + char *c = (char *)str->s + str->l - l; + for (i = 0; i < l; i++) { + if (!isalpha_c(c[i])) + c[i] = '+'; + else if (islower_c(c[i])) + c[i] = toupper_c(c[i]); + } + } + } + + // Aux tags + if (x && x->aux) { + uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data; + while (s && end - s >= 4) { + int tt = s[0]*256 + s[1]; + if (x->tags == NULL || + kh_get(tag, x->tags, tt) != kh_end(x->tags)) { + e |= kputc_('\t', str) < 0; + if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str))) + return -1; + } else { + s = skip_aux(s+2, end); + } + } + e |= kputsn("", 0, str) < 0; // nul terminate + } + + if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1; + e |= kputc_('\n', str) < 0; + + // Seq line + seq = bam_get_seq(b); + if (flag & BAM_FREVERSE) + for (i = len-1; i >= 0; i--) + e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0; + else + for (i = 0; i < len; i++) + e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0; + + + // Qual line + if (x->nprefix == '@') { + kputsn("\n+\n", 3, str); + qual = bam_get_qual(b); + if (qual[0] == 0xff) + for (i = 0; i < len; i++) + e |= kputc_('B', str) < 0; + else if (flag & BAM_FREVERSE) + for (i = len-1; i >= 0; i--) + e |= kputc_(33 + qual[i], str) < 0; + else + for (i = 0; i < len; i++) + e |= kputc_(33 + qual[i], str) < 0; + + } + e |= kputc('\n', str) < 0; + + return e ? -1 : str->l; +} + // Sadly we need to be able to modify the bam_hdr here so we can // reference count the structure. int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) @@ -3168,13 +4447,15 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) // destroy it later on and sam_hdr_destroy takes non-const. // // We do this because some tools do sam_hdr_destroy; sam_close - // while others do sam_close; sam_hdr_destroy. The former is an - // issue as we need the header still when flushing. + // while others do sam_close; sam_hdr_destroy. The former is + // an issue as we need the header still when flushing. fd->h = (sam_hdr_t *)h; fd->h->ref_count++; - if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, fp) != 0) + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, + fp) != 0) return -2; + fd->dispatcher_set = 1; } if (fd->h != h) { @@ -3191,16 +4472,18 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) fd->bams = gb->next; gb->next = NULL; gb->nbams = 0; + gb->bam_mem = 0; pthread_mutex_unlock(&fd->lines_m); } else { pthread_mutex_unlock(&fd->lines_m); if (!(gb = calloc(1, sizeof(*gb)))) return -1; - if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) { + if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) { free(gb); return -1; } gb->nbams = 0; - gb->abams = NB; + gb->abams = SAM_NBAM; + gb->bam_mem = 0; gb->fd = fd; fd->curr_idx = 0; fd->curr_bam = gb; @@ -3209,11 +4492,11 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) if (!bam_copy1(&gb->bams[gb->nbams++], b)) return -2; + gb->bam_mem += b->l_data + sizeof(*b); // Dispatch if full - if (gb->nbams == NB) { + if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) { gb->serial = fd->serial++; - //fprintf(stderr, "Dispatch another %d bams\n", NB); pthread_mutex_lock(&fd->command_m); if (fd->errcode != 0) { pthread_mutex_unlock(&fd->command_m); @@ -3237,6 +4520,8 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) if (sam_format1(h, b, &fp->line) < 0) return -1; kputc('\n', &fp->line); if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; } else { if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; @@ -3263,6 +4548,30 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) return fp->line.l; } + + case fasta_format: + case fastq_format: { + fastq_state *x = (fastq_state *)fp->state; + if (!x) { + if (!(fp->state = fastq_state_init(fp->format.format + == fastq_format ? '@' : '>'))) + return -2; + } + + if (fastq_format1(fp->state, b, &fp->line) < 0) + return -1; + if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; + if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) + return -1; + } else { + if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) + return -1; + } + return fp->line.l; + } + default: errno = EBADF; return -1; @@ -3375,31 +4684,42 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) } } +uint8_t *bam_aux_first(const bam1_t *b) +{ + uint8_t *s = bam_get_aux(b); + uint8_t *end = b->data + b->l_data; + if (s >= end) { errno = ENOENT; return NULL; } + return s+2; +} + +uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) +{ + uint8_t *end = b->data + b->l_data; + uint8_t *next = s? skip_aux((uint8_t *) s, end) : end; + if (next == NULL) goto bad_aux; + if (next >= end) { errno = ENOENT; return NULL; } + return next+2; + + bad_aux: + hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); + errno = EINVAL; + return NULL; +} + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { - uint8_t *s, *end, *t = (uint8_t *) tag; - uint16_t y = (uint16_t) t[0]<<8 | t[1]; - s = bam_get_aux(b); - end = b->data + b->l_data; - while (s != NULL && end - s >= 3) { - uint16_t x = (uint16_t) s[0]<<8 | s[1]; - s += 2; - if (x == y) { + uint8_t *s; + for (s = bam_aux_first(b); s; s = bam_aux_next(b, s)) + if (s[-2] == tag[0] && s[-1] == tag[1]) { // Check the tag value is valid and complete - uint8_t *e = skip_aux(s, end); - if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') { - goto bad_aux; // Unterminated string - } - if (e != NULL) { - return s; - } else { - goto bad_aux; - } + uint8_t *e = skip_aux(s, b->data + b->l_data); + if (e == NULL) goto bad_aux; + if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux; + + return s; } - s = skip_aux(s, end); - } - if (s == NULL) goto bad_aux; - errno = ENOENT; + + // errno now as set by bam_aux_first()/bam_aux_next() return NULL; bad_aux: @@ -3408,23 +4728,28 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) return NULL; } -// s MUST BE returned by bam_aux_get() int bam_aux_del(bam1_t *b, uint8_t *s) { - uint8_t *p, *aux; - int l_aux = bam_get_l_aux(b); - aux = bam_get_aux(b); - p = s - 2; - s = skip_aux(s, aux + l_aux); - if (s == NULL) goto bad_aux; - memmove(p, s, l_aux - (s - aux)); - b->l_data -= s - p; - return 0; + s = bam_aux_remove(b, s); + return (s || errno == ENOENT)? 0 : -1; +} + +uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s) +{ + uint8_t *end = b->data + b->l_data; + uint8_t *next = skip_aux(s, end); + if (next == NULL) goto bad_aux; + + b->l_data -= next - (s-2); + if (next >= end) { errno = ENOENT; return NULL; } + + memmove(s-2, next, end - next); + return s; bad_aux: hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); errno = EINVAL; - return -1; + return NULL; } int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data) @@ -3729,6 +5054,14 @@ int sam_open_mode(char *mode, const char *fn, const char *format) else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c"); else if (strcasecmp(format, "sam") == 0) strcpy(mode, ""); else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z"); + else if (strcasecmp(format, "fastq") == 0 || + strcasecmp(format, "fq") == 0) strcpy(mode, "f"); + else if (strcasecmp(format, "fastq.gz") == 0 || + strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz"); + else if (strcasecmp(format, "fasta") == 0 || + strcasecmp(format, "fa") == 0) strcpy(mode, "F"); + else if (strcasecmp(format, "fasta.gz") == 0 || + strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz"); else return -1; return 0; @@ -3790,6 +5123,20 @@ char *sam_open_mode_opts(const char *fn, ; // format mode="" } else if (strncmp(format, "sam.gz", format_len) == 0) { *cp++ = 'z'; + } else if (strncmp(format, "fastq", format_len) == 0 || + strncmp(format, "fq", format_len) == 0) { + *cp++ = 'f'; + } else if (strncmp(format, "fastq.gz", format_len) == 0 || + strncmp(format, "fq.gz", format_len) == 0) { + *cp++ = 'f'; + *cp++ = 'z'; + } else if (strncmp(format, "fasta", format_len) == 0 || + strncmp(format, "fa", format_len) == 0) { + *cp++ = 'F'; + } else if (strncmp(format, "fasta.gz", format_len) == 0 || + strncmp(format, "fa", format_len) == 0) { + *cp++ = 'F'; + *cp++ = 'z'; } else { free(mode_opts); return NULL; @@ -3932,7 +5279,7 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) uint32_t *cigar = bam_get_cigar(b); int k; // determine the current CIGAR operation - //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); + //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); if (s->k == -1) { // never processed p->qpos = 0; if (c->n_cigar == 1) { // just one operation, save a loop @@ -3977,9 +5324,24 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation int op2 = _cop(cigar[s->k+1]); int l2 = _cln(cigar[s->k+1]); - if (op2 == BAM_CDEL) p->indel = -(int)l2; - else if (op2 == BAM_CINS) p->indel = l2; - else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding + if (op2 == BAM_CDEL && op != BAM_CDEL) { + // At start of a new deletion, merge e.g. 1D2D to 3D. + // Within a deletion (the 2D in 1D2D) we keep p->indel=0 + // and rely on is_del=1 as we would for 3D. + p->indel = -(int)l2; + for (k = s->k+2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CDEL) p->indel -= l2; + else break; + } + } else if (op2 == BAM_CINS) { + p->indel = l2; + for (k = s->k+2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CINS) p->indel += l2; + else if (op2 != BAM_CPAD) break; + } + } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { int l3 = 0; for (k = s->k + 2; k < c->n_cigar; ++k) { op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); @@ -4009,11 +5371,16 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) * Fills out the kstring with the padded insertion sequence for the current * location in 'p'. If this is not an insertion site, the string is blank. * - * Returns the length of insertion string on success; + * This variant handles base modifications, but only when "m" is non-NULL. + * + * Returns the number of inserted base on success, with string length being + * accessable via ins->l; * -1 on failure. */ -int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { - int j, k, indel; +int bam_plp_insertion_mod(const bam_pileup1_t *p, + hts_base_mod_state *m, + kstring_t *ins, int *del_len) { + int j, k, indel, nb = 0; uint32_t *cigar; if (p->indel <= 0) { @@ -4043,7 +5410,7 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { } k++; } - ins->l = indel; + nb = ins->l = indel; // Produce sequence if (ks_resize(ins, indel+1) < 0) @@ -4060,9 +5427,43 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { break; case BAM_CINS: for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) { - c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), - p->qpos + j - p->is_del)]; + c = p->qpos + j - p->is_del < p->b->core.l_qseq + ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), + p->qpos + j - p->is_del)] + : 'N'; ins->s[indel++] = c; + int nm; + hts_base_mod mod[256]; + if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del, + m, mod, 256)) > 0) { + int o_indel = indel; + if (ks_resize(ins, ins->l + nm*16+3) < 0) + return -1; + ins->s[indel++] = '['; + int j; + for (j = 0; j < nm; j++) { + char qual[20]; + if (mod[j].qual >= 0) + snprintf(qual, sizeof(qual), "%d", mod[j].qual); + else + *qual=0; + if (mod[j].modified_base < 0) + // ChEBI + indel += snprintf(&ins->s[indel], ins->m - indel, + "%c(%d)%s", + "+-"[mod[j].strand], + -mod[j].modified_base, + qual); + else + indel += snprintf(&ins->s[indel], ins->m - indel, + "%c%c%s", + "+-"[mod[j].strand], + mod[j].modified_base, + qual); + } + ins->s[indel++] = ']'; + ins->l += indel - o_indel; // grow by amount we used + } } break; case BAM_CDEL: @@ -4077,8 +5478,23 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { k++; } ins->s[indel] = '\0'; + ins->l = indel; // string length + + return nb; // base length +} - return indel; +/* + * Fills out the kstring with the padded insertion sequence for the current + * location in 'p'. If this is not an insertion site, the string is blank. + * + * This is the original interface with no capability for reporting base + * modifications. + * + * Returns the length of insertion string on success; + * -1 on failure. + */ +int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { + return bam_plp_insertion_mod(p, NULL, ins, del_len); } /*********************** @@ -4171,7 +5587,11 @@ void bam_plp_destructor(bam_plp_t plp, * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) +static inline int cigar_iref2iseq_set(const uint32_t **cigar, + const uint32_t *cigar_max, + hts_pos_t *icig, + hts_pos_t *iseq, + hts_pos_t *iref) { hts_pos_t pos = *iref; if ( pos < 0 ) return -1; @@ -4206,7 +5626,11 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) +static inline int cigar_iref2iseq_next(const uint32_t **cigar, + const uint32_t *cigar_max, + hts_pos_t *icig, + hts_pos_t *iseq, + hts_pos_t *iref) { while ( *cigar < cigar_max ) { @@ -4215,14 +5639,14 @@ static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, ht if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) { - if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; } + if ( *icig >= ncig - 1 ) { *icig = -1; (*cigar)++; continue; } (*iseq)++; (*icig)++; (*iref)++; return BAM_CMATCH; } - if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; } - if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } - if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } - if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } + if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; } + if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } + if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } + if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; } hts_log_error("Unexpected cigar %d", cig); return -2; } @@ -4231,10 +5655,18 @@ static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, ht return -1; } +// Given overlapping read 'a' (left) and 'b' (right) on the same +// template, adjust quality values to zero for either a or b. +// Note versions 1.12 and earlier always removed quality from 'b' for +// matching bases. Now we select a or b semi-randomly based on name hash. +// Returns 0 on success, +// -1 on failure static int tweak_overlap_quality(bam1_t *a, bam1_t *b) { - uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; - uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; + const uint32_t *a_cigar = bam_get_cigar(a), + *a_cigar_max = a_cigar + a->core.n_cigar; + const uint32_t *b_cigar = bam_get_cigar(b), + *b_cigar_max = b_cigar + b->core.n_cigar; hts_pos_t a_icig = 0, a_iseq = 0; hts_pos_t b_icig = 0, b_iseq = 0; uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); @@ -4243,69 +5675,92 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) hts_pos_t iref = b->core.pos; hts_pos_t a_iref = iref - a->core.pos; hts_pos_t b_iref = iref - b->core.pos; - int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); - if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error - int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); - if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error - #if DBG - fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, - a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); - #endif + int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, + &a_icig, &a_iseq, &a_iref); + if ( a_ret<0 ) + // no overlap or error + return a_ret<-1 ? -1:0; + + int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, + &b_icig, &b_iseq, &b_iref); + if ( b_ret<0 ) + // no overlap or error + return b_ret<-1 ? -1:0; + + // Determine which seq is the one getting modified qualities. + uint8_t amul, bmul; + if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) { + amul = 1; + bmul = 0; + } else { + amul = 0; + bmul = 1; + } + // Loop over the overlapping region nulling qualities in either + // seq a or b. int err = 0; while ( 1 ) { - // Increment reference position + // Step to next matching reference position in a and b while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos ) - a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); - if ( a_ret<0 ) { err = a_ret<-1?-1:0; break; } // done - if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos; + a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, + &a_icig, &a_iseq, &a_iref); + if ( a_ret<0 ) { // done + err = a_ret<-1?-1:0; + break; + } + if ( iref < a_iref + a->core.pos ) + iref = a_iref + a->core.pos; while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos ) - b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); - if ( b_ret<0 ) { err = b_ret<-1?-1:0; break; } // done - if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos; + b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, + &b_iseq, &b_iref); + if ( b_ret<0 ) { // done + err = b_ret<-1?-1:0; + break; + } + if ( iref < b_iref + b->core.pos ) + iref = b_iref + b->core.pos; iref++; - if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels + + if ( a_iref+a->core.pos != b_iref+b->core.pos ) + // only CMATCH positions, don't know what to do with indels + continue; if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) - return -1; // Fell off end of sequence, bad CIGAR? + // Fell off end of sequence, bad CIGAR? + return -1; - if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) - { - #if DBG - fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]); - #endif - // we are very confident about this base + // We're finally at the same ref base in both a and b. + // Check if the bases match (confident) or mismatch + // (not so confident). + if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { + // We are very confident about this base. Use sum of quals int qual = a_qual[a_iseq] + b_qual[b_iseq]; - a_qual[a_iseq] = qual>200 ? 200 : qual; - b_qual[b_iseq] = 0; - } - else - { - if ( a_qual[a_iseq] >= b_qual[b_iseq] ) - { - #if DBG - fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)])); - #endif - a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch + a_qual[a_iseq] = amul * (qual>200 ? 200 : qual); + b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);; + } else { + // Not so confident about anymore given the mismatch. + // Reduce qual for lowest quality base. + if ( a_qual[a_iseq] > b_qual[b_iseq] ) { + // A highest qual base; keep + a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; b_qual[b_iseq] = 0; - } - else - { - #if DBG - fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]); - #endif + } else if (a_qual[a_iseq] < b_qual[b_iseq] ) { + // B highest qual base; keep b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; a_qual[a_iseq] = 0; + } else { + // Both equal, so pick randomly + a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq]; + b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq]; } } } - #if DBG - fprintf(stderr,"\n"); - #endif + return err; } @@ -4344,8 +5799,6 @@ static int overlap_push(bam_plp_t iter, lbnode_t *node) int err = tweak_overlap_quality(&a->b, &node->b); kh_del(olap_hash, iter->overlaps, kitr); assert(a->end-1 == a->s.end); - a->end = bam_endpos(&a->b); - a->s.end = a->end - 1; return err; } return 0; @@ -4479,9 +5932,14 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) iter->error = 1; return -1; } - if (iter->plp_construct) - iter->plp_construct(iter->data, &iter->tail->b, - &iter->tail->cd); + if (iter->plp_construct) { + if (iter->plp_construct(iter->data, &iter->tail->b, + &iter->tail->cd) < 0) { + mp_free(iter->mp, next); + iter->error = 1; + return -1; + } + } if (overlap_push(iter, iter->tail) < 0) { mp_free(iter->mp, next); iter->error = 1; diff --git a/sam_internal.h b/sam_internal.h index 630b57003..b1fce9fe4 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -36,6 +36,10 @@ int sam_state_destroy(samFile *fp); int sam_set_thread_pool(htsFile *fp, htsThreadPool *p); int sam_set_threads(htsFile *fp, int nthreads); +// Fastq state +int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...); +void fastq_state_destroy(samFile *fp); + // bam1_t data (re)allocation int sam_realloc_bam_data(bam1_t *b, size_t desired); diff --git a/sam_mods.c b/sam_mods.c new file mode 100644 index 000000000..fe8db85f7 --- /dev/null +++ b/sam_mods.c @@ -0,0 +1,683 @@ +/* sam_mods.c -- Base modification handling in SAM and BAM. + + Copyright (C) 2020-2023 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +#include "htslib/sam.h" +#include "textutils_internal.h" + +// --------------------------- +// Base Modification retrieval +// +// These operate by recording state in an opaque type, allocated and freed +// via the functions below. +// +// Initially we call bam_parse_basemod to process the tags and record the +// modifications in the state structure, and then functions such as +// bam_next_basemod can iterate over this cached state. + +/* Overview of API. + +We start by allocating an hts_base_mod_state and parsing the MM, ML and MN +tags into it. This has optional flags controlling how we report base +modifications in "explicit" coordinates. See below + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod2(b, m, HTS_MOD_REPORT_UNCHECKED); + // Or: bam_parse_basemod(b, m), which is equiv to flags==0 + //... do something ... + hts_base_mod_state_free(m); + +In the default implicit MM coordinate system, any location not +reported is implicitly assumed to contain no modification. We only +report the places we think are likely modified. + +Some tools however only look for base modifications in particular +contexts, eg CpG islands. Here we need to distinguish between +not-looked-for and looked-for-but-didn't-find. These calls have an +explicit coordinate system, where we only know information about the +coordinates explicitly listed and everything else is considered to be +unverified. + +By default we don't get reports on the other coordinates in an +explicit MM tag, but the HTS_MOD_REPORT_UNCHECKED flag will report +them (with quality HTS_MOD_UNCHECKED) meaning we can do consensus +modification analysis with accurate counting when dealing with a +mixture of explicit and implicit records. + + +We have different ways of processing the base modifications. We can +iterate either mod-by-mod or position-by-position, or we can simply +query a specific coordinate as may be done when processing a pileup. + +To check for base modifications as a specific location within a +sequence we can use bam_mods_at_qpos. This provides complete random +access within the MM string. However currently this is inefficiently +implemented so should only be used for occasional analysis or as a way +to start iterating at a specific location. It modifies the state +position, so after the first use we can then switch to +bam_mods_at_next_pos to iterate position by position from then on. + + hts_base_mod mods[10]; + int n = bam_mods_at_qpos(b, pos, m, mods, 10); + +For base by base, we have bam_mods_at_next_pos. This strictly starts +at the first base and reports entries one at a time. It's more +efficient than a loop repeatedly calling ...at-pos. + + hts_base_mod mods[10]; + int n = bam_mods_at_next_pos(b, m, mods, 10); + for (int i = 0; i < n; i++) { + // report mod i of n + } + +Iterating over modifications instead of coordinates is simpler and +more efficient as it skips reporting of unmodified bases. This is +done with bam_next_basemod. + + hts_base_mod mods[10]; + while ((n=bam_next_basemod(b, m, mods, 10, &pos)) > 0) { + for (j = 0; j < n; j++) { + // Report 'n'th mod at sequence position 'pos' + } + } + +There are also functions that query meta-data about the MM line rather +than per-site information. + +bam_mods_recorded returns an array of ints holding the +ve code ('m') +or -ve CHEBI numeric values. + + int ntypes, *types = bam_mods_recorded(m, &ntype); + +We can then query a specific modification type to get further +information on the strand it is operating on, whether it has implicit +or explicit coordinates, and what it's corresponding canonical base it +is (The "C" in "C+m"). bam_mods_query_type does this by code name, +while bam_mods_queryi does this by numeric i^{th} type (from 0 to ntype-1). + + bam_mods_query_type(m, 'c', &strand, &implicit, &canonical); + bam_mods_queryi(m, 2, &strand, &implicit, &canonical); + +*/ + +/* + * Base modification are stored in MM/Mm tags as defined as + * + * ::= | "" + * ::= + * + * ::= "A" | "C" | "G" | "T" | "N". + * + * ::= "+" | "-". + * + * ::= | + * ::= | + * ::= + * ::= + * + * ::= "," | ";" + * + * We do not allocate additional memory other than the fixed size + * state, thus we track up to 256 pointers to different locations + * within the MM and ML tags. Each pointer is for a distinct + * modification code (simple or ChEBI), meaning some may point to the + * same delta-list when multiple codes are combined together + * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array. + * + * Each numeric in the delta-list is tracked in MMcount[], counted + * down until it hits zero in which case the next delta is fetched. + * + * ML array similarly holds the locations in the quality (ML) tag per + * type, but these are interleaved so C+mhfc,10,15 will have 4 types + * all pointing to the same delta position, but in ML we store + * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML + * also has MLstride indicating how many positions along ML to jump + * each time we consume a base. (4 in our above example, but usually 1 + * for the simple case). + * + * One complexity of the base modification system is that mods are + * always stored in the original DNA orientation. This is so that + * tools that may reverse-complement a sequence (eg "samtools fastq -T + * MM,ML") can pass through these modification tags irrespective of + * whether they have any knowledge of their internal workings. + * + * Because we don't wish to allocate extra memory, we cannot simply + * reverse the MM and ML tags. Sadly this means we have to manage the + * reverse complementing ourselves on-the-fly. + * For reversed reads we start at the right end of MM and no longer + * stop at the semicolon. Instead we use MMend[] array to mark the + * termination point. + */ +#define MAX_BASE_MOD 256 +struct hts_base_mod_state { + int type[MAX_BASE_MOD]; // char or minus-CHEBI + int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) + char strand[MAX_BASE_MOD]; // strand of modification; + or - + int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod + char *MM[MAX_BASE_MOD]; // next pos delta (string) + char *MMend[MAX_BASE_MOD]; // end of pos-delta string + uint8_t *ML[MAX_BASE_MOD]; // next qual + int MLstride[MAX_BASE_MOD]; // bytes between quals for this type + int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? + int seq_pos; // current position along sequence + int nmods; // used array size (0 to MAX_BASE_MOD-1). + uint32_t flags; // Bit-field: see HTS_MOD_REPORT_UNCHECKED +}; + +hts_base_mod_state *hts_base_mod_state_alloc(void) { + return calloc(1, sizeof(hts_base_mod_state)); +} + +void hts_base_mod_state_free(hts_base_mod_state *state) { + free(state); +} + +/* + * Count frequency of A, C, G, T and N canonical bases in the sequence + */ +static void seq_freq(const bam1_t *b, int freq[16]) { + int i; + + memset(freq, 0, 16*sizeof(*freq)); + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < b->core.l_qseq; i++) + freq[bam_seqi(seq, i)]++; + freq[15] = b->core.l_qseq; // all bases count as N for base mods +} + +//0123456789ABCDEF +//=ACMGRSVTWYHKDBN aka seq_nt16_str[] +//=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str +//084C2A6E195D3B7F +static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; + +/* + * Parse the MM and ML tags to populate the base mod state. + * This structure will have been previously allocated via + * hts_base_mod_state_alloc, but it does not need to be repeatedly + * freed and allocated for each new bam record. (Although obviously + * it requires a new call to this function.) + * + * Flags are copied into the state and used to control reporting functions. + * Currently the only flag is HTS_MOD_REPORT_UNCHECKED, to control whether + * explicit "C+m?" mods report quality HTS_MOD_UNCHECKED for the bases + * outside the explicitly reported region. + */ +int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, + uint32_t flags) { + // Reset position, else upcoming calls may fail on + // seq pos - length comparison + state->seq_pos = 0; + state->nmods = 0; + state->flags = flags; + + // Read MM and ML tags + uint8_t *mm = bam_aux_get(b, "MM"); + if (!mm) mm = bam_aux_get(b, "Mm"); + if (!mm) + return 0; + if (mm[0] != 'Z') { + hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b)); + return -1; + } + + uint8_t *mi = bam_aux_get(b, "MN"); + if (mi && bam_aux2i(mi) != b->core.l_qseq) { + // bam_aux2i with set errno = EINVAL and return 0 if the tag + // isn't integer, but 0 will be a seq-length mismatch anyway so + // triggers an error here too. + hts_log_error("%s: MM/MN data length is incompatible with" + " SEQ length", bam_get_qname(b)); + return -1; + } + + uint8_t *ml = bam_aux_get(b, "ML"); + if (!ml) ml = bam_aux_get(b, "Ml"); + if (ml && (ml[0] != 'B' || ml[1] != 'C')) { + hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b)); + return -1; + } + uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; + if (ml) ml += 6; + + // Aggregate freqs of ACGTN if reversed, to get final-delta (later) + int freq[16]; + if (b->core.flag & BAM_FREVERSE) + seq_freq(b, freq); + + char *cp = (char *)mm+1; + int mod_num = 0; + int implicit = 1; + while (*cp) { + for (; *cp; cp++) { + // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; + unsigned char btype = *cp++; + + if (btype != 'A' && btype != 'C' && + btype != 'G' && btype != 'T' && + btype != 'U' && btype != 'N') + return -1; + if (btype == 'U') btype = 'T'; + + btype = seq_nt16_table[btype]; + + // Strand + if (*cp != '+' && *cp != '-') + return -1; // malformed + char strand = *cp++; + + // List of modification types + char *ms = cp, *me; // mod code start and end + char *cp_end = NULL; + int chebi = 0; + if (isdigit_c(*cp)) { + chebi = strtol(cp, &cp_end, 10); + cp = cp_end; + ms = cp-1; + } else { + while (*cp && isalpha_c(*cp)) + cp++; + if (*cp == '\0') + return -1; + } + + me = cp; + + // Optional explicit vs implicit marker + implicit = 1; + if (*cp == '.') { + // default is implicit = 1; + cp++; + } else if (*cp == '?') { + implicit = 0; + cp++; + } else if (*cp != ',' && *cp != ';') { + // parse error + return -1; + } + + long delta; + int n = 0; // nth symbol in a multi-mod string + int stride = me-ms; + int ndelta = 0; + + if (b->core.flag & BAM_FREVERSE) { + // We process the sequence in left to right order, + // but delta is successive count of bases to skip + // counting right to left. This also means the number + // of bases to skip at left edge is unrecorded (as it's + // the remainder). + // + // To output mods in left to right, we step through the + // MM list in reverse and need to identify the left-end + // "remainder" delta. + int total_seq = 0; + for (;;) { + cp += (*cp == ','); + if (*cp == 0 || *cp == ';') + break; + + delta = strtol(cp, &cp_end, 10); + if (cp_end == cp) { + hts_log_error("%s: Hit end of MM tag. Missing " + "semicolon?", bam_get_qname(b)); + return -1; + } + + cp = cp_end; + total_seq += delta+1; + ndelta++; + } + delta = freq[seqi_rc[btype]] - total_seq; // remainder + } else { + delta = *cp == ',' + ? strtol(cp+1, &cp_end, 10) + : 0; + if (!cp_end) { + // empty list + delta = INT_MAX; + cp_end = cp+1; + } + } + // Now delta is first in list or computed remainder, + // and cp_end is either start or end of the MM list. + while (ms < me) { + state->type [mod_num] = chebi ? -chebi : *ms; + state->strand [mod_num] = (strand == '-'); + state->canonical[mod_num] = btype; + state->MLstride [mod_num] = stride; + state->implicit [mod_num] = implicit; + + if (delta < 0) { + hts_log_error("%s: MM tag refers to bases beyond sequence " + "length", bam_get_qname(b)); + return -1; + } + state->MMcount [mod_num] = delta; + if (b->core.flag & BAM_FREVERSE) { + state->MM [mod_num] = cp+1; + state->MMend[mod_num] = cp_end; + state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL; + } else { + state->MM [mod_num] = cp_end; + state->MMend[mod_num] = NULL; + state->ML [mod_num] = ml ? ml+n : NULL; + } + + if (++mod_num >= MAX_BASE_MOD) { + hts_log_error("%s: Too many base modification types", + bam_get_qname(b)); + return -1; + } + ms++; n++; + } + + // Skip modification deltas + if (ml) { + if (b->core.flag & BAM_FREVERSE) { + ml += ndelta*stride; + } else { + while (*cp && *cp != ';') { + if (*cp == ',') + ml+=stride; + cp++; + } + } + if (ml > ml_end) { + hts_log_error("%s: Insufficient number of entries in ML " + "tag", bam_get_qname(b)); + return -1; + } + } else { + // cp_end already known if FREVERSE + if (cp_end && (b->core.flag & BAM_FREVERSE)) + cp = cp_end; + else + while (*cp && *cp != ';') + cp++; + } + if (!*cp) { + hts_log_error("%s: Hit end of MM tag. Missing semicolon?", + bam_get_qname(b)); + return -1; + } + } + } + + state->nmods = mod_num; + + return 0; +} + +int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { + return bam_parse_basemod2(b, state, 0); +} + +/* + * Fills out mods[] with the base modifications found. + * Returns the number found (0 if none), which may be more than + * the size of n_mods if more were found than reported. + * Returns <= -1 on error. + * + * This always marches left to right along sequence, irrespective of + * reverse flag or modification strand. + */ +int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods) { + if (b->core.flag & BAM_FREVERSE) { + if (state->seq_pos < 0) + return -1; + } else { + if (state->seq_pos >= b->core.l_qseq) + return -1; + } + + int i, j, n = 0; + unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos); + state->seq_pos++; + if (b->core.flag & BAM_FREVERSE) + base = seqi_rc[base]; + + for (i = 0; i < state->nmods; i++) { + int unchecked = 0; + if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) + continue; + + if (state->MMcount[i]-- > 0) { + if (!state->implicit[i] && + (state->flags & HTS_MOD_REPORT_UNCHECKED)) + unchecked = 1; + else + continue; + } + + char *MMptr = state->MM[i]; + if (n < n_mods) { + mods[n].modified_base = state->type[i]; + mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; + mods[n].strand = state->strand[i]; + mods[n].qual = unchecked + ? HTS_MOD_UNCHECKED + : (state->ML[i] ? *state->ML[i] : HTS_MOD_UNKNOWN); + } + n++; + + if (unchecked) + continue; + + if (state->ML[i]) + state->ML[i] += (b->core.flag & BAM_FREVERSE) + ? -state->MLstride[i] + : +state->MLstride[i]; + + if (b->core.flag & BAM_FREVERSE) { + // process MM list backwards + char *cp; + for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--) + if (*cp == ',') + break; + state->MMend[i] = cp; + if (cp != state->MM[i]) + state->MMcount[i] = strtol(cp+1, NULL, 10); + else + state->MMcount[i] = INT_MAX; + } else { + if (*state->MM[i] == ',') + state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10); + else + state->MMcount[i] = INT_MAX; + } + + // Multiple mods at the same coords. + for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) { + if (n < n_mods) { + mods[n].modified_base = state->type[j]; + mods[n].canonical_base = seq_nt16_str[state->canonical[j]]; + mods[n].strand = state->strand[j]; + mods[n].qual = state->ML[j] ? *state->ML[j] : -1; + } + n++; + state->MMcount[j] = state->MMcount[i]; + state->MM[j] = state->MM[i]; + if (state->ML[j]) + state->ML[j] += (b->core.flag & BAM_FREVERSE) + ? -state->MLstride[j] + : +state->MLstride[j]; + } + i = j-1; + } + + return n; +} + +/* + * Return data at the next modified location. + * + * bam_mods_at_next_pos does quite a bit of work, so we don't want to + * repeatedly call it for every location until we find a mod. Instead + * we check how many base types we can consume before the next mod, + * and scan through the sequence looking for them. Once we're at that + * site, we defer back to bam_mods_at_next_pos for the return values. + */ +int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods, int *pos) { + if (state->seq_pos >= b->core.l_qseq) + return 0; + + // Look through state->MMcount arrays to see when the next lowest is + // per base type; + int next[16], freq[16] = {0}, i; + memset(next, 0x7f, 16*sizeof(*next)); + const int unchecked = state->flags & HTS_MOD_REPORT_UNCHECKED; + if (b->core.flag & BAM_FREVERSE) { + for (i = 0; i < state->nmods; i++) { + if (unchecked && !state->implicit[i]) + next[seqi_rc[state->canonical[i]]] = 1; + else if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) + next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; + } + } else { + for (i = 0; i < state->nmods; i++) { + if (unchecked && !state->implicit[i]) + next[state->canonical[i]] = 0; + else if (next[state->canonical[i]] > state->MMcount[i]) + next[state->canonical[i]] = state->MMcount[i]; + } + } + + // Now step through the sequence counting off base types. + for (i = state->seq_pos; i < b->core.l_qseq; i++) { + unsigned char bc = bam_seqi(bam_get_seq(b), i); + if (next[bc] <= freq[bc] || next[15] <= freq[15]) + break; + freq[bc]++; + if (bc != 15) // N + freq[15]++; + } + *pos = state->seq_pos = i; + + if (i >= b->core.l_qseq) { + // Check for more MM elements than bases present. + for (i = 0; i < state->nmods; i++) { + if (!(b->core.flag & BAM_FREVERSE) && + state->MMcount[i] < 0x7f000000) { + hts_log_warning("MM tag refers to bases beyond sequence length"); + return -1; + } + } + return 0; + } + + if (b->core.flag & BAM_FREVERSE) { + for (i = 0; i < state->nmods; i++) + state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]]; + } else { + for (i = 0; i < state->nmods; i++) + state->MMcount[i] -= freq[state->canonical[i]]; + } + + int r = bam_mods_at_next_pos(b, state, mods, n_mods); + return r > 0 ? r : 0; +} + +/* + * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos. + * This can only march forwards along the read, but can do so by more than + * one base-pair. + * + * This makes it useful for calling from pileup iterators where qpos may + * start part way through a read for the first occurrence of that record. + */ +int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods) { + // FIXME: for now this is inefficient in implementation. + int r = 0; + while (state->seq_pos <= qpos) + if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) + break; + + return r; +} + +/* + * Returns the list of base modification codes provided for this + * alignment record as an array of character codes (+ve) or ChEBI numbers + * (negative). + * + * Returns the array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { + *ntype = state->nmods; + return state->type; +} + +/* + * Returns data about a specific modification type for the alignment record. + * Code is either positive (eg 'm') or negative for ChEBI numbers. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical) { + // Find code entry + int i; + for (i = 0; i < state->nmods; i++) { + if (state->type[i] == code) + break; + } + if (i == state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} + +/* + * Returns data about the ith modification type for the alignment record. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_queryi(hts_base_mod_state *state, int i, + int *strand, int *implicit, char *canonical) { + if (i < 0 || i >= state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} diff --git a/samples/DEMO.md b/samples/DEMO.md new file mode 100644 index 000000000..911792899 --- /dev/null +++ b/samples/DEMO.md @@ -0,0 +1,1437 @@ +# HTS API + +## HTSLib APIs and samtools + +HTSLib is a C library implementation used to access and process the genome +sequence data. HTSLib implements multiple API interfaces, HTS API, VCF API and +SAM API. HTS API provides a framework for use by other APIs and applications, +implements bgzf compression, htscodecs and provides CRAM format support. VCF +APIs work with variant data in VCF and BCF format. + +SAM API works with sequence data of different formats, SAM / BAM / CRAM / +FASTA / FASTQ, and provides methods to do operations on the data. It uses +methods from HTS API. + +'samtools' is the utility used to read and modify sequence data. It uses SAM +APIs from HTSLib to work on the sequence data. + + +## About this document + +There are a number of demonstration utilities and their source code in +'samples' directory of HTSLib and this document gives the description of them +and the usage of API of HTSLib. The samples are for demonstration +purposes only and proper error handling is required for actual usage. This +document is based on HTSLib version 1.17. + +Updates to this document may be made along with later releases when required. + + +## The sample apps + +Flags - This application showcases the basic read of alignment files and flag +access. It reads and shows the count of read1 and read2 alignments. + +Split - This application showcases the basic read and write of alignment data. +It saves the read1 and read2 as separate files in given directory, one as sam +and other as bam. + +Split2 - This application showcases the output file format selection. It saves +the read1 and read2 as separate files in given directory, both as compressed +sam though the extensions are different. + +Cram - This application showcases the different way in which cram reference +data is used for cram output creation. + +Read_fast - This application showcases the fasta/fastq data read. + +Read_header - This application showcases the read and access of header data. +It can show all header line of given type, data of a given tag on a specific +header line or for all lines of given type. + +Read_ref - This application showcases the read and access of header data. +It shows all reference names which has length equal or greater to given input. + +Read_bam - This application showcases read of different alignment data fields. +It shows contents of each alignment. + +Read_aux - This application showcases read of specific auxiliary tag data in +alignment. It shows the data retrieved using 2 APIs, one as a string with tag +data and other as raw data alternatively. + +Dump_aux - This application showcases read of all auxiliary tag data one by one +in an alignment. It shows the data retrieved. + +Add_header - This application showcases the write of header lines to a file. +It adds header line of types, SQ, RG, PG and CO and writes to standard output. + +Remove_header - This application showcases removal of header line from a file. +It removes either all header lines of given type or one specific line of given +type with given unique identifier. Modified header is written on standard +output. + +Update_header - This application shows the update of header line fields, where +update is allowed. It takes the header line type, unique identifier for the +line, tag to be modified and the new value. Updated data is written on standard +output. + +Mod_bam - This application showcases the update of alignment data. It takes +alignment name, position of field to be modified and new value of it. +Modified data is written on standard output. + +Mod_aux - This application showcases the update of auxiliary data in alignment. +It takes alignment name, tag to be modified, its type and new value. Modified +data is written on standard output. + +Mod_aux_ba - This application showcases the update of auxiliary array data in +alignment. It adds count of ATCGN base as an array in auxiliary data, BA:I. +Modified data is written on standard output. + +Write_fast - This application showcases the fasta/fastq data write. It appends +a dummy data to given file. + +Index_write - This application showcases the creation of index along with +output creation. Based on file type and shift, it creates bai, csi or crai +files. + +Read_reg - This application showcases the usage of region specification in +alignment read. + +Read_multireg - This application showcases the usage of mulitple regionn +specification in alignment read. + +Pileup - This application showcases the pileup api, where all alignments +covering a reference position are accessed together. It displays the bases +covering each position on standard output. + +Mpileup - This application showcases the mpileup api, which supports multiple +input files for pileup and gives a side by side view of them in pileup format. +It displays the bases covering each position on standard output. + +Modstate - This application showcases the access of base modifications in +alignment. It shows the modifications present in an alignment and accesses them +using available APIs. There are 2 APIs and which one to be used can be selected +through input. + +Pileup_mod - This application showcases the base modification access in pileup +mode. It shows the pileup display with base modifications. + +Flags_field - This application showcases the read of selected fields alone, +reducing the overhead / increasing the performance. It reads the flag field +alone and shows the count of read1 and read2. This has impact only on CRAM +files. + +Split_thread1 - This application showcases the use of threads in file handling. +It saves the read1 and read2 as separate files in given directory, one as sam +and other as bam. 2 threads are used for read and 1 each dedicated for each +output file. + +Split_thread2 - This application showcases the use of thread pool in file +handling. It saves the read1 and read2 as separate files in given directory, +one as sam and other as bam. A pool of 4 threads is created and shared for both +read and write. + + +## Building the sample apps + +The samples expect the HTSLib is installed, libraries and header file path are +part of the PATH environment variable. If not, these paths need to be explicitly +passed during the build time. + +Gcc and compatible compilers can be used to build the samples. + +These applications can be linked statically or dynamically to HTSLib. +For static linking, along with htslib other libraries and/or headers required +to build are, math, pthread, curl, lzma, z and bz2 libraries. + +A makefile is available along with source files which links statically to +htslib. To use dynamic linking, update the makefile's 'LDFLAGS' and 'rpath' +path. The 'rpath' path to be set as the path to lib directory of htslib +installation. + + +## Usage of HTS APIs +### Sequence data file access for read + +The sequence data file for read may be opened using the sam_open method. It +opens the file and returns samFile (htsFile) pointer on success or NULL on +failure. The input can be path to a file in disk, network, cloud or '-' +designating the standard input. + +SAM, BAM and CRAM file formats are supported and the input file format is +detected from the file content. + +Once done with the file, it needs to be closed with sam_close. + +Many times, header details would be required and can be read using +sam_hdr_read api. It returns sam_hdr_t pointer or NULL. The returned header +needs to be destroyed using sam_hdr_destroy when no longer required. + +The sequence data may be compressed or uncompressed on disk and on memory it +is read and kept as uncompressed BAM format. It can be read from a file using +sam_read1 api. samFile pointer, header and bam storage are to be passed as +argument and it returns 0 on success, -1 on end of file and < -1 in case of +errors. + +The bam storage has to be initialised using bam_init1 api before the call and +can be reused for successive reads. Once done, it needs to be destroyed using +bam_destroy1. The member field named core - bam1_core_t - in bam storage, +bam1_t, has the sequence data in an easily accessible way. Using the fields +and macros, data can easily be read from it. + + #include + + int main(int argc, char *argv[]) + { + ... + //initialize + if (!(bamdata = bam_init1())) { + ... + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + ... + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + ... + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; + } +Refer: flags_demo.c + +This shows the count of read1 and read2 alignments. + + ./flags /tmp/sample.sam.gz + +To read CRAM files, reference data is required and if it is not available, based +on configuration, library may try to download it from external repositories. + + +### Sequence data file access for write + +File access for write is similar to read with a few additional optional steps. + +The output file can be opened using sam_open api as in read, with "w" instead +of "r" as mode. This opens the file for writing and uses mode to select the +output file type. "w" alone denotes SAM, "wb" denotes BAM and "wc" denotes CRAM. + +Another way is to use sam_open_mode method, which sets the output file type and +compression based on the file name and explicit textual format specification. +This method expects a buffer to append type and compression flags. Usually a +buffer with standard file open flag is used, the buffer past the flag is passed +to the method to ensure existing flags and updates from this method are present +in the same buffer without being overwritten. This method will add more flags +indicating file type and compression based on name. If explicit format detail +given, then extension is ignored and the explicit specification is used. This +updated buffer can be used with sam_open to select the file format. + +sam_open_format method may also be used to open the file for output as more +information on the output file can be specified using this. Can use +mode buffer from sam_open_mode api or explicit format structure for this. + +The header data can be written using the sam_hdr_write api. When the header +data is copied to another variable and has different lifetime, it is good to +increase the reference count of the header using sam_hdr_incr_ref and +sam_hdr_destroy called as many times as required. + +The alignment data can be written using the sam_write1 api. It takes a samFile +pointer, header pointer and the alignment data. The header data is required to +set the reference name in the alignment. It returns -ve value on error. + + int main(int argc, char *argv[]) + { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || + (sam_hdr_write(outfile2, in_samhdr) == -1)) { + ... + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + ... + } +Refer: split.c + +This creates 1.sam and 2.bam in /tmp/ containing read1 and read2 respectively. + + ./split /tmp/sample.sam.gz /tmp/ + +Below code excerpt shows sam_open_mode api usage. + + int main(int argc, char *argv[]) + { + ... + //set file open mode based on file name for 1st and as explicit for 2nd + if ((sam_open_mode(mode1+1, file1, NULL) == -1) || + (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + //open output files + outfile1 = sam_open(file1, mode1); //as compressed SAM through sam_open + outfile2 = sam_open_format(file2, mode2, NULL); //as compressed SAM through sam_open_format + ... + } +Refer: split2.c + +This creates 1.sam.gz and 2.sam in /tmp/ both having compressed data. + + ./split2 /tmp/sample.sam.gz /tmp/ + +An htsFormat structure filled appropriately can also be used to specify output +file format while using sam_open_format api. + + +### CRAM writing + +CRAM files uses reference data and compresses alignment data. A CRAM file may +be created with external reference data file - most appropriate, with embedded +reference in it or with no reference data at all. It can also be created using +an autogenerated reference, based on consensus with-in the alignment data. +The reference detail can be set to an htsFormat structure using hts_parse_format +api and used with sam_open_format api to create appropriate CRAM file. + + ... + snprintf(reffmt1, size1, "cram,reference=%s", reffile); + snprintf(reffmt2, size2, "cram,embed_ref=1,reference=%s", reffile); + ... + if (hts_parse_format(&fmt1, reffmt1) == -1 || //using external reference - uses the M5/UR tags to get + reference data during read + hts_parse_format(&fmt2, reffmt2) == -1 || //embed the reference internally + hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 || //embed autogenerated reference + hts_parse_format(&fmt4, "cram,no_ref=1") == -1) { //no reference data encoding at all + ... + outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2); + ... +Refer: cram.c + + +### FASTA/FASTQ data access + +FASTA/FASTQ files have the raw sequence data and the data can be read one by +one using sam_read1 or a selected range using a region. The data can be written +similar to alignment data using sam_write1 api. To write the file, format +can be set by updating mode buffer using sam_open_mode with file name +or explicit format text. This mode buffer can be used with sam_open or can be +used with sam_open_format with explicit format information in htsFormat +structure. + + ... + if (!(bamdata = bam_init1())) { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + if (infile->format.format != fasta_format && infile->format.format != fastq_format) { + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + printf("\nsequence: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]); + } + if (infile->format.format == fastq_format) { + printf("\nquality: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", bam_get_qual(bamdata)[c]); + ... +Refer: read_fast.c + + ... + char mode[4] = "a"; + ... + if (sam_open_mode(mode + 1, outname, NULL) < 0) { + ... + if (!(outfile = sam_open(outname, mode))) { + ... + if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) + < 0) { + ... + if (sam_write1(outfile, out_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + ... +Refer: write_fast.c + + +### Header data read + +The header gives the version, reference details, read group, change history +and comments. These data are stored inside the sam_hdr_t. Each of these +entries, except comments, have their unique identifier and it is required to +access different fields of them. The api sam_hdr_count_lines gives the count +of the specified type of header line. The value of a unique identifier to a +specific type of header line can be retrieved with sam_hdr_line_name api. The +api sam_hdr_find_tag_id and sam_hdr_find_tag_pos can get the field data from a +header line using unique identifier values or using position. The full header +line can be retrieved using sam_hdr_find_line_pos or sam_hdr_line_id with +position and unique identifier values respectively. + + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data); + ... + ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data); + ... + linecnt = sam_hdr_count_lines(in_samhdr, header); + ... + ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data); + ... + ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data); + ... +Refer: read_header.c + +This will show the VN tag's value from HD header. + + ./read_header /tmp/sample.sam.gz HD VN + +Shows the 2nd SQ line's LN field value. + + ./read_header /tmp/sample.sam.gz SQ SN T2 LN + +Below code excerpt shows the reference names which has length above given value. + + ... + linecnt = sam_hdr_count_lines(in_samhdr, "SQ"); //get reference count + ... + //iterate and check each reference's length + for (pos = 1, c = 0; c < linecnt; ++c) { + if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) { + ... + size = atoll(data.s); + if (size < minsize) { + //not required + continue; + } + if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) { + //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same! + ... + printf("%d,%s,%s\n", pos, id, data.s); + ... +Refer: read_refname.c + + +### Alignment data read + +The alignment / sequence data contains many fields. Mainly the read/query +name, flags indicating the properties of the read, reference sequence name, +position in reference to which it matches, quality of the read, CIGAR string +indicating the match status, position of mate / reverse strand, name of +reference sequence to which mate matches, the insert length, base sequence, +quality value of each base and auxiliary fields. + +Header data would be required to retrieve the reference names as alignment +contains the position of the reference in the header. + +A few of the data are directly visible in bam1_t and the rest are hidden +inside data member of bam1_t and can easily be retrieved using macros. +bam_get_qname gives the name of the read, sam_hdr_tid2name gives the reference +name. bam_get_cigar retrieves the cigar operation array, which can be decoded +using bam_cigar_oplen to get count of bases to which that operation applicable +and bam_cigar_opchr to get the cigar operation. bam_seqi retrieves the base +data at a given position in alignment and it can be converted to character by +indexing the seq_nt16_str array. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE] + printf("NAME: %s\n", bam_get_qname(bamdata)); //get the query name using the macro + flags = bam_flag2str(bamdata->core.flag); //flags as string + ... + tidname = sam_hdr_tid2name(in_samhdr, bamdata->core.tid); + ... + printf("MQUAL: %d\n", bamdata->core.qual); //map quality value + cigar = bam_get_cigar(bamdata); //retrieves the cigar data + for (i = 0; i < bamdata->core.n_cigar; ++i) { //no. of cigar data entries + printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i])); //the macros gives the count of operation + and the symbol of operation for given cigar entry + } + printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize); + data = bam_get_seq(bamdata); + //get the sequence data + if (bamdata->core.l_qseq != bam_cigar2qlen(bamdata->core.n_cigar, cigar)) { //checks the length with CIGAR and query + ... + for (i = 0; i < bamdata->core.l_qseq ; ++i) { //sequence length + printf("%c", seq_nt16_str[bam_seqi(data, i)]); //retrieves the base from (internal compressed) sequence data + ... + printf("%c", bam_get_qual(bamdata)[i]+33); //retrives the quality value + ... +Refer: read_bam.c + +Shows the data from alignments. + + ./read_bam /tmp/sample.sam.gz + + +### Aux data read + +Auxiliary data gives extra information about the alignment. There can be a +number of such data and can be accessed by specifying required tag or by +iterating one by one through them once the alignment is read as bam1_t. The +auxiliary data are stored along with the variable length data in the data +field of bam1_t. There are macros defined to retrieve information about +auxiliary data from the data field of bam1_t. + +Data for a specific tag can be retrieved as a string or can be retrieved as raw +data. bam_aux_get_str retrieves as a string, with tag name, tag type and data. +bam_aux_get can get raw data and with bam_aux_type and bam_aux2A, bam_aux2f etc. +the raw data can be extracted. + +To iterate through all data, the start of aux data is retrieved using macro +bam_aux_first and successive ones using bam_aux_next. Macro bam_aux_tag gives +the tag of the aux field and bam_aux_type gives the information about type of +the aux field. + +Bam_aux2i, bam_aux2f, bam_aux2Z macros retrieve the aux data's value as +integer, float and string respectively. The integer value may be of different +precision / size and the bam_aux_type character indicates how to use the +value. The string/hex data are NULL terminated. + +For array data, bam_aux_type will return 'B' and bam_auxB_len gives the length +of the array. bam_aux_type with the next byte will give the type of data in +the array. bam_auxB2i, bam_auxB2f will give integer and float data from a +given position of the array. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (i % 2) { //use options alternatively to demonstrate both + //option 1 - get data as string with tag and type + if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) { + printf("%s\n",sdata.s); + ... + //option 2 - get raw data + if (!(data = bam_aux_get(bamdata, tag))) { + ... + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + ... +Refer: read_aux.c + +Shows the MD aux tag from alignments. + + ./read_aux ../../samtools/test/mpileup/mpileup.1.bam MD + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + data = bam_aux_first(bamdata); //get the first aux data + while (data) { + printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data)); + //macros gets the tag and type of aux data + //dump the data + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + ... + data = bam_aux_next(bamdata, data); //get the next aux data + ... +Refer: dump_aux.c + +Shows all the tags from all alignments. + + ./dump_aux ../../samtools/test/mpileup/mpileup.1.bam + + +### Add/Remove/Update header + +There are specific types of data that can be part of header data. They have +a tag from HD, SQ, RG, PG and CO. Fully formatted header lines, separated by new +line, can be added with sam_hdr_add_lines api. A single header line can be added +using sam_hdr_add_line api where the header type, tag and value pair are passed +as arguments, terminated with a NULL argument. The PG header lines are special +that they have a kind of linkage to previous PG lines. This linkage can be auto +generated by using sam_hdr_add_pg api which sets the 'PP' field used in linkage. +sam_hdr_write api does the write of the header data to file. + + ... + //add SQ line with SN as TR1 and TR2 + if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) { //length as 0 for NULL terminated data + ... + //add RG line with ID as RG1 + if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) { + ... + //add pg line + if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) { //NULL is to indicate end of args + ... + if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) { //NULL is to indicate end of args + ... + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + ... +Refer: add_header.c + +Not all type of header data can be removed but where it is possible, either a +specific header line can be removed or all of a header type can be removed. To +remove a specific line, header type, unique identifier field tag and its value +to be used. To remove all lines of a type, header type and unique identifier +field tag are to be used. + + ... + //remove specific line + if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) { + ... + //remove multiple lines of a header type + if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) { + ... + if (sam_hdr_write(outfile, in_samhdr) < 0) { + ... +Refer: rem_header.c + +Shows the file content after removing SQ line with SN 2. + ./rem_header ../../samtools/test/mpileup/mpileup.1.bam SQ 2 + +The unique identifier for the line needs to be found to update a field, though +not all types in the header may be modifiable. The api sam_hdr_update_line +takes the unique identifier for the header line type, its value, the field +which needs to be modified and the new value with which to modify it, followed +by a NULL. +e.g. To change LN field from 2000 to 2250 in SQ line with unique identifier SN +as 'chr1', sam_hdr_update_line( header, "SQ", "SN", "chr1", "LN", "2250", +NULL). To change PP field from ABC to DEF in PG line with ID APP.10, +sam_hdr_update_line( header, "PG", "ID", "APP.10", "PP", "DEF", NULL). + + ... + //update with new data + if (sam_hdr_update_line(in_samhdr, header, id, idval, tag, val, NULL) < 0) { + printf("Failed to update data\n"); + goto end; + } + ... +Refer: update_header.c + +Shows new sam file with 2nd SQ line having length as 38. + + ./update_header /tmp/sample.sam.gz SQ T1 LN 38 + + +### Update alignment data + +Many of the bam data fields may be updated by setting new value to appropriate +field in bam1_core_t structure and for a few, creating a new bam1_t record would +be easier than update of existing record. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + ... + case 1:// QNAME + ret = bam_set_qname(bamdata, val); + break; + case 2:// FLAG + bamdata->core.flag = atol(val) & 0xFFFF; + break; + case 3:// RNAME + case 7:// RNEXT + if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) { + ... + if (field == 3) { + //reference + bamdata->core.tid = ret; + } + else { + //mate reference + bamdata->core.mtid = ret; + } + break; + case 4:// POS + bamdata->core.pos = atoll(val); + break; + case 5:// MAPQ + bamdata->core.qual = atoi(val) & 0x0FF; + break; + case 6:// CIGAR + { + ... + //get cigar array and set all data in new bam record + if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) { + ... + if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid, + bamdata->core.pos, bamdata->core.qual, ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos, + bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata), + (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) { + ... + //correct sequence data as input is expected in ascii format and not as compressed inside bam! + memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2); + //copy the aux data + memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); + ... + break; + case 8:// PNEXT + bamdata->core.mpos = atoll(val); + break; + case 9:// TLEN + bamdata->core.isize = atoll(val); + break; + case 10:// SEQ + ... + for( c = 0; c < i; ++c) { + bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]); + } + break; + case 11:// QUAL + ... + for (c = 0; c < i; ++c) { + val[c] -= 33; //phred score from ascii value + } + memcpy(bam_get_qual(bamdata), val, i); + ... +Refer: mod_bam.c + +Shows data with RNAME modified to T2. + + ./mod_bam /tmp/sample.sam ITR1 3 T2 + +The auxiliary data in bam1_t structure can be modified using +bam_aux_update_float, bam_aux_update_int etc. apis. If the aux field is not +present at all, it can be appended using bam_aux_append. + + ... + //matched to qname, update aux + if (!(data = bam_aux_get(bamdata, tag))) { + //tag not present append + ... + if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) { + ... + else { + char auxtype = bam_aux_type(data); + //update the tag with newer value + switch (type) { + case 'f': + case 'd': + ... + if (bam_aux_update_float(bamdata, tag, atof(val))) { + ... + case 'C': + case 'S': + case 'I': + ... + if (bam_aux_update_int(bamdata, tag, atoll(val))) { + ... + case 'Z': + ... + if (bam_aux_update_str(bamdata, tag, length, val)) { + ... + case 'A': + ... + //update the char data directly on buffer + *(data+1) = val[0]; + ... +Refer: mod_aux.c + +Shows the given record's MD tag set to Test. + + ./mod_aux samtools/test/mpileup/mpileup.1.bam ERR013140.6157908 MD Z Test + +The array aux fields can be updated using bam_aux_update_array api. + + ... + if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) { + ... +Refer: mod_aux_ba.c + +Shows the records updated with an array of integers, containing count of ACGT +and N in that order. + + ./mod_aux_ba samtools/test/mpileup/mpileup.1.bam + + +### Create an index + +Indexes help to read data faster without iterating sequentially through the +file. Indexes contain the position information about alignments and that they +can be read easily. There are different type of indices, BAI, CSI, CRAI, TBI, +FAI etc. and are usually used with iterators. + +Indexing of plain/textual files are not supported, compressed SAM&FASTA/Q, BAM, +and CRAM files can be indexed. CRAM files are indexed as .crai and the other two +can be indexed as .bai or .csi files. Each of these types have different +internal representations of the index information. Bai uses a fixed +configuration values where as csi has them dynamically updated based on the +alignment data. + +Indexes can be created either with save of alignment data or explicitly by +read of existing alignment file. + +To create index along with alignment write, the sam_idx_init api need to be +invoked before the start of alignment data write. This api takes the output +samFile pointer, header pointer, minimum shift and index file path. For BAI +index, the min shift has to be 0. + +At the end of write, sam_idx_save api need to be invoked to save the index. + + //write header + if (sam_hdr_write(outfile, in_samhdr)) { + ... + // initialize indexing, before start of write + if (sam_idx_init(outfile, in_samhdr, size, fileidx)) { + ... + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + ... + if (sam_idx_save(outfile)) { + ... +Refer:index_write.c + +Creates mpileup.1.bam and mpileup.1.bam.bai in /tmp/. + + ./idx_on_write ../../samtools/test/mpileup/mpileup.1.bam 0 /tmp/ + +To create index explicitly on an existing alignment data file, the +sam_index_build api or its alike can be used. sam_index_build takes the +alignment file path, min shift for the index and creates the index file in +same path. The output name will be based on the alignment file format and min +shift passed. + +The sam_index_build2 api takes the index file path as well and gives more +control than the previous one. The sam_index_build3 api provides an option to +configure the number of threads in index creation. + + +### Read with iterators + +Index file helps to read required data without sequentially accessing the file +and are required to use iterators. The interested reference, start and end +position etc. are required to read data with iterators. With index and these +information, an iterator is created and relevant alignments can be accessed by +iterating it. + +The api sam_index_load and the like does the index loading. It takes input +samFile pointer and file path. It loads the index file based on the input file +name, from the same path and with implicit index file extension - cram file +with .crai and others with .bai. The sam_index_load2 api accepts explicit path +to index file, which allows loading it from a different location and explicit +extensions. The sam_index_load3 api supports download/save of the index +locally from a remote location. These apis returns NULL on failure and index +pointer on success. + +The index file path can be appended to alignment file path and used as well. +In this case the paths are expected to be separated by '##idx##'. + +The sam_iter_queryi or sam_iter_querys apis may be used to create an iterator +and sam_itr_next api does the alignment data retrieval. Along with retrieval +of current data, it advances the iterator to next relevant data. The +sam_iter_queryi takes the interested positions as numeric values and +sam_iter_querys takes the interested position as a string. + +With sam_iter_queryi, the reference id can be the 0 based index of reference +data, -2 for unmapped alignments, -3 to start read from beginning of file, -4 +to continue from current position, -5 to return nothing. Based on the +reference id given, alignment covering the given start and end positions will +be read with sam_iter_next api. + +With sam_iter_querys, the reference sequence is identified with the name and +interested positions can be described with start and end separated by '-' as +string. When sequence is identified as '.', it begins from the start of file +and when it is '*', unmapped alignments are read. Reference with [:], +:S, :S-E, :-E retrieves all data, all data covering position +S onwards, all data covering position S to E, all data covering upto position +E of reference with ID respectively on read using sam_iter_next. + +The index and iterator created are to be destroyed once the need is over. +sam_itr_destroy and hts_idx_destroy apis does this. + + ... + //load index file + if (!(idx = sam_index_load2(infile, inname, idxfile))) { + ... + //create iterator + if (!(iter = sam_itr_querys(idx, in_samhdr, region))) { + ... + //read using iterator + while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) { + ... + if (iter) { + sam_itr_destroy(iter); + } + if (idx) { + hts_idx_destroy(idx); + ... +Refer:index_reg_read.c + +With sample.sam, region as \* will show alignments with name UNMAP2 and UNMAP3 + + ./read_reg /tmp/sample.sam.gz \* + +With region as \., it shows all alignments + + ./read_reg /tmp/sample.sam.gz \. + +With region as T1:1-4, start 1 and end 4 it shows nothing and with T1:1-5 it +shows alignment with name ITR1. + + ./read_reg /tmp/sample.sam.gz T1:1-5 + +With region as T2:30-100, it shows alignment with name ITR2M which refers the +reference data T2. + + ./read_reg /tmp/sample.sam.gz T2:30-100 + + +Multiple interested regions can be specified for read using sam_itr_regarray. +It takes index path, header, count of regions and region descriptions as array +of char array / string. This array passed need to be released by the user +itself. + + ... + //load index file, assume it to be present in same location + if (!(idx = sam_index_load(infile, inname))) { + ... + //create iterator + if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) { + ... + if (regions) { + //can be freed as it is no longer required + free(regions); + regions = NULL; + } + //get required area + while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) { + ... +Refer:index_multireg_read.c + +With compressed sample.sam and 2 regions from reference T1 (30 to 32) and 1 +region from T2 (34 onwards), alignments with name A1, B1, A2 and ITR2M would +be shown. + + ./read_multireg /tmp/sample.sam.gz 2 T1:30-32,T2:34 + +To use numeric indices instead of textual regions, sam_itr_regions can be used. +It takes index file path, header, count of regions and an array of region +description (hts_reglist_t*), which has the start end positions as numerals. + +The index and iterators are to be destroyed using the sam_itr_destroy and +hts_idx_destroy. The hts_reglist_t* array passed is destroyed by the library +on iterator destroy. The regions array (array of char array/string) needs to be +destroyed by the user itself. + + +### Pileup and MPileup + +Pileup shows the transposed view of the SAM alignment data, i.e. it shows the +the reference positions and bases which cover that position through different +reads side by side. MPileup facilitates the piling up of multiple sam files +against each other and same reference at the same time. + +Mpileup has replaced the pileup. The input expects the data to be sorted by +position. + +Pileup needs to be initialized with bam_pileup_init method which takes pointer +to a method, which will be called by pileup to read data from required files, +and pointer to data which might be required for this read method to do the +read operation. It returns a pointer to the pileup iterator. + +User can specify methods which need to be invoked during the load and unload +of an alignment, like constructor and destructor of objects. +Bam_plp_constructor and bam_plp_destructor methods does the setup of +these methods in the pileup iterator. During invocation of these methods, the +pointer to data passed in the initialization is passed as well. If user want +to do any custom status handling or actions during load or unload, it can be +done in these methods. Alignment specific data can be created and stored in +an argument passed to the constructor and the same will be accessible during +pileup status return. The same will be accessible during destructor as well +where any deallocation can be made. + +User is expected to invoke bam_plp_auto api to get the pileup status. It +returns the pileup status or NULL on end. During this all alignments are read +one by one, using the method given in initialization for data read, until one +for a new reference is found or all alignment covering a position is read. On +such condition, the pileup status is returned and the same continuous on next +bam_plp_auto call. The pileup status returned is an array for all positions +for which the processing is completed. Along with the result, the reference +index, position in reference data and number of alignments which covers this +position are passed. User can iterate the result array and get bases from each +alignment which covers the given reference position. The alignment specific +custom data which were created in constructor function will also be available +in the result. + +The bam_plp_auto api invokes the data read method to load an alignment and the +constructor method is invoked during the load. Once the end of alignment is +passed, it is removed from the processing and destructor method is invoked, +that user could do deallocations and custom actions as in load during this +time. The custom data passed during the initialization is passed to the +constructor and destructor methods during invocation. + +Once the forward and reverse strands are identified, the better of the quality +is identified and used. Both reads are required for this and hence reads are +cached until its mate is read. The maximum number of reads that can be cached +is controlled by bam_plp_set_maxcnt. Reads covering a position are cached and +as soon as mate is found, quality is adjusted and is removed from cache. Reads +above the cache limit are discarded. + +Once done, the pileup iterator to be discarded by sam_plp_destroy api. + + ... + if (!(plpiter = bam_plp_init(readdata, &conf))) { + ... + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &n))) { + printf("%d\t%d\t", tid+1, refpos+1); + for (j = 0; j < n; ++j) { + //doesnt detect succeeding insertion and deletion together here, only insertion is identified + //deletion is detected in plp->is_del as and when pos reaches the position + //if detection ahead is required, use bam_plp_insertion here which gives deletion length along with insertion + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]))); + if (plp[j].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[j].indel); + for (k = 0; k < plp[j].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos + k + 1)])); + } + } + else if (plp[j].indel < 0) { + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + ... + if (plpiter) { + bam_plp_destroy(plpiter); + ... +Refer:pileup.c + +The read method may use a simple read or it could be an advanced read using +indices, iterators and region specifications based on the need. The constructor +method may create any custom data and store it in the pointer passed to it. The +same need to be released by use on destructor method. + +MPileup works same as the pileup and supports multiple inputs against the same +reference, giving side by side view of reference and alignments from different +inputs. + +MPileup needs to be initialized with bam_mpileup_init method which takes +pointer to a method, which will be called by pileup to read data from required +files, and an array of pointer to data which might be required for this read +method to do the read operation. It returns a pointer to the mpileup iterator. + +User can specify methods which need to be invoked during the load and unload +of an alignment, like constructor and destructor of objects. +bam_mplp_constructor and bam_mplp_destructor methods does the setup +of these methods in the pileup iterator. During invocation of these methods, +the pointer to data passed in the initialization is passed as well. If user +want to do any custom status handling or actions during load or unload, it can +be done on these methods. Alignment specific data can be created and +stored in the custom data pointer and the same will be accessible during +return of pileup status. The same will be accessible during destructor as well +where any deallocation can be made. + +User is expected to invoke bam_mplp_auto api to get the pileup status. It +returns the pileup status. During this all alignments are read one by one, +using the method given in initialization for data read, until one for a new +reference is found or all alignment covering a position is read. On such +condition, the pileup status is returned and the same continuous on next +bam_mplp_auto call. + +The pileup status is returned through a parameter in the method itself, is an +array for all inputs, each containing array for positions on which the +processing is completed. Along with the result, the reference index, position +in reference data and number of alignments which covers this position are +passed. User can iterate the result array and get bases from each alignment +which covers the given reference position. The alignment specific custom data +which were created in constructor function will also be available in the +result. + +Once the forward and reverse strands are identified, the better of the quality +is identified and used. Both reads are required for this and hence reads are +cached until its mate is read. The maximum number of reads that can be cached +is controlled by bam_mplp_set_maxcnt. Reads covering a position are cached and +as soon as mate is found, quality is adjusted and is removed from cache. Reads +above the cache limit are discarded. + +Once done, the pileup iterator to be discarded by sam_mplp_destroy api. + + ... + if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) { + ... + //set constructor destructor callbacks + bam_mplp_constructor(mplpiter, plpconstructor); + bam_mplp_destructor(mplpiter, plpdestructor); + + while (bam_mplp64_auto(mplpiter, &tid, &refpos, depth, plp) > 0) { + printf("%d\t%"PRIhts_pos"\t", tid+1, refpos+1); + + for (input = 0; input < argc - 1; ++input) { + for (dpt = 0; dpt < depth[input]; ++dpt) { + if (plp[input][dpt].is_del || plp[input][dpt].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[input][dpt].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]) : (plp[input]->is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]))); + if (plp[input][dpt].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[input][dpt].indel); + for (k = 0; k < plp[input][dpt].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos + k + 1)])); + } + } + else if (plp[input][dpt].indel < 0) { + printf("%d", plp[input][dpt].indel); + for (k = 0; k < -plp[input][dpt].indel; ++k) { + printf("?"); + ... + if (mplpiter) { + bam_mplp_destroy(mplpiter); + } + ... + if (plp) { + free(plp); + ... +Refer:mpileup.c + +This sample takes multiple sam files and shows the pileup of data side by side. + + ./mpileup /tmp/mp.bam /tmp/mp.sam + + +### Base modifications + +The alignment data may contain base modification information as well. This +gives the base, modifications found, orientation in which it was found and the +quality for the modification. The base modification can be identified using +hts_parse_basemod api. It stores the modification details on hts_base_mod_state +and this has to be initialized using hts_base_mod_state_alloc api. + +Once the modifications are identified, they can be accessed through different +ways. bam_mods_recorded api gives the modifications identified for an alignment. +Modifications can be queried for each base position iteratively using +bam_mods_at_next_pos api. Check the returned value with buffer size to see +whether the buffer is big enough to retrieve all modifications. +Instead of querying for each position, the next modified position can be +directly retrieved directly using bam_next_basemod api. An alignment can be +queried to have a specific modification using bam_mods_query_type api. At the +end of processing, the state need to be released using hts_base_mod_state_free +api. + + ... + if (!(ms = hts_base_mod_state_alloc())) { + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + ... + if (bam_parse_basemod(bamdata, ms)) { + ... + bm = bam_mods_recorded(ms, &cnt); + for (k = 0; k < cnt; ++k) { + printf("%c", bm[k]); + } + printf("\n"); + hts_base_mod mod[5] = {0}; //for ATCGN + if (opt) { + //option 1 + for (; i < bamdata->core.l_qseq; ++i) { + if ((r = bam_mods_at_next_pos(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]))) <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + else if (r > (sizeof(mod) / sizeof(mod[0]))) { + printf("More modifications than this app can handle, update the app\n"); + goto end; + } + else if (!r) { + //no modification at this pos + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + ... + else { + //option 2 + while ((r = bam_next_basemod(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]), &pos)) >= 0) { + for (; i < bamdata->core.l_qseq && i < pos; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + ... + //check last alignment's base modification + int strand = 0, impl = 0; + char canonical = 0, modification[] = "mhfcgebaon"; //possible modifications + printf("\n\nLast alignment has \n"); + for (k = 0; k < sizeof(modification) - 1; ++k) { //avoiding NUL termination + if (bam_mods_query_type(ms, modification[k], &strand, &impl, &canonical)) { + printf ("No modification of %c type\n", modification[k]); + } + else { + printf("%s strand has %c modified with %c, can %sassume unlisted as unmodified\n", strand ? "-/bottom/reverse" : + "+/top/forward", canonical, modification[k], impl?"" : "not " ); + } + } + ... + if (ms) { + hts_base_mod_state_free(ms); + ... +Refer:modstate.c + +The modification can be accessed in pileup mode as well. bam_mods_at_qpos gives +the modification at given pileup position. Insertion and deletion to the given +position with possible modification can be retrieved using bam_plp_insertion_mod +api. + + ... + int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + //when using cd, initialize and use as it will be reused after destructor + cd->p = hts_base_mod_state_alloc(); + //parse the bam data and gather modification data from MM tags + return (-1 == bam_parse_basemod(b, (hts_base_mod_state*)cd->p)) ? 1 : 0; + } + + int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + if (cd->p) { + hts_base_mod_state_free((hts_base_mod_state *)cd->p); + cd->p = NULL; + } + return 0; + } + + int main(int argc, char *argv[]) + { + ... + if (!(plpiter = bam_plp_init(readdata, &conf))) { + ... + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &depth))) { + memset(&mods, 0, sizeof(mods)); + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < depth; ++j) { + dellen = 0; + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + /*invoke bam mods_mods_at_qpos before bam_plp_insertion_mod that the base modification + is retrieved before change in pileup pos thr' plp_insertion_mod call*/ + if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) { + ... + //use plp_insertion/_mod to get insertion and del at the same position + if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) { + ... + //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered + //base and modification + printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)])), + modlen > 0 ? mods[0].strand ? '-' : '+' : '\0', modlen > 0 ? mods[0].modified_base : '\0'); + //insertion and deletions + if (plp[j].indel > 0) { + //insertion + /*insertion data from plp_insertion_mod, note this shows the quality value as well + which is different from base and modification above;the lower case display is not attempted either*/ + printf("+%d%s", plp[j].indel, insdata.s); + //handle deletion if any + if (dellen) { + printf("-%d", dellen); + for (k = 0; k < dellen; ++k) { + printf("?"); + ... + else if (plp[j].indel < 0) { + //deletion + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + ... +Refer:pileup_mod.c + + +### Read selected fields + +At times the whole alignment data may not be of interest and it would be +better to read required fields alone from the alignment data. CRAM file format +supports such specific data read and HTSLib provides an option to use this. +This can improve the performance on read operation. + +The hts_set_opt method does the selection of specified fields. There are flags +indicating specific fields, like SAM_FLAG, SAM_SEQ, SAM_QNAME, in alignment +data and a combination of flags for the required fields can be passed with +CRAM_OPT_REQUIRED_FIELDS to this api. + + ... + //select required field alone, this is useful for CRAM alone + if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) { + ... + //read header + in_samhdr = sam_hdr_read(infile); + ... + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + ... +Refer: flags_htsopt_field.c + + +### Thread-pool to read / write + +The HTSLib api supports thread pooling for better performance. There are a few +ways in which this can be used. The pool can be made specific for a file or a +generic pool can be created and shared across multiple files. Another way to +use thread pool is to schedule tasks explicitly to queues which gets executed +using threads in pool. + +To have a thread pool specific for a file, hts_set_opt api can be used with the +file pointer, HTS_OPT_NTHREADS and the number of threads to use in the pool. +Closure of file releases the thread pool as well. To have a thread pool which +can be shared across different files, it needs to be initialized using +hts_tpool_init api, passing number of threads as argument. This thread pool can +be associated with a file using hts_set_opt api. The file pointer, +HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments +to api. The thread pool has to be released with hts_tpool_destroy. + +Below excerpt shows file specific thread pool, + + ... + //create file specific threads + if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading + hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for sam write + hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) { //1 thread specific for bam write + printf("Failed to set thread options\n"); + goto end; + } +Refer: split_thread1.c + +Below excerpt shows thread pool shared across files, + + ... + //create a pool of 4 threads + if (!(tpool.pool = hts_tpool_init(4))) { + ... + //share the pool with all the 3 files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) { + ... + if (tpool.pool) { + hts_tpool_destroy(tpool.pool); + } + ... +Refer: split_thread2.c + + +## More Information + +### CRAM reference files + +The cram reference data is required for the read of sequence data in CRAM +format. The sequence data file may have it as embedded or as a reference to +the actual file. When it is a reference, it is downloaded locally, in the +cache directory for later usage. It will be stored in a directory structure +based on the MD5 checksum in the cache directory. + +Each chromosome in a reference file gets saved as a separate file with md5sum +as its path and name. The initial 4 numerals make the directory name and rest +as the file name (/<1st 2 of md5sum>/<2nd 2 of md5sum>/). + +The download would be attempted from standard location, EBI ENA +(https://www.ebi.ac.uk/ena). + + +### Bam1_t + +This structure holds the sequence data in BAM format. There are fixed and +variable size fields, basic and extended information on sequence +data. Variable size data and extended information are kept together in a +buffer, named data in bam1_t. Fields in the member named core, bam1_core_t, +and a few macros together support the storage and handling of the whole +sequence data. + +- core has a link to reference as a 0 based index in field tid. The mate / + reverse strand's link to reference is given by mtid. + +- Field pos and mpos gives the position in reference to which the sequence and + its mate / reverse strand match. + +- Field flag gives the properties of the given alignment. It shows the + alignment's orientation, mate status, read order etc. + +- Field qual gives the quality of the alignment read. + +- l_qname gives the length of the name of the alignment / read, l_extranul gives + the extra space used internally in the data field. + +- l_qseq gives the length of the alignment / read in the data field. + +-- n_cigar gives the number of CIGAR operations for the given alignment. + +- isize gives the insert size of the read / alignment. + +The bases in sequence data are stored by compressing 2 bases together in a +byte. When the reverse flag is set, the base data is reversed and +complemented from the actual read (i.e. if the forward read is ACTG, the +reverse read to be CAGT; it will be stored in SAM format with reversed and +complemented format as ACTG with reverse flag set). + +Macros bam_get_qname, bam_get_seq, bam_get_qual, bam_get_aux, bam_get_l_aux, +bam_seqi etc access the data field and retrieve the required data. The aux +macros support the retrieval of auxiliary data from the data field. + + +### Sam_hdr_t + +This structure holds the header information. This holds the number of targets +/ SQ lines in the file, each one's length, name and reference count to this +structure. It also has this information in an internal data structure for +easier access of each field of this data. + +When this data is shared or assigned to another variable of a different scope +or purpose, the reference count needs to be incremented to ensure that it is +valid till the end of the variable's scope. sam_hdr_incr_ref and it needs to +be destroyed as many times with sam_hdr_destroy api. + + +### Index + +Indices need the data to be sorted by position. They can be of different +types with extension .bai, .csi or .tbi for compressed SAM/BAM files and .crai +for CRAM files. The index name can be passed along with the alignment file +itself by appending a specific character sequence. The apis can detect this +sequence and extract the index path. ##idx## is the sequence which separates +the file path and index path. + + +### Data files + +The data files can be a local file, a network file, a file accessible through +the web or in cloud storage like google and amazon. The data files can be +represented with URIs like file://, file://localhost/.., ,ftp://.., +gs+http[s].., s3+http[s]:// + diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 000000000..40991d78f --- /dev/null +++ b/samples/Makefile @@ -0,0 +1,106 @@ +HTS_DIR = ../ +include $(HTS_DIR)/htslib_static.mk + +CC = gcc +CFLAGS = -Wall -g -O0 + +#to statically link to libhts +LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_static_LIBS) + +#to dynamically link to libhts +#LDFLAGS = -L $(HTS_DIR) -lhts -Wl,-rpath, + +PRGS = flags split split2 cram read_fast read_header read_ref read_bam \ + read_aux dump_aux add_header rem_header update_header mod_bam mod_aux \ + mod_aux_ba write_fast idx_on_write read_reg read_multireg pileup \ + mpileup modstate pileup_mod flags_field split_t1 split_t2 + +all: $(PRGS) + +flags: + $(CC) $(CFLAGS) -I $(HTS_DIR) flags_demo.c -o $@ $(LDFLAGS) + +split: + $(CC) $(CFLAGS) -I $(HTS_DIR) split.c -o $@ $(LDFLAGS) + +split2: + $(CC) $(CFLAGS) -I $(HTS_DIR) split2.c -o $@ $(LDFLAGS) + +cram: + $(CC) $(CFLAGS) -I $(HTS_DIR) cram.c -o $@ $(LDFLAGS) + +read_fast: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_fast.c -o $@ $(LDFLAGS) + +read_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_header.c -o $@ $(LDFLAGS) + +read_ref: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_refname.c -o $@ $(LDFLAGS) + +read_bam: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_bam.c -o $@ $(LDFLAGS) + +read_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_aux.c -o $@ $(LDFLAGS) + +dump_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) dump_aux.c -o $@ $(LDFLAGS) + +add_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) add_header.c -o $@ $(LDFLAGS) + +rem_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) rem_header.c -o $@ $(LDFLAGS) + +update_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) update_header.c -o $@ $(LDFLAGS) + +mod_bam: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_bam.c -o $@ $(LDFLAGS) + +mod_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux.c -o $@ $(LDFLAGS) + +mod_aux_ba: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux_ba.c -o $@ $(LDFLAGS) + +write_fast: + $(CC) $(CFLAGS) -I $(HTS_DIR) write_fast.c -o $@ $(LDFLAGS) + +idx_on_write: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_write.c -o $@ $(LDFLAGS) + +read_reg: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_reg_read.c -o $@ $(LDFLAGS) + +read_multireg: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_multireg_read.c -o $@ $(LDFLAGS) + +pileup: + $(CC) $(CFLAGS) -I $(HTS_DIR) pileup.c -o $@ $(LDFLAGS) + +mpileup: + $(CC) $(CFLAGS) -I $(HTS_DIR) mpileup.c -o $@ $(LDFLAGS) + +modstate: + $(CC) $(CFLAGS) -I $(HTS_DIR) modstate.c -o $@ $(LDFLAGS) + +pileup_mod: + $(CC) $(CFLAGS) -I $(HTS_DIR) pileup_mod.c -o $@ $(LDFLAGS) + +flags_field: + $(CC) $(CFLAGS) -I $(HTS_DIR) flags_htsopt_field.c -o $@ $(LDFLAGS) + +split_t1: + $(CC) $(CFLAGS) -I $(HTS_DIR) split_thread1.c -o $@ $(LDFLAGS) + +split_t2: + $(CC) $(CFLAGS) -I $(HTS_DIR) split_thread2.c -o $@ $(LDFLAGS) + +clean: + find . -name "*.o" | xargs rm -rf + find . -name "*.dSYM" | xargs rm -rf + rm $(PRGS) + + diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 000000000..ab5481dea --- /dev/null +++ b/samples/README.md @@ -0,0 +1,227 @@ +HTSlib is an implementation of a unified C library for accessing common file +formats, such as [SAM, CRAM and VCF][1], used for high-throughput sequencing +data, and is the core library used by [samtools][2] and [bcftools][3]. + +A set of sample programs are available which showcases the usage of APIs in HTSlib. +They are based on version 1.17 of HTSLib and are mainly for demonstration of API usage. +Further optimization and error handling might be required for actual usage. + + +[1]: http://samtools.github.io/hts-specs/ +[2]: http://github.com/samtools/samtools +[3]: http://samtools.github.io/bcftools/ + +### Building and using sample programs + +GCC and compatible compilers can be used to build these samples. + +A makefile is available along with source files which links statically to +htslib. To use dynamic linking, update the makefile's 'LDFLAGS' and 'rpath' +path. The 'rpath' path to be set as the path to lib directory of htslib +installation. + +```sh + +# linking statically on a linux machine +gcc -g -o -I \ + /libhts.a -lcrypto -lm -lpthread -lcurl -llzma -lz -lbz2 + +# dynamically linking with libhts +gcc -g -o -I \ + -L -lhts -Wl,-rpath, + +``` + +In many cases, the alignment data are expected as sorted, compressed and +indexed. + +### The samples... + +[Flags][Flags] + + This application showcases the basic read of alignment files and flag + access. It reads and shows the count of read1 and read2 alignments. + +[Split][Split] + + This application showcases the basic read and write of alignment data. It + saves the read1 and read2 as separate files in given directory, one as sam + and other as bam. + +[Split2][Split2] + + This application showcases the output file format selection. It saves the + read1 and read2 as separate files in given directory, both as compressed + sam though the extensions are different. + +[Cram][Cram] + + This application showcases the different way in which cram reference data + is used for cram output creation. + +[Read_fast][Read_fast] + + This application showcases the fasta/fastq data read. + +[Read_header][Read_header] + + This application showcases the read and access of header data. It can show + all header line of given type, data of a given tag on a specific header + line or for all lines of given type. + +[Read_ref][Read_ref] + + This application showcases the read and access of header data. It shows + all reference names which has length equal or greather to given input. + +[Read_bam][Read_bam] + + This application showcases read of different alignment data fields. It + shows contents of each alignment. + +[Read_aux][Read_aux] + + This application showcases read of specific auxiliary tag data in + alignment. It shows the data retrieved using 2 APIs, one as a string with + tag data and other as raw data alternatively. + +[Dump_aux][Dump_aux] + + This application showcases read of all auxiliary tag data one by one in an + alignment. It shows the data retrieved. + +[Add_header][Add_header] + + This application showcases the write of header lines to a file. It adds + header line of types, SQ, RG, PG and CO and writes to standard output. + +[Remove_header][Remove_header] + + This application showcases removal of header line from a file. It removes + either all header lines of given type or one specific line of given type + with given unique identifier. Modified header is written on standard + output. + +[Update_header][Update_header] + + This application shows the update of header line fields, where update is + allowed. It takes the header line type, unique identifier for the line, + tag to be modified and the new value. Updated data is written on standard + output. + +[Mod_bam][Mod_bam] + + This application showcases the update of alignment data. It takes + alignment name, position of field to be modified and new value of + it. Modified data is written on standard output. + +[Mod_aux][Mod_aux] + + This application showcases the update of auxiliary data in alignment. It + takes alignment name, tag to be modified, its type and new value. Modified + data is written on standard output. + +[Mod_aux_ba][Mod_aux_ba] + + This application showcases the update of auxiliary array data in + alignment. It adds count of ATCGN base as an array in auxiliary data, + BA:I. Modified data is written on standard output. + +[Write_fast][Write_fast] + + This application showcases the fasta/fastq data write. It appends a dummy + data to given file. + +[Index_write][Index_write] + + This application showcases the creation of index along with output + creation. Based on file type and shift, it creates bai, csi or crai files. + +[Read_reg][Read_reg]: + + This application showcases the usage of region specification in alignment + read. + +[Read_multireg][Read_multireg]: + + This application showcases the usage of mulitple region specification in + alignment read. + +[Pileup][Pileup]: + + This application showcases the pileup api, where all alignments covering a + reference position are accessed together. It displays the bases covering + each position on standard output. + +[Mpileup][Mpileup]: + + This application showcases the mpileup api, which supports multiple input + files for pileup and gives a side by side view of them in pileup + format. It displays the bases covering each position on standard output. + +[Modstate][Modstate]: + + This application showcases the access of base modifications in + alignment. It shows the modifications present in an alignment and accesses + them using available APIs. There are 2 APIs and which one to be used can + be selected through input. + +[Pileup_mod][Pileup_mod]: + + This application showcases the base modification access in pileup mode. It + shows the pileup display with base modifications. + +[Flags_field][Flags_field] + + This application showcases the read of selected fields alone, reducing the + overhead / increasing the performance. It reads the flag field alone and + shows the count of read1 and read2. This has impact only on CRAM files. + +[Split_thread1][Split_thread1] + + This application showcases the use of threads in file handling. It saves + the read1 and read2 as separate files in given directory, one as sam and + other as bam. 2 threads are used for read and 1 each dedicated for each + output file. + +[Split_thread2][Split_thread2] + + This application showcases the use of thread pool in file handling. It + saves the read1 and read2 as separate files in given directory, one as sam + and other as bam. A pool of 4 threads is created and shared for both read + and write. + +### More Information + +More detailed documentation is available in the [DEMO.md][DEMO] with worked +examples per demonstration tool. + + +[Flags]: flags_demo.c +[Split]: split.c +[Split2]: split2.c +[Cram]: cram.c +[Read_fast]: read_fast.c +[Read_header]: read_header.c +[Read_ref]: read_refname.c +[Read_bam]: read_bam.c +[Read_aux]: read_aux.c +[Dump_aux]: dump_aux.c +[Add_header]: add_header.c +[Remove_header]: rem_header.c +[Update_header]: update_header.c +[Mod_bam]: mod_bam.c +[Mod_aux]: mod_aux.c +[Mod_aux_ba]: mod_aux_ba.c +[Write_fast]: write_fast.c +[Index_write]: index_write.c +[Read_reg]: index_reg_read.c +[Read_multireg]: index_multireg_read.c +[Pileup]: pileup.c +[Mpileup]: mpileup.c +[Modstate]: modstate.c +[Pileup_mod]: pileup_mod.c +[Flags_field]: flags_htsopt_field.c +[Split_thread1]: split_thread1.c +[Split_thread2]: split_thread2.c +[DEMO]: DEMO.md diff --git a/samples/add_header.c b/samples/add_header.c new file mode 100644 index 000000000..d1a2fc13c --- /dev/null +++ b/samples/add_header.c @@ -0,0 +1,128 @@ +/* add_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: add_header infile\n\ +Adds new header lines of SQ, RG, PG and CO typs\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, sq[] = "@SQ\tSN:TR1\tLN:100\n@SQ\tSN:TR2\tLN:50"; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + + //update_header infile header idval tag value + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + //dump command line arguments for PG line + for (c = 0; c < argc; ++c) { + kputs(argv[c], &data); + kputc(' ', &data); + } + + //add SQ line with SN as TR1 and TR2 + if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) { //length as 0 for NULL terminated data + printf("Failed to add SQ lines\n"); + goto end; + } + + //add RG line with ID as RG1 + if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) { + printf("Failed to add RG line\n"); + goto end; + } + + //add pg line + if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) { //NULL is to indicate end of args + printf("Failed to add PG line\n"); + goto end; + } + + if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) { //NULL is to indicate end of args + printf("Failed to add PG line\n"); + goto end; + } + + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + ks_free(&data); + return ret; +} diff --git a/samples/cram.c b/samples/cram.c new file mode 100644 index 000000000..5f55e65d2 --- /dev/null +++ b/samples/cram.c @@ -0,0 +1,168 @@ +/* cram.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: cram infile reffile outdir\n\ +Dumps the input file alignments in cram format in given directory\n\ +1.cram has external reference\n\ +2.cram has reference embedded\n\ +3.cram has autogenerated reference\n\ +4.cram has no reference data in it\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL, *reffile = NULL; + char *file1 = NULL, *file2 = NULL, *file3 = NULL, *file4 = NULL, *reffmt1 = NULL, *reffmt2 = NULL; + int c = 0, ret = EXIT_FAILURE, size1 = 0, size2 = 0, size3 = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL, *outfile3 = NULL, *outfile4 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + htsFormat fmt1 = {0}, fmt2 = {0}, fmt3 = {0}, fmt4 = {0}; + + //cram infile reffile outdir + if (argc != 4) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + reffile = argv[2]; + outdir = argv[3]; + + //allocate space for option string and output file names + size1 = sizeof(char) * (strlen(reffile) + sizeof("cram,reference=") + 1); + size2 = sizeof(char) * (strlen(reffile) + sizeof("cram,embed_ref=1,reference=") + 1); + size3 = sizeof(char) * (strlen(outdir) + sizeof("/1.cram") + 1); + + reffmt1 = malloc(size1); reffmt2 = malloc(size2); + file1 = malloc(size3); file2 = malloc(size3); + file3 = malloc(size3); file4 = malloc(size3); + + if (!file1 || !file2 || !file3 || !file4 || !reffmt1 || !reffmt2) { + printf("Failed to create buffers\n"); + goto end; + } + + snprintf(reffmt1, size1, "cram,reference=%s", reffile); + snprintf(reffmt2, size2, "cram,embed_ref=1,reference=%s", reffile); + snprintf(file1, size3, "%s/1.cram", outdir); snprintf(file2, size3, "%s/2.cram", outdir); + snprintf(file3, size3, "%s/3.cram", outdir); snprintf(file4, size3, "%s/4.cram", outdir); + + if (hts_parse_format(&fmt1, reffmt1) == -1 || //using external reference - uses the M5/UR tags to get reference data during read + hts_parse_format(&fmt2, reffmt2) == -1 || //embed the reference internally + hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 || //embed autogenerated reference + hts_parse_format(&fmt4, "cram,no_ref=1") == -1) { //no reference data encoding at all + printf("Failed to set output option\n"); + goto end; + } + + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM, wc as CRAM (equivalent to fmt3) + outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2); + outfile3 = sam_open_format(file3, "wc", &fmt3); outfile4 = sam_open_format(file4, "wc", &fmt4); + if (!outfile1 || !outfile2 || !outfile3 || !outfile4) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1) || + (sam_hdr_write(outfile3, in_samhdr) == -1) || (sam_hdr_write(outfile4, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0 || + sam_write1(outfile2, in_samhdr, bamdata) < 0 || + sam_write1(outfile3, in_samhdr, bamdata) < 0 || + sam_write1(outfile4, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: +#define IF_OL(X,Y) if((X)) {(Y);} //if one liner + //cleanup + IF_OL(in_samhdr, sam_hdr_destroy(in_samhdr)); + IF_OL(infile, sam_close(infile)); + IF_OL(outfile1, sam_close(outfile1)); + IF_OL(outfile2, sam_close(outfile2)); + IF_OL(outfile3, sam_close(outfile3)); + IF_OL(outfile4, sam_close(outfile4)); + IF_OL(file1, free(file1)); + IF_OL(file2, free(file2)); + IF_OL(file3, free(file3)); + IF_OL(file4, free(file4)); + IF_OL(reffmt1, free(reffmt1)); + IF_OL(reffmt2, free(reffmt2)); + IF_OL(fmt1.specific, hts_opt_free(fmt1.specific)); + IF_OL(fmt2.specific, hts_opt_free(fmt2.specific)); + IF_OL(fmt3.specific, hts_opt_free(fmt3.specific)); + IF_OL(fmt4.specific, hts_opt_free(fmt4.specific)); + IF_OL(bamdata, bam_destroy1(bamdata)); + + return ret; +} diff --git a/samples/dump_aux.c b/samples/dump_aux.c new file mode 100644 index 000000000..49251fe04 --- /dev/null +++ b/samples/dump_aux.c @@ -0,0 +1,188 @@ +/* dump_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: dump_aux infile\n\ +Dump the aux tags from alignments\n"); +} + +/// printauxdata - prints aux data +/** @param fp - file to which it to be printed - stdout or null + * @param type - aux type + * @param idx - index in array, -1 when not an array type + * @param data - data + * recurses when the data is array type +returns 1 on failure 0 on success +*/ +int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data) +{ + uint32_t auxBcnt = 0; + int i = 0; + char auxBType = 'Z'; + + //the tag is already queried and ensured to exist and the type is retrieved from the tag data, also iterated within index for arrays, so no error is expected here. + //when these apis are used explicitly, these error conditions needs to be handled based on return value and errno + switch(type) { + case 'A': + fprintf(fp, "%c", bam_aux2A(data)); //byte data + break; + case 'c': + fprintf(fp, "%d", (int8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 1 byte data; bam_auxB2i - from array or bam_aux2i - non array data + break; + case 'C': + fprintf(fp, "%u", (uint8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 1 byte data + break; + case 's': + fprintf(fp, "%d", (int16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 2 byte data + break; + case 'S': + fprintf(fp, "%u", (uint16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 2 byte data + break; + case 'i': + fprintf(fp, "%d", (int32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 4 byte data + break; + case 'I': + fprintf(fp, "%u", (uint32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 4 byte data + break; + case 'f': + case 'd': + fprintf(fp, "%g", (float)(idx > -1 ? bam_auxB2f(data, idx) : bam_aux2f(data))); //floating point data, 4 bytes + break; + case 'H': + case 'Z': + fprintf(fp, "%s", bam_aux2Z(data)); //array of char or hex data + break; + case 'B': //array of char/int/float + auxBcnt = bam_auxB_len(data); //length of array + auxBType = bam_aux_type(data + 1); //type of element in array + fprintf(fp, "%c", auxBType); + for (i = 0; i < auxBcnt; ++i) { //iterate the array + fprintf(fp, ","); + //calling recurssively with index to reuse a few lines + if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) { + return EXIT_FAILURE; + } + } + break; + default: + printf("Invalid aux tag?\n"); + return EXIT_FAILURE; + break; + } + return EXIT_SUCCESS; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + int ret_r = 0; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + + //dump_aux infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; + data = NULL; + data = bam_aux_first(bamdata); //get the first aux data + while (data) { + printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data)); //macros gets the tag and type of aux data + //dump the data + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + printf("Failed to dump aux data\n"); + goto end; + } + else { + printf(" "); + } + data = bam_aux_next(bamdata, data); //get the next aux data + } + if (ENOENT != errno) { + printf("\nFailed to get aux data\n"); + goto end; + } + printf("\n"); + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/flags_demo.c b/samples/flags_demo.c new file mode 100644 index 000000000..e03fc6cd8 --- /dev/null +++ b/samples/flags_demo.c @@ -0,0 +1,110 @@ +/* flags_demo.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: flags \n\ +Shows the count of read1 and read2 alignments\n\ +This shows basic reading and alignment flag access\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + int64_t cntread1 = 0, cntread2 = 0; //count + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf( "Failed to read header from file\n"); + goto end; + } + + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + if (bamdata->core.flag & BAM_FREAD2) { + cntread2++; + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + printf("File %s has %"PRIhts_pos" read1 and %"PRIhts_pos" read2 alignments\n", inname, cntread1, cntread2); + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/flags_htsopt_field.c b/samples/flags_htsopt_field.c new file mode 100644 index 000000000..4b64445e3 --- /dev/null +++ b/samples/flags_htsopt_field.c @@ -0,0 +1,115 @@ +/* flags_htsopt_field.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: flags_field \n\ +Shows the count of read1 and read2 alignments\n\ +This shows reading selected fields from CRAM file\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + int64_t cntread1 = 0, cntread2 = 0; //count + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //select required field alone, this is useful for CRAM alone + if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) { + printf("Failed to set htsoption\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file\n"); + goto end; + } + + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + if (bamdata->core.flag & BAM_FREAD2) { + cntread2++; + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + printf("File %s has %"PRIhts_pos" read1 and %"PRIhts_pos" read2 alignments\n", inname, cntread1, cntread2); + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/index_multireg_read.c b/samples/index_multireg_read.c new file mode 100644 index 000000000..dbe8f15f9 --- /dev/null +++ b/samples/index_multireg_read.c @@ -0,0 +1,150 @@ +/* index_multireg_read.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the print_usage +/** @param fp pointer to the file / terminal to which print_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_multireg infile count regspec_csv\n\ + Reads alignment of a target matching to given region specifications\n\ + read_multireg infile.sam 2 R1:10-100,R2:200"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + char *ptr = NULL; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + hts_idx_t *idx = NULL; + hts_itr_t *iter = NULL; + unsigned int regcnt = 0; + char **regions = NULL; + + //read_multireg infile count regspec_csv + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + regcnt = atoi(argv[2]); + regions = calloc(regcnt, sizeof(char*)); + //set each regspec as separate entry in region array + ptr = argv[3]; + for (c = 0; ptr && (c < regcnt); ++c) { + regions[c] = ptr; + ptr = strchr(ptr, ','); + if (ptr) { *ptr = '\0'; ++ptr; } + } + + if (regcnt == 0) { + printf("Region count can not be 0\n"); + goto end; + } + //initialize bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open files, use stdout as output SAM file for ease of display + infile = sam_open(inname, "r"); + outfile = sam_open("-", "w"); + if (!outfile || !infile) { + printf("Could not open in/out files\n"); + goto end; + } + //load index file, assume it to be present in same location + if (!(idx = sam_index_load(infile, inname))) { + printf("Failed to load the index\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //create iterator + if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) { + printf("Failed to get iterator\n"); + goto end; + } + if (regions) { + //can be freed as it is no longer required + free(regions); + regions = NULL; + } + + //get required area + while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) { + //write to output + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (c != -1) { + printf("Error during read\n"); + goto end; + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (iter) { + sam_itr_destroy(iter); + } + if (idx) + hts_idx_destroy(idx); + return ret; +} diff --git a/samples/index_reg_read.c b/samples/index_reg_read.c new file mode 100644 index 000000000..346d5428f --- /dev/null +++ b/samples/index_reg_read.c @@ -0,0 +1,143 @@ +/* index_reg_read.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the print_usage +/** @param fp pointer to the file / terminal to which print_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: readreg infile idxfile region\n\ +Reads alignments matching to a specific region\n\ +\\. from start of file\n\ +\\* only unmapped reads\n\ +REFNAME all reads referring REFNAME\n\ +REFNAME:S all reads referring REFNAME and overlapping from S onwards\n\ +REFNAME:S-E all reads referring REFNAME overlapping from S to E\n\ +REFNAME:-E all reads referring REFNAME overlapping upto E\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *region = NULL; + char *idxfile = NULL; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + hts_idx_t *idx = NULL; + hts_itr_t *iter = NULL; + + //readreg infile indexfile region + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + idxfile = argv[2]; + region = argv[3]; + + //initialize bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + + //open files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open input file\n"); + goto end; + } + //using stdout as output file for ease of dumping data + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open out file\n"); + goto end; + } + //load index file + if (!(idx = sam_index_load2(infile, inname, idxfile))) { + printf("Failed to load the index\n"); + goto end; + } + //can use sam_index_load if the index file is present in same location and follows standard naming conventions (i.e. .) + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //create iterator + if (!(iter = sam_itr_querys(idx, in_samhdr, region))) { + printf("Failed to get iterator\n"); + goto end; + } + //read using iterator + while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) { + //write to output + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (c != -1) { + printf("Error during read\n"); + goto end; + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (iter) { + sam_itr_destroy(iter); + } + if (idx) { + hts_idx_destroy(idx); + } + return ret; +} diff --git a/samples/index_write.c b/samples/index_write.c new file mode 100644 index 000000000..8fd2bc968 --- /dev/null +++ b/samples/index_write.c @@ -0,0 +1,166 @@ +/* index_write.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: idx_on_write infile shiftsize outdir\n\ +Creates compressed sam file and index file for it in given directory\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *outdir = NULL; + char *inname = NULL, *fileidx = NULL, *outname = NULL, outmode[4] = "w"; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + //idx_on_write infile sizeshift outputdirectory + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + size = atoi(argv[2]); + outdir = argv[3]; + + //allocate space for output name - outdir/filename.ext.idxextNUL + c = strlen(basename(inname)) + strlen(outdir) + 10; + fileidx = malloc(sizeof(char) * c); + outname = malloc(sizeof(char) * c); + if (!fileidx || !outname) { + printf("Couldnt allocate memory\n"); + goto end; + } + //initialize bam storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + + //open files + if ((infile = sam_open(inname, "r"))) { + //get file type and create output names + if (infile->format.format == cram) { + //set as crai + snprintf(fileidx, c, "%s/%s.crai", outdir, basename(inname)); + snprintf(outname, c, "%s/%s", outdir, basename(inname)); + } + else { + //set as either bai or csi based on interval + if (infile->format.format == sam && infile->format.compression == no_compression) { + //create as gzip compressed + snprintf(outname, c, "%s/%s.gz", outdir, basename(inname)); + snprintf(fileidx, c, "%s/%s.gz.%s", outdir, basename(inname), !size ? "bai" : "csi"); + } + else { + //with same name as input + snprintf(outname, c, "%s/%s", outdir, basename(inname)); + snprintf(fileidx, c, "%s/%s.%s", outdir, basename(inname), !size ? "bai" : "csi"); + } + } + } + c = 0; + sam_open_mode(outmode + 1, outname, NULL); //set extra write options based on name + outfile = sam_open(outname, outmode); + if (!outfile || !infile) { + printf("Could not open files\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if (sam_hdr_write(outfile, in_samhdr)) { + printf("Failed to write header\n"); + goto end; + } + + // initialize indexing, before start of write + if (sam_idx_init(outfile, in_samhdr, size, fileidx)) { + printf("idx initialization failed\n"); + goto end; + } + //read and write alignments + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + goto end; + } + } + if (c != -1) { + printf("Error in reading data\n"); + goto end; + } + //else EOF, save index + if (sam_idx_save(outfile)) { + printf("Could not save index\n"); + goto end; + } + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (fileidx) { + free(fileidx); + } + if (outname) { + free(outname); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/mod_aux.c b/samples/mod_aux.c new file mode 100644 index 000000000..d5ed18cde --- /dev/null +++ b/samples/mod_aux.c @@ -0,0 +1,221 @@ +/* mod_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_aux infile QNAME tag type val\n\ +Add/update the given aux tag to all alignments\n\ +type A-char C-int F-float Z-string\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL, *qname = NULL, *val = NULL; + char type = '\0'; + int ret = EXIT_FAILURE, ret_r = 0, length = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + + //mod_aux infile QNAME tag type val + if (argc != 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + qname = argv[2]; + tag = argv[3]; + type = argv[4][0]; + val = argv[5]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output file + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open std output\n"); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (strcasecmp(bam_get_qname(bamdata), qname)) { + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + continue; //not matching + } + + errno = 0; + //matched to qname, update aux + if (!(data = bam_aux_get(bamdata, tag))) { + int i = 0; float f = 0; + //tag not present append + switch (type) { + case 'f': + case 'd': + length = sizeof(float); + f = atof(val); + val = (const char*) &f; + type = 'f'; + break; + case 'C': + case 'S': + case 'I': + length = sizeof(int); + i = atoi(val); + val = (const char*) &i; + break; + case 'Z': + length = strlen(val) + 1; //1 for NUL termination + break; + case 'A': + length = 1; + break; + default: + printf("Invalid type mentioned\n"); + goto end; + break; + } + if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) { + printf("Failed to append aux data, errno: %d\n", errno); + goto end; + } + } + else { + char auxtype = bam_aux_type(data); + //update the tag with newer value + switch (type) { + case 'f': + case 'd': + if (auxtype != 'f' && auxtype != 'd') { + printf("Invalid aux type passed\n"); + goto end; + } + if (bam_aux_update_float(bamdata, tag, atof(val))) { + printf("Failed to update float data, errno: %d\n", errno); + goto end; + } + break; + case 'C': + case 'S': + case 'I': + if (auxtype != 'c' && auxtype != 'C' && auxtype != 's' && auxtype != 'S' && auxtype != 'i' && auxtype != 'I') { + printf("Invalid aux type passed\n"); + goto end; + } + if (bam_aux_update_int(bamdata, tag, atoll(val))) { + printf("Failed to update int data, errno: %d\n", errno); + goto end; + } + break; + case 'Z': + if (auxtype != 'Z') { + printf("Invalid aux type passed\n"); + goto end; + } + length = strlen(val) + 1; //1 for NUL termination + if (bam_aux_update_str(bamdata, tag, length, val)) { + //with length as -1, length will be detected based on null terminated val data + printf("Failed to update string data, errno: %d\n", errno); + goto end; + } + break; + case 'A': + if (auxtype != 'A') { + printf("Invalid aux type passed\n"); + goto end; + } + //update the char data directly on buffer + *(data+1) = val[0]; + break; + default: + printf("Invalid data type\n"); + goto end; + break; + } + } + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/mod_aux_ba.c b/samples/mod_aux_ba.c new file mode 100644 index 000000000..8ef90ee1e --- /dev/null +++ b/samples/mod_aux_ba.c @@ -0,0 +1,147 @@ +/* mod_aux_ba.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_aux_ba infile\n\ +Updates the count of bases as an aux array on all alignments\n\ +BA:B:I,count of ACTGN\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int i = 0, ret = EXIT_FAILURE, ret_r = 0; + uint32_t cnt[5] = {0}; //A C G T N + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + bam1_t *bamdata = NULL; + + //mod_aux infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //open output file + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open std output\n"); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; + memset(cnt, 0, sizeof(cnt)); + for (i = 0; i < bamdata->core.l_qseq; ++i) { + switch (seq_nt16_str[bam_seqi(bam_get_seq(bamdata),i)]) { + case 'A': + ++cnt[0]; + break; + case 'C': + ++cnt[1]; + break; + case 'G': + ++cnt[2]; + break; + case 'T': + ++cnt[3]; + break; + default: //N + ++cnt[4]; + break; + } + } + + if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) { + printf("Failed to update base array, errno %d", errno); + goto end; + } + + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/mod_bam.c b/samples/mod_bam.c new file mode 100644 index 000000000..9f1eb324e --- /dev/null +++ b/samples/mod_bam.c @@ -0,0 +1,229 @@ +/* mod_bam.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_bam infile QNAME fieldpos newval\n\ +Modifies the alignment data field\n\ +fieldpos - 1 QNAME 2 FLAG 3 RNAME 4 POS 5 MAPQ 6 CIGAR 7 RNEXT 8 PNEXT 9 TLEN 10 SEQ 11 QUAL\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *qname = NULL; + char *val = NULL; + int c = 0, ret = EXIT_FAILURE, field = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + int ret_r = 0, i = 0; + bam1_t *bamdata = NULL; + + //mod_bam infile QNAME fieldpos newval + if (argc != 5) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + qname = argv[2]; + //1 QNAME 2 FLAG 3 RNAME 4 POS 5 MAPQ 6 CIGAR 7 RNEXT 8 PNEXT 9 TLEN 10 SEQ 11 QUAL + field = atoi(argv[3]); + val = argv[4]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r")) || !(outfile = sam_open("-", "w"))) { + printf("Could not open input/output\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… + ret = 0; + if (!strcasecmp(qname, bam_get_qname(bamdata))) { + //the required one + switch(field) { + case 1:// QNAME + ret = bam_set_qname(bamdata, val); + break; + case 2:// FLAG + bamdata->core.flag = atol(val) & 0xFFFF; + break; + case 3:// RNAME + case 7:// RNEXT + if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) { + printf("Invalid reference name\n"); + ret = -1; + break; + } + if (field == 3) { + //reference + bamdata->core.tid = ret; + } + else { + //mate reference + bamdata->core.mtid = ret; + } + break; + case 4:// POS + bamdata->core.pos = atoll(val); + break; + case 5:// MAPQ + bamdata->core.qual = atoi(val) & 0x0FF; + break; + case 6:// CIGAR + { + uint32_t *cigar = NULL; + size_t size = 0; + ssize_t ncigar = 0; + bam1_t *newbam = bam_init1(); + if (!newbam) { + printf("Failed to create new bam data\n"); + ret = -1; + break; + } + //get cigar array and set all data in new bam record + if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) { + printf("Failed to parse cigar\n"); + ret = -1; + break; + } + if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid, bamdata->core.pos, bamdata->core.qual, + ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos, bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata), (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) { + printf("Failed to set bamdata\n"); + ret = -1; + break; + } + //correct sequence data as input is expected in ascii format and not as compressed inside bam! + memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2); + //copy the aux data + memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); + + bam_destroy1(bamdata); + bamdata = newbam; + } + break; + case 8:// PNEXT + bamdata->core.mpos = atoll(val); + break; + case 9:// TLEN + bamdata->core.isize = atoll(val); + break; + case 10:// SEQ + i = strlen(val); + if (bamdata->core.l_qseq != i) { + printf("SEQ length different\n"); + ret = -1; + //as it is different, have to update quality data and cigar data as well and more info is required for it, which is not handled in this sample + //accessing raw memory and moving is one option; creating and using new bam1_t object is another option. + break; + } + for( c = 0; c < i; ++c) { + bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]); + } + break; + case 11:// QUAL + i = strlen(val); + if (i != bamdata->core.l_qseq) { + printf("Qual length different than sequence\n"); + ret = -1; + break; + } + for (c = 0; c < i; ++c) { + val[c] -= 33; //phred score from ascii value + } + memcpy(bam_get_qual(bamdata), val, i); + break; + default: + printf("Invalid input\n"); + goto end; + break; + } + if (ret < 0) { + printf("Failed to set new data\n"); + ret = EXIT_FAILURE; + goto end; + } + } + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write bam data\n"); + ret = EXIT_FAILURE; + goto end; + } + } + + if (ret_r == -1 || ret != EXIT_FAILURE) { + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/modstate.c b/samples/modstate.c new file mode 100644 index 000000000..976391684 --- /dev/null +++ b/samples/modstate.c @@ -0,0 +1,190 @@ +/* modstate.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: modstate infile option\n\ +Shows the base modifications on the alignment\n\ +Option can be 1 or 2 to select the api to use\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + + int ret_r = 0, i = 0 , r = 0, j = 0, pos = 0, opt = 0, k = 0, cnt = 0, *bm = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + hts_base_mod_state *ms = NULL; + + + //modstate infile 1/2 + if (argc != 3) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + opt = atoi(argv[2]) - 1; //option 1 or 2? + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + if (!(ms = hts_base_mod_state_alloc())) { + printf("Failed to allocate state memory\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + i = 0; + data = bam_get_seq(bamdata); + if (bam_parse_basemod(bamdata, ms)) { + printf("Failed to parse the base mods\n"); + goto end; + } + //dump the modifications + printf("Modifications:"); + bm = bam_mods_recorded(ms, &cnt); + for (k = 0; k < cnt; ++k) { + printf("%c", bm[k]); + } + printf("\n"); + hts_base_mod mod[5] = {0}; //for ATCGN + if (opt) { + //option 1 + for (; i < bamdata->core.l_qseq; ++i) { + if ((r = bam_mods_at_next_pos(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]))) <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + else if (r > (sizeof(mod) / sizeof(mod[0]))) { + printf("More modifications than this app can handle, update the app\n"); + goto end; + } + else if (!r) { + //no modification at this pos + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + } + } + else { + //option 2 + while ((r = bam_next_basemod(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]), &pos)) >= 0) { + for (; i < bamdata->core.l_qseq && i < pos; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + if (i == pos) + i++; //skip the modification already displayed + if (!r) { + for (; i < bamdata->core.l_qseq; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + break; + } + } + if (r <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + } + printf("\n"); + } + + if (ret_r == -1) { + //check last alignment's base modification + int strand = 0, impl = 0; + char canonical = 0, modification[] = "mhfcgebaon"; //possible modifications + printf("\n\nLast alignment has \n"); + for (k = 0; k < sizeof(modification) - 1; ++k) { //avoiding NUL termination + if (bam_mods_query_type(ms, modification[k], &strand, &impl, &canonical)) { + printf ("No modification of %c type\n", modification[k]); + } + else { + printf("%s strand has %c modified with %c, can %sassume unlisted as unmodified\n", strand?"-/bottom/reverse":"+/top/forward", canonical, modification[k], impl?"" : "not " ); + } + } + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + + if (ms) { + hts_base_mod_state_free(ms); + } + return ret; +} diff --git a/samples/mpileup.c b/samples/mpileup.c new file mode 100644 index 000000000..fe933748e --- /dev/null +++ b/samples/mpileup.c @@ -0,0 +1,204 @@ +/* mpileup.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mpileup infile ...\n\ +Shows the mpileup api usage.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + return 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf** conf = NULL; + bam_mplp_t mplpiter = NULL; + int tid = -1, input = 0, k = 0, dpt = 0, *depth = NULL; + hts_pos_t refpos = -1; + const bam_pileup1_t **plp = NULL; + + //infile ... + if (argc < 2) { + print_usage(stderr); + goto end; + } + if ((conf = calloc(argc - 1, sizeof(plpconf*)))) { + for (input = 0; input < argc - 1; ++input) { + conf[input] = calloc(1, sizeof(plpconf)); + } + } + depth = calloc(argc - 1, sizeof(int)); + plp = calloc(argc - 1, sizeof(bam_pileup1_t*)); + if (!conf || !depth || !plp) { + printf("Failed to allocate memory\n"); + goto end; + } + for (input = 0; input < argc - 1; ++input) { + conf[input]->inname = argv[input+1]; + } + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + for(input = 0; input < argc - 1; ++input) { + if (!(conf[input]->infile = sam_open(conf[input]->inname, "r"))) { + printf("Could not open %s\n", conf[input]->inname); + goto end; + } + //read header + if (!(conf[input]->in_samhdr = sam_hdr_read(conf[input]->infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + } + + if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) { + printf("Failed to initialize mpileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_mplp_constructor(mplpiter, plpconstructor); + bam_mplp_destructor(mplpiter, plpdestructor); + + while (bam_mplp64_auto(mplpiter, &tid, &refpos, depth, plp) > 0) { + printf("%d\t%"PRIhts_pos"\t", tid+1, refpos+1); + + for (input = 0; input < argc - 1; ++input) { + for (dpt = 0; dpt < depth[input]; ++dpt) { + if (plp[input][dpt].is_del || plp[input][dpt].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[input][dpt].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]) : + (plp[input]->is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]))); + if (plp[input][dpt].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[input][dpt].indel); + for (k = 0; k < plp[input][dpt].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos + k + 1)])); + } + } + else if (plp[input][dpt].indel < 0) { + printf("%d", plp[input][dpt].indel); + for (k = 0; k < -plp[input][dpt].indel; ++k) { + printf("?"); + } + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf) { + for (input = 0; input < argc - 1; ++input) { + if (conf[input] && conf[input]->in_samhdr) { + sam_hdr_destroy(conf[input]->in_samhdr); + } + if (conf[input] && conf[input]->infile) { + sam_close(conf[input]->infile); + } + if (conf[input]) { + free(conf[input]); + } + } + free(conf); + } + + if (bamdata) { + bam_destroy1(bamdata); + } + if (mplpiter) { + bam_mplp_destroy(mplpiter); + } + if (depth) { + free(depth); + } + if (plp) { + free(plp); + } + return ret; +} diff --git a/samples/pileup.c b/samples/pileup.c new file mode 100644 index 000000000..11e2fb02f --- /dev/null +++ b/samples/pileup.c @@ -0,0 +1,183 @@ +/* pileup.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: pileup infile\n\ +Shows the pileup api usage.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + /*plpconf *conf= (plpconf*)data; + can access the data passed to pileup init from data + can do any alignment specific allocation / data storage here in param cd + it can hold either a float, 64 bit int or a pointer + when using cd, initialize and use as it will be reused after destructor*/ + return 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + /*plpconf *conf= (plpconf*)data; + can access the data passed to pileup init from data + deallocate any alignment specific allocation made in constructor and stored in param cd*/ + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf conf = {0}; + bam_plp_t plpiter = NULL; + int tid = -1, n = -1, j = 0, k = 0; + int refpos = -1; + const bam_pileup1_t *plp = NULL; + + //infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + conf.inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + if (!(conf.infile = sam_open(conf.inname, "r"))) { + printf("Could not open %s\n", conf.inname); + goto end; + } + //read header + if (!(conf.in_samhdr = sam_hdr_read(conf.infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (!(plpiter = bam_plp_init(readdata, &conf))) { + printf("Failed to initialize pileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &n))) { + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < n; ++j) { + //doesnt detect succeeding insertion and deletion together here, only insertion is identified + //deletion is detected in plp->is_del as and when pos reaches the position + //if detection ahead is required, use bam_plp_insertion here which gives deletion length along with insertion + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]))); + if (plp[j].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[j].indel); + for (k = 0; k < plp[j].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos + k + 1)])); + } + } + else if (plp[j].indel < 0) { + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf.in_samhdr) { + sam_hdr_destroy(conf.in_samhdr); + } + if (conf.infile) { + sam_close(conf.infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (plpiter) { + bam_plp_destroy(plpiter); + } + return ret; +} diff --git a/samples/pileup_mod.c b/samples/pileup_mod.c new file mode 100644 index 000000000..24d6cf539 --- /dev/null +++ b/samples/pileup_mod.c @@ -0,0 +1,218 @@ +/* pileup_mod.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: pileup_mod infile\n\ +Shows the pileup api usage with base modification.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + //plpconf *conf= (plpconf*)data; can use this to access anything required from the data in pileup init + + //when using cd, initialize and use as it will be reused after destructor + cd->p = hts_base_mod_state_alloc(); + if (!cd->p) { + printf("Failed to allocate base modification state\n"); + return 1; + } + + //parse the bam data and gather modification data from MM tags + return (-1 == bam_parse_basemod(b, (hts_base_mod_state*)cd->p)) ? 1 : 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + if (cd->p) { + hts_base_mod_state_free((hts_base_mod_state *)cd->p); + cd->p = NULL; + } + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf conf = {0}; + bam_plp_t plpiter = NULL; + int tid = -1, depth = -1, j = 0, k = 0, inslen = 0, dellen = 0, modlen = 0; + #define NMODS 5 + hts_base_mod mods[NMODS] = {0}; //ACGT N + int refpos = -1; + const bam_pileup1_t *plp = NULL; + kstring_t insdata = KS_INITIALIZE; + + //infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + conf.inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + if (!(conf.infile = sam_open(conf.inname, "r"))) { + printf("Could not open %s\n", conf.inname); + goto end; + } + //read header + if (!(conf.in_samhdr = sam_hdr_read(conf.infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (!(plpiter = bam_plp_init(readdata, &conf))) { + printf("Failed to initialize pileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &depth))) { + memset(&mods, 0, sizeof(mods)); + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < depth; ++j) { + dellen = 0; + + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + /*invoke bam_mods_at_qpos before bam_plp_insertion_mod that the base modification + is retrieved before change in pileup pos thr' plp_insertion_mod call*/ + if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) { + printf("Failed to get modifications\n"); + goto end; + } + + //use plp_insertion/_mod to get insertion and del at the same position + if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) { + printf("Failed to get insertion status\n"); + goto end; + } + + //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered + //base and modification + printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)])), + modlen > 0 ? mods[0].strand ? '-' : '+' : '\0', + modlen > 0 ? mods[0].modified_base : '\0'); + //insertion and deletions + if (plp[j].indel > 0) { + //insertion + /*insertion data from plp_insertion_mod, note this shows the quality value as well + which is different from base and modification above;the lower case display is not attempted either*/ + printf("+%d%s", plp[j].indel, insdata.s); + //handle deletion if any + if (dellen) { + printf("-%d", dellen); + for (k = 0; k < dellen; ++k) { + printf("?"); + } + } + } + else if (plp[j].indel < 0) { + //deletion + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf.in_samhdr) { + sam_hdr_destroy(conf.in_samhdr); + } + if (conf.infile) { + sam_close(conf.infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (plpiter) { + bam_plp_destroy(plpiter); + } + ks_free(&insdata); + return ret; +} diff --git a/samples/read_aux.c b/samples/read_aux.c new file mode 100644 index 000000000..cbf972b98 --- /dev/null +++ b/samples/read_aux.c @@ -0,0 +1,207 @@ +/* read_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_aux infile tag\n\ +Read the given aux tag from alignments either as SAM string or as raw data\n"); +} + +/// printauxdata - prints aux data +/** @param fp - file to which it to be printed - stdout or null + * @param type - aux type + * @param idx - index in array, -1 when not an array type + * @param data - data + * recurses when the data is array type +returns 1 on failure 0 on success +*/ +int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data) +{ + uint32_t auxBcnt = 0; + int i = 0; + char auxBType = 'Z'; + + //the tag is already queried and ensured to exist and the type is retrieved from the tag data, also iterated within index for arrays, so no error is expected here. + //when these apis are used explicitly, these error conditions needs to be handled based on return value and errno + switch(type) { + case 'A': + fprintf(fp, "%c", bam_aux2A(data)); //byte data + break; + case 'c': + fprintf(fp, "%d", (int8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 1 byte data; bam_auxB2i - from array or bam_aux2i - non array data + break; + case 'C': + fprintf(fp, "%u", (uint8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 1 byte data + break; + case 's': + fprintf(fp, "%d", (int16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 2 byte data + break; + case 'S': + fprintf(fp, "%u", (uint16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 2 byte data + break; + case 'i': + fprintf(fp, "%d", (int32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 4 byte data + break; + case 'I': + fprintf(fp, "%u", (uint32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 4 byte data + break; + case 'f': + case 'd': + fprintf(fp, "%g", (float)(idx > -1 ? bam_auxB2f(data, idx) : bam_aux2f(data))); //floating point data, 4 bytes + break; + case 'H': + case 'Z': + fprintf(fp, "%s", bam_aux2Z(data)); //array of char or hex data + break; + case 'B': //array of char/int/float + auxBcnt = bam_auxB_len(data); //length of array + auxBType = bam_aux_type(data + 1); //type of element in array + fprintf(fp, "%c", auxBType); + for (i = 0; i < auxBcnt; ++i) { //iterate the array + fprintf(fp, ","); + //calling recurssively with index to reuse a few lines + if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) { + return EXIT_FAILURE; + } + } + break; + default: + printf("Invalid aux tag?\n"); + return EXIT_FAILURE; + break; + } + return EXIT_SUCCESS; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL; + int c = 0, ret = EXIT_FAILURE, ret_r = 0, i = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + kstring_t sdata = KS_INITIALIZE; + + //read_aux infile tag + if (argc != 3) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + tag = argv[2]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; i++; + ks_clear(&sdata); + if (i % 2) { //use options alternatively to demonstrate both + //option 1 - get data as string with tag and type + if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) { + printf("%s\n",sdata.s); + } + else if (c == 0 && errno == ENOENT) { + //tag not present + printf("Tag not present\n"); + } + else { + //error + printf("Failed to get tag\n"); + goto end; + } + } + else { + //option 2 - get raw data + if (!(data = bam_aux_get(bamdata, tag))) { + //tag data not returned, errono gives the reason + if (errno == ENOENT) { + printf("Tag not present\n"); + } + else { + printf("Invalid aux data\n"); + } + } + else { + //got the tag, read and print + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + printf("Failed to read aux data\n"); + goto end; + } + printf("\n"); + } + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + ks_free(&sdata); + return ret; +} diff --git a/samples/read_bam.c b/samples/read_bam.c new file mode 100644 index 000000000..7fca8c55d --- /dev/null +++ b/samples/read_bam.c @@ -0,0 +1,139 @@ +/* read_bam.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_bam infile\n\ +Shows the alignment data from file\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tidname = NULL, *flags = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + + int ret_r = 0, i = 0; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + uint32_t *cigar = NULL; + + + //read_bam infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… + printf("NAME: %s\n", bam_get_qname(bamdata)); //get the query name using the macro + flags = bam_flag2str(bamdata->core.flag); //flags as string + printf("FLG: %d - %s\n", bamdata->core.flag, flags); //flag is available in core structure + free((void*)flags); + tidname = sam_hdr_tid2name(in_samhdr, bamdata->core.tid); + printf("RNAME/TID: %d - %s\n", bamdata->core.tid, tidname? tidname: "" ); //retrieves the target name using the value in bam and by referring the header + printf("POS: %"PRIhts_pos"\n", bamdata->core.pos + 1); //internally position is 0 based and on text output / SAM it is 1 based + printf("MQUAL: %d\n", bamdata->core.qual); //map quality value + + cigar = bam_get_cigar(bamdata); //retrieves the cigar data + printf("CGR: "); + for (i = 0; i < bamdata->core.n_cigar; ++i) { //no. of cigar data entries + printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i])); //the macros gives the count of operation and the symbol of operation for given cigar entry + } + printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize); + + data = bam_get_seq(bamdata); //get the sequence data + if (bamdata->core.l_qseq != bam_cigar2qlen(bamdata->core.n_cigar, cigar)) { //checks the length with CIGAR and query + printf("\nLength doesnt matches to cigar data\n"); + goto end; + } + + printf("SEQ: "); + for (i = 0; i < bamdata->core.l_qseq ; ++i) { //sequence length + printf("%c", seq_nt16_str[bam_seqi(data, i)]); //retrieves the base from (internal compressed) sequence data + } + printf("\nQUAL: "); + for (int i = 0; i < bamdata->core.l_qseq ; ++i) { + printf("%c", bam_get_qual(bamdata)[i]+33); //retrives the quality value + } + printf("\n\n"); + } + + if (ret_r == -1) { + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/read_fast.c b/samples/read_fast.c new file mode 100644 index 000000000..f74b25515 --- /dev/null +++ b/samples/read_fast.c @@ -0,0 +1,116 @@ +/* read_fast.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_fast \n\ +Reads the fasta/fastq file and shows the content.\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (infile->format.format != fasta_format && infile->format.format != fastq_format) { + printf("Invalid file specified\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf( "Failed to read header from file\n"); + goto end; + } + + //read data + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + printf("\nsequence: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]); + } + if (infile->format.format == fastq_format) { + printf("\nquality: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", bam_get_qual(bamdata)[c]); + } + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/read_header.c b/samples/read_header.c new file mode 100644 index 000000000..eb14daea5 --- /dev/null +++ b/samples/read_header.c @@ -0,0 +1,173 @@ +/* read_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_header infile header [id val] [tag]\n\ +This shows given tag from given header or the whole line\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *header = NULL, *tag = NULL, *idval = NULL; + char *id = NULL; + int c = 0, ret = EXIT_FAILURE, linecnt = 0; + samFile *infile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + + //read_header infile header tag + if (argc < 3 || argc > 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + if (argc == 4) { //header and tag + tag = argv[3]; + //find unique identifier field name for requested header type + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + id = ""; + } + else { + printf("Invalid header type\n"); + goto end; + } + } + else if (argc == 5) { //header id val + id = argv[3]; + idval = argv[4]; + } + else if (argc == 6) { //header id val tag + id = argv[3]; + idval = argv[4]; + tag = argv[5]; + } + + //open input files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (id && idval) { + if (tag) { + ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data); + } + else { + ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data); + } + + if (ret == 0) { + printf("%s\n", data.s); + } + else if (ret == -1) { + printf("No matching tag found\n"); + goto end; + } + else { + printf("Failed to find header line\n"); + goto end; + } + } + else { + //get count of given header type + linecnt = sam_hdr_count_lines(in_samhdr, header); + if (linecnt == 0) { + printf("No matching line found\n"); + goto end; + } + for (c = 0; c < linecnt; ++c ) { + if (tag) { + //non CO, get the tag requested + ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data); + } + else { + //CO header, there are no tags but the whole line + ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data); + } + + if (ret == 0) { + printf("%s\n", data.s); + continue; + } + else if (ret == -1) { + printf("Tag not present\n"); + continue; + } + else { + printf("Failed to get tag\n"); + goto end; + } + } + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + ks_free(&data); + return ret; +} diff --git a/samples/read_refname.c b/samples/read_refname.c new file mode 100644 index 000000000..adbc71183 --- /dev/null +++ b/samples/read_refname.c @@ -0,0 +1,125 @@ +/* read_refname.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_refname infile minsize\n\ +This shows name of references which has length above the given size\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *id = NULL; + int c = 0, ret = EXIT_FAILURE, linecnt = 0, pos = 0; + samFile *infile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + int64_t minsize = 0, size = 0; + + if (argc != 3 && argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + if (argc == 3) { + minsize = atoll(argv[2]); + } + + //open input files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + linecnt = sam_hdr_count_lines(in_samhdr, "SQ"); //get reference count + if (linecnt <= 0) { + if (!linecnt) { + printf("No reference line present\n"); + } + else { + printf("Failed to get reference line count\n"); + } + goto end; + } + //iterate and check each reference's length + for (pos = 1, c = 0; c < linecnt; ++c) { + if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) { + printf("Failed to get length\n"); + goto end; + } + else if (ret == -1) { + //length not present, ignore + continue; + } + //else have length + size = atoll(data.s); + if (size < minsize) { + //not required + continue; + } + if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) { //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same! + printf("Failed to get id for reference data\n"); + goto end; + } + printf("%d,%s,%s\n", pos, id, data.s); + pos++; + } + + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + ks_free(&data); + return ret; +} diff --git a/samples/rem_header.c b/samples/rem_header.c new file mode 100644 index 000000000..a0b6510fb --- /dev/null +++ b/samples/rem_header.c @@ -0,0 +1,138 @@ +/* rem_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: rem_header infile header [id]\n\ +Removes header line of given type and id\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *header = NULL, *idval = NULL; + char *id = NULL; + int ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + + //update_header infile header idval tag value + if (argc <3 || argc > 4) { + //3 & 4 are ok, 3-> all of given header type, 4->given id of given header type to be removed + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + if (argc == 4) { + idval = argv[3]; + } + + //unique identifier for each of the header types + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + //CO field can be removed using the position of it using sam_hdr_remove_line_pos + id = ""; + } + else { + printf("Invalid header type\n"); + goto end; + } + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + if (idval) { + //remove specific line + if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) { + printf("Failed to remove header line\n"); + goto end; + } + } + else { + //remove multiple lines of a header type + if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) { + printf("Failed to remove header line\n"); + goto end; + } + } + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanupq + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/sample.ref.fa b/samples/sample.ref.fa new file mode 100644 index 000000000..5789e8c42 --- /dev/null +++ b/samples/sample.ref.fa @@ -0,0 +1,4 @@ +>T1 T1:1-40 +AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT +>T2 T2:1:40 +TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT diff --git a/samples/sample.sam b/samples/sample.sam new file mode 100644 index 000000000..e56efd69f --- /dev/null +++ b/samples/sample.sam @@ -0,0 +1,29 @@ +@HD VN:1.17 SO:unknown +@SQ SN:T1 LN:40 +@SQ SN:T2 LN:40 +@CO @SQ SN* LN* AH AN AS DS M5 SP TP UR +@CO @RG ID* BC CN DS DT FO KS LB PG PI PL PM PU SM +@CO @PG ID* PN CL PP DS VN +@CO this is a dummy alignment file to demonstrate different abilities of hts apis +@CO QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… +@CO 1234567890123456789012345678901234567890 +@CO AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT T1 +@CO TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT T2 +@CO ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmappped +@CO A1-A2, A4-A3 are proper pairs with A4-A3 in different read order. A5 is secondary alignment +ITR1 99 T1 5 40 4M = 33 10 ACTG ()() +ITR2 147 T2 23 49 2M = 35 -10 TT ** +ITR2M 99 T2 35 51 2M = 23 10 AA && +ITR1M 147 T1 33 37 4M = 5 -10 ACTG $$$$ +UNMP1 73 T1 21 40 3M * 0 5 GGG &&1 +UNMP2 141 * 0 0 * * 0 7 AA && +UNMP3 77 * 0 0 * * 0 5 GGG &&2 +A1 99 T1 25 35 6M = 31 8 ACTGTT ****** +A2 147 T1 31 33 6M = 25 -8 ACTGTT ()()() +A3 147 T2 23 47 2M1X = 12 -5 TTG ((( +A4 99 T2 12 50 3M = 23 5 GAA ()( +A5 355 T1 25 35 4M = 33 5 ACTG PPPP +B1 99 T1 25 35 6M = 31 8 GCTATT ****** +B3 147 T2 23 47 2M1X = 12 -5 TAG ((( +B4 99 T2 12 50 3M = 23 5 GAT ()( +B5 355 T1 25 35 4M = 33 5 AGTG PPPP diff --git a/samples/split.c b/samples/split.c new file mode 100644 index 000000000..2eb9e6b79 --- /dev/null +++ b/samples/split.c @@ -0,0 +1,153 @@ +/* split.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the basic writing of output\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split2.c b/samples/split2.c new file mode 100644 index 000000000..2354abfe3 --- /dev/null +++ b/samples/split2.c @@ -0,0 +1,158 @@ +/* split2.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows file type selection through name and format api\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL, mode1[5] = "w", mode2[5] = "w"; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam.gz") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam.gz", outdir); //name of Read1 file + snprintf(file2, size, "%s/2.sam", outdir); //name of Read2 file + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //set file open mode based on file name for 1st and as explicit for 2nd + if ((sam_open_mode(mode1+1, file1, NULL) == -1) || (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) { + printf("Failed to set open mode\n"); + goto end; + } + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files + outfile1 = sam_open(file1, mode1); //as compressed SAM through sam_open + outfile2 = sam_open_format(file2, mode2, NULL); //as compressed SAM through sam_open_format + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split_thread1.c b/samples/split_thread1.c new file mode 100644 index 000000000..40d2dfdc2 --- /dev/null +++ b/samples/split_thread1.c @@ -0,0 +1,161 @@ +/* split_thread1.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split_t1 infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the usage of basic thread in htslib\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //create file specific threads + if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading + hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for sam write + hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) { //1 thread specific for bam write + printf("Failed to set thread options\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split_thread2.c b/samples/split_thread2.c new file mode 100644 index 000000000..dab897b5f --- /dev/null +++ b/samples/split_thread2.c @@ -0,0 +1,171 @@ +/* split_thread2.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split_t2 infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the usage of thread pool\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + htsThreadPool tpool = {NULL, 0}; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //create a pool of 4 threads + if (!(tpool.pool = hts_tpool_init(4))) { + printf("Failed to initialize the thread pool\n"); + goto end; + } + //share the pool with all the 3 files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) { + printf("Failed to set thread options\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + if (tpool.pool) { + hts_tpool_destroy(tpool.pool); + } + return ret; +} diff --git a/samples/update_header.c b/samples/update_header.c new file mode 100644 index 000000000..f6b1680cd --- /dev/null +++ b/samples/update_header.c @@ -0,0 +1,131 @@ +/* update_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: update_header infile header idval tag value\n\ +Updates the tag's value on line given in id on header of given type\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL, *idval = NULL, *val = NULL, *header = NULL; + char *id = NULL; + int ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + + //update_header infile header idval tag value + if (argc != 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + idval = argv[3]; + tag = argv[4]; + val = argv[5]; + + //unique identifier for each of the header types + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + printf("This sample doesnt not support modifying HD fields\n"); + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + tag = NULL; + id = ""; + printf("This sample doesnt not support modifying CO fields\n"); + } + else { + printf("Invalid header type\n"); + goto end; + } + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + //update with new data + if (sam_hdr_update_line(in_samhdr, header, id, idval, tag, val, NULL) < 0) { + printf("Failed to update data\n"); + goto end; + } + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/write_fast.c b/samples/write_fast.c new file mode 100644 index 000000000..ef7817683 --- /dev/null +++ b/samples/write_fast.c @@ -0,0 +1,101 @@ +/* write_fast.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: write_fast \n\ +Appends a fasta/fastq file.\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *outname = NULL; //output file name + int ret = EXIT_FAILURE; + samFile *outfile = NULL; //sam file + sam_hdr_t *out_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + char mode[4] = "a"; + + if (argc != 2) { + print_usage(stdout); + goto end; + } + outname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + if (sam_open_mode(mode + 1, outname, NULL) < 0) { + printf("Invalid file name\n"); + goto end; + } + //open output file + if (!(outfile = sam_open(outname, mode))) { + printf("Could not open %s\n", outname); + goto end; + } + //dummy data + if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) < 0) { + printf("Failed to set data\n"); + goto end; + } + if (sam_write1(outfile, out_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (out_samhdr) { + sam_hdr_destroy(out_samhdr); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index f51723300..acb208488 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1,6 +1,6 @@ /* synced_bcf_reader.c -- stream through multiple VCF files. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -67,6 +67,7 @@ region_t; typedef struct { sr_sort_t sort; + int regions_overlap, targets_overlap; } aux_t; @@ -74,6 +75,8 @@ static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); static void _regions_sort_and_merge(bcf_sr_regions_t *reg); +static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler); +static void bcf_sr_seek_start(bcf_srs_t *readers); char *bcf_sr_strerror(int errnum) { @@ -123,6 +126,18 @@ int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...) BCF_SR_AUX(readers)->sort.pair = va_arg(args, int); return 0; + case BCF_SR_REGIONS_OVERLAP: + va_start(args, opt); + BCF_SR_AUX(readers)->regions_overlap = va_arg(args, int); + if ( readers->regions ) readers->regions->overlap = BCF_SR_AUX(readers)->regions_overlap; + return 0; + + case BCF_SR_TARGETS_OVERLAP: + va_start(args, opt); + BCF_SR_AUX(readers)->targets_overlap = va_arg(args, int); + if ( readers->targets ) readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap; + return 0; + default: break; } @@ -171,21 +186,29 @@ static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) { - assert( !readers->regions ); - if ( readers->nreaders ) + if ( readers->nreaders || readers->regions ) { - hts_log_error("Must call bcf_sr_set_regions() before bcf_sr_add_reader()"); - return -1; + if ( readers->regions ) bcf_sr_regions_destroy(readers->regions); + readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); + bcf_sr_seek_start(readers); + return 0; } + readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); if ( !readers->regions ) return -1; readers->explicit_regs = 1; readers->require_index = REQUIRE_IDX_; + readers->regions->overlap = BCF_SR_AUX(readers)->regions_overlap; return 0; } + int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles) { - assert( !readers->targets ); + if ( readers->nreaders || readers->targets ) + { + hts_log_error("Must call bcf_sr_set_targets() before bcf_sr_add_reader()"); + return -1; + } if ( targets[0]=='^' ) { readers->targets_exclude = 1; @@ -194,6 +217,7 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2); if ( !readers->targets ) return -1; readers->targets_als = alleles; + readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap; return 0; } @@ -386,6 +410,8 @@ bcf_srs_t *bcf_sr_init(void) bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t)); files->aux = (aux_t*) calloc(1,sizeof(aux_t)); bcf_sr_sort_init(&BCF_SR_AUX(files)->sort); + bcf_sr_set_opt(files,BCF_SR_REGIONS_OVERLAP,1); + bcf_sr_set_opt(files,BCF_SR_TARGETS_OVERLAP,0); return files; } @@ -394,8 +420,12 @@ static void bcf_sr_destroy1(bcf_sr_t *reader) free(reader->fname); if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); - bcf_hdr_destroy(reader->header); - hts_close(reader->file); + if (reader->header) { + bcf_hdr_destroy(reader->header); + } + if (reader->file) { + hts_close(reader->file); + } if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; jmbuffer; j++) @@ -540,6 +570,35 @@ static int _readers_next_region(bcf_srs_t *files) return 0; } +static void _set_variant_boundaries(bcf1_t *rec, hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t off; + if ( rec->n_allele ) + { + off = rec->rlen; + bcf_unpack(rec, BCF_UN_STR); + int i; + for (i=1; in_allele; i++) + { + // Make symbolic alleles start at POS, although this is not strictly true for + // , where POS should be the position BEFORE the deletion/insertion. + // However, since arbitrary symbolic alleles can be defined by the user, we + // will simplify the interpretation of --targets-overlap and --region-overlap. + int j = 0; + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[i]; + while ( ref[j] && alt[j] && ref[j]==alt[j] ) j++; + if ( off > j ) off = j; + if ( !off ) break; + } + } + else + off = 0; + + *beg = rec->pos + off; + *end = rec->pos + rec->rlen - 1; +} + /* * _reader_fill_buffer() - buffers all records with the same coordinate */ @@ -571,7 +630,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { if ( reader->file->format.format==vcf ) { - if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines + ret = hts_getline(reader->file, KS_SEP_LINE, &files->tmps); + if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) { files->errnum = vcf_parse_error; break; } } @@ -589,7 +650,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } else if ( reader->tbx_idx ) { - if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines + ret = tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps); + if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) { files->errnum = vcf_parse_error; break; } } @@ -601,8 +664,27 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } - // prevent creation of duplicates from records overlapping multiple regions - if ( files->regions && reader->buffer[reader->nbuffer+1]->pos <= files->regions->prev_end ) continue; + // Prevent creation of duplicates from records overlapping multiple regions + // and recognize true variant overlaps vs record overlaps (e.g. TA>T vs A>-) + if ( files->regions ) + { + hts_pos_t beg, end; + if ( BCF_SR_AUX(files)->regions_overlap==0 ) + beg = end = reader->buffer[reader->nbuffer+1]->pos; + else if ( BCF_SR_AUX(files)->regions_overlap==1 ) + { + beg = reader->buffer[reader->nbuffer+1]->pos; + end = reader->buffer[reader->nbuffer+1]->pos + reader->buffer[reader->nbuffer+1]->rlen - 1; + } + else if ( BCF_SR_AUX(files)->regions_overlap==2 ) + _set_variant_boundaries(reader->buffer[reader->nbuffer+1], &beg,&end); + else + { + hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); + exit(1); + } + if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; + } // apply filter if ( !reader->nfilter_ids ) @@ -614,8 +696,8 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } reader->nbuffer++; - if ( files->require_index==ALLOW_NO_IDX_ && reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; - if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full + if ( reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; + if ( reader->read_one_record_only || reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { @@ -632,22 +714,18 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } /* - * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position + * _readers_shift_buffer() - removes the first line */ static void _reader_shift_buffer(bcf_sr_t *reader) { + if ( !reader->nbuffer ) return; int i; + bcf1_t *tmp = reader->buffer[1]; for (i=2; i<=reader->nbuffer; i++) - if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; - if ( i<=reader->nbuffer ) - { - // A record with a different position follows, swap it. Because of the reader's logic, - // only one such line can be present. - bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp; - reader->nbuffer = 1; - } - else - reader->nbuffer = 0; // no other line + reader->buffer[i-1] = reader->buffer[i]; + if ( reader->nbuffer > 1 ) + reader->buffer[reader->nbuffer] = tmp; + reader->nbuffer--; } static int next_line(bcf_srs_t *files) @@ -698,19 +776,38 @@ static int next_line(bcf_srs_t *files) // Skip this position if not present in targets if ( files->targets ) { - int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos); - if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) ) + int match = 0; + for (i=0; inreaders; i++) + { + if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue; + hts_pos_t beg, end; + if ( BCF_SR_AUX(files)->targets_overlap==0 ) + beg = end = min_pos; + else if ( BCF_SR_AUX(files)->targets_overlap==1 ) + { + beg = min_pos; + end = min_pos + files->readers[i].buffer[1]->rlen - 1; + } + else if ( BCF_SR_AUX(files)->targets_overlap==2 ) + _set_variant_boundaries(files->readers[i].buffer[1], &beg,&end); + else + { + hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); + exit(1); + } + int overlap = bcf_sr_regions_overlap(files->targets, chr, beg, end)==0 ? 1 : 0; + if ( (!files->targets_exclude && !overlap) || (files->targets_exclude && overlap) ) + _reader_shift_buffer(&files->readers[i]); + else + match = 1; + } + if ( !match ) { - // Remove all lines with this position from the buffer - for (i=0; inreaders; i++) - if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) - _reader_shift_buffer(&files->readers[i]); min_pos = HTS_POS_MAX; chr = NULL; continue; } } - break; // done: chr and min_pos are set } if ( !chr ) return 0; @@ -752,6 +849,11 @@ static void bcf_sr_seek_start(bcf_srs_t *readers) for (i=0; inseqs; i++) reg->regs[i].creg = -1; reg->iseq = 0; + reg->start = -1; + reg->end = -1; + reg->prev_seq = -1; + reg->prev_start = -1; + reg->prev_end = -1; } @@ -765,8 +867,18 @@ int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos) bcf_sr_seek_start(readers); return 0; } - bcf_sr_regions_overlap(readers->regions, seq, pos, pos); + int i, nret = 0; + + // Need to position both the readers and the regions. The latter is a bit of a mess + // because we can have in memory or external regions. The safe way is: + // - reset all regions as if they were not read from at all (bcf_sr_seek_start) + // - find the requested iseq (stored in the seq_hash) + // - position regions to the requested position (bcf_sr_regions_overlap) + bcf_sr_seek_start(readers); + if ( khash_str2int_get(readers->regions->seq_hash, seq, &i)>=0 ) readers->regions->iseq = i; + _bcf_sr_regions_overlap(readers->regions, seq, pos, pos, 0); + for (i=0; inreaders; i++) { nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); @@ -924,6 +1036,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) } // File name or a list of genomic locations. If file name, NULL is returned. +// Recognises regions in the form chr, chr:pos, chr:beg-end, chr:beg-, {weird-chr-name}:pos. +// Cannot use hts_parse_region() as that requires the header and if header is not present, +// wouldn't learn the chromosome name. static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); @@ -935,9 +1050,23 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) hts_pos_t from, to; while ( 1 ) { - while ( *ep && *ep!=',' && *ep!=':' ) ep++; tmp.l = 0; - kputsn(sp,ep-sp,&tmp); + if ( *ep=='{' ) + { + while ( *ep && *ep!='}' ) ep++; + if ( !*ep ) + { + hts_log_error("Could not parse the region, mismatching braces in: \"%s\"", str); + goto exit_nicely; + } + ep++; + kputsn(sp+1,ep-sp-2,&tmp); + } + else + { + while ( *ep && *ep!=',' && *ep!=':' ) ep++; + kputsn(sp,ep-sp,&tmp); + } if ( *ep==':' ) { sp = ep+1; @@ -945,7 +1074,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( sp==ep ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } if ( !*ep || *ep==',' ) { @@ -956,7 +1085,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep!='-' ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } ep++; sp = ep; @@ -964,22 +1093,32 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep && *ep!=',' ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } if ( sp==ep ) to = MAX_CSI_COOR-1; _regions_add(reg, tmp.s, from, to); if ( !*ep ) break; sp = ep; } - else + else if ( !*ep || *ep==',' ) { if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); if ( !*ep ) break; sp = ++ep; } + else + { + hts_log_error("Could not parse the region(s): %s", str); + goto exit_nicely; + } } free(tmp.s); return reg; + +exit_nicely: + bcf_sr_regions_destroy(reg); + free(tmp.s); + return NULL; } // ichr,ifrom,ito are 0-based; @@ -1009,7 +1148,7 @@ static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char ** if ( k==l ) { *from = *to = hts_parse_decimal(ss, &tmp, 0); - if ( tmp==ss ) return -1; + if ( tmp==ss || (*tmp && *tmp!='\t') ) return -1; } else { @@ -1017,7 +1156,7 @@ static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char ** *from = hts_parse_decimal(ss, &tmp, 0); else *to = hts_parse_decimal(ss, &tmp, 0); - if ( ss==tmp ) return -1; + if ( ss==tmp || (*tmp && *tmp!='\t') ) return -1; for (i=k; ifile); reg->file = NULL; free(reg); return NULL; } + ito = ifrom; } + else if ( ito<0 ) + ito = abs(ito); if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; @@ -1320,14 +1462,20 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re } int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end) +{ + return _bcf_sr_regions_overlap(reg,seq,start,end,1); +} + +static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler) { int iseq; if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence + if ( missed_reg_handler && !reg->missed_reg_handler ) missed_reg_handler = 0; if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek { // flush regions left on previous chromosome - if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) + if ( missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) bcf_sr_regions_flush(reg); bcf_sr_regions_seek(reg, seq); @@ -1341,7 +1489,7 @@ int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t sta { if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left if ( reg->iseq != iseq ) return -1; // does not overlap any regions - if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); + if ( missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); } if ( reg->start <= end ) return 0; // region overlap return -1; // no overlap diff --git a/tabix.1 b/tabix.1 index 0257fc47e..b069a51df 100644 --- a/tabix.1 +++ b/tabix.1 @@ -1,10 +1,10 @@ -.TH tabix 1 "19 December 2019" "htslib-1.10.2" "Bioinformatics tools" +.TH tabix 1 "25 July 2023" "htslib-1.18" "Bioinformatics tools" .SH NAME .PP tabix \- Generic indexer for TAB-delimited genome position files .\" .\" Copyright (C) 2009-2011 Broad Institute. -.\" Copyright (C) 2014, 2016, 2018, 2020 Genome Research Ltd. +.\" Copyright (C) 2014, 2016, 2018, 2020, 2022 Genome Research Ltd. .\" .\" Author: Heng Li .\" @@ -81,8 +81,8 @@ greater than that, you will need to use a CSI index. .SH INDEXING OPTIONS .TP 10 .B -0, --zero-based -Specify that the position in the data file is 0-based (e.g. UCSC files) -rather than 1-based. +Specify that the position in the data file is 0-based half-open +(e.g. UCSC files) rather than 1-based. .TP .BI "-b, --begin " INT Column of start chromosomal position. [4] @@ -101,7 +101,7 @@ start column. [5] Force to overwrite the index file if it is present. .TP .BI "-m, --min-shift " INT -set minimal interval size for CSI indices to 2^INT [14] +Set minimal interval size for CSI indices to 2^INT [14] .TP .BI "-p, --preset " STR Input format for indexing. Valid values are: gff, bed, sam, vcf. @@ -169,7 +169,7 @@ The default is 3, which turns on error and warning messages; Values higher than 3 produce additional informational and debugging messages. .PP .SH EXAMPLE -(grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz; +(grep "^#" in.gff; grep -v "^#" in.gff | sort -t"`printf '\(rst'`" -k1,1 -k4,4n) | bgzip > sorted.gff.gz; tabix -p gff sorted.gff.gz; @@ -199,6 +199,5 @@ implemented by Bob Handsaker and modified by Heng Li for remote file access and in-memory caching. .SH SEE ALSO -.PP -.BR bgzip (1), -.BR samtools (1) +.IR bgzip (1), +.IR samtools (1) diff --git a/tabix.c b/tabix.c index 01cbf801c..0798b279f 100644 --- a/tabix.c +++ b/tabix.c @@ -52,7 +52,8 @@ typedef struct } args_t; -HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error(const char *format, ...) +static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +error(const char *format, ...) { va_list ap; fflush(stdout); @@ -63,7 +64,8 @@ HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error(const char *format, ...) exit(EXIT_FAILURE); } -HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error_errno(const char *format, ...) +static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +error_errno(const char *format, ...) { va_list ap; int eno = errno; @@ -261,7 +263,7 @@ static int query_regions(args_t *args, tbx_conf_t *conf, char *fname, char **reg if (ret < -1) { error_errno("Reading \"%s\" failed", fname); } - tbx_itr_destroy(itr); + bcf_itr_destroy(itr); } bcf_destroy(rec); } @@ -496,6 +498,7 @@ int main(int argc, char *argv[]) memset(&args,0,sizeof(args_t)); args.cache_megs = 10; args.download_index = 1; + int32_t new_line_skip = -1; static const struct option loptions[] = { @@ -568,7 +571,7 @@ int main(int argc, char *argv[]) detect = 0; break; case 'S': - conf.line_skip = strtol(optarg,&tmp,10); + new_line_skip = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: -S %s\n", optarg); detect = 0; break; @@ -578,7 +581,7 @@ int main(int argc, char *argv[]) case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 2: return usage(stdout, EXIT_SUCCESS); @@ -603,6 +606,9 @@ int main(int argc, char *argv[]) } } + if (new_line_skip >= 0) + conf.line_skip = new_line_skip; + if ( optind==argc ) return usage(stderr, EXIT_FAILURE); if ( list_chroms ) diff --git a/tbx.c b/tbx.c index 0d486ad9c..c2c5c6f9d 100644 --- a/tbx.c +++ b/tbx.c @@ -1,6 +1,6 @@ /* tbx.c -- tabix API functions. - Copyright (C) 2009, 2010, 2012-2015, 2017-2020 Genome Research Ltd. + Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022-2023 Genome Research Ltd. Copyright (C) 2010-2012 Broad Institute. Author: Heng Li @@ -91,23 +91,35 @@ int tbx_name2id(tbx_t *tbx, const char *ss) return get_tid(tbx, ss, 0); } -int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) +int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) { - int i, b = 0, id = 1, ncols = 0; + size_t i, b = 0; + int id = 1; char *s; intv->ss = intv->se = 0; intv->beg = intv->end = -1; for (i = 0; i <= len; ++i) { if (line[i] == '\t' || line[i] == 0) { - ++ncols; if (id == conf->sc) { intv->ss = line + b; intv->se = line + i; } else if (id == conf->bc) { // here ->beg is 0-based. - intv->beg = intv->end = strtoll(line + b, &s, 0); + intv->beg = strtoll(line + b, &s, 0); + + if (conf->bc <= conf->ec) // don't overwrite an already set end point + intv->end = intv->beg; + if ( s==line+b ) return -1; // expected int - if (!(conf->preset&TBX_UCSC)) --intv->beg; - else ++intv->end; - if (intv->beg < 0) intv->beg = 0; + + if (!(conf->preset&TBX_UCSC)) + --intv->beg; + else if (conf->bc <= conf->ec) + ++intv->end; + + if (intv->beg < 0) { + hts_log_warning("Coordinate <= 0 detected. " + "Did you forget to use the -0 option?"); + intv->beg = 0; + } if (intv->end < 1) intv->end = 1; } else { if ((conf->preset&0xffff) == TBX_GENERIC) { @@ -318,8 +330,11 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) continue; } if (first == 0) { - if (fmt == HTS_FMT_CSI) + if (fmt == HTS_FMT_CSI) { + if (!max_ref_len) + max_ref_len = (int64_t)100*1024*1024*1024; // 100G default n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len); + } tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); if (!tbx->idx) goto fail; first = 1; diff --git a/test/base_mods/MM-MNf1.sam b/test/base_mods/MM-MNf1.sam new file mode 100644 index 000000000..f973d274c --- /dev/null +++ b/test/base_mods/MM-MNf1.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:37 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:i:36 diff --git a/test/base_mods/MM-MNf2.sam b/test/base_mods/MM-MNf2.sam new file mode 100644 index 000000000..a88924122 --- /dev/null +++ b/test/base_mods/MM-MNf2.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:f:36 diff --git a/test/base_mods/MM-MNp.sam b/test/base_mods/MM-MNp.sam new file mode 100644 index 000000000..7bdca0f31 --- /dev/null +++ b/test/base_mods/MM-MNp.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:i:36 diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out new file mode 100644 index 000000000..89970ddf9 --- /dev/null +++ b/test/base_mods/MM-chebi.out @@ -0,0 +1,48 @@ +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m102 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n212 +16 A +17 C C+m128 +18 G +19 C C+(76792)161 +20 C C+m153 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m179 +32 A +33 C +34 C C+m204 C+(76792)33 +35 A +--- +Present: m. #-76792. n. +6 C C+m102 +15 N N+n212 +17 C C+m128 +19 C C+(76792)161 +20 C C+m153 +31 C C+m179 +34 C C+m204 C+(76792)33 + +=== + diff --git a/test/base_mods/MM-chebi.sam b/test/base_mods/MM-chebi.sam new file mode 100644 index 000000000..0ec8b9ddb --- /dev/null +++ b/test/base_mods/MM-chebi.sam @@ -0,0 +1,2 @@ +@CO Separate m, h and N modifications +* 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15; Ml:B:C,102,128,153,179,204,161,33,212,169 diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out new file mode 100644 index 000000000..431dfff07 --- /dev/null +++ b/test/base_mods/MM-double.out @@ -0,0 +1,48 @@ +0 A +1 G G-m115 +2 G +3 A +4 T +5 C +6 T +7 C C+m128 +8 T +9 A +10 G +11 C +12 G G-m141 +13 G G-m166 G+o102 +14 A +15 T +16 C +17 G +18 G +19 C +20 G +21 G +22 G G-m192 +23 G +24 G +25 A +26 T +27 A +28 T +29 G +30 C C+m153 +31 C C+m179 +32 A +33 T +34 A +35 T +--- +Present: m. m. o. +1 G G-m115 +7 C C+m128 +12 G G-m141 +13 G G-m166 G+o102 +22 G G-m192 +30 C C+m153 +31 C C+m179 + +=== + diff --git a/test/base_mods/MM-double.sam b/test/base_mods/MM-double.sam new file mode 100644 index 000000000..608516fc1 --- /dev/null +++ b/test/base_mods/MM-double.sam @@ -0,0 +1,3 @@ +@CO Modifications called on both strands of the same record, +@CO including potentially at the same location simultaneously. +* 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:C+m,1,3,0;G-m,0,2,0,4;G+o,4; Ml:B:C,128,153,179,115,141,166,192,102 diff --git a/test/base_mods/MM-explicit-f.out b/test/base_mods/MM-explicit-f.out new file mode 100644 index 000000000..0f7326cd8 --- /dev/null +++ b/test/base_mods/MM-explicit-f.out @@ -0,0 +1,111 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m. h. +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C C+m# C+h# +3 A +4 T +5 C C+m# C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+m# C+h# +23 C C+m# C+h# +24 T +--- +Present: m? h? +2 C C+m# C+h# +5 C C+m# C+h# +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 +22 C C+m# C+h# +23 C C+m# C+h# + +=== + +0 A +1 T +2 C C+h# +3 A +4 T +5 C C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+h# +23 C C+h# +24 T +--- +Present: m. h? +2 C C+h# +5 C C+h# +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 +22 C C+h# +23 C C+h# + +=== + diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out new file mode 100644 index 000000000..8acfbf2fe --- /dev/null +++ b/test/base_mods/MM-explicit-x.out @@ -0,0 +1,103 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h.10 +10 C C+m.50 C+h.170 +11 T +12 A +13 C +14 C C+m.160 C+h.20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m. h. +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m?200 C+h?10 +10 C C+m?50 C+h?170 +11 T +12 A +13 C C+m?10 C+h?5 +14 C C+m?160 C+h?20 +15 G +16 C C+m?10 C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m? h? +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h?10 +10 C C+h?170 +11 T +12 A +13 C C+h?5 +14 C C+m.160 C+h?20 +15 G +16 C C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m. h? +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 + +=== + diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out new file mode 100644 index 000000000..0f3701fcd --- /dev/null +++ b/test/base_mods/MM-explicit.out @@ -0,0 +1,103 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m. h. +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m? h? +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m. h? +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 + +=== + diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam new file mode 100644 index 000000000..e85afa293 --- /dev/null +++ b/test/base_mods/MM-explicit.sam @@ -0,0 +1,27 @@ +@CO Testing explicit vs implicit base modifications. +@CO This covers the case where a lack of a signal could be either +@CO implicitly assumed to be no-mod (default) or assumed to be +@CO unchecked and require an explicit statement to indicate it was +@CO looked at and no base modification was observed. +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r1; implicit +@CO - - .. -. - -- +@CO Mm M +@CO - - .. -. - -- +@CO hH h +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r2; explicit to a small region +@CO - - ?? ?? ? -- +@CO Mm mM m +@CO - - ?? ?? ? -- +@CO hH hh h +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r3; mixture +@CO - - . -. - -- +@CO M M +@CO - - ?? ?? ? -- +@CO hH hh h -- +@CO +r1 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh,2,0,1; Ml:B:C,200,10,50,170,160,20 +r2 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh?,2,0,0,0,0; Ml:B:C,200,10,50,170,10,5,160,20,10,5 +r3 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0; Ml:B:C,200,160,10,170,5,20,5 diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out new file mode 100644 index 000000000..41054a7c0 --- /dev/null +++ b/test/base_mods/MM-multi.out @@ -0,0 +1,97 @@ +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A +--- +Present: m. h. n. +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m77 C+h159 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n240 +16 A +17 C C+m103 C+h133 +18 G +19 C C+m128 C+h108 +20 C C+m154 C+h82 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m179 C+h57 +32 A +33 C +34 C C+m204 C+h31 +35 A +--- +Present: m. h. n. +6 C C+m77 C+h159 +15 N N+n240 +17 C C+m103 C+h133 +19 C C+m128 C+h108 +20 C C+m154 C+h82 +31 C C+m179 C+h57 +34 C C+m204 C+h31 + +=== + diff --git a/test/base_mods/MM-multi.sam b/test/base_mods/MM-multi.sam new file mode 100644 index 000000000..ac2831bc1 --- /dev/null +++ b/test/base_mods/MM-multi.sam @@ -0,0 +1,7 @@ +@CO Testing multiple m, h and N modifications on the same read. +@CO r1 has them separated out. +@CO r2 has them combined together, for example as produced by +@CO a joint basecaller which assigns probabilities to all +@CO trained events simultaneously. +r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 +r2 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/test/base_mods/MM-not-all-modded.out b/test/base_mods/MM-not-all-modded.out new file mode 100644 index 000000000..64fc847e2 --- /dev/null +++ b/test/base_mods/MM-not-all-modded.out @@ -0,0 +1,180 @@ +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A +--- +Present: m. h. n. +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A +--- +Present: + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A +--- +Present: m. h. n. +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A +--- +Present: + +=== + diff --git a/test/base_mods/MM-not-all-modded.sam b/test/base_mods/MM-not-all-modded.sam new file mode 100644 index 000000000..0858c766d --- /dev/null +++ b/test/base_mods/MM-not-all-modded.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r1b 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r2b 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A diff --git a/test/base_mods/MM-orient.sam b/test/base_mods/MM-orient.sam new file mode 100644 index 000000000..363e7c2be --- /dev/null +++ b/test/base_mods/MM-orient.sam @@ -0,0 +1,6 @@ +@CO Testing mods on top and bottom strand, but also in +@CO original vs reverse-complemented orientation +top-fwd 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:C+m,1,3,0; Ml:B:C,128,153,179 +top-rev 16 * 0 0 * * 0 0 ATATGGCATATCCCCCGCCGATCCGCTAGAGATCCT * Mm:Z:C+m,1,3,0; Ml:B:C,128,153,179 +bot-fwd 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:G-m,0,0,4,3; Ml:B:C,115,141,166,192 +bot-rev 16 * 0 0 * * 0 0 ATATGGCATATCCCCCGCCGATCCGCTAGAGATCCT * Mm:Z:G-m,0,0,4,3; Ml:B:C,115,141,166,192 diff --git a/test/base_mods/MM-pileup.out b/test/base_mods/MM-pileup.out new file mode 100644 index 000000000..8c603d661 --- /dev/null +++ b/test/base_mods/MM-pileup.out @@ -0,0 +1,36 @@ +I 0 AA DD +I 1 GG FF +I 2 CC ?? +I 3 TTT GGG +I 4 CCC CCC +I 5 TTT HHH +I 6 C[+m128]C[-m128]C[+m128] 888 +I 7 CCC 888 +I 8 AAA ... +I 9 GGG EEE +I 10 AAAA GGGG +I 11 GGGG 8888 +I 12 TTTT .... +I 13 CCCC 7777 +I 14 GGGG @@@@ +I 15 N[+n215]N[-n215]N[+n215]N[+n215] EEEE +I 16 AAAA 9999 +I 17 C[+m153]C[-m153]C[+m153]C[+m153+h0] GGGG +I 18 G[+n240]G[-n240]G[+n240]G[+n240] 8888 +I 19 C[+h159]C[-h159]C[+h159]C[+m0+h159] AAAA +I 20 C[+m179]C[-m179]C[+m179]C[+m179+h0] ???? +I 21 AAAA HHHH +I 22 TTTT 9999 +I 23 YYYY .... +I 24 CCCC :::: +I 25 GGGG CCCC +I 26 CCCC ???? +I 27 GGGG 8888 +I 28 CCCC ,,,, +I 29 GGGG @@@@ +I 30 CCC ,,, +I 31 C[+m204]C[-m204]C[+m204] ,,, +I 32 AAA 999 +I 33 CCC FFF +I 34 C[+m230+h6]C[-m230-h6]C[+m230+h6] @@@ +I 35 AAA AAA diff --git a/test/base_mods/MM-pileup.sam b/test/base_mods/MM-pileup.sam new file mode 100644 index 000000000..0021c7271 --- /dev/null +++ b/test/base_mods/MM-pileup.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 diff --git a/test/base_mods/MM-pileup2.out b/test/base_mods/MM-pileup2.out new file mode 100644 index 000000000..51e6973d3 --- /dev/null +++ b/test/base_mods/MM-pileup2.out @@ -0,0 +1,36 @@ +I 0 AA D~ +I 1 GG F~ +I 2 CC ?~ +I 3 TT G~ +I 4 CC C~ +I 5 TT H~ +I 6 C[+m128]C[+m77+h159] 8~ +I 7 CC 8~ +I 8 AA .~ +I 9 GG E~ +I 10 AA G~ +I 11 GG 8~ +I 12 TT .~ +I 13 CC 7~ +I 14 GG @~ +I 15 NN[+n240] E~ +I 16 AA 9~ +I 17 C[+m153]C[+m103+h133] G~ +I 18 GG 8~ +I 19 C[+(76792)159]C[+m128+h108] A~ +I 20 C[+m179]C[+m154+h82] ?~ +I 21 AA H~ +I 22 TT 9~ +I 23 YY .~ +I 24 CC :~ +I 25 GG C~ +I 26 CC ?~ +I 27 GG 8~ +I 28 CC ,~ +I 29 GG @~ +I 30 CC ,~ +I 31 C[+m204]C[+m179+h57] ,~ +I 32 AA 9~ +I 33 CC F~ +I 34 C[+m230+(76792)6]C[+m204+h31] @~ +I 35 AA A~ diff --git a/test/base_mods/MM-pileup2.sam b/test/base_mods/MM-pileup2.sam new file mode 100644 index 000000000..7aa9ab3ab --- /dev/null +++ b/test/base_mods/MM-pileup2.sam @@ -0,0 +1,4 @@ +@CO With added CHeBI codes and mod-code combining. +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+76792,6,7; Ml:B:C,128,153,179,204,230,159,6 +r2 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/test/base_mods/base-mods.sh b/test/base_mods/base-mods.sh new file mode 100755 index 000000000..f3f3ca4b7 --- /dev/null +++ b/test/base_mods/base-mods.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing base-modifications..." + +test_mod="../test_mod" +pileup_mod="../pileup_mod" + +test_driver $@ + +exit $? diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst new file mode 100644 index 000000000..889c3780e --- /dev/null +++ b/test/base_mods/base-mods.tst @@ -0,0 +1,59 @@ +# Copyright (C) 2020, 2023 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass +# N = expected to return non-zero +# F = expected to fail + +# Second field: +# Filename of expected output + +# Third onwards; command to execute. $fmt is replaced by the current file +# format, ie sam, bam or cram. $samtools is a pointer to the desired +# samtools binary. This can be useful for testing older versions. + +# Test files from SAM spec +P MM-chebi.out $test_mod MM-chebi.sam +P MM-double.out $test_mod MM-double.sam +P MM-multi.out $test_mod MM-multi.sam +P MM-explicit.out $test_mod MM-explicit.sam +P MM-explicit-x.out $test_mod -x MM-explicit.sam + +# Report bases outside the explicitly called ranges, so we could exclude +# these in any depth based consensus analysis and only gather statistics +# for sites known to be have been scanned. +P MM-explicit-f.out $test_mod -f 1 MM-explicit.sam + +# Ensure state gets reset correctly between reads +P MM-not-all-modded.out $test_mod MM-not-all-modded.sam + +# Pileup testing +P MM-pileup.out $pileup_mod < MM-pileup.sam +P MM-pileup2.out $pileup_mod < MM-pileup2.sam + +# Validation testing. We just care about exit status here, but the +# test data is a copy of MM-pileup.sam so that suffices too. +P MM-pileup.out $pileup_mod < MM-MNp.sam +N MM-pileup.out $pileup_mod < MM-MNf1.sam +N MM-pileup.out $pileup_mod < MM-MNf2.sam diff --git a/test/bcf-sr/weird-chr-names.1.out b/test/bcf-sr/weird-chr-names.1.out new file mode 100644 index 000000000..5705c7575 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . +1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.2.out b/test/bcf-sr/weird-chr-names.2.out new file mode 100644 index 000000000..980818a71 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.2.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.3.out b/test/bcf-sr/weird-chr-names.3.out new file mode 100644 index 000000000..5b3ac8e18 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.3.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1 1 . C T . . . +1:1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.4.out b/test/bcf-sr/weird-chr-names.4.out new file mode 100644 index 000000000..0d9e274ab --- /dev/null +++ b/test/bcf-sr/weird-chr-names.4.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.5.out b/test/bcf-sr/weird-chr-names.5.out new file mode 100644 index 000000000..6cb41e14f --- /dev/null +++ b/test/bcf-sr/weird-chr-names.5.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1-1 1 . C T . . . +1:1-1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.6.out b/test/bcf-sr/weird-chr-names.6.out new file mode 100644 index 000000000..a707ed85c --- /dev/null +++ b/test/bcf-sr/weird-chr-names.6.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1-1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.vcf b/test/bcf-sr/weird-chr-names.vcf new file mode 100644 index 000000000..c367be477 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.3 +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . +1 2 . C T . . . +1:1 1 . C T . . . +1:1 2 . C T . . . +1:1-1 1 . C T . . . +1:1-1 2 . C T . . . diff --git a/test/compare_sam.pl b/test/compare_sam.pl index 23b67d3a4..499cb2390 100755 --- a/test/compare_sam.pl +++ b/test/compare_sam.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # Copyright (C) 2013-2018 Genome Research Ltd. # @@ -26,6 +26,7 @@ # Optionally can skip header or ignore specific types of diff. use strict; +use warnings; use Getopt::Long; my %opts; diff --git a/test/faidx/ce.1.expected.fa b/test/faidx/ce.1.expected.fa new file mode 100644 index 000000000..d606105c4 --- /dev/null +++ b/test/faidx/ce.1.expected.fa @@ -0,0 +1,8 @@ +>CHROMOSOME_I:5001-5125 length: 125 +AACTGGTTCAAAAACAAAAATTTTTTAAACTGTACAAACTGTCCAAAAAT +TCGTCGTAAATCGACACACCCTTCTCATTTTTTCAAAATTTTAATTGTTT +TCGAATGTTTTTTTTGCAGAATAAT +>CHROMOSOME_X:101-225 length: 125 +GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC +CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCT +AAGCCTAAGCCTAAGCCTAAGCCTA diff --git a/test/faidx/faidx.1.expected.fa b/test/faidx/faidx.1.expected.fa new file mode 100644 index 000000000..d14656e9f --- /dev/null +++ b/test/faidx/faidx.1.expected.fa @@ -0,0 +1,6 @@ +>trailingblank2:28-33 length: 6 +GGGCCC +>trailingblank3:4-5 length: 2 +TA +>bar:4-5 length: 2 +TA diff --git a/test/faidx.fa b/test/faidx/faidx.fa similarity index 100% rename from test/faidx.fa rename to test/faidx/faidx.fa diff --git a/test/faidx/faidx.fa.expected.fai b/test/faidx/faidx.fa.expected.fai new file mode 100644 index 000000000..b4d1aff26 --- /dev/null +++ b/test/faidx/faidx.fa.expected.fai @@ -0,0 +1,6 @@ + 4 2 4 5 +trailingblank1 33 23 12 13 +trailingblank2 72 111 24 25 +trailingblank3 5 234 4 6 +foo 8 252 6 7 +bar 8 280 8 9 diff --git a/test/faidx/faidx.tst b/test/faidx/faidx.tst new file mode 100644 index 000000000..b6bd7cac1 --- /dev/null +++ b/test/faidx/faidx.tst @@ -0,0 +1,74 @@ +# Copyright (C) 2022 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $bgzip and $test_faidx are replaced with the path to +# bgzip and test_faidx. + +# Index fasta +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -e faidx.fa.expected.fai + +# Test various functions on the fasta index +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_line_length -e 24 trailingblank2 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_has_seq -e 1 foo +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_has_seq -e 0 absent +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_iseq -e trailingblank3 3 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_seq_len -e 33 trailingblank1 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_seq_len64 -e 72 trailingblank2 + +# Index fastq +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -e fastqs.fq.expected.fai + +# Test various functions on the fastq index +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_line_length -e 63 FAKE0005_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_line_length -e 144 SRR014849.203935_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_has_seq -e 1 SRR014849.203935_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_has_seq -e 0 absent +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_iseq -e FAKE0005_1 0 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_seq_len -e 453 FSRRS4401CM938_1 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_seq_len64 -e 309 FSRRS4401AOV6A_4 + +# Fasta retrieval tests +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_fetch trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_fetch_seq64 trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_adjust_region trailingblank2:28-33 trailingblank3:4-5 bar:4-5 + +# Fastq retrieval tests +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_fetch FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t faidx_fetch_seq64 FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.2.expected.fa $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 + +# Indexing and retrieval on bgzip compressed fasta +INIT $bgzip -c < ../ce.fa > ce.fa.tmp.gz +P . $test_faidx -i ce.fa.tmp.gz -f ce.fa.tmp.gz.fai -g ce.fa.tmp.gz.gzi -e ../ce.fa.fai +P ce.1.expected.fa $test_faidx -i ce.fa.tmp.gz -f ce.fa.tmp.gz.fai -g ce.fa.tmp.gz.gzi CHROMOSOME_I:5001-5125 CHROMOSOME_X:101-225 diff --git a/test/faidx/fastqs.1.expected.fq b/test/faidx/fastqs.1.expected.fq new file mode 100644 index 000000000..729393837 --- /dev/null +++ b/test/faidx/fastqs.1.expected.fq @@ -0,0 +1,16 @@ +@FAKE0006_1:4-12 length: 9 +TGCATGCAT ++ +{zyxwvuts +@FSRRS4401BE7HA_1:81-120 length: 40 +GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca ++ +c^^^YRPOSNVU\YTMMMSMRKKKRUUNNNNS[`aa```\ +@FAKE0010_2 length: 30 +gatcrywsmkhbvdnGATCRYWSMKHBVDN ++ +I?5+I?5+I?5+I?5+I?5+I?5+I?5+I? +@SRR014849.50939_3:71-90 length: 20 +CAATAAATCAATACATAAAA ++ +\aZ\d`OY[aY[[\[[e`WP diff --git a/test/faidx/fastqs.2.expected.fa b/test/faidx/fastqs.2.expected.fa new file mode 100644 index 000000000..9b67d15e7 --- /dev/null +++ b/test/faidx/fastqs.2.expected.fa @@ -0,0 +1,8 @@ +>FAKE0006_1:4-12 length: 9 +TGCATGCAT +>FSRRS4401BE7HA_1:81-120 length: 40 +GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca +>FAKE0010_2 length: 30 +gatcrywsmkhbvdnGATCRYWSMKHBVDN +>SRR014849.50939_3:71-90 length: 20 +CAATAAATCAATACATAAAA diff --git a/test/fastqs.fq b/test/faidx/fastqs.fq similarity index 100% rename from test/fastqs.fq rename to test/faidx/fastqs.fq diff --git a/test/faidx/fastqs.fq.expected.fai b/test/faidx/fastqs.fq.expected.fai new file mode 100644 index 000000000..77ba04a5d --- /dev/null +++ b/test/faidx/fastqs.fq.expected.fai @@ -0,0 +1,105 @@ +FAKE0005_1 63 85 63 64 151 +FAKE0006_1 63 300 63 64 366 +FAKE0005_2 63 515 63 64 581 +FAKE0006_2 63 730 63 64 796 +FAKE0005_3 63 945 63 64 1011 +FAKE0006_3 63 1160 63 64 1226 +FAKE0005_4 63 1375 63 64 1441 +FAKE0006_4 63 1590 63 64 1656 +FSRRS4401BE7HA_1 395 1823 395 396 2221 +FSRRS4401BRRTC_1 145 2720 145 146 2868 +FSRRS4401B64ST_1 382 3118 382 383 3503 +FSRRS4401EJ0YH_1 381 3990 381 382 4374 +FSRRS4401BK0IB_1 507 4860 507 508 5370 +FSRRS4401ARCCB_1 258 5982 258 259 6243 +FSRRS4401CM938_1 453 6606 453 454 7062 +FSRRS4401EQLIK_1 411 7620 411 412 8034 +FSRRS4401AOV6A_1 309 8550 309 310 8862 +FSRRS4401EG0ZW_1 424 9276 424 425 9703 +FSRRS4401BE7HA_2 395 10231 395 396 10629 +FSRRS4401BRRTC_2 145 11128 145 146 11276 +FSRRS4401B64ST_2 382 11526 382 383 11911 +FSRRS4401EJ0YH_2 381 12398 381 382 12782 +FSRRS4401BK0IB_2 507 13268 507 508 13778 +FSRRS4401ARCCB_2 258 14390 258 259 14651 +FSRRS4401CM938_2 453 15014 453 454 15470 +FSRRS4401EQLIK_2 411 16028 411 412 16442 +FSRRS4401AOV6A_2 309 16958 309 310 17270 +FSRRS4401EG0ZW_2 424 17684 424 425 18111 +FSRRS4401BE7HA_3 395 18639 395 396 19037 +FSRRS4401BRRTC_3 145 19536 145 146 19684 +FSRRS4401B64ST_3 382 19934 382 383 20319 +FSRRS4401EJ0YH_3 381 20806 381 382 21190 +FSRRS4401BK0IB_3 507 21676 507 508 22186 +FSRRS4401ARCCB_3 258 22798 258 259 23059 +FSRRS4401CM938_3 453 23422 453 454 23878 +FSRRS4401EQLIK_3 411 24436 411 412 24850 +FSRRS4401AOV6A_3 309 25366 309 310 25678 +FSRRS4401EG0ZW_3 424 26092 424 425 26519 +FSRRS4401BE7HA_4 395 27047 80 81 27449 +FSRRS4401BRRTC_4 145 27952 80 81 28101 +FSRRS4401B64ST_4 382 28352 80 81 28741 +FSRRS4401EJ0YH_4 381 29232 80 81 29620 +FSRRS4401BK0IB_4 507 30110 80 81 30626 +FSRRS4401ARCCB_4 258 31244 80 81 31508 +FSRRS4401CM938_4 453 31874 80 81 32335 +FSRRS4401EQLIK_4 411 32898 80 81 33317 +FSRRS4401AOV6A_4 309 33838 80 81 34153 +FSRRS4401EG0ZW_4 424 34570 80 81 35002 +FAKE0007_1 41 35549 41 42 35593 +FAKE0008_1 41 35752 41 42 35796 +FAKE0009_1 41 35955 41 42 35999 +FAKE0010_1 30 36143 30 31 36176 +FAKE0007_2 41 36324 41 42 36368 +FAKE0008_2 41 36527 41 42 36571 +FAKE0009_2 41 36730 41 42 36774 +FAKE0010_2 30 36918 30 31 36951 +FAKE0007_3 41 37099 41 42 37143 +FAKE0008_3 41 37302 41 42 37346 +FAKE0009_3 41 37505 41 42 37549 +FAKE0010_3 30 37693 30 31 37726 +FAKE0007_4 41 37874 41 42 37918 +FAKE0008_4 41 38077 41 42 38121 +FAKE0009_4 41 38280 41 42 38324 +FAKE0010_4 30 38468 30 31 38501 +FAKE0011_1 41 38649 41 42 38693 +FAKE0012_1 41 38852 41 42 38896 +FAKE0013_1 41 39055 41 42 39099 +FAKE0014_1 30 39250 30 31 39283 +FAKE0011_2 41 39431 41 42 39475 +FAKE0012_2 41 39634 41 42 39678 +FAKE0013_2 41 39837 41 42 39881 +FAKE0014_2 30 40032 30 31 40065 +FAKE0011_3 41 40213 41 42 40257 +FAKE0012_3 41 40416 41 42 40460 +FAKE0013_3 41 40619 41 42 40663 +FAKE0014_3 30 40814 30 31 40847 +FAKE0011_4 41 40995 41 42 41039 +FAKE0012_4 41 41198 41 42 41242 +FAKE0013_4 41 41401 41 42 41445 +FAKE0014_4 30 41596 30 31 41629 +FAKE0001_1 94 41745 94 95 41842 +FAKE0002_1 94 42022 94 95 42119 +FAKE0001_2 94 42299 94 95 42396 +FAKE0002_2 94 42576 94 95 42673 +FAKE0001_3 94 42853 94 95 42950 +FAKE0002_3 94 43130 94 95 43227 +FAKE0001_4 94 43407 94 95 43504 +FAKE0002_4 94 43684 94 95 43781 +FAKE0003_1 68 43963 68 69 44034 +FAKE0004_1 68 44190 68 69 44261 +FAKE0003_2 68 44417 68 69 44488 +FAKE0004_2 68 44644 68 69 44715 +FAKE0003_3 68 44871 68 69 44942 +FAKE0004_3 68 45098 68 69 45169 +FAKE0003_4 68 45325 68 69 45396 +FAKE0004_4 68 45552 68 69 45623 +SRR014849.50939_1 135 45737 135 136 45875 +SRR014849.110027_1 131 46057 131 132 46191 +SRR014849.203935_1 144 46369 144 145 46516 +SRR014849.50939_2 135 46706 135 136 46844 +SRR014849.110027_2 131 47026 131 132 47160 +SRR014849.203935_2 144 47338 144 145 47485 +SRR014849.50939_3 135 47675 135 136 47813 +SRR014849.110027_3 131 47995 131 132 48129 +SRR014849.203935_3 144 48307 144 145 48454 diff --git a/test/faidx/test-faidx.sh b/test/faidx/test-faidx.sh new file mode 100755 index 000000000..ae501e086 --- /dev/null +++ b/test/faidx/test-faidx.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# +# Copyright (C) 2022 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing faidx..." + +bgzip="../../bgzip" +test_faidx="../test_faidx" + +test_driver $@ + +exit $? diff --git a/test/fastq/fastq.tst b/test/fastq/fastq.tst new file mode 100644 index 000000000..3b5fd9f4f --- /dev/null +++ b/test/fastq/fastq.tst @@ -0,0 +1,116 @@ +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $pileup is replaced with the path to the pileup test +# program + +# -------------------- +# Reading + +# Minimal +P minimal.sam $tview minimal.fq +P minimal-q.sam $tview minimal.fa + +# Multi-line FASTQ +P multiline.sam $tview multiline.fq +P multiline-q.sam $tview multiline.fa + +# FASTQ with a very long header line +P longline.sam $tview -i fastq_aux longline.fq + +# Single file, unpaired data, with / without aux tags +P single_noaux.sam $tview single.fq +P single_noaux-q.sam $tview single.fa +P single_aux.sam $tview -i fastq_aux single.fq +P single_aux-q.sam $tview -i fastq_aux single.fa + +# Single file, interleaved paired data, no aux +P inter_noaux.sam $tview interleaved.fq +P inter_noaux-q.sam $tview interleaved.fa + +# Single file, interleaved paired data, with aux +P inter_aux.sam $tview -i fastq_aux interleaved.fq +P inter_aux-q.sam $tview -i fastq_aux interleaved.fa + +# Single file, interleaved paired data, using CASAVA +P inter_casava.sam $tview -i fastq_casava interleaved_casava.fq +P inter_casavaOX.sam $tview -i fastq_barcode=OX -i fastq_casava interleaved_casava.fq +P inter_casava-q.sam $tview -i fastq_casava interleaved_casava.fa +P inter_casavaOX-q.sam $tview -i fastq_barcode=OX -i fastq_casava interleaved_casava.fa + +# CASAVA with filtering +P filter_casava.sam $tview -i fastq_casava filter_casava.fq +P filter_casava-q.sam $tview -i fastq_casava filter_casava.fa + +# Paired data is mainly tested by the Samtools test harness. +# Basically though it's just reading two files and relying on either +# this code or explicit overloading of READ1/READ2. +# We simply test here we can read r1 and r2 as separate files +P r1.sam $tview -i fastq_aux r1.fq +P r2.sam $tview -i fastq_aux r2.fq +P r1-q.sam $tview -i fastq_aux r1.fa +P r2-q.sam $tview -i fastq_aux r2.fa + +# Simple tests for the FASTQ_NAME2 option. +P name2.sam $tview -i fastq_name2 name2.fq +P name2-q.sam $tview -i fastq_name2 name2.fa + +# -------------------- +# Writing + +# Minimal +P minimal.fq $tview -f minimal.sam +P minimal.fa $tview -F minimal.sam + +# Single file with unpaired data plus aux tags +P single.fq $tview -f -o fastq_aux single_aux.sam +P single.fa $tview -F -o fastq_aux single_aux.sam + +# Single file, interleaved paired data, with aux and /rnum +P interleaved.fq $tview -f -o fastq_aux -o fastq_rnum inter_aux.sam +P interleaved.fa $tview -F -o fastq_aux -o fastq_rnum inter_aux.sam + +# CASAVA with interleaved data +P interleaved_casava.fq $tview -f -o fastq_casava inter_casava.sam +P interleaved_casava.fq $tview -f -o fastq_barcode=OX -o fastq_casava inter_casavaOX.sam +P interleaved_casava.fa $tview -F -o fastq_casava inter_casava.sam +P interleaved_casava.fa $tview -F -o fastq_barcode=OX -o fastq_casava inter_casavaOX.sam + +# CASAVA with filtering +P filter_casava.fq $tview -f -o fastq_casava filter_casava.sam +P filter_casava.fa $tview -F -o fastq_casava filter_casava.sam + +# Paired data +P r1.fq $tview -f -o fastq_aux -o fastq_rnum r1.sam +P r2.fq $tview -f -o fastq_aux -o fastq_rnum r2.sam +P r1.fa $tview -F -o fastq_aux -o fastq_rnum r1.sam +P r2.fa $tview -F -o fastq_aux -o fastq_rnum r2.sam diff --git a/test/fastq/filter_casava-q.sam b/test/fastq/filter_casava-q.sam new file mode 100644 index 000000000..6d7a2ea06 --- /dev/null +++ b/test/fastq/filter_casava-q.sam @@ -0,0 +1,4 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * BC:Z:NGTCTATC +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 589 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 653 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * BC:Z:NGTCTATC diff --git a/test/fastq/filter_casava.fa b/test/fastq/filter_casava.fa new file mode 100644 index 000000000..ba58034c9 --- /dev/null +++ b/test/fastq/filter_casava.fa @@ -0,0 +1,8 @@ +>HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1505:59795#49 2:N:0:NGTCTATC +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT +>HS25_09827:2:1201:1559:70726#49 1:Y:0:NGTCTATC +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1559:70726#49 2:Y:0:NGTCTATC +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC diff --git a/test/fastq/filter_casava.fq b/test/fastq/filter_casava.fq new file mode 100644 index 000000000..adb191365 --- /dev/null +++ b/test/fastq/filter_casava.fq @@ -0,0 +1,16 @@ +@HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1559:70726#49 2:Y:0:NGTCTATC +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ++ +;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? diff --git a/test/fastq/filter_casava.sam b/test/fastq/filter_casava.sam new file mode 100644 index 000000000..9a278537f --- /dev/null +++ b/test/fastq/filter_casava.sam @@ -0,0 +1,4 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 653 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? BC:Z:NGTCTATC diff --git a/test/fastq/inter_aux-q.sam b/test/fastq/inter_aux-q.sam new file mode 100644 index 000000000..cd5fbc407 --- /dev/null +++ b/test/fastq/inter_aux-q.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * RG:Z:1#49 +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * RG:Z:1#49 diff --git a/test/fastq/inter_aux.sam b/test/fastq/inter_aux.sam new file mode 100644 index 000000000..91e4acaca --- /dev/null +++ b/test/fastq/inter_aux.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG RG:Z:1#49 diff --git a/test/fastq/inter_casava-q.sam b/test/fastq/inter_casava-q.sam new file mode 100644 index 000000000..31f8e8bf6 --- /dev/null +++ b/test/fastq/inter_casava-q.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * BC:Z:NGTCTATC +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * BC:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * BC:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * BC:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * BC:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * BC:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * BC:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * BC:Z:NGTCTATC diff --git a/test/fastq/inter_casava.sam b/test/fastq/inter_casava.sam new file mode 100644 index 000000000..823d78b19 --- /dev/null +++ b/test/fastq/inter_casava.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? BC:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E BC:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG BC:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE BC:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG BC:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF BC:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG BC:Z:NGTCTATC diff --git a/test/fastq/inter_casavaOX-q.sam b/test/fastq/inter_casavaOX-q.sam new file mode 100644 index 000000000..1273184ef --- /dev/null +++ b/test/fastq/inter_casavaOX-q.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * OX:Z:NGTCTATC +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * OX:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * OX:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * OX:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * OX:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * OX:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * OX:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * OX:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * OX:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * OX:Z:NGTCTATC diff --git a/test/fastq/inter_casavaOX.sam b/test/fastq/inter_casavaOX.sam new file mode 100644 index 000000000..85fbbdf55 --- /dev/null +++ b/test/fastq/inter_casavaOX.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE OX:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? OX:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E OX:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG OX:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE OX:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG OX:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF OX:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG OX:Z:NGTCTATC diff --git a/test/fastq/inter_noaux-q.sam b/test/fastq/inter_noaux-q.sam new file mode 100644 index 000000000..107f31ae1 --- /dev/null +++ b/test/fastq/inter_noaux-q.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * diff --git a/test/fastq/inter_noaux.sam b/test/fastq/inter_noaux.sam new file mode 100644 index 000000000..bc7edff0e --- /dev/null +++ b/test/fastq/inter_noaux.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/interleaved.fa b/test/fastq/interleaved.fa new file mode 100644 index 000000000..bd5ca44b0 --- /dev/null +++ b/test/fastq/interleaved.fa @@ -0,0 +1,20 @@ +>HS25_09827:2:1201:1505:59795#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1505:59795#49/2 RG:Z:1#49 +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT +>HS25_09827:2:1201:1559:70726#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1559:70726#49/2 RG:Z:1#49 +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC +>HS25_09827:2:1201:1564:39627#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>HS25_09827:2:1201:1564:39627#49/2 RG:Z:1#49 +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT +>HS25_09827:2:1201:1565:91731#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA +>HS25_09827:2:1201:1565:91731#49/2 RG:Z:1#49 +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA +>HS25_09827:2:1201:1624:69925#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA +>HS25_09827:2:1201:1624:69925#49/2 RG:Z:1#49 +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC diff --git a/test/fastq/interleaved.fq b/test/fastq/interleaved.fq new file mode 100644 index 000000000..5aa88e129 --- /dev/null +++ b/test/fastq/interleaved.fq @@ -0,0 +1,40 @@ +@HS25_09827:2:1201:1505:59795#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1559:70726#49/2 RG:Z:1#49 +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ++ +;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? +@HS25_09827:2:1201:1564:39627#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@HS25_09827:2:1201:1564:39627#49/2 RG:Z:1#49 +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ++ +;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG +@HS25_09827:2:1201:1565:91731#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +@HS25_09827:2:1201:1565:91731#49/2 RG:Z:1#49 +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ++ +;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG +@HS25_09827:2:1201:1624:69925#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA ++ +CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF +@HS25_09827:2:1201:1624:69925#49/2 RG:Z:1#49 +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ++ +;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/interleaved_casava.fa b/test/fastq/interleaved_casava.fa new file mode 100644 index 000000000..8ea7447f9 --- /dev/null +++ b/test/fastq/interleaved_casava.fa @@ -0,0 +1,20 @@ +>HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1505:59795#49 2:N:0:NGTCTATC +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT +>HS25_09827:2:1201:1559:70726#49 1:N:0:NGTCTATC +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1559:70726#49 2:N:0:NGTCTATC +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC +>HS25_09827:2:1201:1564:39627#49 1:N:0:NGTCTATC +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>HS25_09827:2:1201:1564:39627#49 2:N:0:NGTCTATC +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT +>HS25_09827:2:1201:1565:91731#49 1:N:0:NGTCTATC +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA +>HS25_09827:2:1201:1565:91731#49 2:N:0:NGTCTATC +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA +>HS25_09827:2:1201:1624:69925#49 1:N:0:NGTCTATC +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA +>HS25_09827:2:1201:1624:69925#49 2:N:0:NGTCTATC +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC diff --git a/test/fastq/interleaved_casava.fq b/test/fastq/interleaved_casava.fq new file mode 100644 index 000000000..39a19457c --- /dev/null +++ b/test/fastq/interleaved_casava.fq @@ -0,0 +1,40 @@ +@HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1559:70726#49 2:N:0:NGTCTATC +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ++ +;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? +@HS25_09827:2:1201:1564:39627#49 1:N:0:NGTCTATC +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@HS25_09827:2:1201:1564:39627#49 2:N:0:NGTCTATC +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ++ +;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG +@HS25_09827:2:1201:1565:91731#49 1:N:0:NGTCTATC +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +@HS25_09827:2:1201:1565:91731#49 2:N:0:NGTCTATC +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ++ +;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG +@HS25_09827:2:1201:1624:69925#49 1:N:0:NGTCTATC +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA ++ +CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF +@HS25_09827:2:1201:1624:69925#49 2:N:0:NGTCTATC +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ++ +;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/longline.fq b/test/fastq/longline.fq new file mode 100644 index 000000000..09cabd1a3 --- /dev/null +++ b/test/fastq/longline.fq @@ -0,0 +1,4 @@ +@readname XX:Z:baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab +ATGC ++ +qqqq diff --git a/test/fastq/longline.sam b/test/fastq/longline.sam new file mode 100644 index 000000000..4dc5e8215 --- /dev/null +++ b/test/fastq/longline.sam @@ -0,0 +1 @@ +readname 4 * 0 0 * * 0 0 ATGC qqqq XX:Z:baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab diff --git a/test/fastq/minimal-q.sam b/test/fastq/minimal-q.sam new file mode 100644 index 000000000..d2daaf47f --- /dev/null +++ b/test/fastq/minimal-q.sam @@ -0,0 +1 @@ +x 4 * 0 0 * * 0 0 A * diff --git a/test/fastq/minimal.fa b/test/fastq/minimal.fa new file mode 100644 index 000000000..4f65d8c17 --- /dev/null +++ b/test/fastq/minimal.fa @@ -0,0 +1,2 @@ +>x +A diff --git a/test/fastq/minimal.fq b/test/fastq/minimal.fq new file mode 100644 index 000000000..6f13b948d --- /dev/null +++ b/test/fastq/minimal.fq @@ -0,0 +1,4 @@ +@x +A ++ ++ diff --git a/test/fastq/minimal.sam b/test/fastq/minimal.sam new file mode 100644 index 000000000..e12f4a6b4 --- /dev/null +++ b/test/fastq/minimal.sam @@ -0,0 +1 @@ +x 4 * 0 0 * * 0 0 A + diff --git a/test/fastq/multiline-q.sam b/test/fastq/multiline-q.sam new file mode 100644 index 000000000..d31407b36 --- /dev/null +++ b/test/fastq/multiline-q.sam @@ -0,0 +1,2 @@ +seq1 4 * 0 0 * * 0 0 NAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTN * +seq2 4 * 0 0 * * 0 0 RAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTY * diff --git a/test/fastq/multiline.fa b/test/fastq/multiline.fa new file mode 100644 index 000000000..2917e066f --- /dev/null +++ b/test/fastq/multiline.fa @@ -0,0 +1,10 @@ +>seq1 +NAAAAAAAAA +CCCCCCCCCC +GGGGGGGGGG +TTTTTTTTN +>seq2 +RAAAAAAAAA +CCCCCCCCCC +GGGGGGGGGG +TTTTTTTTY diff --git a/test/fastq/multiline.fq b/test/fastq/multiline.fq new file mode 100644 index 000000000..0c94aec53 --- /dev/null +++ b/test/fastq/multiline.fq @@ -0,0 +1,20 @@ +@seq1 +NAAAAAAAAA +CCCCCCCCCC +GGGGGGGGGG +TTTTTTTTN ++ +++++++++++ +@@@@@@@@@@ +########## +........> +@seq2 +RAAAAAAAAA +CCCCCCCCCC +GGGGGGGGGG +TTTTTTTTY ++ +<<<<<<<<<< +!!!!!!!!!! +########## +@@@@@@@@@ diff --git a/test/fastq/multiline.sam b/test/fastq/multiline.sam new file mode 100644 index 000000000..70442ef3e --- /dev/null +++ b/test/fastq/multiline.sam @@ -0,0 +1,2 @@ +seq1 4 * 0 0 * * 0 0 NAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTN ++++++++++@@@@@@@@@@##########........> +seq2 4 * 0 0 * * 0 0 RAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTY <<<<<<<<<ignore_001 name_001 +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>ignore_002 name_002 +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>ignore_003 name_003 +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>name_004 +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA diff --git a/test/fastq/name2.fq b/test/fastq/name2.fq new file mode 100644 index 000000000..6ac19c92b --- /dev/null +++ b/test/fastq/name2.fq @@ -0,0 +1,16 @@ +@ignore_001 name_001 +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +@ignore_003 name_003 +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@name_004 +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE diff --git a/test/fastq/name2.sam b/test/fastq/name2.sam new file mode 100644 index 000000000..a79a05805 --- /dev/null +++ b/test/fastq/name2.sam @@ -0,0 +1,4 @@ +name_001 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +name_003 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +name_004 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE diff --git a/test/fastq/r1-q.sam b/test/fastq/r1-q.sam new file mode 100644 index 000000000..c87f07fe4 --- /dev/null +++ b/test/fastq/r1-q.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB diff --git a/test/fastq/r1.fa b/test/fastq/r1.fa new file mode 100644 index 000000000..4b7af5cf4 --- /dev/null +++ b/test/fastq/r1.fa @@ -0,0 +1,10 @@ +>HS25_09827:2:1201:1505:59795#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1559:70726#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1564:39627#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>HS25_09827:2:1201:1565:91731#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA +>HS25_09827:2:1201:1624:69925#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA diff --git a/test/fastq/r1.fq b/test/fastq/r1.fq new file mode 100644 index 000000000..3728b9a98 --- /dev/null +++ b/test/fastq/r1.fq @@ -0,0 +1,20 @@ +@HS25_09827:2:1201:1505:59795#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1564:39627#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@HS25_09827:2:1201:1565:91731#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +@HS25_09827:2:1201:1624:69925#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA ++ +CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF diff --git a/test/fastq/r1.sam b/test/fastq/r1.sam new file mode 100644 index 000000000..ffc06191a --- /dev/null +++ b/test/fastq/r1.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB diff --git a/test/fastq/r2-q.sam b/test/fastq/r2-q.sam new file mode 100644 index 000000000..033fa98cf --- /dev/null +++ b/test/fastq/r2-q.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * RG:Z:1#49 +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * RG:Z:1#49 diff --git a/test/fastq/r2.fa b/test/fastq/r2.fa new file mode 100644 index 000000000..4d6f08e85 --- /dev/null +++ b/test/fastq/r2.fa @@ -0,0 +1,10 @@ +>HS25_09827:2:1201:1505:59795#49/2 RG:Z:1#49 +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT +>HS25_09827:2:1201:1559:70726#49/2 RG:Z:1#49 +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC +>HS25_09827:2:1201:1564:39627#49/2 RG:Z:1#49 +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT +>HS25_09827:2:1201:1565:91731#49/2 RG:Z:1#49 +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA +>HS25_09827:2:1201:1624:69925#49/2 RG:Z:1#49 +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC diff --git a/test/fastq/r2.fq b/test/fastq/r2.fq new file mode 100644 index 000000000..a766bf9d5 --- /dev/null +++ b/test/fastq/r2.fq @@ -0,0 +1,20 @@ +@HS25_09827:2:1201:1505:59795#49/2 RG:Z:1#49 +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT ++ +DCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHGCFCFFGGCEBF? +@HS25_09827:2:1201:1564:39627#49/2 RG:Z:1#49 +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ++ +;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG +@HS25_09827:2:1201:1565:91731#49/2 RG:Z:1#49 +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ++ +;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG +@HS25_09827:2:1201:1624:69925#49/2 RG:Z:1#49 +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ++ +;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/r2.sam b/test/fastq/r2.sam new file mode 100644 index 000000000..f75edc9d1 --- /dev/null +++ b/test/fastq/r2.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT DCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHGCFCFFGGCEBF? RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG RG:Z:1#49 diff --git a/test/fastq/single.fa b/test/fastq/single.fa new file mode 100644 index 000000000..cdd58d8dc --- /dev/null +++ b/test/fastq/single.fa @@ -0,0 +1,10 @@ +>HS25_09827:2:1201:1505:59795#49 RG:Z:1#49 +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1559:70726#49 RG:Z:1#49 +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1564:39627#49 RG:Z:1#49 +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>HS25_09827:2:1201:1565:91731#49 RG:Z:1#49 +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA +>HS25_09827:2:1201:1624:69925#49 RG:Z:1#49 +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA diff --git a/test/fastq/single.fq b/test/fastq/single.fq new file mode 100644 index 000000000..c6ad15dd6 --- /dev/null +++ b/test/fastq/single.fq @@ -0,0 +1,20 @@ +@HS25_09827:2:1201:1505:59795#49 RG:Z:1#49 +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1564:39627#49 RG:Z:1#49 +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@HS25_09827:2:1201:1565:91731#49 RG:Z:1#49 +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +@HS25_09827:2:1201:1624:69925#49 RG:Z:1#49 +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA ++ +CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF diff --git a/test/fastq/single_aux-q.sam b/test/fastq/single_aux-q.sam new file mode 100644 index 000000000..68af8bc89 --- /dev/null +++ b/test/fastq/single_aux-q.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * RG:Z:1#49 +HS25_09827:2:1201:1559:70726#49 4 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 4 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * RG:Z:1#49 diff --git a/test/fastq/single_aux.sam b/test/fastq/single_aux.sam new file mode 100644 index 000000000..bfe355e0f --- /dev/null +++ b/test/fastq/single_aux.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 4 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF RG:Z:1#49 diff --git a/test/fastq/single_noaux-q.sam b/test/fastq/single_noaux-q.sam new file mode 100644 index 000000000..24542c69c --- /dev/null +++ b/test/fastq/single_noaux-q.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * +HS25_09827:2:1201:1559:70726#49 4 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * +HS25_09827:2:1201:1564:39627#49 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * +HS25_09827:2:1201:1565:91731#49 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * +HS25_09827:2:1201:1624:69925#49 4 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * diff --git a/test/fastq/single_noaux.sam b/test/fastq/single_noaux.sam new file mode 100644 index 000000000..9a98989e8 --- /dev/null +++ b/test/fastq/single_noaux.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +HS25_09827:2:1201:1564:39627#49 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +HS25_09827:2:1201:1565:91731#49 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +HS25_09827:2:1201:1624:69925#49 4 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF diff --git a/test/fastq/test-fastq.sh b/test/fastq/test-fastq.sh new file mode 100755 index 000000000..d1f771980 --- /dev/null +++ b/test/fastq/test-fastq.sh @@ -0,0 +1,32 @@ +#!/bin/sh +# +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +. ../simple_test_driver.sh + +echo "Testing fastq support" + +# Eg set TEST_PRECMD=valgrind +tview="${TEST_PRECMD} ../test_view" +test_driver fastq.tst +exit $? diff --git a/test/formatcols.vcf b/test/formatcols.vcf index c46cf46fe..6bbdb3af6 100644 --- a/test/formatcols.vcf +++ b/test/formatcols.vcf @@ -2,5 +2,5 @@ ##FILTER= ##contig= ##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S² S3 1 100 a A T . . . S a bbbbbbb ccccccccc diff --git a/test/fuzz/hts_open_fuzzer.c b/test/fuzz/hts_open_fuzzer.c index 5c239d4f1..355f790a5 100644 --- a/test/fuzz/hts_open_fuzzer.c +++ b/test/fuzz/hts_open_fuzzer.c @@ -1,7 +1,7 @@ /* test/fuzz/hts_open_fuzzer.c -- Fuzz driver for hts_open. Copyright (C) 2018 Google LLC. - Copyright (C) 2019 Genome Research Ltd. + Copyright (C) 2019-2020 Genome Research Ltd. Author: Markus Kusano diff --git a/test/header_syms.pl b/test/header_syms.pl index fc9cfa303..fe5128a78 100755 --- a/test/header_syms.pl +++ b/test/header_syms.pl @@ -31,7 +31,7 @@ # Roughly equivalent Exuberant-ctags command is: # ctags -f - -n -I HTS_RESULT_USED -I HTS_DEPRECATED+ -I HTS_FORMAT+ \ -# -I KS_ATTR_PRINTF+ -I knet_win32_destroy+ -I knet_win32_init+ +# -I KS_ATTR_PRINTF+ # Unfortunately this is not the default ctags on all platforms, hence this # script. @@ -45,8 +45,9 @@ GetOptions('show-processed' => \$show_processed); -# List of functions to strip from the output -my %ignore = map { $_ => 1 } qw(knet_win32_init knet_win32_destroy); +# List of functions to strip from the output. Currently empty, +# but this functionality is retained for potential future use. +my %ignore = map { $_ => 1 } qw( ); foreach my $file (@ARGV) { extract_symbols($file, $show_processed, \%ignore); diff --git a/test/hfile.c b/test/hfile.c index f6ba0d7cf..8f06a971f 100644 --- a/test/hfile.c +++ b/test/hfile.c @@ -176,7 +176,7 @@ int main(void) original = slurp("vcf.c"); for (i = 1; i <= 6; i++) { char *text; - sprintf(buffer, "test/hfile%d.tmp", i); + snprintf(buffer, sizeof(buffer), "test/hfile%d.tmp", i); text = slurp(buffer); if (strcmp(original, text) != 0) { fprintf(stderr, "%s differs from vcf.c\n", buffer); diff --git a/test/index.vcf.gz.csi b/test/index.vcf.gz.csi index 644832d83..250339624 100644 Binary files a/test/index.vcf.gz.csi and b/test/index.vcf.gz.csi differ diff --git a/test/index.vcf.gz.tbi b/test/index.vcf.gz.tbi index 4d6e99781..e9ab7b60d 100644 Binary files a/test/index.vcf.gz.tbi and b/test/index.vcf.gz.tbi differ diff --git a/test/index2.sam b/test/index2.sam new file mode 100644 index 000000000..97d39e65b --- /dev/null +++ b/test/index2.sam @@ -0,0 +1,11 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:1 LN:249250621 M5:1b22b98cdeb4a9304cb5d48026a85128 +@SQ SN:2 LN:243199373 M5:a0d9851da00400dec1098a9255ac712e +um1 69 1 1000000 0 * * 0 0 AAAAAAAAAA * +um1 137 1 1000000 44 10M * 0 0 AAAAAAAAAA * +um2 69 1 2000000 0 * * 0 0 AAAAAAAAAA * +um2 137 1 2000000 44 10M * 0 0 AAAAAAAAAA * +mu1 137 2 1000000 44 10M * 0 0 AAAAAAAAAA * +mu1 69 2 1000000 0 * * 0 0 AAAAAAAAAA * +mu2 137 2 2000000 44 10M * 0 0 AAAAAAAAAA * +mu2 69 2 2000000 0 * * 0 0 AAAAAAAAAA * diff --git a/test/index_dos.sam b/test/index_dos.sam new file mode 100644 index 000000000..b006aa77d --- /dev/null +++ b/test/index_dos.sam @@ -0,0 +1,190 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd +@SQ SN:CHROMOSOME_II LN:5000 M5:8e7993f7a93158587ee897d7287948ec +@SQ SN:CHROMOSOME_III LN:5000 M5:3adcb065e1cf74fafdbba1e8c352b323 +@SQ SN:CHROMOSOME_IV LN:5000 M5:251af66a69ee589c9f3757340ec2de6f +@SQ SN:CHROMOSOME_V LN:5000 M5:cf200a65fb754836dcc56b24b3170ee8 +@SQ SN:CHROMOSOME_X LN:5000 M5:6f9368fd2192c89c613718399d2d31fc +@SQ SN:CHROMOSOME_MtDNA LN:5000 M5:cd05857ece6411f40257a565ccfe15bb +@PG ID:bowtie2 PN:bowtie2 VN:2.0.0-beta5 +SRR065390.17240207 16 CHROMOSOME_I 999901 42 100M * 0 0 ATGTTTACAGGACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAG CACAC?CBBAA@?@?BADDBBDBBAB>DDDBBDDABBBCCADDDDDCBCBCCCDBDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.15493040 0 CHROMOSOME_I 999912 42 100M * 0 0 ACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDBCCBDBCCBDDA@>DC?5@?@@??:><<>8>39<37 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.6144221 0 CHROMOSOME_I 999914 42 100M * 0 0 TTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCCCCBDCDDBBDDBDBDD@BBB@DBABDB AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8057275 0 CHROMOSOME_I 999916 42 100M * 0 0 CAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTT CCCCCCCBCCC@CCCCCCCCCCC>BBB>BB?4CCCCCC;>====ACCCA@CCCBBCCBC;>@==>BBBBA?<;@<@######################## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24679913 16 CHROMOSOME_I 999917 42 100M * 0 0 AAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTT ==56>??>AB?>D>?A?DBDABBB=BDBDACDBBCCDBBBBDDCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.25513175 0 CHROMOSOME_I 999934 42 100M * 0 0 ATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCBC@CADCDDAABA=B?=A=B.>AA?AADA########################## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17492782 0 CHROMOSOME_I 999935 42 100M * 0 0 TGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCDCCDCCBDCDDBDDBDD@BBBBBBACBBAB=AB>BBBAB>?BA@CAAA? AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17146364 16 CHROMOSOME_I 999942 42 100M * 0 0 CAAAAATTGTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAG #######@/A@@<:BBBBB>ABBDADC@=DDBDDDCDCCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:8T91 YT:Z:UU +SRR065390.14459471 16 CHROMOSOME_I 999944 42 100M * 0 0 AAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGC @@@@=B@CCCBAABACCC@DCCCCCDCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.6968616 16 CHROMOSOME_I 999947 42 100M * 0 0 ATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTA BDB>B@DDDD@DDDDBCACB@DCBCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9052825 16 CHROMOSOME_I 999952 42 100M * 0 0 GGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACC ?B;DABDABDDBDDADCCCD@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22926164 0 CHROMOSOME_I 999967 42 100M * 0 0 TTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBCCBCCCCCCCCDCCDCDDDDCCDACDCADBDDBBCBCBCCABBA@BABABCBABC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27108093 16 CHROMOSOME_I 999969 42 100M * 0 0 AAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACN ##########AAAAA388333-533')''+AA8AAAAAAAAAA8AAAAAA67788AAAA888887AAA5AAAAAAAAAAAA8AAAAAAAA+*++)))))! AS:i:-1 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:99C0 YT:Z:UU +SRR065390.19145675 0 CHROMOSOME_I 999970 42 100M * 0 0 AGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCBCCCCCCCCCCADCBDBBCBBBBBDCBABBBABAABB??DDAACCAACC>AC?C?= AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22660118 16 CHROMOSOME_I 999972 42 100M * 0 0 CTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCC B9ABABDB>DBBBD8CBDCDBCDBCDBCBCCBCCCCCCCCCCCCCCC>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1589310 0 CHROMOSOME_I 999973 42 100M * 0 0 NTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCG !++((22221AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7A8AAAAAAAA8AAAAAAAAAAAAA7A7AA768655 AS:i:-1 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:0T99 YT:Z:UU +SRR065390.32984687 0 CHROMOSOME_I 999978 42 100M * 0 0 GTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCT CCCCCCCCCCCBCCCCCCCCCCCCCCCCC@CC@CCCBCCCCCCBDACDCC>@B@CDBADB@BCBD@B=BBB@BD>C@BBCBACAABAB;D9<4:<66 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28347129 16 CHROMOSOME_I 999978 42 100M * 0 0 GTTATGTTTAGGCGTAGGCTTAGACATACGCTTAGGTTTCGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCT ##############################################@B?BB@A@ABBBDABD@DDBBB@@B;C@BACBC@CC@CCCCCBCCCCCCCCCCC AS:i:-10 XN:i:0 XM:i:5 XO:i:0 XG:i:0 NM:i:5 MD:Z:23G1T2G2G7A60 YT:Z:UU +SRR065390.17964692 16 CHROMOSOME_I 999984 42 100M * 0 0 TTTGGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAA #####@<@=<53.830;>.?A5@@?ABAAADBDBC<@CB@D@BCB@CBCDCDBBDC=C@C@CAAC@C@ACCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:3A96 YT:Z:UU +SRR065390.16701032 0 CHROMOSOME_I 999987 42 100M * 0 0 AGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCACCCCCCCCCCDCBCCCCCCDCCBAA@BBBBBC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24060716 16 CHROMOSOME_I 999989 42 100M * 0 0 GCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAA @8>68BD?B??B@DB>ABB?BA@A=ADBCC@?AA@CCBBCBCCDBCDCCBCBCCC@CCCCBCCCCCCCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24907628 16 CHROMOSOME_I 999989 42 100M * 0 0 GCGTAGGCTGAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAA ################################BDDBB?BB>?>BADABBBDBDBABDBDC;?>9=C?B>CC@CCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:9T90 YT:Z:UU +SRR065390.21366278 16 CHROMOSOME_I 999991 42 100M * 0 0 GTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAG ######?9>A09=@?=>BBDBBBB8B>DBCDCCDCBCBCBDCCC@CCCCCCCBCCCCCCC@@CCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27662957 0 CHROMOSOME_I 999995 42 100M * 0 0 GCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAGAGTA CCCCCCCCCCCCCCCCCCC@ACCCCCCCCCCCCCCADCCCBC?CDDDDAC=BA?@B@DBDB>?>>D?#################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.29477959 0 CHROMOSOME_I 999997 42 100M * 0 0 TTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAGAGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=AB?DAB@3=@8@=@?@ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.13030274 16 CHROMOSOME_I 1000208 42 100M * 0 0 TCAATTAAACTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTC 955576>0@BBBBBBDBBD?DABDDDDCD@DCDDCCDCDDCACBACCCCCCBCCCCCCCCCCCCCCBCCCCCCCCCCBBCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.18054898 16 CHROMOSOME_I 1000209 42 100M * 0 0 CAATTAAACTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCC CAC@CAA?BC?D??BCABB8=>@@?#### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.26866653 16 CHROMOSOME_I 1000217 42 100M * 0 0 CTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACC ###########??????4D;AA?AAD?A>>?CABCBABBBBAA@AD>ADAAC@CCCCBCCBCCC?CCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCBCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.23714265 0 CHROMOSOME_I 1000218 0 78M2I20M * 0 0 TGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGAGATCGGAAGAGCGGTTCAGCAGGAA CCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCADDCCBBBBBDBBBB+=7=0?==>A#################### AS:i:-48 XN:i:0 XM:i:16 XO:i:1 XG:i:2 NM:i:18 MD:Z:75T0T1T0T0G0T2T0T0T3C0T0T0T0T0T1C0 YT:Z:UU +SRR065390.20744360 16 CHROMOSOME_I 1000218 42 100M * 0 0 TGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCG #####@ABBBBDBD@BA@DCDBABBBBBDA>@CBBDBBAD=BBDCBACBCCCCCCCBCBCCCCACCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3611567 16 CHROMOSOME_I 1000225 42 100M * 0 0 CGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTT #####@<2@=BBBBAC=DBBB@BBACBBBB=C;BBCCBACC@CCACCCCBCCCCCCBCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10053218 16 CHROMOSOME_I 1000225 42 100M * 0 0 CGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTT @@=@6AA=AAC?CAC>BB>?A>>CBB@@CBAD>CC;>C@BC>A################################################# AS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 NM:i:4 MD:Z:66A7A14C2A7 YT:Z:UU +SRR065390.21951837 0 CHROMOSOME_I 1000229 42 100M * 0 0 AATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCBCDCACCCCCCBCCB>AACCC@1/?@?CCC@@BABCB=?@@+:A?B###### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.21381202 0 CHROMOSOME_I 1000232 40 100M * 0 0 TATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGGTTTTTTTAGTTTTTTCTTTTTTCCCAATTTTTTTGGATA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCDC?=8@';4@AA############################################# AS:i:-16 XN:i:0 XM:i:8 XO:i:0 XG:i:0 NM:i:8 MD:Z:60A5G8C6A2G7A0G1A3 YT:Z:UU +SRR065390.22184926 16 CHROMOSOME_I 1000235 42 100M * 0 0 TGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCT ??CAACCBAADD?DBB?@>BBB;BABBBBB@>CCCDBCDBACCCCAACACACCACCC@@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17603173 0 CHROMOSOME_I 1000236 42 100M * 0 0 GGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTG CCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCDCCCCCCCCB>CAB@ACCC################################################ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17587471 16 CHROMOSOME_I 1000250 42 100M * 0 0 GAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTT 10?8;;?;AA??:AA@BBBBB?BDDDDDBCDA>@DDDCCCDACCCDDCCDCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.635026 0 CHROMOSOME_I 1000255 42 100M * 0 0 TTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGA CCCCCCCCCCCCCCBCCCCCCCBD@CCCCB0:>8:=BBBBC6:=7@>?B?B43/+2>@@/@########## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.33333470 0 CHROMOSOME_I 1000257 42 100M * 0 0 TTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTTTGTTT CCCCBCCCCCCC?CCC?CCCCDBCADCCCCCA@@:;CCCC?7.)8;>???-3>>;A?3?6;/2;>?A:24775=4B<@@<4)+75:70(4@>::)9,B>BB?BBD:>BADDD=ABBBDDDBD@DBCCCDCCDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:0T99 YT:Z:UU +SRR065390.18670433 0 CHROMOSOME_I 1000260 40 100M * 0 0 TGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTTTGAATATCTGGGGATTTTTCGTTTTTTTTTTTTT CCCCCCCCCCCCC>CCCCBBC4A@ACCC8@;5/8;A?A/6,>==AAC6<@################################################## AS:i:-14 XN:i:0 XM:i:7 XO:i:0 XG:i:0 NM:i:7 MD:Z:65A0G11A4C7C1G0A5 YT:Z:UU +SRR065390.5800524 0 CHROMOSOME_I 1000261 42 100M * 0 0 GGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTTCCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTT CCCCCCCCCCCCCCCCCDDC*/,0/??/<<508BAA@@BCBCAC?BAADBCD@@@CBCCBA9CCCACCCCCCCCCDCCCCCC?CCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCBBBCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1793614 16 CHROMOSOME_I 1000274 42 100M * 0 0 ACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTT A:CAADB=DBDD@CBACC>@CACCCCCCCDCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.20107270 0 CHROMOSOME_I 1000276 42 100M * 0 0 GTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAG CCCCCCCCCCCCCCCCCCCCCCC@@CCCCCCCCCCCCCCCCADDCCCCCDCC?ACACDCCCCC@CCCDCD@BCDCBB3>B@BCCC@@9=3BB?@B@>85; AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8268806 16 CHROMOSOME_I 1000276 42 100M * 0 0 GTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAG ##########D?:BBA>;BBABBAABBBBBDDB>DDDDBDCDDCDCDDCCCDCCCDCCCCDCCCCCCCCCCCCCCCCC@BBCCCCCCCCBBBCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.19264263 16 CHROMOSOME_I 1000280 42 100M * 0 0 CACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCC ##BB?>CBABBB?:BBBBABABABB@DBCBBDAABDCCCCCCBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.18391831 0 CHROMOSOME_I 1000283 42 100M * 0 0 TTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCC CCCCCCCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC?ACCCCCCCBCC@CC8BBCCCCCB@>A>CCCDDC@@@DBBBC?:CCDBAC;CDDDDCBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCC@BBCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24029537 16 CHROMOSOME_I 1000284 42 100M * 0 0 TTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCA DB>B8BB<9;?>ABDDAADB@DD@C@BBAABBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCCCCCCCCCDCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28630205 0 CHROMOSOME_I 1000286 42 100M * 0 0 TTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCACG CCCCCCCCCCCCC@BCCCCCCCCCCCCCCCCCDBCCCCCDDBBBCBCDCDB@=?BBBBDBBABBBBBB@@CBBDB>>>A>BCBCCB:;:>=<9:@A#### AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:98A1 YT:Z:UU +SRR065390.15799530 0 CHROMOSOME_I 1000295 42 100M * 0 0 TTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCCT CCCCCCCCCCCCCCCCCCCCCCAACCCCCCCCCACCBCBCCCC?B@CCCCB@93=@B5>BB>>3/77:7:B>CDBDDB@>;B>BBBBDACAAB@D@<9<9<7 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22494349 0 CHROMOSOME_I 1000297 42 100M * 0 0 TGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCCTAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACDCCCCCD@@CCDCDCBBDCDDDBADDDDCD>B;@>DAABBB@>5A>BDBB?6??@D?9@####### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12445253 0 CHROMOSOME_I 1000298 42 100M * 0 0 GTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCATAAC CCCCCCCCCCCCCCCCCCCCACCCCCCCCC@DCCCCCCCC?BACCBC@CBDCCACB?BBBCDC@@;4BCBABDC@B56?B@96=4A>BAB;;5;:@19A;@;;;6?BBBBB3BBB??@@@>@BBB;@AA@9@AA9BABBBAA@@AABAABAB@BB:;??>:?DBAB?BBDDBBABB;ACBDB?BBB@CCCBDD@CD@CCDBCDDDCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27194079 0 CHROMOSOME_II 2920 42 100M * 0 0 CTAATTTTCAGAGAGACTGAAAGAGTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAA CCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCC=BBBCB?BBBA?BBBDB?>BB=CBCCAACAC;DAB=ACAC?##### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.21775125 16 CHROMOSOME_II 2934 42 100M * 0 0 AACTGAAAGAGTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACC #####ABA?=<<=5=@BBA?=@>:A:7.44?B?8B@@>BBB=@B?ADBBBCBBACBD9CBD?A9?=A?.AABADDABBB@BABDDBACBBCCDCBCCDCCCCDCCCCDCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3790175 16 CHROMOSOME_II 2944 42 100M * 0 0 GTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTG 8BDD@:=7)/>B>ABBB?BB?>?DB@B:BBB?BBADDC@BDCDDCDBCDCCCBADCCCCCCCBCCCCCCCCCCCCCCCCCCCDCCCCDCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.4091455 0 CHROMOSOME_II 2946 42 100M * 0 0 TTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC?CCCCCCCCCCDCCCCBCCCDACBCDCACC@C@CA@CBAAD=BBAADD06@##### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8676436 0 CHROMOSOME_II 2947 42 100M * 0 0 TAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCCCCGCGGTCCGTGTGC <:>:>/000/:<<:BAB?>8A?A;:A873;3?>?>A>>A8B############################################# AS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 NM:i:4 MD:Z:84A7A0A5T0 YT:Z:UU +SRR065390.28734084 0 CHROMOSOME_II 2948 42 100M * 0 0 AAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTGTGTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCBCBCDCBCCCBBDDDCADABADBBABB:BB=D?B<@B@>CA?CA>BACADAA########### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10526869 0 CHROMOSOME_II 2956 40 100M * 0 0 TACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACGCCACCGCGGGGAGGGGGGTTGTTTTAT CCCCCCCCCCCCCCCCCCCCADCBBDDDDDDDBBB8BA@B>6<:>9=789=0>D>AA<@<8B>1>A9>;@5=@8C:48;*AAA=<>9>9>>:>>AB?D>BBDBCBDBBCCBABBB>@CDCCBCDCAACCCCCACCCCCCCCBCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10879394 16 CHROMOSOME_V 938 42 100M * 0 0 TATGTTTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCA B;B:B>@B?>@>7BBDABADADBBCBDCCBACBCCBBB@CCCCCBCCACACCCCCC>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1520161 0 CHROMOSOME_V 941 42 100M * 0 0 GTTTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATG CCCCCCCBBBCCCCCCCCCCCCCCCCCCCCACCCCCCDCC@CCCCCCCCCCCCCCCCCCACCCBCCAD=D@BC?C?C?C##################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17468019 16 CHROMOSOME_V 943 42 100M * 0 0 TTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAA >ABBBABBDDDB=DBCD?DDBDBDADDADDBDCCCCCCC=CCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12403970 0 CHROMOSOME_V 949 42 100M * 0 0 TGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCC@CBBCACBC@?144:>><@@DAB?:=9@<>/>9?;=927= AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.16193993 16 CHROMOSOME_V 949 42 100M * 0 0 TGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTC ;;/67AAC@ADCCDBCDCCCCCCD@CCCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCDC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.30032741 16 CHROMOSOME_V 950 42 100M * 0 0 GAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCA AACBBAB?BB>BABBCDBBDABDBADDDDBDDBBADDDDBACCDCBDDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.32455256 0 CHROMOSOME_V 956 42 100M * 0 0 GTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGGTATAATACAGCGACTCAATGAAAAAATCAAAAAAA CCCCACCCCCBB=?ABB?BBA?BAABBBBBB@BBABBBBBBBBBBBBBAA@BBBBBBB>B######################################## AS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:64T25C9 YT:Z:UU +SRR065390.15571530 16 CHROMOSOME_V 966 42 100M * 0 0 CTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTT B;:B;B?D?@?BBBB5-=<:@@AA@BBA>BBADBBDDDDCDCDCDBBDCCCDCCCCCCCCCDCCCCCDCCCCCCCCCCCACCBBBCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9595122 0 CHROMOSOME_V 967 42 100M * 0 0 TGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCC?@:AAA>C@CBB@@>?B=A?BBBBBCB>@/@>=>=>BB# AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3600239 16 CHROMOSOME_V 969 42 100M * 0 0 ATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCA DB>DBBBBA@AAB?DCA@CB@ABB@BB=AAAA>@==>>6/>:>5:688/85A?AAA>>657==BBB<;;;9>>8>>BBBB> AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:28T25T45 YT:Z:UU +SRR065390.31266674 0 CHROMOSOME_V 971 42 100M * 0 0 AATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTTACTTTGCACG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCADDCCBC@CBC5<5<7?:83;+471/0<4=8;??BBD(.94;9?@?################ AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:89G10 YT:Z:UU +SRR065390.23187971 16 CHROMOSOME_V 972 42 100M * 0 0 ATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGC 647:0BBB?B==@?@@BDBDBBBDDDBDDBDBDDDCBCCCCBBCCCCCDCCBCCCCCCCCCCCBBBCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28661392 16 CHROMOSOME_V 975 42 100M * 0 0 TGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTAT ACCACAA5BDABAA>BDBDBDCBCBA@DBDB>DBBBBBAABDBDBDDBCCCCDCCCCCDCCCCCCCCCCCCCCBBBCCCCCCCCCCDCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1859967 0 CHROMOSOME_V 979 42 100M * 0 0 AAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGCAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCA>=>C<@@;:@A@A=53@?AB::?@CCACC=B/<;53;7BB:>B=::=A@?@?ACCC>C@CCCCCB:/&-7735@B7B>B?;@@CC@35A@@CCBC@######################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12435485 16 CHROMOSOME_V 981 42 100M * 0 0 ATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAA B<=BB>B@>>BBBD@>?DABBBBBDDDDDDDDCADCDCCDCCCDCDBCCCCCDCCCCCCCCCCCCCCBBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.7485987 0 CHROMOSOME_V 983 42 100M * 0 0 TATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGGTAAAA C@?C@CCCCCCCCCCCC@CCCCC@?C8CCC@BC?@CC############################################################### AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:94T5 YT:Z:UU +SRR065390.17264189 0 CHROMOSOME_V 983 42 100M * 0 0 TATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACCCTATGGTTAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=C=/////=?5=;:@8???AA############################ AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:87G12 YT:Z:UU +SRR065390.6356855 0 CHROMOSOME_V 986 42 100M * 0 0 AATTTAAAACTTAAACGAAGCTAAAATGTGGCTGGTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAAAA CCBCCBCCCCCCCCCC@CC?@CCCCC@CCC>A=@.88/45+()/.=>2==BBCB659?9?'))10;9??############################### AS:i:-7 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:34T63T0G0 YT:Z:UU +SRR065390.20107175 0 CHROMOSOME_V 989 42 100M * 0 0 TTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAAT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@C@?./..):3872A=@=A<=:;=B>B>>87777@>&@9A@@@8:@>88 AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:87T12 YT:Z:UU +SRR065390.6431660 16 CHROMOSOME_V 994 42 100M * 0 0 ACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATT AACBABABDC@@ADABBDDCDCDBCDDDCCDCDBCACCCBCCDCCCCCCCCBCC@@@CCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.4439503 16 CHROMOSOME_V 997 42 100M * 0 0 TAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTT ###########B>:AAAAA@C@=;937<ACCC8@@@AABCC>@+/662BBBC?B>BBB?BBBB#################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.11492188 0 CHROMOSOME_V 998 42 100M * 0 0 AAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTG CCCCCCCCCCCCCCCCDCCCCC>A@AAAAAACA??B@@BBD>BACACC08;;AAACB==/*/1//:=@99BBABA@;<@;<:9>>B??>B??:?6B??B9 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9605367 0 CHROMOSOME_V 999 42 100M * 0 0 AACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTGG CCCCCCCCCCCCCCCDCCCCC@C<>>A9<4=9>=B###################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.29302896 0 CHROMOSOME_V 1000 42 100M * 0 0 ACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTGGT CCCCCCCCCCCCCCDCCCCC6?:??AABCCCC8?C@BCCCC@@5;><9>>>B>>AB=<)6=4:):9>>@@################ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.13754 4 * 0 0 * * 0 0 TCGCTGCTGTGATGTTGCGTTTTTATCAGCACAAAGGCGGTCAGGCCGAGGCCTATTTTTTCCGGATCCAGCAGGGCGACTTTGCCGATAAGGATACCGT CCCCCCCCCCCCCCCCCCCCCCCCCBBCCCCCCC@CCCCCDCCCCCCCDCCCC@ACCCC@>>CCD?>>>@@@ YT:Z:UU +SRR065390.13765 4 * 0 0 * * 0 0 CGTGGTCGTGCCGGTTACAAGCCTGCCGTGAAAAGCCGTTTCAGTAAGTCAGCCAATAGCAAATTCTCCCATACTATCGCTTTTGCCTGATCCTGAACTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCDCCACCCCCCCC@CCCCCCCCCACCCBB@?CBDABDDADB<=ABBB@B@BB@ YT:Z:UU +SRR065390.13778 4 * 0 0 * * 0 0 TTTTATACCAACAAAAAACGGAAAGCAGATAACCCAGCAGCCCGAGTAACAGTATCCGGGCATCCAGGCCAAAAGCTAACAGAGCCGCGATAAAATCCCA CCCCCCCCCCCCCCBBBBBCCCCCCCBCCCCCCCCCBCCCCCCCCCDCCCCCACCCCCCCCCDCCCCDCBCA@AC>@=@CC?B>CBBCC>=?8A8=?>66 YT:Z:UU +SRR065390.13779 4 * 0 0 * * 0 0 ATAATGGACAACTTTAATGGCAATCACTAAATCAACTCCGGCACCATTAACCGGTGGGACGTTATGGTGCGTCACTATTGCATTGTCATTAGCGACATTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCC=CC?BCCBCCACBABCCDCDBDADBBCDCBD>DBBD==BAA:>5<> YT:Z:UU +SRR065390.13802 4 * 0 0 * * 0 0 AAGGCGTTTATTATATACACTCGCATGGCTTTTCTTCTGAAAATGTAGAATAATTGAGTAATTTTTAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG ??B:??????BBB>B99;;;>B>>>:BBBBB;;B=70///0-/01/BA>BABB>B>BBB@BB>>ABABA>BA>6BB88183,<8;<4>:@><>A>> YT:Z:UU +SRR065390.13808 4 * 0 0 * * 0 0 GTTTGCTGACTGGCCAGCCAGCTCAAGGCATCAAAAGCGTCTTTGAGAGGGAAAGGAATATCGATAACCCGAATGCCGGGTGGCGCCGGAATCTCTTGCG CCCCBCCCBCCC@CCC@CCCAA:A=BB?BBBB@>B>A#################### YT:Z:UU +SRR065390.13853 4 * 0 0 * * 0 0 CTGGTACGTCACCACACGCCGCGATGGCGTCATCCACCGACTTCACCCACGTTACGCGATCGTCCGTACCCGGGTGACCGTTGGGGATAATATTTTTGCG #################################################################################################### YT:Z:UU +SRR065390.13861 4 * 0 0 * * 0 0 TTCAGAAACTGGATGAACAGTGCGCAGCCATCTGCAAATATGAATTAGTTCAAGTCACTCAAAAGCTATTTATTTGAATGGAAGAAATTTTTGAACTATA CCCCCCCCCCCBCCB@@CCCBCCBCCCCCCCCCCCCC?CCCC@CCC@C@CCC@CACCBCC?BBBC@C7CBCBCB@@ABCCBBBC=BABCCBBBBAB@@CA YT:Z:UU +SRR065390.13907 4 * 0 0 * * 0 0 CATTACCATTCAGTTGTATTGTTTGCGCACCAGAAAAATGAGACTGCACAGAATAAATTATACTGACCAGAAATTGTAAAATTCGTATATTCTTATTCAT 8998;9:;9;>9:9>?BABBAAA2A@@@@@>:3'3A################################################################ YT:Z:UU +SRR065390.13946 4 * 0 0 * * 0 0 TTTCCTCGAGTTCTTGATGAAATGGTCCATTATTTGTCAACCATTTATTTTTCCATATTTTTTCCAGGTAAGGCATGAATTCTGCAAGTTCCGGCAAAGA CCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCBBBBCACCCCCCB@BCCCCBC?CCCCACCBCBCCCCC@BBDCDDBCA4@@A YT:Z:UU +SRR065390.13956 4 * 0 0 * * 0 0 CGGCGCAACAATACTCAGCAGTTAATTGCAAAGGTATCGCACACCATTAAAAGCATTAAGCCGGGAGTCGAATTTGGTGTTAGCCCGGCAGGCGTGTGGC CCCCCCCCCCCBCCCCCCCCCDCCCCCCCCCCCC>CCCCCCCCCCCDCCCCBDCCDDCBDCC@?@BA@B@B>BBABAABB6?BB>B@?B??2?=+>->60 YT:Z:UU +SRR065390.13964 4 * 0 0 * * 0 0 NTTGAGGTGCTCCAGTGGCTTCTGTTTCTATCAGCTGTCCCTCCTGTTCAGCTACTGACGGGGGGGTGCGCAACGGCAAAAGCACCGCCGGGCATCAGCG !))))++++*AAAAA8AAAA################################################################################ YT:Z:UU +SRR065390.13969 4 * 0 0 * * 0 0 CGGGCGATAGTCAAAAACTTATTTTCACAATTTTCGGCTAGGGAGTATATTTACAGTTAATTTGCGATGTGTTAGATCGGAAGAGCGGTTCAGCAGGAAT CCCCCCCCCCCCCCCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCACCCCCCCCCCDCCCACCCCCDCCACBCCCCCCBCBBCDCBCC?BCBBCBCBC;A YT:Z:UU +SRR065390.13978 4 * 0 0 * * 0 0 AGACGGTAACTTTCAATTTGCACCCATGATTAAATTTTATGTTGATTAAAATAGAAGCAAAAATCATTACATTACACTACAAAATACGCCGAAATGTTAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBACCCDCBCCCABCADCCAABC? YT:Z:UU +SRR065390.13985 4 * 0 0 * * 0 0 TAACCAAAAACTGGATTATGCAAATAACTAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATATCGTATGCCGTCTTCTGCTTGAAAAAAAAA CCCCCCCCCCCCCC?CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCBCCDCDCCD@CBD5>@=:=><9A@3>=B?BB>CBACB?BBA YT:Z:UU +SRR065390.14000 4 * 0 0 * * 0 0 TAGGTGAGAAAAGCGTTATTGGTCCGGTATACCTGCGAAGCGACAAAGCAATAAGGCAACAATGGCAGGTAATGCTGCTCAAAAAAGCGTTTACTGATCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCBADCCCB?@B>B@BADAAABBD@C5;B9?:?;ACABAB YT:Z:UU +SRR065390.14032 4 * 0 0 * * 0 0 GAAGGTCCAAGTGCCTTGAAGATAGAAAATTATAGCATTTCTCTTTAATTTCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATATCGCATGA CCCACCCCCC>AAAACBCCCCCCCBCC?CCCCCCCCCCCCCCCCCCCCCCCB@CCCCCCBAC@CDCA/@B<;8=?@B>BC>?>?BB=:A########### YT:Z:UU +SRR065390.14061 4 * 0 0 * * 0 0 TGAAGCCGACAATTTGAGGCCAAACATCTTACATTCGACAGTAAATATTTGGGGATTAAGACTTATGTTAGATCGGAAGAGCGGTTCAGCAGGAATGCCG CCCCCCCCCCCCCCCCCCCCCCCCCCC=CCCCCCCCCCADCBCCCC=CCCCCCCBCCC=CBCCCCCCCABCCCCCCBACBC@CCBB;@B;?A@A@=?99A YT:Z:UU +SRR065390.14072 4 * 0 0 * * 0 0 TGAGTGAGGCTCAGGATTTTGAGTGAGGCTCAGGATTATGAGTGAGGTTGAAGAATTTGAGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAG @B=@@BB@B@<@BB>BB>>@BB@==2;:;8BBBBBB@B@@@:@?1B@B@B@3@@@>3;@;@<@?>;@B@@##### YT:Z:UU +SRR065390.14100 4 * 0 0 * * 0 0 AAGCCTGAGGGATAATTTTCGTCAAATTAAGGCAATTGCCGAGTGTTTCATCCCTGGCAAGCAGAACGGCTTTTTCGTTATTTATATCGGGAGAATTTAT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCBCDCDCDCCCCDBBCDADBBBABDDBBBBBBBBDCBCD?BABB>B>AB>BCABAA>C YT:Z:UU +SRR065390.14105 4 * 0 0 * * 0 0 AAATTGTCCCCAAATAAAACAATTCCAGTGATCTTCCGATTCTAGGTGCCAAATAACCCAAATAGTCACTGCATTAGTTTTTATCTCACTTTTCTCCCCC #################################################################################################### YT:Z:UU +SRR065390.14107 4 * 0 0 * * 0 0 TGAAATTTCAAGAAAAATGTTAATTACCACCGTATTAAAAAAAAAAAACTTAAAATCAAAGATCGGAAAAGGGGTCAGGCAGGAATGCCAAAACCGACAC CCCCBCCCCCCCBCB>>>ACCCBCACCC?CCCCCCCCCCC?B########################################################## YT:Z:UU +SRR065390.14137 4 * 0 0 * * 0 0 CTGTGGCGTTTTTATCAAATTGGCAGAGCCACGTTCAGAGCTGAAAAAGCCACAGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTTGGA BCCC@CCCCC?CCC>>CCBCCC?>C@CCCB=6?AA>=>3?>@?@86;86.@A@==378::68829>B9B############################### YT:Z:UU +SRR065390.14141 4 * 0 0 * * 0 0 GGTCACCAATCATAAGAGGAACAGCGACTGCACCTGCGTACATGACAAGGACGTGTTGCAGACCGAGTATGATCAGCTTTCCTGGTGATAGTATGCGCTC AAA@A?AA8:>A######################################################################################## YT:Z:UU +SRR065390.14162 4 * 0 0 * * 0 0 ATACTTCACCGGATGGTGGAATTAACGAAAACAACAACTGGTGTCACATCCCGCAGGCAAAAGAGGCAGCGGCTAACTAAGCGGCCTGCTGACTTTCTCG CCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCCBBCCCCCCDCDCCCCCCCCCC?C?CCCCCACD@CAD@AB<>@CB;6B#################### YT:Z:UU +SRR065390.14168 4 * 0 0 * * 0 0 TCGAGGGTGAGGGCGTCTGCCAGATCGGAAGAGCGGGTCAGCAGGAATGCCGAGACCGATATCGGATGCCGTCTCCTGCTGGACAAAAAATGAGAATGGG AACC@0@>@6:<>??>?BBBBB?+B6BBB>B?B=:?BBB=BBBBB>B######################################## YT:Z:UU +SRR065390.14173 4 * 0 0 * * 0 0 AAGAAACTCAACAAACCGGACTTGCAGGTGAAACTGATTCCGATTACCTCACAAAACCGTATTCCACTGCTGCAAAACGGCACTTTCGATTTTGAATGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=@CCCCCCCCAA?ABC@CCA=CCCABDCCAABDD?BB@BBA YT:Z:UU +SRR065390.14178 4 * 0 0 * * 0 0 GCGCTTTGTTTACCTGATACGGAATTTCGTGGACGATAATGGTTTCACGACCGGTTTTGGCGTCAACTTCCACTTCTGCGCGAGCGCGGATATACACCTT BCCCCCCCCC@@CCCCCCCCCCCCCCCCCCA=AA>AAA=ACCBCCCCC?CCCCAB@ACCC?A?<CB?=CAB9B@BA################### YT:Z:UU +SRR065390.14182 4 * 0 0 * * 0 0 ATTTACTCTAATGTTCTGAAAAATAATTTACTCTAATGTTCTGCCAAATAATTTACTCTAACGTTCTGCCAAATAATTTACTCTAATGTTCTGCCAAATA CCCCCCCCCBCCCCCCCBCC@CCB@@@BCCCCCCC@CCCCCCBBCCCCCCBCCCC@CCC?CCC>>CCBCCCCA@CCCC;CBCCBDCCB@CCBCAACB@BB;B?B0B=8??9>??BB>B?@?B>A>A########## YT:Z:UU +SRR065390.14197 4 * 0 0 * * 0 0 GTACCTCGCCGTTGTTCTCGACCTGTTCGCAAGAAAACCAGTGGGCTGGGCCATGTCGTTCTCGCCGGACAGCAGGCTCACCATGCAAGCGCTGGAAATG CCCCCCCCCCCCCDCBB=B@?BB@BBBBBB@@@B@B==BBB9B@@@@B@=BBB@BBB=@BBABBB@@@BB<@BA@BBB=B;B?BBACA YT:Z:UU +SRR065390.14284 4 * 0 0 * * 0 0 CGGTGCATGATGCGGATTCCAGGAATCAACGTACAGCGTCGGGCTAAACCAGAACCAGCCAATAATGCACAGACCGACGACCGGAATAATAACCCCCCAC BCACCCCCCCCBCCBCCCCCCCBCC@CCCCCCCAC@CBACCACCCCC@CBCCCCCCCA8CC?A@9@AB@9CACC8=81B@CC9CCCCCCC,<8??CBC@BB?@C@ACBCB################################################## YT:Z:UU +SRR065390.14312 4 * 0 0 * * 0 0 ACAGTAACATTCAACGTTAAATATGTTAATAAGACGTTGCATTATTGTCCTGAAGTTGAAGATAGCAGGTATGGCGGTTGGATAGCACGGCGTTGGTTTA CCCCCCCCCCCCCCCCCCCCBCCCCCCBCCCCCCCCCCCCCCCCCCCBCCCCCDCCCCCCCCCACCBC@?CACBC######################### YT:Z:UU +SRR065390.14331 4 * 0 0 * * 0 0 GAATAATGAAGATGATGCGACGCGTCTGGCGCGTTTGAACGAACGCTTTAAACGCGAAGGTAAACCGGAGTTGAAGAAGCTGGATGATCTACCTAAAGAT CCCCCCCCCCCCCCBCCBCCCCCCBCCCC@CCCB@CCCCACCDDCBCC?CAC@B@DABA?BAB@@@?C?C@BC?9A::>=@@C;?############### YT:Z:UU +SRR065390.14335 4 * 0 0 * * 0 0 TCCATTTGATGAACCTGAAGTTTAAGTATTGACTTGAGAGGAAAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCT CCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC@BBA>CCCCCCCCBBCCBC=CBCCCB################### YT:Z:UU +SRR065390.14342 4 * 0 0 * * 0 0 AAGTTCATGAATTAAAGCCGACTCAAACACTCTGTTTAAAAACTGGATAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAGATCGTAGGCCGTC 0000079;9;AAAA?;;;>9>3>9BB8BBBB@############################ YT:Z:UU +SRR065390.14359 4 * 0 0 * * 0 0 GCATCAGTACGATAAAACGCGTACCGAACTACTGAATGATGTCGCAGGGGCGCTGGCTCTTGATGACAAACTCGGACGTAGCACCAATCAACTTTCCGGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBABACCBB@BBBBBB>BDBBDBBB>B@@@>>?BCBAC?CBC?> YT:Z:UU +SRR065390.14364 4 * 0 0 * * 0 0 GGTCGCCGATCCGATTTGCACTTTAACCACTTTCGGTAAAGAAACCGTTGTTAGTGAAAGCGAAAAACGCACAACGACCACTGATGACCCGCTACAGGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@@CCCCCCCCCBCCCCCCCCCACCDACCBCACA@CACCAA=B=CBC=ACAAAC=)>? YT:Z:UU +SRR065390.14392 4 * 0 0 * * 0 0 GTTATCCTTTTCCGTGATATGTGCGGTACTGCAGCGTATGCCGGCAAGGGTTGCAAACGGTGGTAGTGTGCAGGTTGACTGTTGGTCGGATTCCTCCACC CCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCDCCCCCCCCCBCCCA@CCB@AACB?CB?BA=A8@BACB?:===@C@-A6==?@@<@@AA##### YT:Z:UU +SRR065390.14393 4 * 0 0 * * 0 0 AGAAATTTACTGGCTCGCCGCAGCCAACTCCTCTTCTGACACCCCGGTAAAGCGCATGATGTCTGTAAGAGGGGCCCCGGATTCAAGCATTATTTTGGCT CCCCCCCCCCCCCCCBA9::<4A>AAAA:?A#################################### YT:Z:UU +SRR065390.14434 4 * 0 0 * * 0 0 GGTAGATTCCCATAAAAATCGCCAGCGGAATGGTGAACGCAACGGTATACGTTCCCCACGGGCTATGAGTCAGGGCTTTCACCACGATCATCGCCAGTAC DCACCCBCCCCCCCC>CBBCCCCCCCCCCCCCC?CCCCCCCCCCCACACCC@BCCCCBCD=ABB@BCBD?@@B6BC8B@B>BABCBB@AB=@2C###### YT:Z:UU diff --git a/test/maintainer/check_copyright.pl b/test/maintainer/check_copyright.pl index 161a7214c..22556df01 100755 --- a/test/maintainer/check_copyright.pl +++ b/test/maintainer/check_copyright.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # check_copyright.pl : Basic source file checks for copyright boilerplate # # Author : Rob Davies @@ -43,9 +43,13 @@ sub check { # Only check C, perl and shell files return unless (/(?:\.[ch]|\.pl|\.sh)$/); + # Exclude htscodecs submodule + return if (/\/htscodecs\//); + # Exclusions: my %exclude = map { ("$root/$_", 1) } ( 'config.h', # Auto-generated +'config_vars.h', # Auto-generated 'version.h', # Auto-generated 'cram/rANS_byte.h', # "Public domain" 'os/lzma_stub.h', # "Public domain" diff --git a/test/maintainer/check_spaces.pl b/test/maintainer/check_spaces.pl index 0daf24406..e48518f25 100755 --- a/test/maintainer/check_spaces.pl +++ b/test/maintainer/check_spaces.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # check_spaces.pl : Check source files for tabs and trailing spaces # # Author : Rob Davies @@ -43,16 +43,17 @@ sub check { # Only check C, perl and shell files return unless (/(?:\.[ch]|\.pl|\.sh)$/); + # Exclude htscodecs submodule + return if (/\/htscodecs\//); + my %allow_tabs = map { ("$root/$_", 1) } ( 'kfunc.c', -'knetfile.c', 'kstring.c', 'md5.c', 'htslib/khash.h', 'htslib/kseq.h', 'htslib/ksort.h', 'htslib/kstring.h', -'htslib/knetfile.h', 'htslib/klist.h', 'htslib/kbitset.h', 'os/rand.c', diff --git a/test/modhdr.expected.vcf b/test/modhdr.expected.vcf new file mode 100644 index 000000000..bad663c7e --- /dev/null +++ b/test/modhdr.expected.vcf @@ -0,0 +1,4 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO diff --git a/test/modhdr.vcf.gz b/test/modhdr.vcf.gz new file mode 100644 index 000000000..f97e06ab3 Binary files /dev/null and b/test/modhdr.vcf.gz differ diff --git a/test/modhdr.vcf.gz.csi b/test/modhdr.vcf.gz.csi new file mode 100644 index 000000000..61b60e79a Binary files /dev/null and b/test/modhdr.vcf.gz.csi differ diff --git a/test/mpileup/mp_N2.sam b/test/mpileup/mp_N2.sam index 292cfcb58..8ea072db9 100644 --- a/test/mpileup/mp_N2.sam +++ b/test/mpileup/mp_N2.sam @@ -40,7 +40,7 @@ @CO sD1 0 z 1 0 4M2I5D2I4M * 0 0 TAGCAATTAGGT ABCDEFGHIJKL sD2 0 z 1 0 4M1I1P5D1P1I4M * 0 0 TAGCATAGGT ABCDEHIJKL -sD3 0 z 1 0 4M1P1II5D1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL +sD3 0 z 1 0 4M1P1I5D1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL sN1 0 z 1 0 4M2I5N2I4M * 0 0 TAGCAATTAGGT ABCDEFGHIJKL sN2 0 z 1 0 4M1I1P5N1P1I4M * 0 0 TAGCATAGGT ABCDEHIJKL sN3 0 z 1 0 4M1P1I5N1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL diff --git a/test/pileup_mod.c b/test/pileup_mod.c new file mode 100644 index 000000000..323c0c6c2 --- /dev/null +++ b/test/pileup_mod.c @@ -0,0 +1,226 @@ +/* test/pileup_mod.c -- simple pileup tester with base modifications + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include +#include "../htslib/sam.h" + +typedef struct { + samFile *fp; + sam_hdr_t *h; +} plp_dat; + +static int readaln(void *data, bam1_t *b) { + plp_dat *dat = (plp_dat *)data; + return sam_read1(dat->fp, dat->h, b); +} + +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +// No modification reporting. +// This is just a simple base-line for comparison against mod_pileup1 for +// performance testing. +void process_pileup(sam_hdr_t *h, const bam_pileup1_t *p, + int tid, int pos, int n) { + kstring_t s = {0,0}; + printf("%s\t%d\t", sam_hdr_tid2name(h, tid), pos); + int i; + for (i = 0; i < n; i++, p++) { + if (p->is_del) { + putchar('*'); + continue; + } + + uint8_t *seq = bam_get_seq(p->b); + uint8_t *qual = bam_get_qual(p->b); + unsigned char c = seq_nt16_str[bam_seqi(seq, p->qpos)]; + putchar(c); + kputc(MIN('~','!'+qual[p->qpos]), &s); + } + putchar('\t'); + puts(s.l ? s.s : ""); + + free(s.s); +} + +// Initialise and destroy the base modifier state data. This is called +// as each new read is added or removed from the pileups. +int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) { + hts_base_mod_state *m = hts_base_mod_state_alloc(); + if (bam_parse_basemod(b, m) < 0) + return -1; + cd->p = m; + return 0; +} + +int pileup_cd_destroy(void *data, const bam1_t *b, bam_pileup_cd *cd) { + hts_base_mod_state_free(cd->p); + return 0; +} + +// Report a line of pileup, including base modifications inline with +// the sequence (including qualities), as [...] +void process_mod_pileup1(sam_hdr_t *h, const bam_pileup1_t *p, + int tid, int pos, int n) { + kstring_t s = {0,0}; + printf("%s\t%d\t", sam_hdr_tid2name(h, tid), pos); + int i; + for (i = 0; i < n; i++, p++) { + if (p->is_del) { + putchar('*'); + continue; + } + + uint8_t *seq = bam_get_seq(p->b); + uint8_t *qual = bam_get_qual(p->b); + unsigned char c = seq_nt16_str[bam_seqi(seq, p->qpos)]; + putchar(c); + kputc(MIN('~','!'+qual[p->qpos]), &s); + + // Simple mod detection; assumes at most 5 mods + hts_base_mod_state *m = p->cd.p; + hts_base_mod mod[5]; + int nm; + if ((nm = bam_mods_at_qpos(p->b, p->qpos, m, mod, 5)) > 0) { + int j; + putchar('['); + for (j = 0; j < nm && j < 5; j++) { + if (mod[j].modified_base < 0) + // ChEBI + printf("%c(%d)%d", "+-"[mod[j].strand], + -mod[j].modified_base, mod[j].qual); + else + printf("%c%c%d", "+-"[mod[j].strand], + mod[j].modified_base, mod[j].qual); + } + putchar(']'); + } + } + putchar('\t'); + puts(s.l ? s.s : ""); + + free(s.s); +} + +// Report a line of pileup, including base modifications. +// This replaces the base with the mod call (NB this can be confusing +// as both C and G can map to m depending on orientation). +// It also reports qualities in the QUAl column, remapped to +// phred scale as only one single mod is supported and hence extreme +// unlikely probabilities shouldn't be reported (although we don't +// scan to pick the highest). +void process_mod_pileup2(sam_hdr_t *h, const bam_pileup1_t *p, + int tid, int pos, int n) { + kstring_t s = {0,0}; + printf("%s\t%d\t%d\t", sam_hdr_tid2name(h, tid), pos, n); + int i; + for (i = 0; i < n; i++, p++) { + if (p->is_del) { + putchar('*'); + continue; + } + + uint8_t *seq = bam_get_seq(p->b); + uint8_t *qual = bam_get_qual(p->b); + unsigned char c = seq_nt16_str[bam_seqi(seq, p->qpos)]; + + // Simple mod detection; assumes at most 2 non-ChEBI mods + hts_base_mod_state *m = p->cd.p; + int n, is_rev = bam_is_rev(p->b); + hts_base_mod mod; + char base; + uint8_t q = qual[p->qpos]; + if ((n = bam_mods_at_qpos(p->b, p->qpos, m, &mod, 1)) > 0) { + base = mod.modified_base; + // base mod as phred scale + q = -10 * log10(1-((mod.qual+0.5)/256)) + 0.5; + } else { + base = c; + } + + // Case is inappropriate here as some mods (eg "a") are lc. + // So we dim/bold them instead using ANSI escape codes. + // It's a test script, so I'm not going to care about curses. + if (is_rev) { + printf("\033[2m%c\033[0m", base); + } else { + printf("\033[1m%c\033[0m", base); + } + kputc(MIN('~','!'+q), &s); + } + putchar('\t'); + puts(s.l ? s.s : ""); + + free(s.s); +} + +int main(int argc, char **argv) { + int compact = 0; + while (argc > 1 && strcmp(argv[1], "-c") == 0) { + compact++; + argc--; + argv++; + } + + samFile *in = sam_open(argc > 1 ? argv[1] : "-", "r"); + bam1_t *b = bam_init1(); + sam_hdr_t *h = sam_hdr_read(in); + + // Pileup iterator with constructor/destructor to parse base mod tags + plp_dat dat = { + .fp = in, + .h = h, + }; + bam_plp_t iter = bam_plp_init(readaln, &dat); + bam_plp_constructor(iter, pileup_cd_create); + bam_plp_destructor(iter, pileup_cd_destroy); + + const bam_pileup1_t *p; + int tid, pos, n = 0; + while ((p = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { + switch (compact) { + case 0: + process_mod_pileup1(h, p, tid, pos, n); + break; + case 1: + process_mod_pileup2(h, p, tid, pos, n); + break; + default: + process_pileup(h, p, tid, pos, n); + break; + } + } + bam_plp_destroy(iter); + + sam_close(in); + bam_destroy1(b); + sam_hdr_destroy(h); + + return n != 0; +} diff --git a/test/plugins-dlhts.c b/test/plugins-dlhts.c index 0e4638d1f..f90e3bd74 100644 --- a/test/plugins-dlhts.c +++ b/test/plugins-dlhts.c @@ -37,9 +37,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include #include #include +#include #ifndef EPROTONOSUPPORT #define EPROTONOSUPPORT ENOSYS @@ -101,6 +101,7 @@ void verbose_log(const char *message) int main(int argc, char **argv) { int dlflags = RTLD_NOW; + int skip = 0; int c; while ((c = getopt(argc, argv, "glv")) >= 0) @@ -133,15 +134,26 @@ int main(int argc, char **argv) hclose_abruptly_p = (hclose_abruptly_func *) func(htslib, "hclose_abruptly"); test_hopen("bad-scheme:unsupported", 0); + +#ifdef __APPLE__ + /* Skip -l tests as we don't link plugins back to libhts on macOS, as this + would conflict with a statically linked libhts.a on this platform. */ + skip = (dlflags & RTLD_LOCAL) != 0; +#endif + + if (! skip) { #ifdef HAVE_LIBCURL - test_hopen("https://localhost:99999/invalid_port", 1); + test_hopen("https://localhost:99999/invalid_port", 1); #endif #ifdef ENABLE_GCS - test_hopen("gs:invalid", 1); + test_hopen("gs:invalid", 1); #endif #ifdef ENABLE_S3 - test_hopen("s3:invalid", 1); + test_hopen("s3:invalid", 1); #endif + } + else + verbose_log("Skipping most tests"); verbose_log("Calling hts_lib_shutdown()"); (func(htslib, "hts_lib_shutdown"))(); diff --git a/test/realn03.fa b/test/realn03.fa new file mode 100644 index 000000000..9ac86c11e --- /dev/null +++ b/test/realn03.fa @@ -0,0 +1,2 @@ +>MX +CGTCTACTACG diff --git a/test/realn03.fa.fai b/test/realn03.fa.fai new file mode 100644 index 000000000..dfaa59355 --- /dev/null +++ b/test/realn03.fa.fai @@ -0,0 +1 @@ +MX 11 4 11 12 diff --git a/test/realn03.sam b/test/realn03.sam new file mode 100644 index 000000000..50266242d --- /dev/null +++ b/test/realn03.sam @@ -0,0 +1,4 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MX LN:11 +M 64 MX 1 60 11M * 0 0 CGTCTCCTACG IIIIIIIIIII +X 64 MX 1 60 5=1X5= * 0 0 CGTCTCCTACG IIIIIIIIIII diff --git a/test/realn03_exp.sam b/test/realn03_exp.sam new file mode 100644 index 000000000..3b608d0c6 --- /dev/null +++ b/test/realn03_exp.sam @@ -0,0 +1,4 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MX LN:11 +M 64 MX 1 60 11M * 0 0 CGTCTCCTACG IIIIIIIIIII BQ:Z:D@@@@@@@@@D +X 64 MX 1 60 5=1X5= * 0 0 CGTCTCCTACG IIIIIIIIIII BQ:Z:D@@@@@@@@@D diff --git a/test/sam.c b/test/sam.c index 1c65c4556..eb404bd65 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1,6 +1,6 @@ /* test/sam.c -- SAM/BAM/CRAM API test cases. - Copyright (C) 2014-2020 Genome Research Ltd. + Copyright (C) 2014-2020, 2022 Genome Research Ltd. Author: John Marshall @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress message for faidx_fetch_nseq(), which we're intentionally testing #include "../htslib/hts_defs.h" @@ -69,6 +70,11 @@ static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) fail(const char *fmt, ...) status = EXIT_FAILURE; } +#define VERIFY(test, message) if (!(test)) { \ + fail("%s: %s", __func__, (message)); \ + goto cleanup; \ +} + uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type) { uint8_t *p = bam_aux_get(aln, tag); @@ -81,6 +87,15 @@ uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type) return NULL; } +static void check_aux_count(const bam1_t *aln, int expected, const char *what) +{ + const uint8_t *itr; + int n = 0; + for (itr = bam_aux_first(aln); itr; itr = bam_aux_next(aln, itr)) n++; + if (n != expected) + fail("%s has %d aux fields, expected %d", what, n, expected); +} + static void check_int_B_array(bam1_t *aln, char *tag, uint32_t nvals, int64_t *vals) { uint8_t *p; @@ -279,10 +294,30 @@ static int aux_fields1(void) if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); + check_aux_count(aln, 24, "Original record"); + bam_aux_del(aln,p); if (bam_aux_get(aln,"XA")) fail("XA field was not deleted"); + check_aux_count(aln, 23, "Record post-XA-deletion"); + + p = bam_aux_get(aln, "Y2"); + if (p == NULL || strncmp(bam_aux_tag(p), "Y2", 2) != 0 || bam_aux_type(p) != 'i') + fail("bam_aux_get() missed Y2 field"); + + p = bam_aux_next(aln, p); + if (p == NULL || strncmp(bam_aux_tag(p), "Y3", 2) != 0 || bam_aux_type(p) != 'c') + fail("bam_aux_next() missed Y3 field"); + + p = bam_aux_get(aln, "Y8"); + if (p == NULL || strncmp(bam_aux_tag(p), "Y8", 2) != 0 || bam_aux_type(p) != 'I') + fail("bam_aux_get() missed Y8 field"); + + p = bam_aux_next(aln, p); + if (p != NULL || errno != ENOENT) + fail("bam_aux_next missed the end of fields"); + if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %"PRId64", expected 37", bam_aux2i(p)); @@ -486,6 +521,16 @@ static int aux_fields1(void) if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); + + // Test field removal APIs -- after the strcmp(..., r1) check so that + // can also check the formatting of the to-be-removed fields. + + p = bam_aux_remove(aln, check_bam_aux_get(aln, "XH", 'H')); + if (bam_aux_get(aln, "XH")) + fail("XH field was not removed"); + check_aux_count(aln, 31, "Record post-XH-removal"); + if (strncmp(bam_aux_tag(p), "XB", 2) != 0 || bam_aux_type(p) != 'B') + fail("bam_aux_remove() missed XB field"); } else fail("can't read record"); @@ -1459,7 +1504,7 @@ static void faidx1(const char *filename) fin = fopen(filename, "rb"); if (fin == NULL) fail("can't open %s", filename); - sprintf(tmpfilename, "%s.tmp", filename); + snprintf(tmpfilename, sizeof(tmpfilename), "%s.tmp", filename); fout = fopen(tmpfilename, "wb"); if (fout == NULL) fail("can't create temporary %s", tmpfilename); while (fgets(line, sizeof line, fin)) { @@ -1519,7 +1564,11 @@ static void test_text_file(const char *filename, int nexp) if (in) { kstring_t str = KS_INITIALIZE; int ret, n = 0; - while ((ret = hts_getline(in, '\n', &str)) >= 0) n++; + while ((ret = hts_getline(in, '\n', &str)) >= 0) { + size_t len = strlen(str.s); + n++; + if (ret != len) fail("hts_getline read length %d (expected %zu)", ret, len); + } if (ret != -1) fail("hts_getline got an error from %s", filename); if (n != nexp) fail("hts_getline read %d lines from %s (expected %d)", n, filename, nexp); @@ -1649,6 +1698,37 @@ static int read_data_block(const char *in_name, samFile *fp_in, return ret; } +static void test_parse_decimal1(long long exp, const char *str, size_t exp_consumed, int flags, const char *warning) +{ + if (warning) fprintf(stderr, "(Expect %s message for \"%s\")\n", warning, str); + + long long val = hts_parse_decimal(str, NULL, flags); + if (val != exp) fail("hts_parse_decimal(\"%s\", NULL, %d) returned %lld, expected %lld", str, flags, val, exp); + + char *end; + val = hts_parse_decimal(str, &end, flags); + if (val != exp) fail("hts_parse_decimal(\"%s\", ..., %d) returned %lld, expected %lld", str, flags, val, exp); + size_t consumed = end - str; + if (consumed != exp_consumed) fail("hts_parse_decimal(\"%s\", ..., %d) consumed %zu chars, expected %zu", str, flags, consumed, exp_consumed); +} + +static void test_parse_decimal(void) +{ + test_parse_decimal1(37, "+37", 3, 0, NULL); + test_parse_decimal1(-1001, " \t -1,001x", 9, HTS_PARSE_THOUSANDS_SEP, "trailing 'x'"); + test_parse_decimal1(LLONG_MAX, "+9223372036854775807", 20, 0, NULL); + test_parse_decimal1(LLONG_MIN, "-9,223,372,036,854,775,808", 26, HTS_PARSE_THOUSANDS_SEP, NULL); + test_parse_decimal1(1500, "1.5e3", 5, 0, NULL); + test_parse_decimal1(1500, "1.5e+3k", 6, 0, "trailing 'k'"); + test_parse_decimal1(1500000000, "1.5G", 4, 0, NULL); + test_parse_decimal1(12345, "12.345k", 7, 0, NULL); + test_parse_decimal1(12345, "12.3456k", 8, 0, "dropped fraction"); + test_parse_decimal1(0, "A", 0, 0, "invalid numeric"); + test_parse_decimal1(0, "G", 0, 0, "invalid numeric"); + test_parse_decimal1(0, " +/-", 0, 0, "invalid numeric"); + test_parse_decimal1(0, " \t -.e+9999", 0, 0, "invalid numeric"); +} + static void test_mempolicy(void) { size_t bufsz = MAX_RECS * REC_LENGTH, nrecs = 0, i; @@ -1837,6 +1917,334 @@ static void test_mempolicy(void) } } +static void test_bam_set1_minimal() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, 0); + // expected number of bytes written is qname: 4, cigar: 0, sequence: 0, qual: 0, aux: 0. + VERIFY(r == 4, "call to bam_set1() failed or did not write the correct number of bytes."); + + VERIFY(bam->core.l_qname == 4, "l_qname should include terminating null and be padded to the nearest 32-bit boundary."); + VERIFY(bam->core.l_extranul == 2, "l_extranul not set correctly"); + VERIFY(strcmp(bam_get_qname(bam), "*") == 0, "qname not set correctly."); + + VERIFY(bam->core.pos == 0, "pos not set correctly."); + VERIFY(bam->core.tid == -1, "tid not set correctly."); + VERIFY(bam->core.bin == hts_reg2bin(0, 1, 14, 5), "bin not set correctly."); + VERIFY(bam->core.qual == 0xff, "mapq not set correctly."); + VERIFY(bam->core.flag == BAM_FUNMAP, "flag not set correctly."); + VERIFY(bam->core.n_cigar == 0, "n_cigar not set correctly."); + VERIFY(bam->core.mtid == -1, "mtid not set correctly."); + VERIFY(bam->core.mpos == 0, "mpos not set correctly."); + VERIFY(bam->core.isize == 0, "isize not set correctly."); + VERIFY(bam->core.l_qseq == 0, "l_seq not set correctly."); + VERIFY(bam_get_l_aux(bam) == 0, "l_aux not set correctly."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_full() +{ + const char *qname = "!??AAA~~~~"; + const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + const char *qual = "DBBBB+=7=0"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, strlen(qname), qname, + BAM_FREVERSE, 1, 1000, 42, + sizeof(cigar) / 4, cigar, 2, 2000, 3000, + strlen(seq), seq, qual, 64); + // expected number of bytes written is qname: 12, cigar: 12, sequence: 5, qual: 10, aux: 0. + VERIFY(r == 39, "call to bam_set1() failed or did not write the correct number of bytes."); + + VERIFY(bam->core.l_qname == 12, "l_qname should include terminating null and be padded to the nearest 32-bit boundary."); + VERIFY(bam->core.l_extranul == 1, "l_extranul not set correctly"); + VERIFY(strcmp(bam_get_qname(bam), qname) == 0, "qname not set correctly."); + + VERIFY(bam->core.n_cigar == sizeof(cigar) / 4, "n_cigar not set correctly."); + VERIFY(memcmp(bam_get_cigar(bam), cigar, sizeof(cigar)) == 0, "cigar not set correctly."); + + VERIFY(bam->core.l_qseq == strlen(seq), "l_seq not set correctly."); + for (i = 0; i < strlen(seq); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq[i]], "seq not set correctly."); + } + VERIFY(memcmp(bam_get_qual(bam), qual, strlen(seq)) == 0, "qual not set correctly."); + + VERIFY(bam->core.pos == 1000, "pos not set correctly."); + VERIFY(bam->core.tid == 1, "tid not set correctly."); + VERIFY(bam->core.bin == hts_reg2bin(1000, 1010, 14, 5), "bin not set correctly."); + VERIFY(bam->core.qual == 42, "mapq not set correctly."); + VERIFY(bam->core.flag == BAM_FREVERSE, "flag not set correctly."); + VERIFY(bam->core.mtid == 2, "mtid not set correctly."); + VERIFY(bam->core.mpos == 2000, "mpos not set correctly."); + VERIFY(bam->core.isize == 3000, "isize not set correctly."); + VERIFY(bam_get_l_aux(bam) == 0, "l_aux not set correctly."); + VERIFY(bam->m_data - bam->l_data >= 64, "not enough memory allocated for aux data."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_even_and_odd_seq_len() +{ + const char *seq_even = "TGGACTACGA"; + const char *seq_odd = "TGGACTACGAC"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, 0, 0, 0, 0, NULL, 0, 0, 0, + strlen(seq_even), seq_even, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq_even), "l_seq not set correctly."); + for (i = 0; i < strlen(seq_even); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq_even[i]], "seq not set correctly."); + } + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, 0, 0, 0, 0, NULL, 0, 0, 0, + strlen(seq_odd), seq_odd, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq_odd), "l_seq not set correctly."); + for (i = 0; i < strlen(seq_odd); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq_odd[i]], "seq not set correctly."); + } + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_with_seq_but_no_qual() +{ + const char *seq = "TGGACTACGA"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, + BAM_FUNMAP, 0, 0, 0, + 0, NULL, 0, 0, 0, + strlen(seq), seq, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq), "l_seq not set correctly."); + for (i = 0; i < strlen(seq); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq[i]], "seq not set correctly."); + VERIFY(bam_get_qual(bam)[i] == 0xff, "qual not set correctly"); + } + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_qname() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // qname too long + const char too_long[255] = { 'A' }; + r = bam_set1(bam, sizeof(too_long), too_long, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_seq() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // seq too long + const char *sequence = "C"; + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, (size_t)INT32_MAX + 1, sequence, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_cigar() +{ + const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // mapped query must have a CIGAR + r = bam_set1(bam, 0, NULL, 0, -1, 0, 0xff, 0, NULL, -1, 0, 0, strlen(seq), seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // pos + ref len from CIGAR should be <= HTS_POS_MAX + r = bam_set1(bam, 0, NULL, 0, -1, HTS_POS_MAX - 10, 0xff, sizeof(cigar) / 4, cigar, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // query len from CIGAR should match the sequence length + r = bam_set1(bam, 0, NULL, 0, -1, 0, 0xff, sizeof(cigar) / 4, cigar, -1, 0, 0, strlen(seq), seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_size_limits() +{ + const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // very long sequence. each base counts for 1/2 byte of sequence data and + // 1 byte of sequence quality data. the sum of all components may not exceed + // INT32_MAX, which is the maximum possible value that can be stored in l_data. + // In this case the 4 bytes of qname will cause it to overflow. + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 2 * (size_t)INT32_MAX / 3, seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // very long CIGAR + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, (size_t)INT32_MAX / 4, cigar, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // very long aux + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, INT32_MAX); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_write_and_read_back() +{ + const char *qname = "q1"; + const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + const char *qual = "DBBBB+=7=0"; + const char *temp_fname = "test/test_bam_set1_write_and_read_back.tmp.bam"; + + int r; + htsFile *writer = NULL, *reader = NULL; + sam_hdr_t *w_header = NULL, *r_header = NULL; + bam1_t *w_bam = NULL, *r_bam = NULL; + kstring_t ks = KS_INITIALIZE; + + // open file for writing + writer = hts_open(temp_fname, "wb"); + VERIFY(writer != NULL, "failed to open bam file for writing."); + + // write header + w_header = bam_hdr_init(); + VERIFY(w_header != NULL, "failed to initialize bam header."); + r = sam_hdr_add_line(w_header, "SQ", "SN", "t1", "LN", "5000", NULL); + VERIFY(r == 0, "failed to add SQ header line."); + r = sam_hdr_write(writer, w_header); + VERIFY(r == 0, "failed to write bam header."); + + // write alignments + w_bam = bam_init1(); + VERIFY(w_bam != NULL, "failed to initialize BAM struct."); + r = bam_set1(w_bam, strlen(qname), qname, + BAM_FREVERSE, 0, 1000, 42, + sizeof(cigar) / 4, cigar, 0, 2000, 3000, + strlen(seq), seq, qual, 64); + VERIFY(r >= 0, "call to bam_set1() failed."); + r = sam_write1(writer, w_header, w_bam); + VERIFY(r >= 0, "failed to write alignment."); + bam_destroy1(w_bam); + + // close file + r = hts_close(writer); + VERIFY(r == 0, "failed to close bam file for writing."); + sam_hdr_destroy(w_header); + + // open file for reading + reader = hts_open(temp_fname, "rb"); + VERIFY(reader != NULL, "failed to open bam file for reading."); + + // read header + r_header = sam_hdr_read(reader); + VERIFY(r_header != NULL, "failed to read bam header."); + r = sam_hdr_find_tag_id(r_header, "SQ", NULL, NULL, "SN", &ks); + VERIFY(r == 0, "failed to read SQ/SN value"); + VERIFY(strcmp(ks_c_str(&ks), "t1") == 0, "expected reference sequence name in the header == 't1'"); + VERIFY(r_header->n_targets == 1, "expected number of reference sequences == 1"); + VERIFY(strcmp(r_header->target_name[0], "t1") == 0, "expected reference sequence name == 't1'"); + VERIFY(r_header->target_len[0] == 5000, "expected reference sequence length == 5000"); + + // read alignments + r_bam = bam_init1(); + VERIFY(r_bam != NULL, "failed to initialize BAM struct."); + r = sam_read1(reader, r_header, r_bam); + VERIFY(r >= 0, "failed to read alignment."); + VERIFY(strcmp(bam_get_qname(r_bam), qname) == 0, "qname does not match."); + VERIFY(r_bam->core.n_cigar == sizeof(cigar) / 4, "cigar length does not match."); + VERIFY(memcmp(bam_get_cigar(r_bam), cigar, sizeof(cigar)) == 0, "cigar data does not match."); + VERIFY(r_bam->core.l_qseq == strlen(seq), "sequence length does not match."); + + r = sam_read1(reader, r_header, r_bam); + VERIFY(r < 0, "expected no more alignments."); + bam_destroy1(r_bam); + + // close file + r = hts_close(reader); + VERIFY(r == 0, "failed to close bam file for reading."); + sam_hdr_destroy(r_header); + +cleanup: + ks_free(&ks); +} + +static void test_cigar_api(void) +{ + uint32_t *buf = NULL; + char *cig = "*"; + char *end; + size_t m = 0; + int n; + n = sam_parse_cigar(cig, &end, &buf, &m); + VERIFY(n == 0 && m == 0 && (end-cig) == 1, "failed to parse undefined CIGAR"); + cig = "2M3X1I10M5D"; + n = sam_parse_cigar(cig, &end, &buf, &m); + VERIFY(n == 5 && m > 0 && (end-cig) == 11, "failed to parse CIGAR string: 2M3X1I10M5D"); + n = sam_parse_cigar("722M15D187217376188323783284M67I", NULL, &buf, &m); + VERIFY(n == -1, "failed to flag CIGAR string with long op length: 722M15D187217376188323783284M67I"); + n = sam_parse_cigar("53I722MD8X", NULL, &buf, &m); + VERIFY(n == -1, "failed to flag CIGAR string with no op length: 53I722MD8X"); + +cleanup: + free(buf); +} + int main(int argc, char **argv) { int i; @@ -1855,14 +2263,27 @@ int main(int argc, char **argv) test_text_file("test/emptyfile", 0); test_text_file("test/xx#pair.sam", 7); test_text_file("test/xx.fa", 7); - test_text_file("test/fastqs.fq", 500); + test_text_file("test/faidx/fastqs.fq", 500); check_enum1(); check_cigar_tab(); check_big_ref(0); check_big_ref(1); + test_parse_decimal(); test_mempolicy(); set_qname(); for (i = 1; i < argc; i++) faidx1(argv[i]); + hts_set_log_level(HTS_LOG_OFF); + test_bam_set1_minimal(); + test_bam_set1_full(); + test_bam_set1_even_and_odd_seq_len(); + test_bam_set1_with_seq_but_no_qual(); + test_bam_set1_validate_qname(); + test_bam_set1_validate_seq(); + test_bam_set1_validate_cigar(); + test_bam_set1_validate_size_limits(); + test_bam_set1_write_and_read_back(); + test_cigar_api(); + return status; } diff --git a/test/sam_filter/filter.sh b/test/sam_filter/filter.sh new file mode 100755 index 000000000..575cb13ae --- /dev/null +++ b/test/sam_filter/filter.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing sam_filter..." + +tv="../test_view" + +test_driver $@ + +exit $? diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst new file mode 100644 index 000000000..e74c307e6 --- /dev/null +++ b/test/sam_filter/filter.tst @@ -0,0 +1,59 @@ +# Copyright (C) 2020, 2022 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $tv is replaced with the path to test_view + +# String matches +P string1.out $tv -i 'filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam +P string2.out $tv -i 'filter=rname=="CHROMOSOME_II"' ../ce#5b.sam +P string3.out $tv -i 'filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam +P string4.out $tv -i 'filter=cigar=~"D"' ../ce#1000.sam +P string5.out $tv -i 'filter=seq =~ "(AT){2}"' ../ce#1000.sam +P string6.out $tv -i 'filter=library=="x"' ../xx#rg.sam +P string7.out $tv -i 'filter=library!="x"' ../xx#rg.sam + +# Integer ops +P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam | grep -E -cv '^@' +P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam | grep -E -cv '^@' + +# Aux tags +P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam | grep -E -cv '^@' + +# Functions. +P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | grep -E -cv '^@' +P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | grep -E -cv '^@' +P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | grep -E -cv '^@' +P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | grep -E -cv '^@' +P func5.out $tv -i 'filter=sclen>=20' ../realn02.sam | grep -E -v '^@' +P func6.out $tv -i 'filter=rlen<50' ../realn02.sam | grep -E -v '^@' +P func7.out $tv -i 'filter=qlen>100' ../realn02.sam | grep -E -v '^@' +P func8.out $tv -i 'filter=hclen>=4' ../c1#clip.sam | grep -E -v '^@' diff --git a/test/sam_filter/func1.out b/test/sam_filter/func1.out new file mode 100644 index 000000000..d00491fd7 --- /dev/null +++ b/test/sam_filter/func1.out @@ -0,0 +1 @@ +1 diff --git a/test/sam_filter/func2.out b/test/sam_filter/func2.out new file mode 100644 index 000000000..d81cc0710 --- /dev/null +++ b/test/sam_filter/func2.out @@ -0,0 +1 @@ +42 diff --git a/test/sam_filter/func3.out b/test/sam_filter/func3.out new file mode 100644 index 000000000..0cfbf0888 --- /dev/null +++ b/test/sam_filter/func3.out @@ -0,0 +1 @@ +2 diff --git a/test/sam_filter/func4.out b/test/sam_filter/func4.out new file mode 100644 index 000000000..103a99dd8 --- /dev/null +++ b/test/sam_filter/func4.out @@ -0,0 +1 @@ +604 diff --git a/test/sam_filter/func5.out b/test/sam_filter/func5.out new file mode 100644 index 000000000..6c2e2bc64 --- /dev/null +++ b/test/sam_filter/func5.out @@ -0,0 +1,5 @@ +ERR013140.3521432 99 17 1 29 22S86M = 226 313 AGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTGCTCTGGGGTCTCTGGGGTCTCA @AEDGBHIIIIIFJGIKHGHIJJJEJKHJKJKGKLLIFHKLLCJJIDEFFHKHEHHJIIIDJEEEJEIKGJIHCGKHFKFE9BBDIAJAHF4?DE@I:DD48(86D=> MD:Z:86 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR156632.12704932 163 17 1 29 36S64M = 195 293 TGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTG BFAFGFEIGFEFHHEIDKJGHHHJIIE=@KKGGKJGIBLLMFKMDIIHJKKHFELLLKFIHMHIHHIHLKJFCHFJIJAID=JHKFGHJIHKKCH:@HD? MD:Z:64 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR156632.9601178 99 17 1 29 62S38M = 279 377 CTATGACAGGGAGGTCATGTGCAGGCTGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGA DEEEIIHHKIJILKHLHIKEKHHMKLKKJGKKKKLKLFIHEKIKL=KLJLKIILHKMH9LJJJJLHLHJJKJJKMLKJD>MJKLEHIGHIH=FFCHF>BE MD:Z:38 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.13475139 99 17 2401 60 88M20S = 2680 386 AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGGAGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGCGTTTGAGGCTGCTGT @CEBEEIHHHICFJIFKGHIKJHII>DBC:CE>A8C>C>7DBA=BEDDB4=9;:@=;@D@@=B@E.3?972<>6@8=>?1$0:95%5%*1=8;0%4<228% X0:i:1 X1:i:0 XC:i:88 MD:Z:88 RG:Z:rg AM:i:37 NM:i:0 SM:i:37 MQ:i:60 XT:A:U +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg diff --git a/test/sam_filter/func6.out b/test/sam_filter/func6.out new file mode 100644 index 000000000..de091ed96 --- /dev/null +++ b/test/sam_filter/func6.out @@ -0,0 +1,2 @@ +ERR156632.9601178 99 17 1 29 62S38M = 279 377 CTATGACAGGGAGGTCATGTGCAGGCTGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGA DEEEIIHHKIJILKHLHIKEKHHMKLKKJGKKKKLKLFIHEKIKL=KLJLKIILHKMH9LJJJJLHLHJJKJJKMLKJD>MJKLEHIGHIH=FFCHF>BE MD:Z:38 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg diff --git a/test/sam_filter/func7.out b/test/sam_filter/func7.out new file mode 100644 index 000000000..1fe2500bf --- /dev/null +++ b/test/sam_filter/func7.out @@ -0,0 +1,3 @@ +ERR013140.3521432 99 17 1 29 22S86M = 226 313 AGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTGCTCTGGGGTCTCTGGGGTCTCA @AEDGBHIIIIIFJGIKHGHIJJJEJKHJKJKGKLLIFHKLLCJJIDEFFHKHEHHJIIIDJEEEJEIKGJIHCGKHFKFE9BBDIAJAHF4?DE@I:DD48(86D=> MD:Z:86 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.13475139 99 17 2401 60 88M20S = 2680 386 AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGGAGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGCGTTTGAGGCTGCTGT @CEBEEIHHHICFJIFKGHIKJHII>DBC:CE>A8C>C>7DBA=BEDDB4=9;:@=;@D@@=B@E.3?972<>6@8=>?1$0:95%5%*1=8;0%4<228% X0:i:1 X1:i:0 XC:i:88 MD:Z:88 RG:Z:rg AM:i:37 NM:i:0 SM:i:37 MQ:i:60 XT:A:U +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg diff --git a/test/sam_filter/func8.out b/test/sam_filter/func8.out new file mode 100644 index 000000000..8553866ed --- /dev/null +++ b/test/sam_filter/func8.out @@ -0,0 +1 @@ +s0C 0 c1 3 0 2H6M2H * 0 0 CCGCGG ****** diff --git a/test/sam_filter/int1.out b/test/sam_filter/int1.out new file mode 100644 index 000000000..6529ff889 --- /dev/null +++ b/test/sam_filter/int1.out @@ -0,0 +1 @@ +98 diff --git a/test/sam_filter/int2.out b/test/sam_filter/int2.out new file mode 100644 index 000000000..e522732c7 --- /dev/null +++ b/test/sam_filter/int2.out @@ -0,0 +1 @@ +38 diff --git a/test/sam_filter/int3.out b/test/sam_filter/int3.out new file mode 100644 index 000000000..ea70ce013 --- /dev/null +++ b/test/sam_filter/int3.out @@ -0,0 +1 @@ +72 diff --git a/test/sam_filter/string1.out b/test/sam_filter/string1.out new file mode 100644 index 000000000..7ba8527b2 --- /dev/null +++ b/test/sam_filter/string1.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.14978392 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 diff --git a/test/sam_filter/string2.out b/test/sam_filter/string2.out new file mode 100644 index 000000000..be94b071e --- /dev/null +++ b/test/sam_filter/string2.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +II.14978392 16 CHROMOSOME_II 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU diff --git a/test/sam_filter/string3.out b/test/sam_filter/string3.out new file mode 100644 index 000000000..2424115de --- /dev/null +++ b/test/sam_filter/string3.out @@ -0,0 +1,7 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +II.14978392 16 CHROMOSOME_II 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU +III 16 CHROMOSOME_III 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU diff --git a/test/sam_filter/string4.out b/test/sam_filter/string4.out new file mode 100644 index 000000000..386300c53 --- /dev/null +++ b/test/sam_filter/string4.out @@ -0,0 +1,8 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.14978392 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.32874267 0 CHROMOSOME_I 75 1 13M1D87M * 0 0 CTAAGCCTAAGCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAG DCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCA>/=;=9>:/5AA############# AS:i:-8 XS:i:-8 XN:i:0 XM:i:0 XO:i:1 XG:i:1 YT:Z:UU MD:Z:13^T87 NM:i:1 +SRR065390.723611 0 CHROMOSOME_I 155 1 5M1D95M * 0 0 AAGCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCDCCBCABCDADBACDDDBABBDB?AB@@<>;B>B?DB?=@@?@: AS:i:-8 XS:i:-8 XN:i:0 XM:i:0 XO:i:1 XG:i:1 YT:Z:UU MD:Z:5^T95 NM:i:1 diff --git a/test/sam_filter/string5.out b/test/sam_filter/string5.out new file mode 100644 index 000000000..4be462117 --- /dev/null +++ b/test/sam_filter/string5.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.9154510 16 CHROMOSOME_I 56 0 100M * 0 0 TTCATATGGGCAGGGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA ##############################@B;@?>>B1?BCBBC@>CDB>B@CA@CCAC=AA>>AC;CCACCCCCCC=CCCCCCCCCCCCBCCCCCCCC AS:i:-28 XS:i:-28 XN:i:0 XM:i:14 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C0C0T1A0G0C0C0T0A0A0G0C0C0T85 NM:i:14 diff --git a/test/sam_filter/string6.out b/test/sam_filter/string6.out new file mode 100644 index 000000000..c2766bd73 --- /dev/null +++ b/test/sam_filter/string6.out @@ -0,0 +1,9 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:xx LN:20 AS:? SP:? UR:? M5:bbf4de6d8497a119dda6e074521643dc +@RG ID:x1 SM:x1 +@RG ID:x2 SM:x2 LB:x PG:foo:bar PI:1111 +@PG ID:emacs PN:emacs VN:23.1.1 +@CO also test +@CO other headers +b1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** RG:Z:x2 +b2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** RG:Z:x2 diff --git a/test/sam_filter/string7.out b/test/sam_filter/string7.out new file mode 100644 index 000000000..8efe886b3 --- /dev/null +++ b/test/sam_filter/string7.out @@ -0,0 +1,11 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:xx LN:20 AS:? SP:? UR:? M5:bbf4de6d8497a119dda6e074521643dc +@RG ID:x1 SM:x1 +@RG ID:x2 SM:x2 LB:x PG:foo:bar PI:1111 +@PG ID:emacs PN:emacs VN:23.1.1 +@CO also test +@CO other headers +a1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** RG:Z:x1 +c1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** +a2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** RG:Z:x1 +c2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** diff --git a/test/simple_test_driver.sh b/test/simple_test_driver.sh index 7e4a3bdc2..5bc020eb6 100644 --- a/test/simple_test_driver.sh +++ b/test/simple_test_driver.sh @@ -3,7 +3,8 @@ # # Copyright (C) 2017-2018 Genome Research Ltd. # -# Author: Robert Davies +# Author: James Bonfield +# Robert Davies # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -50,6 +51,7 @@ run_test() { else # Expected non-zero exit code and got it r="P" + rm -f _out.tmp _err.tmp fi elif [ "$p" = "N" ] then @@ -68,11 +70,12 @@ run_test() { # Output differed r="F" y="output" + rm -f _out.tmp2 fi else # Expected zero exit code and got it. r="P" - rm -f _out.tmp _out.tmp2 _err.tmp + rm -f _out.tmp _err.tmp fi if [ "$r" = "F" ] @@ -106,6 +109,7 @@ run_test() { ;; *) echo "XFAIL: $@" + rm -f _out.tmp _err.tmp nefail=`expr $nefail + 1` ;; esac diff --git a/test/tabix/vcf_file.bcf b/test/tabix/vcf_file.bcf index 75a64b38c..a4aafec47 100644 Binary files a/test/tabix/vcf_file.bcf and b/test/tabix/vcf_file.bcf differ diff --git a/test/tabix/vcf_file.vcf b/test/tabix/vcf_file.vcf index de0a7c7b6..d3cf30fc8 100644 --- a/test/tabix/vcf_file.vcf +++ b/test/tabix/vcf_file.vcf @@ -35,3 +35,4 @@ 2 3199812 . G GTT,GT 82.7 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:322:26 1/2:322:26 3 3212016 . CTT C,CT 79 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:91:26 1/2:91:26 4 3258448 . TACACACAC T . PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31 +4 3258501 . C A,T,G,CA,CT,CG,CC,CAA,CAT,CAG,CAC,CTA,CTT,CTG,CTC,CGA,CGT,CGG,CGC,CCA,CCT,CCG,CCC,CAAA,CAAT,CAAG,CAAC,CATA,CATT,CATG,CATC,CAGA,CAGT,CAGG,CAGC,CACA,CACT,CACG,CACC,CTAA,CTAT,CTAG,CTAC,CTTA,CTTT,CTTG,CTTC,CTGA,CTGT,CTGG,CTGC,CTCA,CTCT,CTCG,CTCC,CGAA,CGAT,CGAG,CGAC,CGTA,CGTT,CGTG,CGTC,CGGA,CGGT,CGGG,CGGC,CGCA,CGCT,CGCG,CGCC,CCAA,CCAT,CCAG,CCAC,CCTA,CCTT,CCTG,CCTC,CCGA,CCGT,CCGG,CCGC,CCCA,CCCT,CCCG,CCCC,CAAAA,CAAAT,CAAAG,CAAAC,CAATA,CAATT,CAATG,CAATC,CAAGA,CAAGT,CAAGG,CAAGC,CAACA,CAACT,CAACG,CAACC,CATAA,CATAT,CATAG,CATAC,CATTA,CATTT,CATTG,CATTC,CATGA,CATGT,CATGG,CATGC,CATCA,CATCT,CATCG,CATCC,CAGAA,CAGAT,CAGAG,CAGAC,CAGTA,CAGTT,CAGTG,CAGTC,CAGGA,CAGGT,CAGGG,CAGGC,CAGCA,CAGCT,CAGCG,CAGCC,CACAA,CACAT,CACAG,CACAC,CACTA,CACTT,CACTG,CACTC,CACGA,CACGT,CACGG,CACGC,CACCA,CACCT,CACCG,CACCC,CTAAA,CTAAT,CTAAG,CTAAC,CTATA,CTATT,CTATG,CTATC,CTAGA,CTAGT,CTAGG,CTAGC,CTACA,CTACT,CTACG,CTACC,CTTAA,CTTAT,CTTAG,CTTAC,CTTTA,CTTTT,CTTTG,CTTTC,CTTGA,CTTGT,CTTGG,CTTGC,CTTCA,CTTCT,CTTCG,CTTCC,CTGAA,CTGAT,CTGAG,CTGAC,CTGTA,CTGTT,CTGTG,CTGTC,CTGGA,CTGGT,CTGGG,CTGGC,CTGCA,CTGCT,CTGCG,CTGCC,CTCAA,CTCAT,CTCAG,CTCAC,CTCTA,CTCTT,CTCTG,CTCTC,CTCGA,CTCGT,CTCGG,CTCGC,CTCCA,CTCCT,CTCCG,CTCCC,CGAAA,CGAAT,CGAAG,CGAAC,CGATA,CGATT,CGATG,CGATC,CGAGA,CGAGT,CGAGG,CGAGC,CGACA,CGACT,CGACG,CGACC,CGTAA,CGTAT,CGTAG,CGTAC,CGTTA,CGTTT,CGTTG,CGTTC,CGTGA,CGTGT,CGTGG,CGTGC,CGTCA,CGTCT,CGTCG,CGTCC,CGGAA,CGGAT,CGGAG,CGGAC,CGGTA,CGGTT,CGGTG,CGGTC,CGGGA,CGGGT,CGGGG,CGGGC,CGGCA,CGGCT,CGGCG,CGGCC,CGCAA,CGCAT,CGCAG,CGCAC,CGCTA,CGCTT,CGCTG,CGCTC,CGCGA,CGCGT,CGCGG,CGCGC,CGCCA,CGCCT,CGCCG,CGCCC,CCAAA,CCAAT,CCAAG,CCAAC,CCATA,CCATT,CCATG,CCATC,CCAGA,CCAGT,CCAGG,CCAGC,CCACA,CCACT,CCACG,CCACC,CCTAA,CCTAT,CCTAG,CCTAC,CCTTA,CCTTT,CCTTG,CCTTC,CCTGA,CCTGT 45 PASS AN=4;AC=2 GT 0/300 240/260 diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index ee64afe41..80daf0423 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017, 2020 Genome Research Ltd. + Copyright (C) 2017, 2020, 2023 Genome Research Ltd. Author: Petr Danecek @@ -28,12 +28,17 @@ #include +#include #include #include #include #include +#include +#include #include "../htslib/synced_bcf_reader.h" +#include "../htslib/hts.h" +#include "../htslib/vcf.h" void error(const char *format, ...) { @@ -41,16 +46,78 @@ void error(const char *format, ...) va_start(ap, format); vfprintf(stderr, format, ap); va_end(ap); - exit(-1); + exit(EXIT_FAILURE); } -void usage(void) +void usage(int exit_code) { fprintf(stderr, "Usage: test-bcf-sr [OPTIONS] vcf-list.txt\n"); + fprintf(stderr, " test-bcf-sr [OPTIONS] -args file1.bcf [...]\n"); fprintf(stderr, "Options:\n"); + fprintf(stderr, " --args pass filenames directly in argument list\n"); + fprintf(stderr, " --no-index allow streaming\n"); + fprintf(stderr, " -o, --output output file (stdout if not set)\n"); + fprintf(stderr, " -O, --output-fmt fmt: vcf,bcf,summary\n"); fprintf(stderr, " -p, --pair logic: snps,indels,both,snps+ref,indels+ref,both+ref,exact,some,all\n"); + fprintf(stderr, " -r, --regions comma-separated list of regions\n"); + fprintf(stderr, " -t, --targets comma-separated list of targets\n"); fprintf(stderr, "\n"); - exit(-1); + exit(exit_code); +} + +void write_summary_format(bcf_srs_t *sr, FILE *out) +{ + int n, i, j; + while ((n = bcf_sr_next_line(sr)) > 0) { + for (i=0; inreaders; i++) + { + if ( !bcf_sr_has_line(sr,i) ) continue; + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + fprintf(out, "%s:%"PRIhts_pos, + bcf_seqname_safe(bcf_sr_get_header(sr,i),rec),rec->pos+1); + break; + } + + for (i=0; inreaders; i++) + { + fprintf(out, "\t"); + + if ( !bcf_sr_has_line(sr,i) ) + { + fprintf(out, "%s","-"); + continue; + } + + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + fprintf(out, "%s", rec->n_allele > 1 ? rec->d.allele[1] : "."); + for (j=2; jn_allele; j++) + { + fprintf(out, ",%s", rec->d.allele[j]); + } + } + fprintf(out, "\n"); + } +} + +void write_vcf_bcf_format(bcf_srs_t *sr, bcf_hdr_t *hdr, vcfFile *vcf_out, + const char *fmt_type) +{ + int i, n; + if (bcf_hdr_write(vcf_out, hdr) != 0) + error("Couldn't write %s header\n", fmt_type); + + while ((n = bcf_sr_next_line(sr)) > 0) { + for (i=0; inreaders; i++) + { + if ( !bcf_sr_has_line(sr,i) ) continue; + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + if (vcf_write(vcf_out, hdr, rec) < 0) + error("vcf_write() failed\n"); + } + } } int main(int argc, char *argv[]) @@ -58,16 +125,31 @@ int main(int argc, char *argv[]) static struct option loptions[] = { {"help",no_argument,NULL,'h'}, + {"output-fmt",required_argument,NULL,'O'}, {"pair",required_argument,NULL,'p'}, + {"regions",required_argument,NULL,'r'}, + {"targets",required_argument,NULL,'t'}, {"no-index",no_argument,NULL,1000}, + {"args",no_argument,NULL,1001}, {NULL,0,NULL,0} }; - int c, pair = 0, use_index = 1; - while ((c = getopt_long(argc, argv, "p:h", loptions, NULL)) >= 0) + int c, pair = 0, use_index = 1, use_fofn = 1; + enum htsExactFormat out_fmt = text_format; // for original pos + alleles + const char *out_fn = NULL, *regions = NULL, *targets = NULL; + while ((c = getopt_long(argc, argv, "o:O:p:r:t:h", loptions, NULL)) >= 0) { switch (c) { + case 'o': + out_fn = optarg; + break; + case 'O': + if (!strcasecmp(optarg, "vcf")) out_fmt = vcf; + else if (!strcasecmp(optarg, "bcf")) out_fmt = bcf; + else if (!strcasecmp(optarg, "summary")) out_fmt = text_format; + else error("Unknown output format \"%s\"\n", optarg); + break; case 'p': if ( !strcmp(optarg,"snps") ) pair |= BCF_SR_PAIR_SNPS; else if ( !strcmp(optarg,"snp+ref") ) pair |= BCF_SR_PAIR_SNPS|BCF_SR_PAIR_SNP_REF; @@ -83,68 +165,103 @@ int main(int argc, char *argv[]) else if ( !strcmp(optarg,"exact") ) pair = BCF_SR_PAIR_EXACT; else error("The --pair logic \"%s\" not recognised.\n", optarg); break; + case 'r': + regions = optarg; + break; + case 't': + targets = optarg; + break; case 1000: use_index = 0; break; - default: usage(); + case 1001: + use_fofn = 0; + break; + case 'h': + usage(EXIT_SUCCESS); + default: usage(EXIT_FAILURE); } } if ( !pair ) pair = BCF_SR_PAIR_EXACT; - if ( optind == argc ) usage(); + if ( optind == argc ) usage(EXIT_FAILURE); - int i, j, n, nvcf; - char **vcf = hts_readlist(argv[optind], 1, &nvcf); - if ( !vcf ) error("Could not parse %s\n", argv[optind]); + int i, nvcf; + char **vcfs = NULL; + if (use_fofn) { + vcfs = hts_readlist(argv[optind], 1, &nvcf); + if ( !vcfs ) error("Could not parse %s\n", argv[optind]); + } else { + vcfs = &argv[optind]; + nvcf = argc - optind; + } bcf_srs_t *sr = bcf_sr_init(); + if (!sr) error("bcf_sr_init() failed\n"); bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC, pair); if (use_index) { bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX); } else { bcf_sr_set_opt(sr, BCF_SR_ALLOW_NO_IDX); } - for (i=0; ierrnum)); - kstring_t str = {0,0,0}; - while ( (n=bcf_sr_next_line(sr)) ) + if (regions) { - for (i=0; inreaders; i++) + if (bcf_sr_set_regions(sr, regions, 0) != 0) + error("Failed to set regions\n"); + } + + if (targets) + { + if (bcf_sr_set_targets(sr, targets, 0, 0) != 0) + error("Failed to set targets\n"); + } + + for (i=0; ierrnum)); + + if (!sr->readers || sr->nreaders < 1) + error("No readers set, even though one was added\n"); + + if (out_fmt == text_format) { + FILE *out = stdout; + if (out_fn) { - if ( !bcf_sr_has_line(sr,i) ) continue; - bcf1_t *rec = bcf_sr_get_line(sr, i); - printf("%s:%"PRIhts_pos, bcf_seqname_safe(bcf_sr_get_header(sr,i),rec),rec->pos+1); - break; + out = fopen(out_fn, "w"); + if (!out) error("Couldn't open \"%s\" for writing: %s\n", + out_fn, strerror(errno)); } - - for (i=0; inreaders; i++) + write_summary_format(sr, out); + if (out_fn) { - printf("\t"); + if (fclose(out) != 0) + error("Error on closing %s : %s\n", + out_fn, strerror(errno)); + } + } else { + const char *fmt_type = out_fmt == vcf ? "VCF" : "BCF"; - if ( !bcf_sr_has_line(sr,i) ) - { - printf("%s","-"); - continue; - } + bcf_hdr_t *hdr = bcf_sr_get_header(sr, 0); + if (!hdr) error("%s output, but don't have a header\n", fmt_type); - str.l = 0; - bcf1_t *rec = bcf_sr_get_line(sr, i); - kputs(rec->n_allele > 1 ? rec->d.allele[1] : ".", &str); - for (j=2; jn_allele; j++) - { - kputc(',', &str); - kputs(rec->d.allele[j], &str); - } - printf("%s",str.s); - } - printf("\n"); + if (!out_fn) { out_fn = "-"; } + vcfFile *vcf_out = vcf_open(out_fn, out_fmt == vcf ? "w" : "wb"); + if (!vcf_out) error("Couldn't open \"%s\" for writing: %s\n", + out_fn, strerror(errno)); + write_vcf_bcf_format(sr, hdr, vcf_out, fmt_type); + if (vcf_close(vcf_out) != 0) + error("Error on closing \"%s\"\n", out_fn); } - free(str.s); + if (sr->errnum) error("Synced reader error: %s\n", + bcf_sr_strerror(sr->errnum)); + bcf_sr_destroy(sr); - for (i=0; i', "$$opts{tmp}/no_index_1.txt") - || error("$$opts{tmp}/no_index_1.txt : $!"); - print $fh "$vcfdir/merge.noidx.a.vcf\n"; - print $fh "$vcfdir/merge.noidx.b.vcf\n"; - print $fh "$vcfdir/merge.noidx.c.vcf\n"; - close($fh) || error("$$opts{tmp}/no_index_1.txt : $!"); - - my $cmd = "$FindBin::Bin/test-bcf-sr --no-index -p all $$opts{tmp}/no_index_1.txt > $$opts{tmp}/no_index_1.out 2> $$opts{tmp}/no_index_1.err"; - my ($ret) = _cmd($cmd); - if ($ret) { - error("The command failed [$ret]: $cmd\n"); - } - - if ($^O =~ /^msys/) { - cmd("diff --strip-trailing-cr $vcfdir/merge.noidx.abc.expected.out $$opts{tmp}/no_index_1.out"); - } else { - cmd("cmp $vcfdir/merge.noidx.abc.expected.out $$opts{tmp}/no_index_1.out"); - } - - # Check bad input detection - - my @bad_file_tests = (["out-of-order header", - ["merge.noidx.a.vcf", "merge.noidx.hdr_order.vcf"]], - ["out-of-order records", - ["merge.noidx.a.vcf", "merge.noidx.rec_order.vcf"]], - ["out-of-order records", - ["merge.noidx.rec_order.vcf", "merge.noidx.a.vcf"]]); - my $count = 2; - foreach my $test_params (@bad_file_tests) { - my ($badness, $inputs) = @$test_params; - open($fh, '>', "$$opts{tmp}/no_index_$count.txt") - || error("$$opts{tmp}/no_index_$count.txt : $!"); - foreach my $input (@$inputs) { - print $fh "$vcfdir/$input\n"; - } - close($fh) || error("$$opts{tmp}/no_index_$count.txt : $!"); - - $cmd = "$FindBin::Bin/test-bcf-sr --no-index -p all $$opts{tmp}/no_index_$count.txt > $$opts{tmp}/no_index_$count.out 2> $$opts{tmp}/no_index_$count.err"; - my ($ret) = _cmd($cmd); - if ($ret == 0) { - error("Failed to detect $badness: $cmd\n"); - } - $count++; - } -} diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c new file mode 100644 index 000000000..e5092084e --- /dev/null +++ b/test/test-bcf_set_variant_type.c @@ -0,0 +1,135 @@ +/* test/test-bcf_set_variant_type.c -- bcf_set_variant_type test harness. + + Copyright (C) 2022 Genome Research Ltd. + + Author: Martin Pollard + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include + +#include "../htslib/hts.h" +#include "../vcf.c" + +void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + if (strrchr(format, '\n') == NULL) fputc('\n', stderr); + exit(-1); +} + +static void test_bcf_set_variant_type() +{ + // Test SNVs + bcf_variant_t var1; + bcf_set_variant_type("A", "T", &var1); + if ( var1.type != VCF_SNP) + { + error("A -> T was not detected as a SNP"); + } + + // Test INDEL + bcf_variant_t var2a; + bcf_set_variant_type("A", "AA", &var2a); + if ( var2a.type != (VCF_INDEL|VCF_INS) ) + { + error("A -> AA was not detected as an INDEL"); + } + bcf_variant_t var2b; + bcf_set_variant_type("AA", "A", &var2b); + if ( var2b.type != (VCF_INDEL|VCF_DEL) ) + { + error("AA -> A was not detected as a INDEL"); + } + + // Test breakends + bcf_variant_t var3a; + bcf_set_variant_type("N", "N]16:33625444]", &var3a); + if ( var3a.type != VCF_BND) + { + error("N]16:33625444] was not detected as a breakend"); + } + + bcf_variant_t var3b; + bcf_set_variant_type("N", "N[16:33625444[", &var3b); + if (var3b.type != VCF_BND) + { + error("N[16:33625444[ was not detected as a breakend"); + } + + bcf_variant_t var3c; + bcf_set_variant_type("N", "]16:33625444]N", &var3c); + if ( var3c.type != VCF_BND) + { + error("]16:33625444]N was not detected as a breakend"); + } + + bcf_variant_t var3d; + bcf_set_variant_type("N", "[16:33625444[N", &var3d); + if ( var3d.type != VCF_BND) + { + error("[16:33625444[N was not detected as a breakend"); + } + // Test special reference alleles + bcf_variant_t var4a; + bcf_set_variant_type("A", "", &var4a); + if ( var4a.type != VCF_REF) + { + error(" was not detected as a special reference allele"); + } + bcf_variant_t var4b; + bcf_set_variant_type("A", "<*>", &var4b); + if ( var4b.type != VCF_REF) + { + error("<*> was not detected as a special reference allele"); + } + // Test MNP + bcf_variant_t var5; + bcf_set_variant_type("AA", "TT", &var5); + if ( var5.type != VCF_MNP) + { + error("AA->TT was not detected as a MNP"); + } + // Test Overlapping allele + bcf_variant_t var6; + bcf_set_variant_type("A", "*", &var6); + if ( var6.type != VCF_OVERLAP) + { + error("A->* was not detected as an overlap"); + } + // Test . + bcf_variant_t var7; + bcf_set_variant_type("A", ".", &var7); + if ( var7.type != VCF_REF) + { + error("A->. was not detected as a special reference allele"); + } +} + +int main(int argc, char **argv) +{ + test_bcf_set_variant_type(); + return 0; +} + diff --git a/test/test-logging.pl b/test/test-logging.pl index 1040b0e47..2f22560b5 100755 --- a/test/test-logging.pl +++ b/test/test-logging.pl @@ -33,7 +33,7 @@ sub check_log_message my ($message, $filename, $line_num) = @_; $log_message_count++; - unless ($message =~ /^\"([A-Z]|%s)/) + unless ($message =~ /^\"([A-Z!-@]|%s)/) { print "$filename line $line_num:\n"; print "Log message should begin with a capital letter: $message.\n"; diff --git a/test/test-regidx.c b/test/test-regidx.c index 90e7244d1..4ba623734 100644 --- a/test/test-regidx.c +++ b/test/test-regidx.c @@ -2,7 +2,7 @@ gcc -g -Wall -O0 -I. -I../htslib/ -L../htslib regidx.c -o test-regidx test-regidx.c -lhts - Copyright (C) 2014,2016,2018, 2020 Genome Research Ltd. + Copyright (C) 2014,2016,2018, 2020, 2023 Genome Research Ltd. Author: Petr Danecek @@ -304,20 +304,20 @@ void test_explicit(char *tgt, char *qry, char *exp) regidx_destroy(idx); } -void create_line_bed(char *line, char *chr, int start, int end) +void create_line_bed(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s\t%d\t%d\n",chr,start-1,end); + snprintf(line,size,"%s\t%d\t%d\n",chr,start-1,end); } -void create_line_tab(char *line, char *chr, int start, int end) +void create_line_tab(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s\t%d\t%d\n",chr,start,end); + snprintf(line,size,"%s\t%d\t%d\n",chr,start,end); } -void create_line_reg(char *line, char *chr, int start, int end) +void create_line_reg(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s:%d-%d\n",chr,start,end); + snprintf(line,size,"%s:%d-%d\n",chr,start,end); } -typedef void (*set_line_f)(char *line, char *chr, int start, int end); +typedef void (*set_line_f)(char *line, size_t size, char *chr, int start, int end); void test(set_line_f set_line, regidx_parse_f parse) { @@ -329,17 +329,17 @@ void test(set_line_f set_line, regidx_parse_f parse) for (i=1; i @@ -24,7 +24,9 @@ DEALINGS IN THE SOFTWARE. */ #include +#include #include +#include #include "../htslib/hts.h" #include "../htslib/vcf.h" @@ -44,6 +46,65 @@ void error(const char *format, ...) #define STRINGIFY(x) #x #define check0(x) ((x) == 0 ? (void) 0 : error("Failed: %s", STRINGIFY(x))) +static int check_alleles(bcf1_t *rec, const char **alleles, int num) { + int i; + if (rec->n_allele != num) { + fprintf(stderr, "Wrong number of alleles - expected %d, got %d\n", + num, rec->n_allele); + return -1; + } + if (bcf_unpack(rec, BCF_UN_STR) != 0) + return -1; + for (i = 0; i < num; i++) { + if (0 != strcmp(alleles[i], rec->d.allele[i])) { + fprintf(stderr, + "Mismatch for allele %d : expected '%s' got '%s'\n", + i, alleles[i], rec->d.allele[i]); + return -1; + } + } + return 0; +} + +static void test_update_alleles(bcf_hdr_t *hdr, bcf1_t *rec) +{ + // Exercise bcf_update_alleles() a bit + const char *alleles1[2] = { "G", "A" }; + const char *alleles2[3] = { "C", "TGCA", "CATG" }; +#define rep10(x) x x x x x x x x x x + const char *alleles3[3] = { rep10("ATTCTAGATC"), "TGCA", + rep10("CTATTATCTCTAATGACATG") }; +#undef rep10 + const char *alleles4[3] = { alleles3[2], NULL, alleles3[0] }; + // Add some alleles + check0(bcf_update_alleles(hdr, rec, alleles1, 2)); + check0(check_alleles(rec, alleles1, 2)); + // Erase them + check0(bcf_update_alleles(hdr, rec, NULL, 0)); + check0(check_alleles(rec, NULL, 0)); + // Expand to three + check0(bcf_update_alleles(hdr, rec, alleles2, 3)); + check0(check_alleles(rec, alleles2, 3)); + // Now try some bigger ones (should force a realloc) + check0(bcf_update_alleles(hdr, rec, alleles3, 3)); + check0(check_alleles(rec, alleles3, 3)); + // Ensure it works even if one of the alleles points into the + // existing structure + alleles4[1] = rec->d.allele[1]; + check0(bcf_update_alleles(hdr, rec, alleles4, 3)); + alleles4[1] = alleles3[1]; // Will have been clobbered by the update + check0(check_alleles(rec, alleles4, 3)); + // Ensure it works when the alleles point into the existing data, + // rec->d.allele is used to define the input array and the + // order of the entries is changed. The result of this should + // be the same as alleles2. + char *tmp = rec->d.allele[0] + strlen(rec->d.allele[0]) - 4; + rec->d.allele[0] = rec->d.allele[2] + strlen(rec->d.allele[2]) - 1; + rec->d.allele[2] = tmp; + check0(bcf_update_alleles(hdr, rec, (const char **) rec->d.allele, 3)); + check0(check_alleles(rec, alleles2, 3)); +} + void write_bcf(char *fname) { // Init @@ -64,6 +125,7 @@ void write_bcf(char *fname) check0(bcf_hdr_append(hdr, "##INFO=")); check0(bcf_hdr_append(hdr, "##FILTER=")); check0(bcf_hdr_append(hdr, "##unused=")); + check0(bcf_hdr_append(hdr, "##unused=")); check0(bcf_hdr_append(hdr, "##unused=unformatted text 1")); check0(bcf_hdr_append(hdr, "##unused=unformatted text 2")); check0(bcf_hdr_append(hdr, "##contig=")); @@ -112,10 +174,10 @@ void write_bcf(char *fname) // .. ID check0(bcf_update_id(hdr, rec, "rs6054257")); // .. REF and ALT + test_update_alleles(hdr, rec); const char *alleles[2] = { "G", "A" }; - check0(bcf_update_alleles(hdr, rec, alleles, 2)); - check0(bcf_update_alleles(hdr, rec, NULL, 0)); check0(bcf_update_alleles_str(hdr, rec, "G,A")); + check0(check_alleles(rec, alleles, 2)); // .. QUAL rec->qual = 29; // .. FILTER @@ -236,12 +298,42 @@ void bcf_to_vcf(char *fname) if (!out) error("Couldn't open \"%s\" : %s\n", gz_fname, strerror(errno)); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); - bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused"); + if (!bcf_hdr_get_hrec(hdr_out, BCF_HL_STR,"ID","BB","unused")) + error("Missing header ##unused="); + bcf_hdr_remove(hdr_out,BCF_HL_STR,"BB"); + if (bcf_hdr_get_hrec(hdr_out, BCF_HL_STR,"ID","BB","unused")) + error("Got pointer to deleted header ##unused="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_GEN,"unused","unformatted text 1",NULL)) + error("Missing header ##unused=unformatted text 1"); bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_GEN,"unused","unformatted text 1",NULL)) + error("Got pointer to deleted header ##unused=unformatted text 1"); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_FLT,"ID","Flt",NULL)) + error("Missing header ##FILTER="); bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FLT,"ID","Flt",NULL)) + error("Got pointer to deleted header ##FILTER="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_INFO,"ID","UI",NULL)) + error("Missing header ##INFO="); bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_INFO,"ID","UI",NULL)) + error("Got pointer to deleted header ##INFO="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","UF",NULL)) + error("Missing header ##INFO="); bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","UF",NULL)) + error("Got pointer to deleted header ##INFO="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_CTG,"ID","Unused",NULL)) + error("Missing header ##contig="); bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","Unused",NULL)) + error("Got pointer to header ##contig="); + if ( bcf_hdr_write(out, hdr_out)!=0 ) error("Failed to write to %s\n", fname); int r; while ((r = bcf_read1(fp, hdr, rec)) >= 0) diff --git a/test/test-vcf-sweep.c b/test/test-vcf-sweep.c index 4b8e3d70b..40ee4e417 100644 --- a/test/test-vcf-sweep.c +++ b/test/test-vcf-sweep.c @@ -63,7 +63,7 @@ int main(int argc, char **argv) { // get copy of the PL vectors nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); - if ( !nPLs ) continue; // PL not present + if ( nPLs <= 0 ) continue; // PL not present // how many values are there per sample int nvals = nPLs / bcf_hdr_nsamples(hdr); @@ -91,7 +91,7 @@ int main(int argc, char **argv) while ( (rec = bcf_sweep_bwd(sw)) ) { nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); - if ( !nPLs ) continue; + if ( nPLs <= 0 ) continue; int nvals = nPLs / bcf_hdr_nsamples(hdr); int32_t *ptr = PLs; for (i=0; i # @@ -32,6 +32,7 @@ use IO::Handle; my $opts = parse_params(); +srand($$opts{seed}); test_bgzip($opts, 0); test_bgzip($opts, 4); @@ -53,11 +54,15 @@ test_vcf_sweep($opts,out=>'test-vcf-sweep.out'); test_vcf_various($opts); test_bcf_sr_sort($opts); +test_bcf_sr_no_index($opts); +test_bcf_sr_range($opts); test_command($opts,cmd=>'test-bcf-translate -',out=>'test-bcf-translate.out'); test_convert_padded_header($opts); test_rebgzip($opts); test_logging($opts); +test_plugin_loading($opts); test_realn($opts); +test_bcf_set_variant_type($opts); print "\nNumber of tests:\n"; printf " total .. %d\n", $$opts{nok}+$$opts{nfailed}; @@ -78,6 +83,7 @@ sub error "Usage: test.pl [OPTIONS]\n", "Options:\n", " -r, --redo-outputs Recreate expected output files.\n", + " -s, --random-seed Initialise rand() with a different seed.\n", " -t, --temp-dir When given, temporary files will not be removed.\n", " -f, --fail-fast Fail-fast mode: exit as soon as a test fails.\n", " -h, -?, --help This help message.\n", @@ -103,12 +109,13 @@ sub safe_tempdir sub parse_params { - my $opts = { keep_files=>0, nok=>0, nfailed=>0 }; + my $opts = { keep_files=>0, nok=>0, nfailed=>0, seed=>42 }; my $help; Getopt::Long::Configure('bundling'); my $ret = GetOptions ( 't|temp-dir:s' => \$$opts{keep_files}, 'r|redo-outputs' => \$$opts{redo_outputs}, + 's|random-seed=i' => \$$opts{seed}, 'f|fail-fast' => \$$opts{fail_fast}, 'h|?|help' => \$help ); @@ -370,6 +377,8 @@ sub test_bgzip { my $index = "${compressed}.gzi"; my $test = sprintf('%s %2s threads', 'bgzip round-trip', $threads ? $threads : 'no'); + my $uncompressed1 = "$$opts{tmp}/ce.fa.$threads"; + my $uncompressed1_copy = "$$opts{tmp}/ce.fa.$threads.copy"; # Round-trip test print "$test: "; @@ -393,6 +402,30 @@ sub test_bgzip { } passed($opts,$test); + # Round-trip test of text in binary mode + $test = sprintf('%s %2s threads', 'bgzip text mode round-trip', + $threads ? $threads : 'no'); + print "$test: "; + $c = "$$opts{bin}/bgzip $at --binary -i -I '$index' < '$data' > '$compressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "$$opts{bin}/bgzip $at -d < '$compressed' > '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "cmp '$data' '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, $out ? $out : "'$data' '$uncompressed' differ"); + return; + } + passed($opts,$test); + # Extract from an offset $test = sprintf('%s %2s threads', 'bgzip -b', $threads ? $threads : 'no'); @@ -442,6 +475,50 @@ sub test_bgzip { return; } passed($opts,$test); + + # multi file test, expects compressed files from previous tests + # bgzip should return failure if both inputs not present + $test = sprintf('%s %2s threads', 'bgzip multifile', + $threads ? $threads : 'no'); + print "$test: "; + + #decompress and remove + $c = "$$opts{bin}/bgzip $at -d '$compressed' '$compressed_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + #check both files present and matches or not + $c = "cmp '$data' '$uncompressed1'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, + $out ? $out : "'$data' '$uncompressed1' differ"); + return; + } + $c = "cmp '$data' '$uncompressed1_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, + $out ? $out : "'$data' '$uncompressed1_copy' differ"); + return; + } + #compress and remove + $c = "$$opts{bin}/bgzip $at '$uncompressed1' '$uncompressed1_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + #decompress again to ensure successful compression + $c = "$$opts{bin}/bgzip $at -d '$compressed' '$compressed_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + passed($opts,$test); } my $test_view_failures; @@ -519,6 +596,32 @@ sub test_view my ($opts, $nthreads) = @_; my $tv_args = $nthreads ? "-\@$nthreads" : ""; + # Files appropriate for CRAM V3.1 and V4.0 testing + my %cram31 = ("auxf#values.sam" => 1, + "c1#pad3.sam" => 1, + "ce#5.sam" => 1, + "ce#1000.sam", => 1, + "ce#large_seq.sam", => 1, + "ce#supp.sam", => 1, + "xx#MD.sam", => 1, + "xx#blank.sam", => 1, + "xx#large_aux.sam", => 1, + "xx#pair.sam", => 1, + "xx#tlen.sam" => 1); + + # Files appropriate for CRAM multi-ref containers + my %cram_ms = ("ce#1000.sam" => 1, + "ce#5.sam" => 1, + "ce#5b.sam" => 1, + "ce#unmap.sam" => 1, + "ce#unmap1.sam" => 1, + "ce#unmap2.sam" => 1, + "xx#blank.sam" => 1, + "xx#minimal.sam" => 1, + "xx#tlen.sam" => 1, + "xx#tlen2.sam" => 1, + "xx#triplet.sam" => 1); + foreach my $sam (glob("*#*.sam")) { my ($base, $ref) = ($sam =~ /((.*)#.*)\.sam/); $ref .= ".fa"; @@ -535,9 +638,11 @@ sub test_view $test_view_failures = 0; # SAM -> BAM -> SAM - testv $opts, "./test_view $tv_args -S -b $sam > $bam"; - testv $opts, "./test_view $tv_args $bam > $bam.sam_"; - testv $opts, "./compare_sam.pl $sam $bam.sam_"; + if ($sam eq "ce#1000.sam") { + testv $opts, "./test_view $tv_args -S -b $sam > $bam"; + testv $opts, "./test_view $tv_args $bam > $bam.sam_"; + testv $opts, "./compare_sam.pl $sam $bam.sam_"; + } # SAM -> BAMu -> SAM testv $opts, "./test_view $tv_args -S -l0 -b $sam > $bam"; @@ -549,37 +654,52 @@ sub test_view testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; - # BAM -> CRAM2 -> BAM -> SAM - $cram = "$bam.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=2.1 $bam > $cram"; - testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; - testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; - # SAM -> CRAM3u -> SAM - $cram = "$base.tmp.cram"; - testv $opts, "./test_view $tv_args -t $ref -S -l0 -C -o VERSION=3.0 $sam > $cram"; - testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + if ($sam eq "ce#1000.sam") { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l0 -C -o VERSION=3.0 $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } - # BAM -> CRAM3 -> BAM -> SAM + # BAM -> CRAM3 -> SAM $cram = "$bam.cram"; testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $bam > $cram"; - testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; - testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; + testv $opts, "./test_view $tv_args $cram > $cram.bam.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; - # CRAM3 -> CRAM2 - $cram = "$base.tmp.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=2.1 $cram > $cram.cram"; + # CRAM3 -> CRAM3 + multi-slice + if (exists($cram_ms{$sam}) && $nthreads > 0) { + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram > $cram.ms"; + testv $opts, "./test_view $tv_args $cram.ms > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } - # CRAM2 -> CRAM3 - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram.cram > $cram"; + if (exists($cram31{$sam}) && $nthreads > 0) { + ## Experimental CRAM 3.1 support. + # SAM -> CRAM31 -> SAM + my @p = $sam eq "ce#1000.sam" + ? (qw/fast normal small archive/) + : (qw/archive/); + foreach my $profile (@p) { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=3.1 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } - # CRAM3 -> CRAM3 + multi-slice - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; - testv $opts, "./test_view $tv_args $cram > $cram.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + ## Experimental CRAM 4.0 support. + # SAM -> CRAM40 -> SAM + @p = $sam eq "ce#large_seq.sam" || $sam eq "xx#large_aux.sam" + ? (qw/fast normal small archive/) + : (qw/archive/); + foreach my $profile (@p) { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=4.0 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } + } # Java pre-made CRAM -> SAM my $jcram = "${base}_java.cram"; @@ -599,6 +719,21 @@ sub test_view } } + # embed_ref=2 mode + print "test_view testing embed_ref=2:\n"; + $test_view_failures = 0; + my $ersam = "ce#1000.sam"; + my $ercram = "ce#1000_er.tmp.cram"; + my $ersam2 = "${ercram}.sam"; + testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; + testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; + testv $opts, "./compare_sam.pl $ersam $ersam2"; + if ($test_view_failures == 0) { + passed($opts, "embed_ref=2 tests"); + } else { + failed($opts, "embed_ref=2 tests", "$test_view_failures subtests failed"); + } + # BAM and CRAM range queries on prebuilt BAM and CRAM # The cram file has @SQ UR: set to point to an invalid location to # force the reference to be reloaded from the one given on the @@ -762,6 +897,14 @@ sub test_index unlink("$$opts{tmp}/index.sam.gz.bai"); test_compare($opts,"$$opts{path}/test_index -b $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + # SAM DOS LINE ENDINGS (\r\n) + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 14 -x $$opts{tmp}/index.sam.gz.csi $$opts{path}/index_dos.sam > $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.csi", "$$opts{path}/index.sam.gz.csi", gz=>1); + unlink("$$opts{tmp}/index.bam.bai"); + test_compare($opts,"$$opts{path}/test_index -c $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.csi", "$$opts{path}/index.sam.gz.csi", gz=>1); + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 0 -x $$opts{tmp}/index.sam.gz.bai $$opts{path}/index_dos.sam > $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + unlink("$$opts{tmp}/index.sam.gz.bai"); + test_compare($opts,"$$opts{path}/test_index -b $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + # CRAM local $ENV{REF_PATH} = $$opts{m5_dir}; test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -C -x $$opts{tmp}/index.cram.crai $$opts{path}/index.sam > $$opts{tmp}/index.cram", "$$opts{tmp}/index.cram.crai", "$$opts{path}/index.cram.crai", gz=>1); @@ -788,6 +931,31 @@ sub test_index $wtmp =~ s/\//\\\\/g; } test_cmd($opts,out=>'tabix.out',cmd=>"$$opts{bin}/tabix $wtmp/index.vcf.gz##idx##$wtmp/index.vcf.gz.tbi 1:10000060-10000060"); + + cmd("$$opts{path}/test_view -b -p $$opts{tmp}/index2.bam -x $$opts{tmp}/index2.bam.bai $$opts{path}/index2.sam"); + for (my $tid = 1; $tid <= 2; $tid++) { + for (my $pos = 1; $pos <= 2; $pos++) { + # All queries should return exactly two sequences. + # The input data consists of mapped/unmapped and unmapped/mapped + # in both orders. + # Done verbatim as test_cmd cannot return $out for us to check. + my $test = "$$opts{path}/test_view $$opts{tmp}/index2.bam $tid:${pos}000000-${pos}000000"; + print "test_index:\n\t$test\n"; + my ($ret, $out) = _cmd($test); + if ($ret ne 0) { + failed($opts, $test); + } else { + my $rnum = ($out =~ s/^[^@].*\n//gm); + if ($rnum ne 2) { + failed($opts, $test); + } else { + passed($opts, $test); + } + } + } + } + unlink("$$opts{tmp}/index2.bam"); + unlink("$$opts{tmp}/index2.bam.bai"); } sub test_bcf2vcf @@ -825,6 +993,13 @@ sub test_vcf_various cmd => "$$opts{bin}/htsfile -c $$opts{path}/noroundtrip.vcf"); test_cmd($opts, %args, out => "formatmissing-out.vcf", cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf"); + test_cmd($opts, %args, out => "vcf_meta_meta.vcf", + cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf_meta_meta.vcf"); + + # VCF file with contig IDX=1, simulating an edited BCF file + # See htslib issue 1534 + test_cmd($opts, %args, out => "modhdr.expected.vcf", + cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2"); } sub write_multiblock_bgzf { @@ -895,7 +1070,7 @@ sub test_bcf_sr_sort my ($opts, %args) = @_; for (my $i=0; $i<10; $i++) { - my $seed = int(rand(time)); + my $seed = int(rand(100000000)); my $test = 'test-bcf-sr'; my $cmd = "$$opts{path}/test-bcf-sr.pl -t $$opts{tmp} -s $seed"; print "$test:\n"; @@ -906,6 +1081,113 @@ sub test_bcf_sr_sort } } +sub test_bcf_sr_no_index { + my ($opts) = @_; + + my $test = "test_bcf_sr_no_index"; + + my $vcfdir = "$$opts{path}/bcf-sr"; + + # Positive test + test_cmd($opts, out => "bcf-sr/merge.noidx.abc.expected.out", + cmd => "$$opts{path}/test-bcf-sr --no-index -p all --args $vcfdir/merge.noidx.a.vcf $vcfdir/merge.noidx.b.vcf $vcfdir/merge.noidx.c.vcf 2> $$opts{tmp}/no_index_1.err"); + + # Check bad input detection + + my @bad_file_tests = (["out-of-order header", + ["merge.noidx.a.vcf", "merge.noidx.hdr_order.vcf"]], + ["out-of-order records", + ["merge.noidx.a.vcf", "merge.noidx.rec_order.vcf"]], + ["out-of-order records", + ["merge.noidx.rec_order.vcf", "merge.noidx.a.vcf"]]); + my $count = 2; + foreach my $test_params (@bad_file_tests) { + my ($badness, $inputs) = @$test_params; + my @ins = map { "$vcfdir/$_" } @$inputs; + + my $cmd = "$$opts{path}/test-bcf-sr --no-index -p all --args @ins > $$opts{tmp}/no_index_$count.out 2> $$opts{tmp}/no_index_$count.err"; + print "$test:\n\t$cmd (expected fail)\n"; + my ($ret) = _cmd($cmd); + if ($ret == 0) { + failed($opts, $test, "Failed to detect $badness: $cmd\n"); + } else { + passed($opts, $test); + } + $count++; + } +} + +sub test_bcf_sr_range { + my ($opts) = @_; + + my $test = "test_bcf_sr_range"; + + my $vcfdir = "$$opts{path}/bcf-sr"; + + my @tests = (['r', '1', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1-2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1,1:2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1-1', 'weird-chr-names.vcf', 'weird-chr-names.2.out'], + ['r', '{1:1}', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1,{1:1}:2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.4.out'], + ['r', '{1:1-1}', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1,{1:1-1}:2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.6.out'], + ['r', '{1:1-1}-2', 'weird-chr-names.vcf', undef], # Expected failure + ['t', '1', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1-2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1,1:2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1-1', 'weird-chr-names.vcf', 'weird-chr-names.2.out'], + ['t', '{1:1}', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1,{1:1}:2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.4.out'], + ['t', '{1:1-1}', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1,{1:1-1}:2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.6.out'], + ['t', '{1:1-1}-2', 'weird-chr-names.vcf', undef] # Expected failure + ); + + my $count = 0; + my %converted; + foreach my $tst (@tests) { + my ($option, $range, $in, $exp_out) = @$tst; + $count++; + if (!$converted{$in}) { + my $cmd = "$$opts{path}/test_view -b -p $$opts{tmp}/$in.bcf -x $$opts{tmp}/$in.bcf.csi $vcfdir/$in"; + print "$test:\n\t$cmd\n"; + my ($ret) = _cmd($cmd); + if ($ret) { + failed($opts, $test); + $converted{$in} = 'fail'; + next; + } else { + passed($opts, $test); + $converted{$in} = "$$opts{tmp}/$in.bcf"; + } + } + next if ($converted{$in} eq 'fail'); + my $cmd = "$$opts{path}/test-bcf-sr -O vcf -o $$opts{tmp}/range_test_$count.out.vcf -$option '$range' --args $converted{$in}"; + if ($exp_out) { + test_compare($opts, $cmd, "$vcfdir/$exp_out", + "$$opts{tmp}/range_test_$count.out.vcf", + fix_newlines => 1); + } else { + print "$test:\n\t$cmd (expected fail)\n"; + my ($ret) = _cmd($cmd); + if ($ret) { + passed($opts, $test); + } else { + failed($opts, $test); + } + } + } +} + sub test_command { my ($opts, %args) = @_; @@ -928,6 +1210,26 @@ sub test_logging else { passed($opts,$test); } } +sub test_plugin_loading { + my ($opts) = @_; + + my $test = "test_plugin_loading"; + + unless (-e "$$opts{bin}/hfile_libcurl.so" || -e "$$opts{bin}/hfile_libcurl.bundle") { + print "$test: .. skipping\n\n"; + return; + } + + # Test that plugins can be loaded from an executable statically linked to libhts.a + my $url = "https://localhost:99999/invalid_port"; + my $cmd = "HTS_PATH=$$opts{bin} $$opts{path}/with-shlib.sh $$opts{bin}/htsfile $url"; + print "$test:\n\t$cmd\n"; + my ($ret, $out) = _cmd("$cmd 2>&1"); + if ($ret == 0) { failed($opts, $test, "successful exit status"); } + elsif ($out =~ /couldn't register/i || $out =~ /not supported/i) { failed($opts, $test, $out); } + else { passed($opts, $test); } +} + sub test_realn { my ($opts) = @_; @@ -952,4 +1254,21 @@ sub test_realn { # Revert quality values (using data in ZQ tags) test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn02.fa -i $$opts{path}/realn02_exp-a.sam -o -", out => "realn02_exp.sam"); + + # Make sure multiple matches are treated the same way as a single match of the same length. + test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn03.fa -e -i $$opts{path}/realn03.sam -o -", out => "realn03_exp.sam"); +} + +sub test_bcf_set_variant_type +{ + my ($opts) = @_; + my $test = 'test-bcf_set_variant_type'; + my $cmd = "$$opts{path}/test-bcf_set_variant_type"; + print "$test:\n"; + print "\t$cmd\n"; + my ($ret,$out) = _cmd($cmd); + if ( $ret ) { + print $out; + failed($opts,$test); + } else { passed($opts,$test); } } diff --git a/test/test_bgzf.c b/test/test_bgzf.c index 90ec167ac..6cb6db902 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -1,6 +1,6 @@ /* test/test_bgzf.c -- bgzf unit tests - Copyright (C) 2017, 2019 Genome Research Ltd + Copyright (C) 2017, 2019, 2022-2023 Genome Research Ltd Author: Robert Davies @@ -37,6 +37,7 @@ DEALINGS IN THE SOFTWARE. #include "../htslib/bgzf.h" #include "../htslib/hfile.h" +#include "../htslib/hts_log.h" #include "../hfile_internal.h" const char *bgzf_suffix = ".gz"; @@ -159,13 +160,19 @@ static BGZF * try_bgzf_hopen(const char *name, const char *mode, return bgz; } -static int try_bgzf_close(BGZF **bgz, const char *name, const char *func) { +static int try_bgzf_close(BGZF **bgz, const char *name, const char *func, int expected_fail) { BGZF *to_close = *bgz; *bgz = NULL; if (bgzf_close(to_close) != 0) { - fprintf(stderr, "%s : bgzf_close failed on %s : %s\n", - func, name, strerror(errno)); + if (!expected_fail) + fprintf(stderr, "%s : bgzf_close failed on %s%s%s\n", + func, name, + errno ? " : " : "", + errno ? strerror(errno) : ""); return -1; + } else if (expected_fail) { + fprintf(stderr, "%s : bgzf_close worked on %s, but expected failure\n", + func, name); } return 0; } @@ -398,6 +405,7 @@ static int test_read(Files *f) { ssize_t bg_got, f_got; unsigned char bg_buf[BUFSZ], f_buf[BUFSZ]; + errno = 0; bgz = try_bgzf_open(f->src_bgzf, "r", __func__); if (!bgz) return -1; @@ -414,7 +422,7 @@ static int test_read(Files *f) { } } while (bg_got > 0 && f_got > 0); - if (try_bgzf_close(&bgz, f->src_bgzf, __func__) != 0) return -1; + if (try_bgzf_close(&bgz, f->src_bgzf, __func__, 0) != 0) return -1; if (try_fseek_start(f->f_plain, f->src_plain, __func__) != 0) return -1; return 0; @@ -449,7 +457,7 @@ static int test_write_read(Files *f, const char *mode, Open_method method, bg_put = try_bgzf_write(bgz, f->text, f->ltext, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; switch (method) { case USE_BGZF_DOPEN: @@ -491,7 +499,7 @@ static int test_write_read(Files *f, const char *mode, Open_method method, goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -521,7 +529,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { bg_put = try_bgzf_write(bgz, f->text, half, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; // Write second half. Append mode, so an EOF block should be in the @@ -535,7 +543,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; // Try reading pos = 0; @@ -564,7 +572,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -601,7 +609,7 @@ static int test_index_load_dump(Files *f) { } while (got_src > 0 && got_dest > 0); if (try_fclose(&fdest, f->tmp_idx, __func__) != 0) goto fail; - if (try_bgzf_close(&bgz, f->src_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->src_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -624,7 +632,7 @@ static int test_check_EOF(char *name, int expected) { return -1; } - return try_bgzf_close(&bgz, name, __func__); + return try_bgzf_close(&bgz, name, __func__, 0); } static int test_index_useek_getc(Files *f, const char *mode, @@ -651,7 +659,7 @@ static int test_index_useek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -710,7 +718,7 @@ static int test_index_useek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -741,7 +749,7 @@ static int test_tell_seek_getc(Files *f, const char *mode, if (bg_put < 0) goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -811,7 +819,7 @@ static int test_tell_seek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -841,7 +849,7 @@ static int test_tell_read(Files *f, const char *mode) { if (bg_put < 0) goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -859,7 +867,7 @@ static int test_tell_read(Files *f, const char *mode) { } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; free(bg_buf); return 0; @@ -885,11 +893,13 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { bg_put = try_bgzf_write(bgz, f->text, f->ltext, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + for (pos = 0; pos < f->ltext; ) { const char *end = strchr(text + pos, '\n'); size_t l = end ? end - (text + pos) : f->ltext - pos; @@ -909,12 +919,13 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { "Got : %.*s\n", __func__, f->tmp_bgzf, (int) l, (char *) f->text + pos, (int) str.l, str.s); + goto fail; } pos += l + 1; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; free(ks_release(&str)); return 0; @@ -924,6 +935,98 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { return -1; } +static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int nthreads) { + BGZF* bgz = NULL; + ssize_t bg_put; + size_t pos; + kstring_t str = { 0, 0, NULL }; + const char *text = (const char *) f->text; + + // Turn off bgzf errors as they're expected. + enum htsLogLevel lvl = hts_get_log_level(); + hts_set_log_level(HTS_LOG_OFF); + + bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__); + if (!bgz) goto fail; + + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + + const char *text_line2 = strchr(text, '\n') + 1; + bg_put = try_bgzf_write(bgz, text, text_line2 - text, f->tmp_bgzf, __func__); + if (bg_put < 0) goto fail; + if (bgzf_flush(bgz) < 0) goto fail; + int64_t block2_start = bgz->block_address; + + const char *text_line3 = strchr(text_line2, '\n') + 1; + bg_put = try_bgzf_write(bgz, text_line2, text_line3 - text_line2, f->tmp_bgzf, __func__); + if (bg_put < 0) goto fail; + if (bgzf_flush(bgz) < 0) goto fail; + int64_t block3_start = bgz->block_address; + + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; + + int64_t newsize; + for(newsize = block3_start - 1; newsize > block2_start; newsize--) { + //fprintf(stderr, "test_bgzf_getline_on_truncated_file : size truncated to %" PRId64 " with threads %d\n", newsize, nthreads); + + if (truncate(f->tmp_bgzf, newsize) != 0) goto fail; + + bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); + if (!bgz) goto fail; + + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + + for (pos = 0; pos < f->ltext; ) { + const char *end = strchr(text + pos, '\n'); + size_t l = end ? end - (text + pos) : f->ltext - pos; + + int res = bgzf_getline(bgz, '\n', &str); + if (res < -1) { + // ok, we expect error from truncated file + break; + } else if (res == -1) { + // truncated file should never return EOF since we do not truncate at block boundary + fprintf(stderr, "%s : %s from bgzf_getline on %s\n", + __func__, "Unexpected EOF", + f->tmp_bgzf); + goto fail; + } + + if (str.l != l || memcmp(text + pos, str.s, l) != 0) { + fprintf(stderr, + "%s : Unexpected data from bgzf_getline on %s\n" + "Expected : %.*s\n" + "Got : %.*s\n", + __func__, f->tmp_bgzf, (int) l, (char *) f->text + pos, + (int) str.l, str.s); + goto fail; + } + pos += l + 1; + } + + // verify error is persistent + int k; + for(k = 0; k < 3; k++) { + int res = bgzf_getline(bgz, '\n', &str); + if (res > -2) { + fprintf(stderr, "%s : unexpected bgzf_getline result %d\n", __func__, res); + goto fail; + } + } + // closing a stream with error returns error + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 1) == 0) goto fail; + } + free(ks_release(&str)); + hts_set_log_level(lvl); + return 0; + + fail: + hts_set_log_level(lvl); + if (bgz) bgzf_close(bgz); + free(ks_release(&str)); + return -1; +} + int main(int argc, char **argv) { Files f = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0 }; int retval = EXIT_FAILURE; @@ -1000,6 +1103,10 @@ int main(int argc, char **argv) { if (test_bgzf_getline(&f, "w", 1) != 0) goto out; if (test_bgzf_getline(&f, "w", 2) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 0) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 1) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 2) != 0) goto out; + retval = EXIT_SUCCESS; out: diff --git a/test/test_expr.c b/test/test_expr.c new file mode 100644 index 000000000..ecd1232e4 --- /dev/null +++ b/test/test_expr.c @@ -0,0 +1,366 @@ +/* test-expr.c -- Testing: filter expression parsing and processing. + + Copyright (C) 2020, 2022 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include "../htslib/hts_expr.h" + +int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { + int foo = 15551; // my favourite palindromic prime + int a = 1; + int b = 2; + int c = 3; + res->is_str = 0; + if (strncmp(str, "foo", 3) == 0) { + *end = str+3; + res->d = foo; + } else if (*str == 'a') { + *end = str+1; + res->d = a; + } else if (*str == 'b') { + *end = str+1; + res->d = b; + } else if (*str == 'c') { + *end = str+1; + res->d = c; + } else if (strncmp(str, "magic", 5) == 0) { + // non-empty string + *end = str+5; + res->is_str = 1; + kputs("plugh", ks_clear(&res->s)); + } else if (strncmp(str, "empty-but-true", 14) == 0) { + // empty string + *end = str+14; + res->is_true = 1; + res->is_str = 1; + kputs("", ks_clear(&res->s)); + } else if (strncmp(str, "empty", 5) == 0) { + // empty string + *end = str+5; + res->is_str = 1; + kputs("", ks_clear(&res->s)); + } else if (strncmp(str, "zero-but-true", 13) == 0) { + *end = str+13; + res->d = 0; + res->is_true = 1; + } else if (strncmp(str, "null-but-true", 13) == 0) { + *end = str+13; + hts_expr_val_undef(res); + res->is_true = 1; + } else if (strncmp(str, "null", 4) == 0) { + // null string (eg aux:Z tag is absent) + *end = str+4; + hts_expr_val_undef(res); + } else if (strncmp(str, "nan", 3) == 0) { + // sqrt(-1), 0/0 and similar + // Semantically the same operations as null. + *end = str+3; + hts_expr_val_undef(res); + + } else { + return -1; + } + + return 0; +} + +typedef struct { + int truth_val; + double dval; + char *sval; + char *str; +} test_ev; + +static inline int strcmpnull(const char *a, const char *b) { + if (!a && !b) return 0; + if (!a && b) return -1; + if (a && !b) return 1; + return strcmp(a, b); +} + +// Compare NAN as equal, for testing we returned the correct values +static inline int cmpfloat(double d1, double d2) { + // If needs be, can use DBL_EPSILON in comparisons here. + return d1 == d2 || (isnan(d1) && isnan(d2)); +} + +int test(void) { + // These are all valid expressions that should work + test_ev tests[] = { + { 1, 1, NULL, "1"}, + { 1, 1, NULL, "+1"}, + { 1, -1, NULL, "-1"}, + { 0, 0, NULL, "!7"}, + { 1, 1, NULL, "!0"}, + { 1, 1, NULL, "!(!7)"}, + { 1, 1, NULL, "!!7"}, + + { 1, 5, NULL, "2+3"}, + { 1, -1, NULL, "2+-3"}, + { 1, 6, NULL, "1+2+3"}, + { 1, 1, NULL, "-2+3"}, + { 0, NAN, NULL, "1+null" }, + { 0, NAN, NULL, "null-1" }, + { 0, NAN, NULL, "-null" }, + + { 1, 6, NULL, "2*3"}, + { 1, 6, NULL, "1*2*3"}, + { 0, 0, NULL, "2*0"}, + + { 1, 7, NULL, "(7)"}, + { 1, 7, NULL, "((7))"}, + { 1, 21, NULL, "(1+2)*(3+4)"}, + { 1, 14, NULL, "(4*5)-(-2*-3)"}, + + { 0, NAN, NULL, "2*null"}, + { 0, NAN, NULL, "null/2"}, + { 0, NAN, NULL, "0/0"}, + + { 1, 1, NULL, "(1+2)*3==9"}, + { 1, 1, NULL, "(1+2)*3!=8"}, + { 0, 0, NULL, "(1+2)*3!=9"}, + { 0, 0, NULL, "(1+2)*3==8"}, + + { 0, 0, NULL, "1>2"}, + { 1, 1, NULL, "1<2"}, + { 0, 0, NULL, "3<3"}, + { 0, 0, NULL, "3>3"}, + { 1, 1, NULL, "9<=9"}, + { 1, 1, NULL, "9>=9"}, + { 1, 1, NULL, "2*4==8"}, + { 1, 1, NULL, "16==0x10"}, + { 1, 1, NULL, "15<0x10"}, + { 1, 1, NULL, "17>0x10"}, + { 0, 0, NULL, "2*4!=8"}, + { 1, 1, NULL, "4+2<3+4"}, + { 0, 0, NULL, "4*2<3+4"}, + { 1, 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 + + { 1, 1, NULL, "(1<2) == (3>2)"}, + { 1, 1, NULL, "1<2 == 3>2"}, + + { 0, NAN, NULL, "null <= 0" }, + { 0, NAN, NULL, "null >= 0" }, + { 0, NAN, NULL, "null < 0" }, + { 0, NAN, NULL, "null > 0" }, + { 0, NAN, NULL, "null == null" }, + { 0, NAN, NULL, "null != null" }, + { 0, NAN, NULL, "null < 10" }, + { 0, NAN, NULL, "10 > null" }, + + { 1, 1, NULL, "2 && 1"}, + { 0, 0, NULL, "2 && 0"}, + { 0, 0, NULL, "0 && 2"}, + { 1, 1, NULL, "2 || 1"}, + { 1, 1, NULL, "2 || 0"}, + { 1, 1, NULL, "0 || 2"}, + { 1, 1, NULL, "1 || 2 && 3"}, + { 1, 1, NULL, "2 && 3 || 1"}, + { 1, 1, NULL, "0 && 3 || 2"}, + { 0, 0, NULL, "0 && 3 || 0"}, + { 0, 0, NULL, " 5 - 5 && 1"}, + { 0, 0, NULL, "+5 - 5 && 1"}, + { 0, 0, NULL, "null && 1"}, // null && x == null + { 0, 0, NULL, "1 && null"}, + { 1, 1, NULL, "!null && 1"}, + { 1, 1, NULL, "1 && !null"}, + { 1, 1, NULL, "1 && null-but-true"}, + { 0, 0, NULL, "null || 0"}, // null || 0 == null + { 0, 0, NULL, "0 || null"}, + { 1, 1, NULL, "!null || 0"}, + { 1, 1, NULL, "0 || !null"}, + { 1, 1, NULL, "0 || null-but-true"}, + { 1, 1, NULL, "null || 1"}, // null || 1 == 1 + { 1, 1, NULL, "1 || null"}, + + { 1, 1, NULL, "3 & 1"}, + { 1, 2, NULL, "3 & 2"}, + { 1, 3, NULL, "1 | 2"}, + { 1, 3, NULL, "1 | 3"}, + { 1, 7, NULL, "1 | 6"}, + { 1, 2, NULL, "1 ^ 3"}, + { 0, NAN, NULL, "1 | null"}, + { 0, NAN, NULL, "null | 1"}, + { 0, NAN, NULL, "1 & null"}, + { 0, NAN, NULL, "null & 1"}, + { 0, NAN, NULL, "0 ^ null"}, + { 0, NAN, NULL, "null ^ 0"}, + { 0, NAN, NULL, "1 ^ null"}, + { 0, NAN, NULL, "null ^ 1"}, + + { 1, 1, NULL, "(1^0)&(4^3)"}, + { 1, 2, NULL, "1 ^(0&4)^ 3"}, + { 1, 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ + + { 1, 6, NULL, "(1|0)^(4|3)"}, + { 1, 7, NULL, "1 |(0^4)| 3"}, + { 1, 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | + + { 1, 1, NULL, "4 & 2 || 1"}, + { 1, 1, NULL, "(4 & 2) || 1"}, + { 0, 0, NULL, "4 & (2 || 1)"}, + { 1, 1, NULL, "1 || 4 & 2"}, + { 1, 1, NULL, "1 || (4 & 2)"}, + { 0, 0, NULL, "(1 || 4) & 2"}, + + { 1, 1, NULL, " (2*3)&7 > 4"}, + { 0, 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv + { 1, 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv + { 1, 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, + + { 1, 1, "plugh", "magic"}, + { 1, 1, "", "empty"}, + { 1, 1, NULL, "magic == \"plugh\""}, + { 1, 1, NULL, "magic != \"xyzzy\""}, + + { 1, 1, NULL, "\"abc\" < \"def\""}, + { 1, 1, NULL, "\"abc\" <= \"abc\""}, + { 0, 0, NULL, "\"abc\" < \"ab\""}, + { 0, 0, NULL, "\"abc\" <= \"ab\""}, + + { 0, 0, NULL, "\"abc\" > \"def\""}, + { 1, 1, NULL, "\"abc\" >= \"abc\""}, + { 1, 1, NULL, "\"abc\" > \"ab\""}, + { 1, 1, NULL, "\"abc\" >= \"ab\""}, + + { 0, NAN, NULL, "null == \"x\"" }, + { 0, NAN, NULL, "null != \"x\"" }, + { 0, NAN, NULL, "null < \"x\"" }, + { 0, NAN, NULL, "null > \"x\"" }, + + { 1, 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, + { 0, 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, + { 1, 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, + { 1, 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, + + { 1, 1, "", "empty-but-true" }, + { 0, 0, NULL, "!empty-but-true" }, + { 1, 1, NULL, "!!empty-but-true" }, + { 1, 1, NULL, "1 && empty-but-true && 1" }, + { 0, 0, NULL, "1 && empty-but-true && 0" }, + + { 0, NAN, NULL, "null" }, + { 1, 1, NULL, "!null" }, + { 0, 0, NULL, "!!null", }, + { 0, 0, NULL, "!\"foo\"" }, + { 1, 1, NULL, "!!\"foo\"" }, + + { 1, NAN, NULL, "null-but-true" }, + { 0, 0, NULL, "!null-but-true" }, + { 1, 1, NULL, "!!null-but-true" }, + { 1, 0, NULL, "zero-but-true" }, + { 0, 0, NULL, "!zero-but-true" }, + { 1, 1, NULL, "!!zero-but-true" }, + + { 1, log(2), NULL, "log(2)"}, + { 1, exp(9), NULL, "exp(9)"}, + { 1, 9, NULL, "log(exp(9))"}, + { 1, 8, NULL, "pow(2,3)"}, + { 1, 3, NULL, "sqrt(9)"}, + { 0, NAN, NULL, "sqrt(-9)"}, + + { 1, 2, NULL, "default(2,3)"}, + { 1, 3, NULL, "default(null,3)"}, + { 0, 0, NULL, "default(null,0)"}, + { 1, NAN, NULL, "default(null-but-true,0)"}, + { 1, NAN, NULL, "default(null-but-true,null)"}, + { 1, NAN, NULL, "default(null,null-but-true)"}, + + { 1, 1, NULL, "exists(\"foo\")"}, + { 1, 1, NULL, "exists(12)"}, + { 1, 1, NULL, "exists(\"\")"}, + { 1, 1, NULL, "exists(0)"}, + { 0, 0, NULL, "exists(null)"}, + { 1, 1, NULL, "exists(null-but-true)"}, + }; + + int i, res = 0; + hts_expr_val_t r = HTS_EXPR_VAL_INIT; + for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { + hts_filter_t *filt = hts_filter_init(tests[i].str); + if (!filt) + return 1; + if (hts_filter_eval2(filt, NULL, lookup, &r)) { + fprintf(stderr, "Failed to parse filter string %s\n", + tests[i].str); + res = 1; + hts_filter_free(filt); + continue; + } + + if (!hts_expr_val_exists(&r)) { + if (r.is_true != tests[i].truth_val || + !cmpfloat(r.d, tests[i].dval)) { + fprintf(stderr, + "Failed test: \"%s\" == \"%f\", got %s, \"%s\", %f\n", + tests[i].str, tests[i].dval, + r.is_true ? "true" : "false", r.s.s, r.d); + res = 1; + } + } else if (r.is_str && (strcmpnull(r.s.s, tests[i].sval) != 0 + || !cmpfloat(r.d, tests[i].dval) + || r.is_true != tests[i].truth_val)) { + fprintf(stderr, + "Failed test: \"%s\" == \"%s\", got %s, \"%s\", %f\n", + tests[i].str, tests[i].sval, + r.is_true ? "true" : "false", r.s.s, r.d); + res = 1; + } else if (!r.is_str && (!cmpfloat(r.d, tests[i].dval) + || r.is_true != tests[i].truth_val)) { + fprintf(stderr, "Failed test: %s == %f, got %s, %f\n", + tests[i].str, tests[i].dval, + r.is_true ? "true" : "false", r.d); + res = 1; + } + + hts_expr_val_free(&r); + hts_filter_free(filt); + } + + return res; +} + +int main(int argc, char **argv) { + if (argc > 1) { + hts_expr_val_t v = HTS_EXPR_VAL_INIT; + hts_filter_t *filt = hts_filter_init(argv[1]); + if (hts_filter_eval2(filt, NULL, lookup, &v)) + return 1; + + printf("%s\t", v.is_true ? "true":"false"); + + if (v.is_str) + puts(v.s.s); + else + printf("%g\n", v.d); + + hts_expr_val_free(&v); + hts_filter_free(filt); + return 0; + } + + return test(); +} diff --git a/test/test_faidx.c b/test/test_faidx.c new file mode 100644 index 000000000..566149071 --- /dev/null +++ b/test/test_faidx.c @@ -0,0 +1,516 @@ +/* test/test_fadix.c -- Test faidx interfaces + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include + +#include "../htslib/faidx.h" + +int file_compare(const char *file1, const char *file2) { + FILE *f1 = NULL; + FILE *f2 = NULL; + unsigned int lno = 1; + size_t got1, got2, i; + char buf1[1024], buf2[1024]; + int ret = -1; + + f1 = fopen(file1, "rb"); + if (!f1) { + perror(file1); + goto out; + } + f2 = fopen(file2, "rb"); + if (!f2) { + perror(file2); + goto out; + } + + do { + got1 = fread(buf1, 1, sizeof(buf1), f1); + got2 = fread(buf2, 1, sizeof(buf2), f2); + + for (i = 0; i < got1 && i < got2 && buf1[i] == buf2[i]; i++) + lno += (buf1[i] == '\n'); + if (i < got1 || i < got2) { + fprintf(stderr, "%s and %s differ at line %u\n", + file1, file2, lno); + goto out; + } + } while (got1 > 0 && got2 > 0); + + if (ferror(f1)) { + perror(file1); + goto out; + } + if (ferror(f2)) { + perror(file2); + goto out; + } + + if (got1 > 0 || got2 > 0) { + fprintf(stderr, "EOF on %s at line %u\n", + got1 ? file2 : file1, lno); + goto out; + } + + ret = 0; + out: + if (f1) fclose(f1); + if (f2) fclose(f2); + return ret; +} + +faidx_t * load_index(const char *fn, const char *fnfai, const char *fngzi, + int flags, enum fai_format_options format) { + faidx_t *fai = fai_load3_format(fn, fnfai, fngzi, flags, format); + if (!fai) { + fprintf(stderr, "Failed: fai_load3(%s, %s, %s, %d, %d)\n", + fn, fnfai ? fnfai : "NULL", fngzi ? fngzi : "NULL", flags, + (int) format); + return NULL; + } + return fai; +} + +int do_retrieval(const char *fn, const char *fnfai, const char *fngzi, + int flags, enum fai_format_options format, const char *fnout, + const char *interface, int nreg, char **regions) { + int i, use_64bit = 1, use_parse_reg = 0, use_adjust_reg = 0; + faidx_t *fai = NULL; + FILE *out = stdout; + + if (interface) { + if (strcmp(interface, "fai_fetch") == 0) { + use_64bit = 0; + } else if (strcmp(interface, "faidx_fetch_seq") == 0) { + use_64bit = 0; + use_parse_reg = 1; + } else if (strcmp(interface, "faidx_fetch_seq64") == 0 + || strcmp(interface, "fai_parse_region") == 0) { + use_parse_reg = 1; + } else if (strcmp(interface, "fai_adjust_region") == 0) { + use_parse_reg = 1; + use_adjust_reg = 1; + } + } + + if (fnout) { + out = fopen(fnout, "wb"); + if (!out) { + perror(fnout); + return -1; + } + } + + fai = load_index(fn, fnfai, fngzi, flags, format); + if (!fai) + goto fail; + + for (i = 0; i < nreg; i++) { + hts_pos_t len = 0, pos, beg = 0, end = 0; + int tid = 0; + char *seq = NULL; + size_t l; + + if (use_parse_reg) { + const char *e = fai_parse_region(fai, regions[i], + &tid, &beg, &end, 0); + if (e == NULL) { + fprintf(stderr, "Failed: " + "fai_parse_region(fai, %s, &tid, &beg, &end, 0)\n", + regions[i]); + goto fail; + } + if (use_adjust_reg) { + hts_pos_t orig_beg = beg, orig_end = end; + int r = fai_adjust_region(fai, tid, &beg, &end); + if (r < 0 + || (((r & 1) != 0) ^ (beg != orig_beg)) + || (((r & 2) != 0) ^ (end != orig_end))) { + fprintf(stderr, "Failed: fai_adjust_region(fai, %d, " + "%"PRIhts_pos", %"PRIhts_pos") returned %d\n" + "After: beg = %"PRIhts_pos" end = %"PRIhts_pos"\n", + tid, orig_beg, orig_end, r, beg, end); + goto fail; + } + } + if (use_64bit) { + seq = faidx_fetch_seq64(fai, faidx_iseq(fai, tid), + beg, end - 1, &len); + } else { + int ilen = 0; + seq = faidx_fetch_seq(fai, faidx_iseq(fai, tid), + beg, end - 1, &ilen); + len = ilen; + } + if (!seq) { + fprintf(stderr, "Failed: faidx_fetch_seq%s(fai, %s, " + "%"PRIhts_pos", %"PRIhts_pos", &len)\n", + use_64bit ? "64" : "", faidx_iseq(fai, tid), beg, end); + goto fail; + } + } else { + if (use_64bit) { + seq = fai_fetch64(fai, regions[i], &len); + } else { + int ilen = 0; + seq = fai_fetch(fai, regions[i], &ilen); + len = ilen; + } + if (!seq) { + fprintf(stderr, "Failed: fai_fetch%s(fai, %s, &len)\n", + use_64bit ? "64" : "", regions[i]); + goto fail; + } + } + + l = strlen(seq); + fprintf(out, "%c%s length: %"PRIhts_pos"\n", + format == FAI_FASTQ ? '@' : '>', regions[i], len); + for (pos = 0; pos < l; pos += 50) { + fprintf(out, "%.*s\n", 50, seq + pos); + } + free(seq); + if (format == FAI_FASTQ) { + hts_pos_t qual_len = 0; + char *qual; + if (use_parse_reg) { + if (use_64bit) { + qual = faidx_fetch_qual64(fai, faidx_iseq(fai, tid), + beg, end - 1, &qual_len); + } else { + int ilen = 0; + qual = faidx_fetch_qual(fai, faidx_iseq(fai, tid), + beg, end - 1, &ilen); + qual_len = ilen; + } + } else { + if (use_64bit) { + qual = fai_fetchqual64(fai, regions[i], &qual_len); + } else { + int ilen = 0; + qual = fai_fetchqual(fai, regions[i], &ilen); + qual_len = ilen; + } + if (!qual) { + fprintf(stderr, "Failed: fai_fetchqual64(fai, %s, &len)\n", + regions[i]); + goto fail; + } + } + if (qual_len != len) { + fprintf(stderr, + "Sequence and quality lengths differ for %s %s\n", + fn, regions[i]); + free(qual); + goto fail; + } + fprintf(out, "+\n"); + l = strlen(qual); + for (pos = 0; pos < l; pos+=50) { + fprintf(out, "%.*s\n", 50, qual + pos); + } + free(qual); + } + } + + fai_destroy(fai); + + if (fnout) { + if (fclose(out) != 0) { + perror(fnout); + return -1; + } + } + return 0; + + fail: + if (fai) + fai_destroy(fai); + if (fnout) + fclose(out); + + return -1; +} + +int test_fai_line_length(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *reg) { + hts_pos_t found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = fai_line_length(fai, reg); + fai_destroy(fai); + if (expected) { + long long exp_len = strtoll(expected, NULL, 10); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %"PRIhts_pos" from " + "fai_line_length, expected %s\n", found_len, expected); + return -1; + } + } else { + printf("%"PRIhts_pos"\n", found_len); + } + return 0; +} + +int test_faidx_has_seq(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + int res; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + res = faidx_has_seq(fai, seq); + fai_destroy(fai); + if (expected) { + long exp_res = strtol(expected, NULL, 10); + if (res != exp_res) { + fprintf(stderr, "Unexpected result %d from faidx_has_seq(%s) " + "expected %s\n", res, seq, expected); + return -1; + } + } else { + printf("%d\n", res); + } + return 0; +} + +int test_faidx_iseq(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *index) { + const char *found_name = NULL; + int idx = atoi(index); + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_name = faidx_iseq(fai, idx); + + if (expected) { + if (!found_name || strcmp(found_name, expected) != 0) { + fprintf(stderr, "Unexpected result %s from faidx_iseq(fai, %d), " + "expected %s\n", found_name ? found_name : "(null)", + idx, expected); + fai_destroy(fai); + return -1; + } + } else { + printf("%s\n", found_name ? found_name : "(null)"); + } + + fai_destroy(fai); + return 0; +} + +int test_faidx_seq_len(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + int found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = faidx_seq_len(fai, seq); + fai_destroy(fai); + + if (expected) { + int exp_len = atoi(expected); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %d from faidx_seq_len(fai, %s) " + "expected %s\n", found_len, seq, expected); + return -1; + } + } else { + printf("%d\n", found_len); + } + + return 0; +} + +int test_faidx_seq_len64(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + hts_pos_t found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = faidx_seq_len(fai, seq); + fai_destroy(fai); + + if (expected) { + long long exp_len = strtoll(expected, NULL, 10); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %"PRIhts_pos + " from fai_seq_len64(fai, %s) expected %s\n", + found_len, seq, expected); + return -1; + } + } else { + printf("%"PRIhts_pos"\n", found_len); + } + + return 0; +} + +void usage(FILE *out, const char *arg0) { + fprintf(out, + "Usage: %s [-c] -i fasta/q [-f fai_file] [-g gzi_file] [-e expected_fai]\n" + " %s [-cQ] -i fasta/q [-f fai_file] [-g gzi_file] [region]\n" + " %s -t FUNC -i fasta/q [-f fai_file] [-g gzi_file] [-e expected] \n" + " %s -h\n", + arg0, arg0, arg0, arg0); +} + +void help(FILE *out, const char *arg0) { + usage(out, arg0); + fprintf(out, + "Options:\n" + " -i FILE Input file\n" + " -f FILE Fasta/q index file name\n" + " -g FILE Bgzip index file name\n" + " -o FILE Output file name\n" + " -e FILE|STR Expected output\n" + " -c Set FAI_CREATE flag\n" + " -Q Output fastq format\n" + " -t FUNC Test function\n" + " -h Print this help\n" + "\n" + "Expected output is compared to the FAI file in indexing mode;" + " the output file\n" + "in retrieval mode; " + "expected output for various -t function tests.\n" + "\n" + "Unit tests (-t option):\n" + " fai_line_length, faidx_has_seq, faidx_iseq, faidx_seq_len, faidx_seq_len64\n" + "In retrieval mode, -t can change the functions used to fetch data:\n" + " fai_fetch, fai_fetch64, faidx_fetch_seq, faidx_fetch_seq64,\n" + " fai_parse_region, fai_adjust_region\n" + "\n"); +} + +int main(int argc, char **argv) { + int opt; + const char *fn = NULL; + const char *fnout = NULL; + const char *fnfai = NULL; + const char *fngzi = NULL; + const char *expected = NULL; + const char *func = ""; + int flags = 0; + enum fai_format_options format = FAI_FASTA; + int res; + + while ((opt = getopt(argc, argv, "i:f:g:o:e:t:cQh")) > 0) { + switch (opt) { + case 'i': + fn = optarg; + break; + case 'f': + fnfai = optarg; + break; + case 'g': + fngzi = optarg; + break; + case 'o': + fnout = optarg; + break; + case 'e': + expected = optarg; + break; + case 'c': + flags |= FAI_CREATE; + break; + case 'Q': + format = FAI_FASTQ; + break; + case 't': + func = optarg; + break; + case 'h': + help(stdout, argv[0]); + return EXIT_SUCCESS; + default: + usage(stderr, argv[0]); + return EXIT_FAILURE; + } + } + + if (!fn) { + usage(stderr, argv[0]); + return EXIT_FAILURE; + } + + if (optind == argc) { + // Index building mode + res = fai_build3(fn, fnfai, fngzi); + if (res) { + fprintf(stderr, "Failed: fai_build3(%s, %s, %s)\n", + fn, fnfai ? fnfai : "NULL", fngzi ? fngzi : "NULL"); + } else if (expected) { + res = file_compare(fnfai, expected); + } + } else { + if (strcmp(func, "fai_line_length") == 0) { + res = test_fai_line_length(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_has_seq") == 0) { + res = test_faidx_has_seq(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_iseq") == 0) { + res = test_faidx_iseq(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_seq_len") == 0) { + res = test_faidx_seq_len(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_seq_len64") == 0) { + res = test_faidx_seq_len64(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else { + res = do_retrieval(fn, fnfai, fngzi, flags, format, fnout, + func, argc - optind, &argv[optind]); + if (res == 0 && fnout && expected) { + res = file_compare(fnout, expected); + } + } + } + return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/test/test_index.c b/test/test_index.c index 0740427ab..cc90f51fe 100644 --- a/test/test_index.c +++ b/test/test_index.c @@ -24,12 +24,12 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include +#include #include "../htslib/sam.h" #include "../htslib/vcf.h" -void usage(FILE *fp) { +void HTS_NORETURN usage(FILE *fp) { fprintf(fp, "Usage: test_index [opts] in.{sam.gz,bam,cram}|in.{vcf.gz,bcf}\n\n"); fprintf(fp, " -b Use BAI index (BAM, SAM)\n"); fprintf(fp, " -c Use CSI index (BAM, SAM, VCF, BCF)\n"); diff --git a/test/test_introspection.c b/test/test_introspection.c new file mode 100644 index 000000000..843e45eb8 --- /dev/null +++ b/test/test_introspection.c @@ -0,0 +1,87 @@ +/* test/test_introspection.c -- demonstration of introspection function usage + + Copyright (C) 2020-2021 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include + +#include "../htslib/hts.h" +#include "../htslib/hfile.h" + +int main(void) { + printf("Version string: %s\n", hts_version()); + printf("Version number: %d\n", HTS_VERSION); + printf("\nhtscodecs version: %s\n", + hts_test_feature(HTS_FEATURE_HTSCODECS)); + + printf("\nCC: %s\n", hts_test_feature(HTS_FEATURE_CC)); + printf("CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS)); + printf("CFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CFLAGS)); + printf("LDFLAGS: %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS)); + + unsigned int feat = hts_features(); + printf("\nFeature number: 0x%x\n", feat); + if (feat & HTS_FEATURE_CONFIGURE) + printf(" HTS_FEATURE_CONFIGURE\n"); + if (feat & HTS_FEATURE_PLUGINS) + printf(" HTS_FEATURE_PLUGINS\n"); + if (feat & HTS_FEATURE_LIBCURL) + printf(" HTS_FEATURE_LIBCURL\n"); + if (feat & HTS_FEATURE_S3) + printf(" HTS_FEATURE_S3\n"); + if (feat & HTS_FEATURE_GCS) + printf(" HTS_FEATURE_GCS\n"); + if (feat & HTS_FEATURE_LIBDEFLATE) + printf(" HTS_FEATURE_LIBDEFLATE\n"); + if (feat & HTS_FEATURE_LZMA) + printf(" HTS_FEATURE_LZMA\n"); + if (feat & HTS_FEATURE_BZIP2) + printf(" HTS_FEATURE_BZIP2\n"); + if (feat & HTS_FEATURE_HTSCODECS) + printf(" HTS_FEATURE_HTSCODECS\n"); + + printf("\nFeature string: %s\n", hts_feature_string()); + + + // Plugins and schemes + printf("\nPlugins present:\n"); + const char *plugins[100]; + int np = 100, i, j; + + if (hfile_list_plugins(plugins, &np) < 0) + return 1; + + for (i = 0; i < np; i++) { + const char *sc_list[100]; + int nschemes = 100; + if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0) + return 1; + + printf(" %s:\n", plugins[i]); + for (j = 0; j < nschemes; j++) + printf("\t%s\n", sc_list[j]); + puts(""); + } + + return 0; +} diff --git a/test/test_kstring.c b/test/test_kstring.c index 5923ba2a7..ee913a2e3 100644 --- a/test/test_kstring.c +++ b/test/test_kstring.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include +#include #include "../htslib/kstring.h" @@ -261,6 +261,118 @@ static int test_kputw(int64_t start, int64_t end) { return 0; } +// callback used by test_kgetline +static char *mock_fgets(char *str, int num, void *p) { + int *mock_state = (int*)p; + (*mock_state)++; + switch (*mock_state) { + case 1: + case 4: + case 7: + // a few characters, no endline + strcpy(str, "ABCD"); + break; + case 2: + case 3: + // \n endline + strcpy(str, "\n"); + break; + case 5: + case 6: + // \r\n endline + strcpy(str, "\r\n"); + break; + default: + // eof + return 0; + } + + return str; +} + +static int test_kgetline() { + kstring_t s = KS_INITIALIZE; + int mock_state = 0; + + // normal line, \n terminated, called with non-empty s + kputs("_", &s); + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("_ABCD", s.s) || 5 != s.l) return -1; + s.l = 0; + // empty line, \n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // normal line, \r\n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // empty line, \r\n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // line terminated by EOF + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // EOF + if (EOF != kgetline(&s, mock_fgets, &mock_state) || 0 != s.l) return -1; + + ks_free(&s); + return EXIT_SUCCESS; +} + +// callback used by test_kgetline2 +static ssize_t mock_fgets2(char *str, size_t num, void *p) { + int *mock_state = (int*)p; + (*mock_state)++; + switch (*mock_state) { + case 1: + case 4: + case 7: + // a few characters, no endline + strcpy(str, "ABCD"); + break; + case 2: + case 3: + // \n endline + strcpy(str, "\n"); + break; + case 5: + case 6: + // \r\n endline + strcpy(str, "\r\n"); + break; + default: + // eof + return 0; + } + + return strlen(str); +} + +static int test_kgetline2() { + kstring_t s = KS_INITIALIZE; + int mock_state = 0; + + // normal line, \n terminated, called with non-empty s + kputs("_", &s); + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("_ABCD", s.s) || 5 != s.l) return -1; + s.l = 0; + // empty line, \n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // normal line, \r\n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // empty line, \r\n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // line terminated by EOF + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // EOF + if (EOF != kgetline2(&s, mock_fgets2, &mock_state) || 0 != s.l) return -1; + + ks_free(&s); + return EXIT_SUCCESS; +} + int main(int argc, char **argv) { int opt, res = EXIT_SUCCESS; int64_t start = 0; @@ -301,5 +413,11 @@ int main(int argc, char **argv) { if (!test || strcmp(test, "kputw") == 0) if (test_kputw(start, end) != 0) res = EXIT_FAILURE; + if (!test || strcmp(test, "kgetline") == 0) + if (test_kgetline() != 0) res = EXIT_FAILURE; + + if (!test || strcmp(test, "kgetline2") == 0) + if (test_kgetline2() != 0) res = EXIT_FAILURE; + return res; } diff --git a/test/test_mod.c b/test/test_mod.c new file mode 100644 index 000000000..d8a53f3de --- /dev/null +++ b/test/test_mod.c @@ -0,0 +1,229 @@ +/* test/test_mod.c -- testing of base modification functions + + Copyright (C) 2020-2021, 2023 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* +This tests multiple APIs. The simplest is to parse the MM/ML tags with +bam_parse_basemod and then call bam_mods_at_next_pos once for each base in +the bam sequence to check for modifications. + +Ie: + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); // b=bam1_t pointer + hts_base_mod mods[5]; + for (i = 0; i < b->core.l_qseq; i++) { + n = bam_mods_at_next_pos(b, m, mods, 5); + for (j = 0; j < n && j < 5; j++) { + // Report 'n'th mod at seq pos 'i'. + // mods[j].modified_base holds the base mod itself, with + // mods[j].canonical_base, mods[j].strand and mods[j].qual + // also present in hts_base_mod struct. + // ... + } + } + hts_base_mod_state_free(m); + +The extended mode has the same loop above, but calls bam_mods_query_type +to return additional meta-data including the strand, canonical base and +whether the base modification is recorded implicitly or explicitly: + + int ret = bam_mods_query_type(m, mods[j].modified_base, + &m_strand, &m_implicit, + &m_canonical); + +Looping over every base in the sequence is not particularly efficient +however unless this fits your natural processing order. The alternative +is to call bam_next_base_mod to iterate only over modified locations: + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); // b=bam1_t pointer + hts_base_mod mods[5]; + while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { + for (j = 0; j < n && j < 5; j++) { + // Report 'n'th mod at sequence position 'pos' + } + } + hts_base_mod_state_free(m); + +*/ + +#include +#include + +#include "../htslib/sam.h" + +static char *code(int id) { + static char code[20]; + if (id > 0) { + code[0] = id; + code[1] = 0; + } else { + snprintf(code, sizeof(code), "(%d)", -id); + } + + return code; +} + +int main(int argc, char **argv) { + int extended = 0; + uint32_t flags = 0; + + if (argc > 1 && strcmp(argv[1], "-x") == 0) { + extended = 1; + argv++; + argc--; + } + + if (argc > 2 && strcmp(argv[1], "-f") == 0) { + flags = atoi(argv[2]); + argv+=2; + argc-=2; + } + + if (argc < 2) + return 1; + + samFile *in = sam_open(argv[1], "r"); + if (!in) + return 1; + + bam1_t *b = bam_init1(); + sam_hdr_t *h = sam_hdr_read(in); + hts_base_mod_state *m = hts_base_mod_state_alloc(); + if (!h || !b || !m) + goto err; + + int r; + while ((r = sam_read1(in, h, b)) >= 0) { + if (bam_parse_basemod2(b, m, flags) < 0) { + fprintf(stderr, "Failed to parse MM/ML aux tags\n"); + goto err; + } + + // per-base iterator + int i, j, n; + hts_base_mod mods[5]; + for (i = 0; i < b->core.l_qseq; i++) { + char sp = '\t'; + n = bam_mods_at_next_pos(b, m, mods, 5); + printf("%d\t%c", i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); + for (j = 0; j < n && j < 5; j++) { + char qstr[10]; + if (mods[j].qual == HTS_MOD_UNCHECKED) + qstr[0] = '#', qstr[1] = 0; + else if (mods[j].qual == HTS_MOD_UNKNOWN) + qstr[0] = '.', qstr[1] = 0; + else + snprintf(qstr, 10, "%d", mods[j].qual); + + if (extended) { + int m_strand, m_implicit; + char m_canonical; + int ret = bam_mods_query_type(m, mods[j].modified_base, + &m_strand, &m_implicit, + &m_canonical); + if (ret < 0 || + m_canonical != mods[j].canonical_base || + m_strand != mods[j].strand) + goto err; + printf("%c%c%c%s%c%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + "?."[m_implicit], + qstr); + } else { + printf("%c%c%c%s%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + qstr); + } + sp = ' '; + } + putchar('\n'); + } + + puts("---"); + + bam_parse_basemod2(b, m, flags); + + // List possible mod choices. + int *all_mods; + int all_mods_n = 0; + all_mods = bam_mods_recorded(m, &all_mods_n); + printf("Present:"); + for (i = 0; i < all_mods_n; i++) { + int m_strand, m_implicit; + char m_canonical; + bam_mods_queryi(m, i, &m_strand, &m_implicit, &m_canonical); + printf(all_mods[i] > 0 ? " %c" : " #%d", all_mods[i]); + putchar("?."[m_implicit]); + } + putchar('\n'); + + int pos; + while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { + char sp = '\t'; + printf("%d\t%c", pos, + seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); + for (j = 0; j < n && j < 5; j++) { + char qstr[10]; + if (mods[j].qual == HTS_MOD_UNCHECKED) + qstr[0] = '#', qstr[1] = 0; + else if (mods[j].qual == HTS_MOD_UNKNOWN) + qstr[0] = '.', qstr[1] = 0; + else + snprintf(qstr, 10, "%d", mods[j].qual); + + printf("%c%c%c%s%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + qstr); + sp = ' '; + } + putchar('\n'); + } + + if (n < 0) + goto err; + + puts("\n===\n"); + } + fflush(stdout); + if (sam_close(in) != 0 || r < -1) + goto err; + + bam_destroy1(b); + sam_hdr_destroy(h); + hts_base_mod_state_free(m); + return 0; + + err: + bam_destroy1(b); + sam_hdr_destroy(h); + hts_base_mod_state_free(m); + return 1; +} diff --git a/test/test_realn.c b/test/test_realn.c index b96ef9d59..3f511704b 100644 --- a/test/test_realn.c +++ b/test/test_realn.c @@ -28,8 +28,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include +#include #include "../htslib/sam.h" #include "../htslib/hts.h" diff --git a/test/test_str2int.c b/test/test_str2int.c index 6345fdc4c..70e799169 100644 --- a/test/test_str2int.c +++ b/test/test_str2int.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include +#include #include "../textutils_internal.h" diff --git a/test/test_time_funcs.c b/test/test_time_funcs.c new file mode 100644 index 000000000..0e0512988 --- /dev/null +++ b/test/test_time_funcs.c @@ -0,0 +1,125 @@ +/* test_time_compat.c -- Test time functions + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../hts_time_funcs.h" + +int test_normalised(time_t start, time_t end, time_t incr) { + time_t i, j; + struct tm *utc; + + for (i = start; i < end; i += incr) { + utc = gmtime(&i); + j = hts_time_gm(utc); + if (i != j) { + fprintf(stderr, + "hts_time_gm() failed, got %"PRId64" expected %"PRId64"\n", + (int64_t) j, (int64_t) i); + return 1; + } + } + return 0; +} + +int test_specific(int year, int mon, int mday, int hour, int min, int sec, + time_t expected) { + struct tm utc = { sec, min, hour, mday, mon - 1, year - 1900, 0, 0, 0 }; + time_t res = hts_time_gm(&utc); + if (res != expected) { + fprintf(stderr, + "hts_time_gm() failed for %4d/%02d/%02d %02d:%02d:%02d :" + " got %"PRId64" expected %"PRId64"\n", + year, mon, mday, hour, min, sec, + (int64_t) res, (int64_t) expected); + return 1; + } + return 0; +} + +int main(int argc, char **argv) { + int res = 0; + + if (test_normalised(0, INT_MAX - 1000, 1000) != 0) + return EXIT_FAILURE; + if (sizeof(time_t) >= 8) { + if (test_normalised(INT_MAX - 1000, + (time_t)((int64_t) INT_MAX * 2), 1000) != 0) + return EXIT_FAILURE; + } + + // 2022-06-14 12:32:10 + res |= test_specific(2022, 6, 14, 12, 32, 10, 1655209930); + // 2022-06-14 12:32:10 + res |= test_specific(1993, 9, 10514, 12, 32, 10, 1655209930); + // 2022-02-28 12:00:00 + res |= test_specific(2020, 2, 28, 12, 0, 0, 1582891200); + // 2022-02-29 12:00:00 + res |= test_specific(2020, 2, 29, 12, 0, 0, 1582977600); + // 2022-03-01 12:00:00 + res |= test_specific(2020, 2, 30, 12, 0, 0, 1583064000); + // 2022-02-29 12:00:00 + res |= test_specific(2020, 3, 0, 12, 0, 0, 1582977600); + // 2020-02-01 12:00:00 + res |= test_specific(2019, 14, 1, 12, 0, 0, 1580558400); + // 2020-03-01 12:00:00 + res |= test_specific(2019, 15, 1, 12, 0, 0, 1583064000); + // 2021-03-01 12:00:00 + res |= test_specific(2019, 27, 1, 12, 0, 0, 1614600000); + // 2024-02-01 12:00:00 + res |= test_specific(2019, 62, 1, 12, 0, 0, 1706788800); + // 2024-03-01 12:00:00 + res |= test_specific(2019, 63, 1, 12, 0, 0, 1709294400); + // 2020-12-31 23:59:59 + res |= test_specific(2021, 0, 31, 23, 59, 59, 1609459199); + // 2020-03-01 12:00:00 + res |= test_specific(2021, -9, 1, 12, 0, 0, 1583064000); + // 2020-02-01 12:00:00 + res |= test_specific(2021, -10, 1, 12, 0, 0, 1580558400); + // 2019-02-01 12:00:00 + res |= test_specific(2021, -22, 1, 12, 0, 0, 1549022400); + // 1970-01-01 00:00:00 + res |= test_specific(1970, 1, 1, 0, 0, 0, 0); + // 2038-01-19 03:14:07 + res |= test_specific(1970, 1, 1, 0, 0, INT_MAX, INT_MAX); + // 2038-01-19 03:14:07 + res |= test_specific(2038, 1, 19, 3, 14, 7, INT_MAX); + if (sizeof(time_t) < 8) { + // 2038-01-19 03:14:08 + res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) -1); + } else { + // 2038-01-19 03:14:08 + res |= test_specific(2038, 1, 19, 3, 14, 8, + (time_t)((int64_t) INT_MAX + 1)); + } + + return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} diff --git a/test/test_view.c b/test/test_view.c index 604ec42c6..c899ff995 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -1,7 +1,7 @@ /* test/test_view.c -- simple view tool, purely for use in a test harness. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2020 Genome Research Ltd. Author: Heng Li @@ -25,11 +25,11 @@ DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include #include -#include #include #include "../cram/cram.h" @@ -57,7 +57,9 @@ enum test_op { READ_CRAM = 4, WRITE_CRAM = 8, WRITE_UNCOMPRESSED = 16, - WRITE_COMPRESSED = 32, // eg vcf.gz, sam.gz + WRITE_COMPRESSED = 32, // eg vcf.gz, sam.gz, fastq.gz + WRITE_FASTQ = 64, + WRITE_FASTA = 128, }; int sam_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, htsFile *out) { @@ -223,7 +225,8 @@ int vcf_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, hts_itr_t *iter; if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); - continue; + exit_code = 1; + break; } while ((r = bcf_itr_next(in, iter, b)) >= 0) { if (!opts->benchmark && bcf_write1(out, h, b) < 0) { @@ -295,7 +298,7 @@ int main(int argc, char *argv[]) opts.index = NULL; opts.min_shift = 0; - while ((c = getopt(argc, argv, "DSIt:i:bzCul:o:N:BZ:@:Mx:m:p:v")) >= 0) { + while ((c = getopt(argc, argv, "DSIt:i:bzCfFul:o:N:BZ:@:Mx:m:p:v")) >= 0) { switch (c) { case 'D': opts.flag |= READ_CRAM; break; case 'S': opts.flag |= READ_COMPRESSED; break; @@ -305,6 +308,8 @@ int main(int argc, char *argv[]) case 'b': opts.flag |= WRITE_BINARY_COMP; break; case 'z': opts.flag |= WRITE_COMPRESSED; break; case 'C': opts.flag |= WRITE_CRAM; break; + case 'f': opts.flag |= WRITE_FASTQ; break; + case 'F': opts.flag |= WRITE_FASTA; break; case 'u': opts.flag |= WRITE_UNCOMPRESSED; break; // eg u-BAM not SAM case 'l': opts.clevel = atoi(optarg); break; case 'o': if (hts_opt_add(&out_opts, optarg)) return 1; break; @@ -329,8 +334,9 @@ int main(int argc, char *argv[]) fprintf(stderr, "-i: option=value: set an option for CRAM input\n"); fprintf(stderr, "\n"); fprintf(stderr, "-b: write binary compressed BCF, BAM, FAI (mode 'b')\n"); - fprintf(stderr, "-z: write text compressed VCF.gz, SAM.gz (mode 'z')\n"); + fprintf(stderr, "-z: write text compressed VCF.gz, SAM.gz or FASTQ.gz (mode 'z')\n"); fprintf(stderr, "-C: write CRAM format (mode 'c')\n"); + fprintf(stderr, "-f: write FASTQ format (mode 'f')\n"); fprintf(stderr, "-l 0-9: set zlib compression level\n"); fprintf(stderr, "-o option=value: set an option for CRAM output\n"); fprintf(stderr, "-N: num_reads: limit the output to the first num_reads reads\n"); @@ -357,11 +363,14 @@ int main(int argc, char *argv[]) } strcpy(modew, "w"); - if (opts.clevel >= 0 && opts.clevel <= 9) sprintf(modew + 1, "%d", opts.clevel); + if (opts.clevel >= 0 && opts.clevel <= 9) + snprintf(modew + 1, sizeof(modew) - 1, "%d", opts.clevel); if (opts.flag & WRITE_CRAM) strcat(modew, "c"); else if (opts.flag & WRITE_BINARY_COMP) strcat(modew, "b"); else if (opts.flag & WRITE_COMPRESSED) strcat(modew, "z"); else if (opts.flag & WRITE_UNCOMPRESSED) strcat(modew, "bu"); + if (opts.flag & WRITE_FASTQ) strcat(modew, "f"); + else if (opts.flag & WRITE_FASTA) strcat(modew, "F"); out = hts_open(out_fn, modew); if (out == NULL) { fprintf(stderr, "Error opening standard output\n"); @@ -422,5 +431,10 @@ int main(int argc, char *argv[]) if (p.pool) hts_tpool_destroy(p.pool); + if (fclose(stdout) != 0 && errno != EBADF) { + fprintf(stderr, "Error closing standard output.\n"); + exit_code = EXIT_FAILURE; + } + return exit_code; } diff --git a/test/vcf_meta_meta.vcf b/test/vcf_meta_meta.vcf new file mode 100644 index 000000000..43aba73f7 --- /dev/null +++ b/test/vcf_meta_meta.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##META= +##META= +##META= +##META= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 123 . TC T . . . diff --git a/textutils.c b/textutils.c index 53a3b252d..0cc2af818 100644 --- a/textutils.c +++ b/textutils.c @@ -453,7 +453,7 @@ const char * hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len) { const char *slim = (len < SIZE_MAX)? &s[len] : NULL; - char *t = buf; + char *t = buf, *bufend = buf + buflen; size_t qlen = quote? 1 : 0; if (quote) *t++ = quote; @@ -482,7 +482,7 @@ hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len) } if (clen == 4) { - sprintf(t, "\\x%02X", (unsigned char) c); + snprintf(t, bufend - t, "\\x%02X", (unsigned char) c); t += clen; } else { diff --git a/textutils_internal.h b/textutils_internal.h index 7e8628658..1ad096494 100644 --- a/textutils_internal.h +++ b/textutils_internal.h @@ -65,9 +65,11 @@ typedef struct hts_json_token hts_json_token; /// Allocate an empty JSON token structure, for use with hts_json_* functions /** @return An empty token on success; NULL on failure */ +HTSLIB_EXPORT hts_json_token *hts_json_alloc_token(void); /// Free a JSON token +HTSLIB_EXPORT void hts_json_free_token(hts_json_token *token); /// Accessor function to get JSON token type @@ -85,6 +87,7 @@ as follows: - `!` other errors (e.g. out of memory) - `\0` terminator at end of input */ +HTSLIB_EXPORT char hts_json_token_type(hts_json_token *token); /// Accessor function to get JSON token in string form @@ -98,6 +101,7 @@ will point at the kstring_t buffer passed as the third parameter to hts_json_fnext(). In that case, the value will only be valid until the next call to hts_json_fnext(). */ +HTSLIB_EXPORT char *hts_json_token_str(hts_json_token *token); /// Read one JSON token from a string @@ -111,6 +115,7 @@ is modified by having token-terminating characters overwritten as NULs. The `state` argument records the current position within `str` after each `hts_json_snext()` call, and should be set to 0 before the first call. */ +HTSLIB_EXPORT char hts_json_snext(char *str, size_t *state, hts_json_token *token); /// Read and discard a complete JSON value from a string @@ -123,6 +128,7 @@ char hts_json_snext(char *str, size_t *state, hts_json_token *token); Skips a complete JSON value, which may be a single token or an entire object or array. */ +HTSLIB_EXPORT char hts_json_sskip_value(char *str, size_t *state, char type); struct hFILE; @@ -137,6 +143,7 @@ The `kstr` buffer is used to store the string value of the token read, so `token->str` is only valid until the next time `hts_json_fnext()` is called with the same `kstr` argument. */ +HTSLIB_EXPORT char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr); /// Read and discard a complete JSON value from a file @@ -148,6 +155,7 @@ char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr); Skips a complete JSON value, which may be a single token or an entire object or array. */ +HTSLIB_EXPORT char hts_json_fskip_value(struct hFILE *fp, char type); // The functions operate on ints such as are returned by fgetc(), @@ -355,7 +363,7 @@ static inline double hts_str2dbl(const char *in, char **end, int *failed) { case '0': if (v[1] != 'x' && v[1] != 'X') break; - // else fall through (hex number) + // else fall through - hex number default: // Non numbers, like NaN, Inf diff --git a/thread_pool.c b/thread_pool.c index 2bc427d51..252a9d24c 100644 --- a/thread_pool.c +++ b/thread_pool.c @@ -42,6 +42,10 @@ DEALINGS IN THE SOFTWARE. */ #include "thread_pool_internal.h" #include "htslib/hts_log.h" +// Minimum stack size for threads. Required for some rANS codecs +// that use over 2Mbytes of stack for encoder / decoder state +#define HTS_MIN_THREAD_STACK (3 * 1024 * 1024) + static void hts_tpool_process_detach_locked(hts_tpool *p, hts_tpool_process *q); @@ -525,7 +529,7 @@ static void *tpool_worker(void *arg) { // room to put the result. //if (q && q->input_head && !hts_tpool_process_output_full(q)) { if (q && q->input_head - && q->qsize - q->n_output > p->tsize - p->nwaiting + && q->qsize - q->n_output > q->n_processing && !q->shutdown) { work_to_do = 1; break; @@ -716,6 +720,9 @@ static void wake_next_worker(hts_tpool_process *q, int locked) { */ hts_tpool *hts_tpool_init(int n) { int t_idx = 0; + size_t stack_size = 0; + pthread_attr_t pattr; + int pattr_init_done = 0; hts_tpool *p = malloc(sizeof(*p)); if (!p) return NULL; @@ -748,18 +755,32 @@ hts_tpool *hts_tpool_init(int n) { pthread_mutex_lock(&p->pool_m); + // Ensure new threads have a reasonably large stack. On some platforms, + // for example MacOS which defaults to 512Kb, this is not big enough + // for some of the rANS codecs. + + if (pthread_attr_init(&pattr) < 0) + goto cleanup; + pattr_init_done = 1; + if (pthread_attr_getstacksize(&pattr, &stack_size) < 0) + goto cleanup; + if (stack_size < HTS_MIN_THREAD_STACK) { + if (pthread_attr_setstacksize(&pattr, HTS_MIN_THREAD_STACK) < 0) + goto cleanup; + } + for (t_idx = 0; t_idx < n; t_idx++) { hts_tpool_worker *w = &p->t[t_idx]; p->t_stack[t_idx] = 0; w->p = p; w->idx = t_idx; pthread_cond_init(&w->pending_c, NULL); - if (0 != pthread_create(&w->tid, NULL, tpool_worker, w)) { + if (0 != pthread_create(&w->tid, &pattr, tpool_worker, w)) goto cleanup; - } } pthread_mutex_unlock(&p->pool_m); + pthread_attr_destroy(&pattr); return p; @@ -778,6 +799,8 @@ hts_tpool *hts_tpool_init(int n) { pthread_cond_destroy(&p->t[j].pending_c); } pthread_mutex_destroy(&p->pool_m); + if (pattr_init_done) + pthread_attr_destroy(&pattr); free(p->t_stack); free(p->t); free(p); @@ -926,7 +949,9 @@ int hts_tpool_process_flush(hts_tpool_process *q) { pthread_cond_signal(&p->t[i].pending_c); // Ensure there is room for the final sprint. - // Shouldn't be possible to get here, but just in case. + // Ideally we shouldn't get here, but the "q->qsize - q->n_output > + // n_processing" check in tpool_worker means we can trigger a + // deadlock there. This negates that possibility. if (q->qsize < q->n_output + q->n_input + q->n_processing) q->qsize = q->n_output + q->n_input + q->n_processing; diff --git a/vcf.c b/vcf.c index 89ee90c20..e6275ddad 100644 --- a/vcf.c +++ b/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li @@ -37,6 +37,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#include "fuzz_settings.h" +#endif + #include "htslib/vcf.h" #include "htslib/bgzf.h" #include "htslib/tbx.h" @@ -46,11 +50,35 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/khash_str2int.h" #include "htslib/kstring.h" #include "htslib/sam.h" - #include "htslib/khash.h" + +#if 0 +// This helps on Intel a bit, often 6-7% faster VCF parsing. +// Conversely sometimes harms AMD Zen4 as ~9% slower. +// Possibly related to IPC differences. However for now it's just a +// curiousity we ignore and stick with the simpler code. +// +// Left here as a hint for future explorers. +static inline int xstreq(const char *a, const char *b) { + while (*a && *a == *b) + a++, b++; + return *a == *b; +} + +#define KHASH_MAP_INIT_XSTR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq) + +KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t) +#else KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) +#endif + typedef khash_t(vdict) vdict_t; +KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*) +typedef khash_t(hdict) hdict_t; + + #include "htslib/kseq.h" HTSLIB_EXPORT uint32_t bcf_float_missing = 0x7F800001; @@ -79,6 +107,22 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N #define BCF_IS_64BIT (1<<30) +// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI. +// Note that this preserving API and ABI requires that the first element is vdict_t struct +// rather than a pointer, as user programs may (and in some cases do) access the dictionary +// directly as (vdict_t*)hdr->dict. +typedef struct +{ + vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT + hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields +} +bcf_hdr_aux_t; + +static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr) +{ + return (bcf_hdr_aux_t *)hdr->dict[0]; +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -93,9 +137,6 @@ static char *find_chrom_header_line(char *s) static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) { - if ( !s ) return 0; - if (len == 0) len = strlen(s); - const char *ss = s; while ( *ss && isspace_c(*ss) && ss - s < len) ss++; if ( !*ss || ss - s == len) @@ -129,7 +170,7 @@ static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) kh_val(d, k) = bcf_idinfo_def; kh_val(d, k).id = n; } else { - hts_log_error("Duplicated sample name '%s'", s); + hts_log_error("Duplicated sample name '%s'", sdup); free(sdup); return -1; } @@ -140,24 +181,41 @@ static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) { - return bcf_hdr_add_sample_len(h, s, 0); + if (!s) { + // Allowed for backwards-compatibility, calling with s == NULL + // used to trigger bcf_hdr_sync(h); + return 0; + } + return bcf_hdr_add_sample_len(h, s, strlen(s)); } -int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) +int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str) { - int ret = 0; - int i = 0; - const char *p, *q; - // add samples - for (p = q = str;; ++q) { - if (*q > '\n') continue; - if (++i > 9) { - if ( bcf_hdr_add_sample_len(h, p, q - p) < 0 ) ret = -1; - } - if (*q == 0 || *q == '\n' || ret < 0) break; - p = q + 1; + const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; + if ( strncmp(str,mandatory,strlen(mandatory)) ) + { + hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str); + return -1; + } + + const char *beg = str + strlen(mandatory), *end; + if ( !*beg || *beg=='\n' ) return 0; + if ( strncmp(beg,"\tFORMAT\t",8) ) + { + hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str); + return -1; } + beg += 8; + int ret = 0; + while ( *beg ) + { + end = beg; + while ( *end && *end!='\t' && *end!='\n' ) end++; + if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1; + if ( !*end || *end=='\n' || ret<0 ) break; + beg = end + 1; + } return ret; } @@ -364,6 +422,126 @@ int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) return -1; } +static void bcf_hrec_set_type(bcf_hrec_t *hrec) +{ + if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG; + else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; + else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; + else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; + else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR; + else hrec->type = BCF_HL_GEN; +} + + +/** + The arrays were generated with + + valid_ctg: + perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 + + valid_tag: + perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 +*/ +static const uint8_t valid_ctg[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static const uint8_t valid_tag[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/** + bcf_hrec_check() - check the validity of structured header lines + + Returns 0 on success or negative value on error. + + Currently the return status is not checked by the caller + and only a warning is printed on stderr. This should be improved + to propagate the error all the way up to the caller and let it + decide what to do: throw an error or proceed anyway. + */ +static int bcf_hrec_check(bcf_hrec_t *hrec) +{ + int i; + bcf_hrec_set_type(hrec); + + if ( hrec->type==BCF_HL_CTG ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg; + while ( *(++val) ) + if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg; + return 0; + } + if ( hrec->type==BCF_HL_INFO ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( !strcmp(val,"1000G") ) return 0; + if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag; + while ( *(++val) ) + if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag; + return 0; + } + if ( hrec->type==BCF_HL_FMT ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag; + while ( *(++val) ) + if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag; + return 0; + } + return 0; + + err_missing_id: + hts_log_warning("Missing ID attribute in one or more header lines"); + return -1; + + err_invalid_ctg: + hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]); + return -1; + + err_invalid_tag: + hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]); + return -1; +} + static inline int is_escaped(const char *min, const char *str) { int n = 0; @@ -390,6 +568,7 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) if (!hrec->key) goto fail; memcpy(hrec->key,p,n); hrec->key[n] = 0; + hrec->type = -1; p = ++q; if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579 @@ -426,11 +605,24 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail; p = ++q; while ( *q && *q==' ' ) { p++; q++; } - int quoted = *p=='"' ? 1 : 0; - if ( quoted ) p++, q++; + + int quoted = 0; + char ending = '\0'; + switch (*p) { + case '"': + quoted = 1; + ending = '"'; + p++; + break; + case '[': + quoted = 1; + ending = ']'; + break; + } + if ( quoted ) q++; while ( *q && *q != '\n' ) { - if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } + if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; } else { if ( *q=='<' ) nopen++; @@ -441,12 +633,31 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) q++; } const char *r = q; + if (quoted && ending == ']') { + if (*q == ending) { + r++; + q++; + quoted = 0; + } else { + char buffer[320]; + hts_log_error("Missing ']' in header line %s", + hts_strprint(buffer, sizeof(buffer), '"', + line, q-line)); + goto fail; + } + } while ( r > p && r[-1] == ' ' ) r--; if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0) goto fail; - if ( quoted && *q=='"' ) q++; - if ( *q=='>' ) { nopen--; q++; } + if ( quoted && *q==ending ) q++; + if ( *q=='>' ) + { + if (nopen) nopen--; // this can happen with nested angle brackets <> + q++; + } } + if ( nopen ) + hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]); // Skip to end of line int nonspace = 0; @@ -496,6 +707,11 @@ static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_i } new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type]; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // hts_resize() can attempt to allocate up to 2 * requested items + if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t))) + return -1; +#endif if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type], &hdr->id[dict_type], HTS_RESIZE_CLEAR)) { return -1; @@ -517,10 +733,11 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) khint_t k; char *str = NULL; - if ( !strcmp(hrec->key, "contig") ) + bcf_hrec_set_type(hrec); + + if ( hrec->type==BCF_HL_CTG ) { hts_pos_t len = 0; - hrec->type = BCF_HL_CTG; // Get the contig ID ($str) and length ($j) i = bcf_hrec_find_key(hrec,"length"); @@ -585,11 +802,8 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 1; } - if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; - else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; - else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; - else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; } - else return 0; + if ( hrec->type==BCF_HL_STR ) return 1; + if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0; // INFO/FILTER/FORMAT char *id = NULL; @@ -646,6 +860,12 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) *hrec->key == 'I' ? "An" : "A", hrec->key); var = BCF_VL_VAR; } + if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) ) + { + hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id); + var = BCF_VL_FIXED; + num = 0; + } } uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 | (var & 0xf) << 8 | @@ -695,12 +915,109 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 1; } +static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +{ + if (hrec->type == BCF_HL_FLT || + hrec->type == BCF_HL_INFO || + hrec->type == BCF_HL_FMT || + hrec->type == BCF_HL_CTG) { + int id = bcf_hrec_find_key(hrec, "ID"); + if (id < 0 || !hrec->vals[id]) + return; + vdict_t *dict = (hrec->type == BCF_HL_CTG + ? (vdict_t*)hdr->dict[BCF_DT_CTG] + : (vdict_t*)hdr->dict[BCF_DT_ID]); + khint_t k = kh_get(vdict, dict, hrec->vals[id]); + if (k != kh_end(dict)) + kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL; + } +} + +static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +{ + kstring_t str = KS_INITIALIZE; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k; + int id; + + switch (hrec->type) { + case BCF_HL_GEN: + if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0) + str.l = 0; + break; + case BCF_HL_STR: + id = bcf_hrec_find_key(hrec, "ID"); + if (id < 0) + return; + if (!hrec->vals[id] || + ksprintf(&str, "##%s=", hrec->key, hrec->vals[id]) < 0) + str.l = 0; + break; + default: + return; + } + if (str.l) { + k = kh_get(hdict, aux->gen, str.s); + } else { + // Couldn't get a string for some reason, so try the hard way... + for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) { + if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec) + break; + } + } + if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) { + kh_val(aux->gen, k) = NULL; + free((char *) kh_key(aux->gen, k)); + kh_key(aux->gen, k) = NULL; + kh_del(hdict, aux->gen, k); + } + free(str.s); +} + +int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) +{ + // currently only for bcf_hdr_set_version + assert( hrec->type==BCF_HL_GEN ); + int ret; + khint_t k; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + for (k=kh_begin(aux->gen); kgen); k++) + { + if ( !kh_exist(aux->gen,k) ) continue; + if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue; + break; + } + assert( kgen) ); // something went wrong, should never happen + free((char*)kh_key(aux->gen,k)); + kh_del(hdict,aux->gen,k); + kstring_t str = {0,0,0}; + if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 ) + { + free(str.s); + return -1; + } + k = kh_put(hdict, aux->gen, str.s, &ret); + if ( ret<0 ) + { + free(str.s); + return -1; + } + free(hrec->value); + hrec->value = strdup(tmp->value); + if ( !hrec->value ) return -1; + return 0; +} + int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) { + kstring_t str = {0,0,0}; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + int res; if ( !hrec ) return 0; - hrec->type = BCF_HL_GEN; + bcf_hrec_check(hrec); // todo: check return status and propagate errors up + res = bcf_hdr_register_hrec(hdr,hrec); if (res < 0) return -1; if ( !res ) @@ -713,16 +1030,35 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) } // Is one of the generic fields and already present? - int i; - for (i=0; inhrec; i++) + if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 ) + { + free(str.s); + return -1; + } + khint_t k = kh_get(hdict, aux->gen, str.s); + if ( k != kh_end(aux->gen) ) + { + // duplicate record + bcf_hrec_destroy(hrec); + free(str.s); + return 0; + } + } + + int i; + if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 ) + { + if ( ksprintf(&str, "##%s=", hrec->key,hrec->vals[i]) < 0 ) { - if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue; - if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break; - if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break; + free(str.s); + return -1; } - if ( inhrec ) + khint_t k = kh_get(hdict, aux->gen, str.s); + if ( k != kh_end(aux->gen) ) { + // duplicate record bcf_hrec_destroy(hrec); + free(str.s); return 0; } } @@ -730,8 +1066,24 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) // New record, needs to be added int n = hdr->nhrec + 1; bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); - if (!new_hrec) return -1; + if (!new_hrec) { + free(str.s); + bcf_hdr_unregister_hrec(hdr, hrec); + return -1; + } hdr->hrec = new_hrec; + + if ( str.s ) + { + khint_t k = kh_put(hdict, aux->gen, str.s, &res); + if ( res<0 ) + { + free(str.s); + return -1; + } + kh_val(aux->gen,k) = hrec; + } + hdr->hrec[hdr->nhrec] = hrec; hdr->dirty = 1; hdr->nhrec = n; @@ -739,27 +1091,47 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return hrec->type==BCF_HL_GEN ? 0 : 1; } -/* - * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed), - * the STR,GEN lines are searched for linearly in a linked list of all header lines. - * This may become a problem for VCFs with huge headers, we might need to build a - * dictionary for these lines as well. - */ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class) { int i; if ( type==BCF_HL_GEN ) { + // e.g. ##fileformat=VCFv4.2 + // ##source=GenomicsDBImport + // ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364 + if ( value ) + { + kstring_t str = {0,0,0}; + ksprintf(&str, "##%s=%s", key,value); + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k = kh_get(hdict, aux->gen, str.s); + free(str.s); + if ( k == kh_end(aux->gen) ) return NULL; + return kh_val(aux->gen, k); + } for (i=0; inhrec; i++) { if ( hdr->hrec[i]->type!=type ) continue; if ( strcmp(hdr->hrec[i]->key,key) ) continue; - if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i]; + return hdr->hrec[i]; } return NULL; } else if ( type==BCF_HL_STR ) { + // e.g. ##GATKCommandLine= + // ##ALT= + if (!str_class) return NULL; + if ( !strcmp("ID",key) ) + { + kstring_t str = {0,0,0}; + ksprintf(&str, "##%s=<%s=%s>",str_class,key,value); + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k = kh_get(hdict, aux->gen, str.s); + free(str.s); + if ( k == kh_end(aux->gen) ) return NULL; + return kh_val(aux->gen, k); + } for (i=0; inhrec; i++) { if ( hdr->hrec[i]->type!=type ) continue; @@ -823,6 +1195,11 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // Parse the whole header do { while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { + if(len < 0) + { + done = -1; + break; + } if (bcf_hdr_add_hrec(hdr, hrec) < 0) { bcf_hrec_destroy(hrec); return -1; @@ -840,14 +1217,18 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // of the next one). p += len; continue; - } + } + if(done < 0) + break; + + // Next should be the sample line. If not, it was a malformed // header, in which case print a warning and skip (many VCF // operations do not really care about a few malformed lines). // In the future we may want to add a strict mode that errors in // this case. - if ( strncmp("#CHROM\tPOS",p,10) != 0 ) { + if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) { char *eol = strchr(p, '\n'); if (*p != '\0') { char buffer[320]; @@ -896,22 +1277,13 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) bcf_hrec_t *hrec; if ( !key ) { + // no key, remove all entries of this type while ( inhrec ) { if ( hdr->hrec[i]->type!=type ) { i++; continue; } hrec = hdr->hrec[i]; - - if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG ) - { - int j = bcf_hrec_find_key(hdr->hrec[i], "ID"); - if ( j>=0 ) - { - vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; - khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[j]); - kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL; - } - } - + bcf_hdr_unregister_hrec(hdr, hrec); + bcf_hdr_remove_from_hdict(hdr, hrec); hdr->dirty = 1; hdr->nhrec--; if ( i < hdr->nhrec ) @@ -953,6 +1325,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) } if ( i==hdr->nhrec ) return; hrec = hdr->hrec[i]; + bcf_hdr_remove_from_hdict(hdr, hrec); } hdr->nhrec--; @@ -1011,14 +1384,19 @@ int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) { int len; kstring_t str = {0,0,0}; - ksprintf(&str,"##fileformat=%s", version); + if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1; hrec = bcf_hdr_parse_line(hdr, str.s, &len); free(str.s); } else { - free(hrec->value); - hrec->value = strdup(version); + bcf_hrec_t *tmp = bcf_hrec_dup(hrec); + if ( !tmp ) return -1; + free(tmp->value); + tmp->value = strdup(version); + if ( !tmp->value ) return -1; + bcf_hdr_update_hrec(hdr, hrec, tmp); + bcf_hrec_destroy(tmp); } hdr->dirty = 1; return 0; // FIXME: check for errs in this function (return < 0 if so) @@ -1030,8 +1408,20 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) bcf_hdr_t *h; h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t)); if (!h) return NULL; - for (i = 0; i < 3; ++i) + for (i = 0; i < 3; ++i) { if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail; + // Supersize the hash to make collisions very unlikely + static int dsize[3] = {16384,16384,2048}; // info, contig, format + if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail; + } + + bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t)); + if ( !aux ) goto fail; + if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } + aux->dict = *((vdict_t*)h->dict[0]); + free(h->dict[0]); + h->dict[0] = aux; + if ( strchr(mode,'w') ) { bcf_hdr_append(h, "##fileformat=VCFv4.2"); @@ -1057,6 +1447,13 @@ void bcf_hdr_destroy(bcf_hdr_t *h) if (d == 0) continue; for (k = kh_begin(d); k != kh_end(d); ++k) if (kh_exist(d, k)) free((char*)kh_key(d, k)); + if ( i==0 ) + { + bcf_hdr_aux_t *aux = get_hdr_aux(h); + for (k=kh_begin(aux->gen); kgen); k++) + if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); + kh_destroy(hdict, aux->gen); + } kh_destroy(vdict, d); free(h->id[i]); } @@ -1110,6 +1507,9 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp) if (bgzf_read(fp, buf, 4) != 4) goto fail; hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (hlen > FUZZ_ALLOC_LIMIT) { errno = ENOMEM; goto fail; } +#endif htxt = (char*)malloc(hlen + 1); if (!htxt) goto fail; if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; @@ -1236,11 +1636,16 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) shared_len = le_to_u32(x); if (shared_len < 24) return -2; shared_len -= 24; // to exclude six 32-bit integers - if (ks_resize(&v->shared, shared_len) != 0) return -2; indiv_len = le_to_u32(x + 4); - if (ks_resize(&v->indiv, indiv_len) != 0) return -2; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // ks_resize() normally allocates 1.5 * requested size to allow for growth + if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2; +#endif + if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2; + if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2; v->rid = le_to_i32(x + 8); v->pos = le_to_u32(x + 12); + if ( v->pos==UINT32_MAX ) v->pos = -1; // this is for telomere coordinate, e.g. MT:0 v->rlen = le_to_i32(x + 16); v->qual = le_to_float(x + 20); v->n_info = le_to_u16(x + 24); @@ -1368,6 +1773,12 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { ptr += bytes; // Check REF and ALT + if (rec->n_allele < 1) { + hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele", + bcf_seqname_safe(hdr,rec), rec->pos+1); + err |= BCF_ERR_TAG_UNDEF; + } + reports = 0; for (i = 0; i < rec->n_allele; i++) { if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; @@ -1420,7 +1831,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { err |= BCF_ERR_TAG_UNDEF; } if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; - if (((1 << type) & is_valid_type) == 0) { + if (((1 << type) & is_valid_type) == 0 + || (type == BCF_BT_NULL && num > 0)) { if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type)); err |= BCF_ERR_TAG_INVALID; @@ -1444,7 +1856,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { err |= BCF_ERR_TAG_UNDEF; } if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv; - if (((1 << type) & is_valid_type) == 0) { + if (((1 << type) & is_valid_type) == 0 + || (type == BCF_BT_NULL && num > 0)) { bcf_record_check_err(hdr, rec, "type", &reports, type); err |= BCF_ERR_TAG_INVALID; } @@ -1736,6 +2149,7 @@ static int bcf1_sync(bcf1_t *line) return 0; } + bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) { bcf1_sync(src); @@ -1793,7 +2207,8 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) // header. At this point, the header must have been printed, // proceeding would lead to a broken BCF file. Errors must be checked // and cleared by the caller before we can proceed. - hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos, v->errcode, bcf_seqname_safe(h,v), v->pos+1); + char errdescription[1024] = ""; + hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1); return -1; } bcf1_sync(v); // check if the BCF record was modified @@ -1910,7 +2325,7 @@ bcf_hdr_t *vcf_hdr_read(htsFile *fp) if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error; // check tabix index, are all contigs listed in the header? add the missing ones - idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL); + idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL); if ( idx ) { int i, n, need_sync = 0; @@ -2030,20 +2445,44 @@ char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n) { vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; - int tid, m = kh_size(d); + int i, tid, m = kh_size(d); const char **names = (const char**) calloc(m,sizeof(const char*)); + if ( !names ) + { + hts_log_error("Failed to allocate memory"); + *n = 0; + return NULL; + } khint_t k; for (k=kh_begin(d); k= m ) + { + // This can happen after a contig has been removed from BCF header via bcf_hdr_remove() + if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 ) + { + hts_log_error("Failed to allocate memory"); + *n = 0; + free(names); + return NULL; + } + m = tid + 1; + } names[tid] = kh_key(d,k); } - // sanity check: there should be no gaps - for (tid=0; tidformat.compression!=no_compression ) + if ( fp->format.compression!=no_compression ) { ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l); - else + if (bgzf_flush(fp->fp.bgzf) != 0) return -1; + } else { ret = hwrite(fp->fp.hfile, htxt.s, htxt.l); + } free(htxt.s); return ret<0 ? -1 : 0; } @@ -2072,25 +2513,64 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) { int32_t max = INT32_MIN, min = INT32_MAX; int i; - if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL); - else if (n == 1) bcf_enc_int1(s, a[0]); - else { + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_int1(s, a[0]); + } else { if (wsize <= 0) wsize = n; - for (i = 0; i < n; ++i) { - if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue; + + // Equivalent to: + // for (i = 0; i < n; ++i) { + // if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) + // continue; + // if (max < a[i]) max = a[i]; + // if (min > a[i]) min = a[i]; + // } + int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN}; + int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX}; + for (i = 0; i < (n&~3); i+=4) { + // bcf_int32_missing == INT32_MIN and + // bcf_int32_vector_end == INT32_MIN+1. + // We skip these, but can mostly avoid explicit checking + if (max4[0] < a[i+0]) max4[0] = a[i+0]; + if (max4[1] < a[i+1]) max4[1] = a[i+1]; + if (max4[2] < a[i+2]) max4[2] = a[i+2]; + if (max4[3] < a[i+3]) max4[3] = a[i+3]; + if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0]; + if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1]; + if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2]; + if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3]; + } + min = min4[0]; + if (min > min4[1]) min = min4[1]; + if (min > min4[2]) min = min4[2]; + if (min > min4[3]) min = min4[3]; + max = max4[0]; + if (max < max4[1]) max = max4[1]; + if (max < max4[2]) max = max4[2]; + if (max < max4[3]) max = max4[3]; + for (; i < n; ++i) { if (max < a[i]) max = a[i]; - if (min > a[i]) min = a[i]; + if (min > a[i] && a[i] > INT32_MIN+1) min = a[i]; } + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { - bcf_enc_size(s, wsize, BCF_BT_INT8); - for (i = 0; i < n; ++i) - if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s); - else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s); - else kputc(a[i], s); + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + s->l += n; } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT16); - ks_resize(s, s->l + n * sizeof(int16_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { @@ -2104,8 +2584,9 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) s->l += n * sizeof(int16_t); } else { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT32); - ks_resize(s, s->l + n * sizeof(int32_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { i32_to_le(a[i], p); @@ -2115,7 +2596,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) } } - return 0; // FIXME: check for errs in this function + return 0; } #ifdef VCF_ALLOW_INT64 @@ -2138,41 +2619,151 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } #endif -static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { - uint8_t *p; - size_t i; - size_t bytes = n * sizeof(float); - - if (bytes / sizeof(float) != n) return -1; - if (ks_resize(s, s->l + bytes) < 0) return -1; - - p = (uint8_t *) s->s + s->l; - for (i = 0; i < n; i++) { - float_to_le(a[i], p); - p += sizeof(float); - } - s->l += bytes; - - return 0; -} - -int bcf_enc_vfloat(kstring_t *s, int n, float *a) +int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) { - assert(n >= 0); - bcf_enc_size(s, n, BCF_BT_FLOAT); - serialize_float_array(s, n, a); - return 0; // FIXME: check for errs in this function -} + int64_t max = INT64_MIN, min = INT64_MAX; + int i; + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_long1(s, a[0]); + } else { + if (wsize <= 0) wsize = n; -int bcf_enc_vchar(kstring_t *s, int l, const char *a) -{ - bcf_enc_size(s, l, BCF_BT_CHAR); - kputsn(a, l, s); - return 0; // FIXME: check for errs in this function -} + // Equivalent to: + // for (i = 0; i < n; ++i) { + // if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) + // continue; + // if (max < a[i]) max = a[i]; + // if (min > a[i]) min = a[i]; + // } + int64_t max4[4] = {INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN}; + int64_t min4[4] = {INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX}; + for (i = 0; i < (n&~3); i+=4) { + // bcf_int32_missing == INT32_MIN and + // bcf_int32_vector_end == INT32_MIN+1. + // We skip these, but can mostly avoid explicit checking + if (max4[0] < a[i+0]) max4[0] = a[i+0]; + if (max4[1] < a[i+1]) max4[1] = a[i+1]; + if (max4[2] < a[i+2]) max4[2] = a[i+2]; + if (max4[3] < a[i+3]) max4[3] = a[i+3]; + if (min4[0] > a[i+0] && a[i+0] > INT64_MIN+1) min4[0] = a[i+0]; + if (min4[1] > a[i+1] && a[i+1] > INT64_MIN+1) min4[1] = a[i+1]; + if (min4[2] > a[i+2] && a[i+2] > INT64_MIN+1) min4[2] = a[i+2]; + if (min4[3] > a[i+3] && a[i+3] > INT64_MIN+1) min4[3] = a[i+3]; + } + min = min4[0]; + if (min > min4[1]) min = min4[1]; + if (min > min4[2]) min = min4[2]; + if (min > min4[3]) min = min4[3]; + max = max4[0]; + if (max < max4[1]) max = max4[1]; + if (max < max4[2]) max = max4[2]; + if (max < max4[3]) max = max4[3]; + for (; i < n; ++i) { + if (max < a[i]) max = a[i]; + if (min > a[i] && a[i] > INT64_MIN+1) min = a[i]; + } -int bcf_fmt_array(kstring_t *s, int n, int type, void *data) -{ + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int64_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int64_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + s->l += n; + } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { + uint8_t *p; + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) + { + int16_t x; + if ( a[i]==bcf_int64_vector_end ) x = bcf_int16_vector_end; + else if ( a[i]==bcf_int64_missing ) x = bcf_int16_missing; + else x = a[i]; + i16_to_le(x, p); + p += sizeof(int16_t); + } + s->l += n * sizeof(int16_t); + } else if(max <= BCF_MAX_BT_INT32 && min >= BCF_MIN_BT_INT32){ + uint8_t *p; + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + int32_t x; + if ( a[i]==bcf_int64_vector_end ) x = bcf_int32_vector_end; + else if ( a[i]==bcf_int64_missing ) x = bcf_int32_missing; + else x = a[i]; + i32_to_le(x, p); + p += sizeof(int32_t); + } + s->l += n * sizeof(int32_t); + } + #ifdef VCF_ALLOW_INT64 + else { + uint8_t *p; + if(bcf_enc_size(s, wsize, BCF_BT_INT64) < 0 || ks_resize(s, s->l + n * sizeof(int64_t)) < 0) + return -1; + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + int64_t x = a[i]; + i64_to_le(x, p); + p += sizeof(int64_t); + } + s->l += n * sizeof(int64_t); + } +#else + return -1; +#endif + } + + return 0; +} + +static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { + uint8_t *p; + size_t i; + size_t bytes = n * sizeof(float); + + if (bytes / sizeof(float) != n) return -1; + if (ks_resize(s, s->l + bytes) < 0) return -1; + + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; i++) { + float_to_le(a[i], p); + p += sizeof(float); + } + s->l += bytes; + + return 0; +} + +int bcf_enc_vfloat(kstring_t *s, int n, float *a) +{ + assert(n >= 0); + bcf_enc_size(s, n, BCF_BT_FLOAT); + serialize_float_array(s, n, a); + return 0; // FIXME: check for errs in this function +} + +int bcf_enc_vchar(kstring_t *s, int l, const char *a) +{ + bcf_enc_size(s, l, BCF_BT_CHAR); + kputsn(a, l, s); + return 0; // FIXME: check for errs in this function +} + +int bcf_fmt_array(kstring_t *s, int n, int type, void *data) +{ int j = 0; uint32_t e = 0; if (n == 0) { @@ -2204,6 +2795,7 @@ int bcf_fmt_array(kstring_t *s, int n, int type, void *data) case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, v==bcf_int8_missing, v==bcf_int8_vector_end, kputw(v, s)); break; case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break; case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break; + case BCF_BT_INT64: BRANCH(int64_t, le_to_i64, v==bcf_int64_missing, v==bcf_int64_vector_end, kputll(v, s)); break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break; default: hts_log_error("Unexpected type %d", type); exit(1); break; } @@ -2225,13 +2817,36 @@ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) ********************/ typedef struct { - int key, max_m, size, offset; - uint32_t is_gt:1, max_g:31; - uint32_t max_l; - uint32_t y; - uint8_t *buf; + int key; // Key for h->id[BCF_DT_ID][key] vdict + int max_m; // number of elements in field array (ie commas) + int size; // field size (max_l or max_g*4 if is_gt) + int offset; // offset of buf into h->mem + uint32_t is_gt:1, // is genotype + max_g:31; // maximum number of genotypes + uint32_t max_l; // length of field + uint32_t y; // h->id[0][fmt[j].key].val->info[BCF_HL_FMT] + uint8_t *buf; // Pointer into h->mem } fmt_aux_t; +// fmt_aux_t field notes: +// max_* are biggest sizes of the various FORMAT fields across all samples. +// We use these after pivoting the data to ensure easy random access +// of a specific sample. +// +// max_m is only used for type BCF_HT_REAL or BCF_HT_INT +// max_g is only used for is_gt == 1 (will be BCF_HT_STR) +// max_l is only used for is_gt == 0 (will be BCF_HT_STR) +// +// These are computed in vcf_parse_format_max3 and used in +// vcf_parse_format_alloc4 to get the size. +// +// size is computed from max_g, max_l, max_m and is_gt. Once computed +// the max values are never accessed again. +// +// In theory all 4 vars could be coalesced into a single variable, but this +// significantly harms speed (even if done via a union). It's about 25-30% +// slower. + static inline int align_mem(kstring_t *s) { int e = 0; @@ -2242,23 +2857,12 @@ static inline int align_mem(kstring_t *s) return e == 0 ? 0 : -1; } -// p,q is the start and the end of the FORMAT field #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */ -static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) -{ - if ( !bcf_hdr_nsamples(h) ) return 0; - - static int extreme_val_warned = 0; - char *r, *t; - int j, l, m, g, overflow = 0; - khint_t k; - ks_tokaux_t aux1; - vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; - kstring_t *mem = (kstring_t*)&h->mem; - fmt_aux_t fmt[MAX_N_FMT]; - mem->l = 0; - char *end = s->s + s->l; +// detect FORMAT "." +static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + const char *p, const char *q) { + const char *end = s->s + s->l; if ( q>=end ) { hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1); @@ -2270,10 +2874,20 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "." { v->n_sample = bcf_hdr_nsamples(h); - return 0; + return 1; } - // get format information from the dictionary + return 0; +} + +// get format information from the dictionary +static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + const char *p, const char *q, fmt_aux_t *fmt) { + const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; + char *t; + int j; + ks_tokaux_t aux1; + for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; @@ -2283,7 +2897,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } *(char*)aux1.p = 0; - k = kh_get(vdict, d, t); + khint_t k = kh_get(vdict, d, t); if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) { if ( t[0]=='.' && t[1]==0 ) { @@ -2302,7 +2916,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, t); - v->errcode = BCF_ERR_TAG_UNDEF; + v->errcode |= BCF_ERR_TAG_UNDEF; if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; @@ -2311,14 +2925,22 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0; fmt[j].key = kh_val(d, k).id; - fmt[j].is_gt = !strcmp(t, "GT"); + fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]); fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; v->n_fmt++; } - // compute max + return 0; +} + +// compute max +static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { int n_sample_ori = -1; - r = q + 1; // r: position in the format string - l = 0, m = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles + char *r = q + 1; // r: position in the format string + int l = 0, m = 1, g = 1, j; + v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles + const char *end = s->s + s->l; + while ( rmax_m < m) f->max_m = m; if (f->max_l < l) f->max_l = l; if (f->is_gt && f->max_g < g) f->max_g = g; @@ -2368,7 +3008,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p break; } if ( r>=end ) break; - r++; l++; + r++; } end_for: v->n_sample++; @@ -2376,20 +3016,30 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p r++; } - // allocate memory for arrays + return 0; +} + +// allocate memory for arrays +static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + const char *p, const char *q, + fmt_aux_t *fmt) { + kstring_t *mem = (kstring_t*)&h->mem; + + int j; for (j = 0; j < v->n_fmt; ++j) { fmt_aux_t *f = &fmt[j]; if ( !f->max_m ) f->max_m = 1; // omitted trailing format field + if ((f->y>>4&0xf) == BCF_HT_STR) { f->size = f->is_gt? f->max_g << 2 : f->max_l; } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) { f->size = f->max_m << 2; - } else - { + } else { hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; return -1; } + if (align_mem(mem) < 0) { hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; @@ -2413,11 +3063,27 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } mem->l += v->n_sample * f->size; } - for (j = 0; j < v->n_fmt; ++j) - fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; - // fill the sample fields; at beginning of the loop, t points to the first char of a format - n_sample_ori = -1; - t = q + 1; m = 0; // m: sample id + + { + int j; + for (j = 0; j < v->n_fmt; ++j) + fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; + } + + return 0; +} + +// Fill the sample fields +static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + const char *p, const char *q, fmt_aux_t *fmt) { + static int extreme_val_warned = 0; + int n_sample_ori = -1; + // At beginning of the loop t points to the first char of a format + const char *t = q + 1; + int m = 0; // m: sample id + const int nsamples = bcf_hdr_nsamples(h); + + const char *end = s->s + s->l; while ( ty>>4&0xf; if (!z->buf) { hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; return -1; } - if ((z->y>>4&0xf) == BCF_HT_STR) { - if (z->is_gt) { // genotypes + + if (htype == BCF_HT_STR) { + int l; + if (z->is_gt) { + // Genotypes. + // ([|/])+... where is [0-9]+ or ".". int32_t is_phased = 0; uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); uint32_t unreadable = 0; uint32_t max = 0; - overflow = 0; + int overflow = 0; for (l = 0;; ++t) { if (*t == '.') { ++t, x[l++] = is_phased; } else { - char *tt = t; - uint32_t val = hts_str2uint(t, &t, sizeof(val) * CHAR_MAX - 2, &overflow); - unreadable |= tt == t; + const char *tt = t; + uint32_t val; + // Or "v->n_allele < 10", but it doesn't + // seem to be any faster and this feels safer. + if (*t >= '0' && *t <= '9' && + !(t[1] >= '0' && t[1] <= '9')) { + val = *t++ - '0'; + } else { + val = hts_str2uint(t, (char **)&t, + sizeof(val) * CHAR_MAX - 2, + &overflow); + unreadable |= tt == t; + } if (max < val) max = val; x[l++] = (val + 1) << 1 | is_phased; } @@ -2473,26 +3154,35 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p return -1; } if ( !l ) x[l++] = 0; // An empty field, insert missing value - for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; + for (; l < z->size>>2; ++l) + x[l] = bcf_int32_vector_end; + } else { + // Otherwise arbitrary strings char *x = (char*)z->buf + z->size * (size_t)m; - for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t; - for (; l < z->size; ++l) x[l] = 0; + for (l = 0; *t != ':' && *t; ++t) + x[l++] = *t; + if (z->size > l) + memset(&x[l], 0, (z->size-l) * sizeof(*x)); } - } else if ((z->y>>4&0xf) == BCF_HT_INT) { + + } else if (htype == BCF_HT_INT) { + // One or more integers in an array int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); + int l; for (l = 0;; ++t) { if (*t == '.') { x[l++] = bcf_int32_missing, ++t; // ++t to skip "." } else { - overflow = 0; + int overflow = 0; char *te; long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); if ( te==t || overflow || tmp_valBCF_MAX_BT_INT32 ) { if ( !extreme_val_warned ) { - hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1); + hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, + h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1); extreme_val_warned = 1; } tmp_val = bcf_int32_missing; @@ -2502,15 +3192,20 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } if (*t != ',') break; } - if ( !l ) x[l++] = bcf_int32_missing; - for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; - } else if ((z->y>>4&0xf) == BCF_HT_REAL) { + if ( !l ) + x[l++] = bcf_int32_missing; + for (; l < z->size>>2; ++l) + x[l] = bcf_int32_vector_end; + + } else if (htype == BCF_HT_REAL) { + // One of more floating point values in an array float *x = (float*)(z->buf + z->size * (size_t)m); + int l; for (l = 0;; ++t) { if (*t == '.' && !isdigit_c(t[1])) { bcf_float_set_missing(x[l++]), ++t; // ++t to skip "." } else { - overflow = 0; + int overflow = 0; char *te; float tmp_val = hts_str2dbl(t, &te, &overflow); if ( (te==t || overflow) && !extreme_val_warned ) @@ -2523,10 +3218,13 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } if (*t != ',') break; } - if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value - for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); + if ( !l ) + // An empty field, insert missing value + bcf_float_set_missing(x[l++]); + for (; l < z->size>>2; ++l) + bcf_float_set_vector_end(x[l]); } else { - hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); + hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; return -1; } @@ -2547,23 +3245,28 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } } - for (; j < v->n_fmt; ++j) { // fill end-of-vector values + // fill end-of-vector values + for (; j < v->n_fmt; ++j) { fmt_aux_t *z = &fmt[j]; - if ((z->y>>4&0xf) == BCF_HT_STR) { + const int htype = z->y>>4&0xf; + int l; + if (htype == BCF_HT_STR) { if (z->is_gt) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); if (z->size) x[0] = bcf_int32_missing; for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; } else { char *x = (char*)z->buf + z->size * (size_t)m; - if ( z->size ) x[0] = '.'; - for (l = 1; l < z->size; ++l) x[l] = 0; + if ( z->size ) { + x[0] = '.'; + memset(&x[1], 0, (z->size-1) * sizeof(*x)); + } } - } else if ((z->y>>4&0xf) == BCF_HT_INT) { + } else if (htype == BCF_HT_INT) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); x[0] = bcf_int32_missing; for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; - } else if ((z->y>>4&0xf) == BCF_HT_REAL) { + } else if (htype == BCF_HT_REAL) { float *x = (float*)(z->buf + z->size * (size_t)m); bcf_float_set_missing(x[0]); for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); @@ -2573,7 +3276,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p m++; t++; } - // write individual genotype information + return 0; +} + +// write individual genotype information +static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + const char *p, const char *q, fmt_aux_t *fmt) { kstring_t *str = &v->indiv; int i; if (v->n_sample > 0) { @@ -2597,6 +3305,11 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } } + return 0; +} + +// validity checking +static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) { if ( v->n_sample!=bcf_hdr_nsamples(h) ) { hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", @@ -2617,6 +3330,65 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p return 0; } +// p,q is the start and the end of the FORMAT field +static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q) +{ + if ( !bcf_hdr_nsamples(h) ) return 0; + kstring_t *mem = (kstring_t*)&h->mem; + mem->l = 0; + + fmt_aux_t fmt[MAX_N_FMT]; + + // detect FORMAT "." + int ret; // +ve = ok, -ve = err + if ((ret = vcf_parse_format_empty1(s, h, v, p, q))) + return ret ? 0 : -1; + + // get format information from the dictionary + if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0) + return -1; + + // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is + // stored as per-type arrays AAA... BBB... CCC... This is basically + // a data rotation or pivot. + + // The size of elements in the array grow to their maximum needed, + // permitting fast random access. This means however we have to first + // scan the whole FORMAT line to find the maximum of each type, and + // then scan it again to find the store the data. + // We break this down into compute-max, allocate, fill-out-buffers + + // TODO: ? + // The alternative would be to pivot on the first pass, with fixed + // size entries for numerics and concatenated strings otherwise, also + // tracking maximum sizes. Then on a second pass we reallocate and + // copy the data again to a uniformly sized array. Two passes through + // memory, but without doubling string parsing. + + // compute max + if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0) + return -1; + + // allocate memory for arrays + if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0) + return -1; + + // fill the sample fields; at beginning of the loop + if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0) + return -1; + + // write individual genotype information + if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0) + return -1; + + // validity checking + if (vcf_parse_format_check7(h, v) < 0) + return -1; + + return 0; +} + static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) { // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has // been already printed, but will enable tools like vcfcheck to proceed. @@ -2697,24 +3469,24 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p char *r, *key; khint_t k; vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; - int32_t *a_val = NULL; - + int64_t *a_val = NULL; v->n_info = 0; if (*(q-1) == ';') *(q-1) = 0; for (r = key = p;; ++r) { int c; char *val, *end; - if (*r != ';' && *r != '=' && *r != 0) continue; + while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++; if (v->n_info == UINT16_MAX) { hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; - return -1; + goto fail; } - val = end = 0; + val = end = NULL; c = *r; *r = 0; if (c == '=') { val = r + 1; + for (end = val; *end != ';' && *end != 0; ++end); c = *end; *end = 0; } else end = r; @@ -2732,11 +3504,11 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (res < 0) bcf_hrec_destroy(hrec); if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, key); - v->errcode = BCF_ERR_TAG_UNDEF; + v->errcode |= BCF_ERR_TAG_UNDEF; if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; - return -1; + goto fail; } } uint32_t y = kh_val(d, k).info[BCF_HL_INFO]; @@ -2753,75 +3525,44 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (*t == ',') ++n_val; // Check both int and float size in one step for simplicity if (n_val > max_n_val) { - int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val)); + int64_t *a_tmp = (int64_t *)realloc(a_val, n_val * sizeof(*a_val)); if (!a_tmp) { hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // No appropriate code? - return -1; + goto fail; } a_val = a_tmp; max_n_val = n_val; } - if ((y>>4&0xf) == BCF_HT_INT) { + if (((y >> 4 & 0xf) == BCF_HT_INT) || + ((y >> 4 & 0xf) == BCF_HT_LONG)) { i = 0, t = val; int64_t val1; - int is_int64 = 0; -#ifdef VCF_ALLOW_INT64 - if ( n_val==1 ) - { - overflow = 0; - long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); - if ( te==val ) tmp_val = bcf_int32_missing; - else if ( overflow || tmp_valBCF_MAX_BT_INT64 ) - { - if ( !extreme_int_warned ) - { - hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); - extreme_int_warned = 1; - } - tmp_val = bcf_int32_missing; - } - else - is_int64 = 1; - val1 = tmp_val; - t = te; - i = 1; // this is just to avoid adding another nested block... - } -#endif - for (; i < n_val; ++i, ++t) - { + for (; i < n_val; ++i, ++t) { overflow = 0; - long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); - if ( te==t ) tmp_val = bcf_int32_missing; - else if ( overflow || tmp_valBCF_MAX_BT_INT32 ) - { - if ( !extreme_int_warned ) - { - hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); + long long int tmp_val = hts_str2int( + t, &te, sizeof(tmp_val) * CHAR_BIT, &overflow); + if (te == t) + tmp_val = bcf_int64_missing; + else if (overflow || tmp_val < BCF_MIN_BT_INT64 || + tmp_val > BCF_MAX_BT_INT64) { + if (!extreme_int_warned) { + hts_log_warning( + "Extreme INFO/%s value encountered and set to " + "missing at %s:%" PRIhts_pos, + key, bcf_seqname_safe(h, v), v->pos + 1); extreme_int_warned = 1; } - tmp_val = bcf_int32_missing; + tmp_val = bcf_int64_missing; } a_val[i] = tmp_val; - for (t = te; *t && *t != ','; t++); - } - if (n_val == 1) { -#ifdef VCF_ALLOW_INT64 - if ( is_int64 ) - { - v->unpacked |= BCF_IS_64BIT; - bcf_enc_long1(str, val1); - } - else - bcf_enc_int1(str, (int32_t)val1); -#else - val1 = a_val[0]; - bcf_enc_int1(str, (int32_t)val1); -#endif - } else { - bcf_enc_vint(str, n_val, a_val, -1); + for (t = te; *t && *t != ','; t++) + ; } - if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && strcmp(key, "END") == 0) + v->unpacked |= BCF_IS_64BIT; + bcf_enc_vlong(str, n_val, a_val, -1); + val1 = a_val[0]; + if (n_val==1 && val1!=bcf_int64_missing && memcmp(key, "END", 4) == 0)//memcmp instead of strcmp { if ( val1 <= v->pos ) { @@ -2854,16 +3595,26 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p free(a_val); return 0; + + fail: + free(a_val); + return -1; } int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { - int i = 0, ret = -2, overflow = 0; + int ret = -2, overflow = 0; char *p, *q, *r, *t; kstring_t *str; khint_t k; ks_tokaux_t aux; +//#define NOT_DOT(p) strcmp((p), ".") +//#define NOT_DOT(p) (!(*p == '.' && !p[1])) +//#define NOT_DOT(p) ((*p) != '.' || (p)[1]) +//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2)) +#define NOT_DOT(p) (memcmp(p, ".\0", 2)) + if (!s || !h || !v || !(s->s)) return ret; @@ -2873,86 +3624,136 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t)); - for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) { - q = (char*)aux.p; - *q = 0; - if (i == 0) { // CHROM - vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; - k = kh_get(vdict, d, p); - if (k == kh_end(d)) - { - hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); - v->errcode = BCF_ERR_CTG_UNDEF; - if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { - hts_log_error("Could not add dummy header for contig '%s'", p); - v->errcode |= BCF_ERR_CTG_INVALID; + + // CHROM + if (!(p = kstrtok(s->s, "\t", &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; + k = kh_get(vdict, d, p); + if (k == kh_end(d)) { + hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); + v->errcode = BCF_ERR_CTG_UNDEF; + if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { + hts_log_error("Could not add dummy header for contig '%s'", p); + v->errcode |= BCF_ERR_CTG_INVALID; + goto err; + } + } + v->rid = kh_val(d, k).id; + + // POS + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + overflow = 0; + char *tmp = p; + v->pos = hts_str2uint(p, &p, 63, &overflow); + if (overflow) { + hts_log_error("Position value '%s' is too large", tmp); + goto err; + } else if ( *p ) { + hts_log_error("Could not parse the position '%s'", tmp); + goto err; + } else { + v->pos -= 1; + } + if (v->pos >= INT32_MAX) + v->unpacked |= BCF_IS_64BIT; + + // ID + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p); + else bcf_enc_size(str, 0, BCF_BT_CHAR); + + // REF + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + bcf_enc_vchar(str, q - p, p); + v->n_allele = 1, v->rlen = q - p; + + // ALT + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + for (r = t = p;; ++r) { + if (*r == ',' || *r == 0) { + if (v->n_allele == UINT16_MAX) { + hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, + bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_LIMITS; goto err; } + bcf_enc_vchar(str, r - t, t); + t = r + 1; + ++v->n_allele; } - v->rid = kh_val(d, k).id; - } else if (i == 1) { // POS - overflow = 0; - v->pos = hts_str2uint(p, &p, 63, &overflow); - if (overflow) { - hts_log_error("Position value '%s' is too large", p); - goto err; - } else { - v->pos -= 1; - } - if (v->pos >= INT32_MAX) - v->unpacked |= BCF_IS_64BIT; - } else if (i == 2) { // ID - if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); - else bcf_enc_size(str, 0, BCF_BT_CHAR); - } else if (i == 3) { // REF - bcf_enc_vchar(str, q - p, p); - v->n_allele = 1, v->rlen = q - p; - } else if (i == 4) { // ALT - if (strcmp(p, ".")) { - for (r = t = p;; ++r) { - if (*r == ',' || *r == 0) { - if (v->n_allele == UINT16_MAX) { - hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, - bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_LIMITS; - goto err; - } - bcf_enc_vchar(str, r - t, t); - t = r + 1; - ++v->n_allele; - } - if (r == q) break; - } - } - } else if (i == 5) { // QUAL - if (strcmp(p, ".")) v->qual = atof(p); - else bcf_float_set_missing(v->qual); - if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR - } else if (i == 6) { // FILTER - if (strcmp(p, ".")) { - if (vcf_parse_filter(str, h, v, p, q)) goto err; - } else bcf_enc_vint(str, 0, 0, -1); - if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT - } else if (i == 7) { // INFO - if (strcmp(p, ".")) { - if (vcf_parse_info(str, h, v, p, q)) goto err; - } - if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; - } else if (i == 8) {// FORMAT - return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; + if (r == q) break; } } - end: - ret = 0; + // QUAL + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; - err: - return ret; -} + if (NOT_DOT(p)) v->qual = atof(p); + else bcf_float_set_missing(v->qual); + if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR -int vcf_open_mode(char *mode, const char *fn, const char *format) -{ - if (format == NULL) { + // FILTER + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + if (vcf_parse_filter(str, h, v, p, q)) { + goto err; + } + } else bcf_enc_vint(str, 0, 0, -1); + if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT + + // INFO + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + if (vcf_parse_info(str, h, v, p, q)) { + goto err; + } + } + if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; + + // FORMAT; optional + p = kstrtok(0, 0, &aux); + if (p) { + *(q = (char*)aux.p) = 0; + + return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; + } else { + return 0; + } + + end: + ret = 0; + + err: + return ret; +} + +int vcf_open_mode(char *mode, const char *fn, const char *format) +{ + if (format == NULL) { // Try to pick a format based on the filename extension char extension[HTS_MAX_EXT_LEN]; if (find_file_extension(fn, extension) < 0) return -1; @@ -3223,14 +4024,19 @@ int vcf_write_line(htsFile *fp, kstring_t *line) int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) { - int ret; + ssize_t ret; fp->line.l = 0; if (vcf_format1(h, v, &fp->line) != 0) return -1; - if ( fp->format.compression!=no_compression ) + if ( fp->format.compression!=no_compression ) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; + if (fp->idx) + hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); - else + } else { ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); + } if (fp->idx) { int tid; @@ -3441,185 +4247,611 @@ int bcf_idx_init(htsFile *fp, bcf_hdr_t *h, int min_shift, const char *fnidx) { return 0; } -// Finishes an index. Call after the last record has been written. -// Returns 0 on success, <0 on failure. -// -// NB: same format as SAM/BAM as it uses bgzf. -int bcf_idx_save(htsFile *fp) { - return sam_idx_save(fp); -} +// Finishes an index. Call after the last record has been written. +// Returns 0 on success, <0 on failure. +// +// NB: same format as SAM/BAM as it uses bgzf. +int bcf_idx_save(htsFile *fp) { + return sam_idx_save(fp); +} + +/***************** + *** Utilities *** + *****************/ + +int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) +{ + int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res; + for (i=0; inhrec; i++) + { + if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) + { + int j; + for (j=0; jhrec[j]->type!=BCF_HL_GEN ) continue; + + // Checking only the key part of generic lines, otherwise + // the VCFs are too verbose. Should we perhaps add a flag + // to bcf_hdr_combine() and make this optional? + if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; + } + if ( j>=ndst_ori ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return -1; + need_sync += res; + } + } + else if ( src->hrec[i]->type==BCF_HL_STR ) + { + // NB: we are ignoring fields without ID + int j = bcf_hrec_find_key(src->hrec[i],"ID"); + if ( j>=0 ) + { + bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); + if ( !rec ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return -1; + need_sync += res; + } + } + } + else + { + int j = bcf_hrec_find_key(src->hrec[i],"ID"); + assert( j>=0 ); // this should always be true for valid VCFs + + bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); + if ( !rec ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return -1; + need_sync += res; + } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) + { + // Check that both records are of the same type. The bcf_hdr_id2length + // macro cannot be used here because dst header is not synced yet. + vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID]; + vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID]; + khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]); + khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]); + if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) ) + { + hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths", + src->hrec[i]->vals[0]); + ret |= 1; + } + if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) + { + hts_log_warning("Trying to combine \"%s\" tag definitions of different types", + src->hrec[i]->vals[0]); + ret |= 1; + } + } + } + } + if ( need_sync ) { + if (bcf_hdr_sync(dst) < 0) return -1; + } + return ret; +} + +bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) +{ + if ( !dst ) + { + // this will effectively strip existing IDX attributes from src to become dst + dst = bcf_hdr_init("r"); + kstring_t htxt = {0,0,0}; + if (bcf_hdr_format(src, 0, &htxt) < 0) { + free(htxt.s); + return NULL; + } + if ( bcf_hdr_parse(dst, htxt.s) < 0 ) { + bcf_hdr_destroy(dst); + dst = NULL; + } + free(htxt.s); + return dst; + } + + int i, ndst_ori = dst->nhrec, need_sync = 0, res; + for (i=0; inhrec; i++) + { + if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) + { + int j; + for (j=0; jhrec[j]->type!=BCF_HL_GEN ) continue; + + // Checking only the key part of generic lines, otherwise + // the VCFs are too verbose. Should we perhaps add a flag + // to bcf_hdr_combine() and make this optional? + if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; + } + if ( j>=ndst_ori ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return NULL; + need_sync += res; + } + } + else if ( src->hrec[i]->type==BCF_HL_STR ) + { + // NB: we are ignoring fields without ID + int j = bcf_hrec_find_key(src->hrec[i],"ID"); + if ( j>=0 ) + { + bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); + if ( !rec ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return NULL; + need_sync += res; + } + } + } + else + { + int j = bcf_hrec_find_key(src->hrec[i],"ID"); + assert( j>=0 ); // this should always be true for valid VCFs + + bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); + if ( !rec ) { + res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); + if (res < 0) return NULL; + need_sync += res; + } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) + { + // Check that both records are of the same type. The bcf_hdr_id2length + // macro cannot be used here because dst header is not synced yet. + vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID]; + vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID]; + khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]); + khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]); + if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) ) + { + hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths", + src->hrec[i]->vals[0]); + } + if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) + { + hts_log_warning("Trying to combine \"%s\" tag definitions of different types", + src->hrec[i]->vals[0]); + } + } + } + } + if ( need_sync ) { + if (bcf_hdr_sync(dst) < 0) return NULL; + } + return dst; +} +typedef union { + uint32_t i; + float f; +} if_pair; + +bcf_hdr_t *vcf_hdr_read_required_sample_line(htsFile *fp, const uint8_t is_sample_line_required) +{ + kstring_t txt, *s = &fp->line; + int ret; + bcf_hdr_t *h; + tbx_t *idx = NULL; + const char **names = NULL; + h = bcf_hdr_init("r"); + if (!h) { + hts_log_error("Failed to allocate bcf header"); + return NULL; + } + txt.l = txt.m = 0; txt.s = 0; + while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) { + int e = 0; + if (s->l == 0) continue; + if (s->s[0] != '#') { + hts_log_error("No sample line"); + goto error; + } + if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here + kstring_t tmp = { 0, 0, NULL }; + hFILE *f = hopen(fp->fn_aux, "r"); + if (f == NULL) { + hts_log_error("Couldn't open \"%s\"", fp->fn_aux); + goto error; + } + while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) { + char *tab = strchr(tmp.s, '\t'); + if (tab == NULL) continue; + e |= (kputs("##contig=\n", 2, &txt) < 0); + } + free(tmp.s); + if (hclose(f) != 0) { + hts_log_error("Error on closing %s", fp->fn_aux); + goto error; + } + if (e) goto error; + } + if (kputsn(s->s, s->l, &txt) < 0) goto error; + if (kputc('\n', &txt) < 0) goto error; + if (s->s[1] != '#') break; + } + if ( ret < -1 ) goto error; + if ( !txt.s ) + { + hts_log_error("Could not read the header"); + goto error; + } + size_t hdr_length = 0ull; + if ( bcf_hdr_parse_required_sample_line(h, txt.s, &hdr_length, is_sample_line_required) < 0 ) goto error; + + // check tabix index, are all contigs listed in the header? add the missing ones + idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL); + if ( idx ) + { + int i, n, need_sync = 0; + names = tbx_seqnames(idx, &n); + if (!names) goto error; + for (i=0; ierrcode ) + { + // vcf_parse1() encountered a new contig or tag, undeclared in the + // header. At this point, the header must have been printed, + // proceeding would lead to a broken BCF file. Errors must be checked + // and cleared by the caller before we can proceed. + hts_log_error("Unchecked error (%d)", v->errcode); + return -1; + } + bcf1_sync(v); // check if the BCF record was modified + if(is_bcf) + { + if((offset+8*sizeof(int)+v->shared.l+v->indiv.l) <= capacity) + { + //First 8 integers represent various lengths + if_pair* x = (if_pair*)(buffer+offset); + x[0].i = v->shared.l + 24; // to include six 32-bit integers + x[1].i = v->indiv.l; + x[2].i = v->rid; + x[3].i = v->pos; + x[4].i = v->rlen; + x[5].f = v->qual; + x[6].i = (uint32_t)v->n_allele<<16 | v->n_info; + x[7].i = (uint32_t)v->n_fmt<<24 | v->n_sample; + offset += 8*sizeof(int); + memcpy(buffer+offset, v->shared.s, v->shared.l); + offset += v->shared.l; + memcpy(buffer+offset, v->indiv.s, v->indiv.l); + offset += v->indiv.l; + } + } + else + { + tmp->l = 0; + int status = vcf_format(hdr, v, tmp); + assert(status == 0); + if((offset+tmp->l) <= capacity) + { + memcpy(buffer+offset, tmp->s, tmp->l); + offset += tmp->l; + } + } + return offset; +} + +bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_sample_line_required) +{ + if (hfp->format.format == vcf) + return vcf_hdr_read_required_sample_line(hfp, is_sample_line_required); + if (hfp->format.format != bcf) { + hts_log_error("Input is not detected as bcf or vcf format"); + return NULL; + } + + assert(hfp->is_bgzf); + + BGZF *fp = hfp->fp.bgzf; + uint8_t magic[5]; + bcf_hdr_t *h; + h = bcf_hdr_init("r"); + if (!h) { + hts_log_error("Failed to allocate bcf header"); + return NULL; + } + if (bgzf_read(fp, magic, 5) != 5) + { + hts_log_error("Failed to read the header (reading BCF in text mode?)"); + bcf_hdr_destroy(h); + return NULL; + } + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) + { + if (!strncmp((char*)magic, "BCF", 3)) + hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported"); + else + hts_log_error("Invalid BCF2 magic string"); + bcf_hdr_destroy(h); + return NULL; + } + uint8_t buf[4]; + size_t hlen; + char *htxt = NULL; + if (bgzf_read(fp, buf, 4) != 4) goto fail; + hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); + if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (hlen > FUZZ_ALLOC_LIMIT) { errno = ENOMEM; goto fail; } +#endif + htxt = (char*)malloc(hlen + 1); + if (!htxt) goto fail; + if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; + htxt[hlen] = '\0'; // Ensure htxt is terminated + size_t hdr_length = 0ull; + bcf_hdr_parse_required_sample_line(h, htxt, &hdr_length, is_sample_line_required); // FIXME: Does this return anything meaningful? + free(htxt); + return h; + fail: + hts_log_error("Failed to read BCF header"); + free(htxt); + bcf_hdr_destroy(h); + return NULL; +} + + +int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_length, + const uint8_t is_sample_line_required) +{ + int len, done = 0; + char *p = htxt; + int return_val = 0; + + // Check sanity: "fileformat" string must come as first + bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); + if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") ) + hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?"); + + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + + // The filter PASS must appear first in the dictionary + hrec = bcf_hdr_parse_line(hdr,"##FILTER=",&len); + if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + + // Parse the whole header + do { + while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { + if(len < 0) + { + return_val = -1; + done = -1; + break; + } + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + p += len; + } + assert(hrec == NULL); + if (len < 0) { + // len < 0 indicates out-of-memory, or similar error + hts_log_error("Could not parse header line: %s", strerror(errno)); + return -1; + } else if (len > 0) { + // Bad header line. bcf_hdr_parse_line() will have logged it. + // Skip and try again on the next line (p + len will be the start + // of the next one). + p += len; + continue; + } + + if(done < 0) + break; + + // Next should be the sample line. If not, it was a malformed + // header, in which case print a warning and skip (many VCF + // operations do not really care about a few malformed lines). + // In the future we may want to add a strict mode that errors in + // this case. + if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) { + char *eol = strchr(p, '\n'); + if (*p != '\0') { + char buffer[320]; + hts_log_warning("Could not parse header line: %s", + hts_strprint(buffer, sizeof(buffer), + '"', p, + eol ? (eol - p) : SIZE_MAX)); + } + if (eol) { + p = eol + 1; // Try from the next line. + } else { + done = -1; // No more lines left, give up. + } + } else { + done = 1; // Sample line found + } + } while (!done); + + size_t sample_line_length = 0; + if (done < 0) { + if(is_sample_line_required) + { + // No sample line is fatal. + hts_log_error("Could not parse the header, sample line not found"); + return -1; + } + } + else + { + if(return_val >= 0) + return_val = bcf_hdr_parse_sample_line(hdr,p); + } + (*hdr_length) = ((size_t)(p - htxt)) + sample_line_length; + if(return_val >= 0) + return_val = bcf_hdr_sync(hdr); + if(return_val >= 0) + bcf_hdr_check_sanity(hdr); + return return_val; +} + -/***************** - *** Utilities *** - *****************/ -int bcf_hdr_combine(bcf_hdr_t *dst, const bcf_hdr_t *src) + + +size_t bcf_hdr_serialize(bcf_hdr_t* h, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const uint8_t keep_idx_fields) { - int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res; - for (i=0; inhrec; i++) + if (!h) { + errno = EINVAL; + return offset; + } + if ( h->dirty ) { + if (bcf_hdr_sync(h) < 0) return offset; + } + + kstring_t htxt = {0,0,0}; + bcf_hdr_format(h, (is_bcf & keep_idx_fields), &htxt); + uint32_t hlen = htxt.l; + if(is_bcf) { - if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) - { - int j; - for (j=0; jhrec[j]->type!=BCF_HL_GEN ) continue; + kputc('\0', &htxt); // include the \0 byte + ++hlen; - // Checking only the key part of generic lines, otherwise - // the VCFs are too verbose. Should we perhaps add a flag - // to bcf_hdr_combine() and make this optional? - if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; - } - if ( j>=ndst_ori ) { - res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); - if (res < 0) return -1; - need_sync += res; - } - } - else if ( src->hrec[i]->type==BCF_HL_STR ) + if((offset+5+sizeof(int)+hlen) <= capacity) { - // NB: we are ignoring fields without ID - int j = bcf_hrec_find_key(src->hrec[i],"ID"); - if ( j>=0 ) - { - bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); - if ( !rec ) { - res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); - if (res < 0) return -1; - need_sync += res; - } - } + if(!keep_idx_fields) //htsjdk cannot deal with 2.2 header + memcpy(buffer+offset, "BCF\2\1", 5); + else + memcpy(buffer+offset, "BCF\2\2", 5); + offset += 5; + memcpy(buffer+offset, &hlen, sizeof(int)); + offset += sizeof(int); + memcpy(buffer+offset, htxt.s, hlen); + offset += hlen; } - else + } + else + { + if(offset+hlen <= capacity) { - int j = bcf_hrec_find_key(src->hrec[i],"ID"); - assert( j>=0 ); // this should always be true for valid VCFs - - bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); - if ( !rec ) { - res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); - if (res < 0) return -1; - need_sync += res; - } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) - { - // Check that both records are of the same type. The bcf_hdr_id2length - // macro cannot be used here because dst header is not synced yet. - vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID]; - vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID]; - khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]); - khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]); - if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) ) - { - hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths", - src->hrec[i]->vals[0]); - ret |= 1; - } - if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) - { - hts_log_warning("Trying to combine \"%s\" tag definitions of different types", - src->hrec[i]->vals[0]); - ret |= 1; - } - } + memcpy(buffer+offset, htxt.s, hlen); + offset += hlen; } } - if ( need_sync ) { - if (bcf_hdr_sync(dst) < 0) return -1; - } - return ret; + free(htxt.s); + return offset; } -bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) +size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf) { - if ( !dst ) + size_t hdr_length = 0ull; + size_t curr_offset = offset; + if(is_bcf) { - // this will effectively strip existing IDX attributes from src to become dst - dst = bcf_hdr_init("r"); - kstring_t htxt = {0,0,0}; - if (bcf_hdr_format(src, 0, &htxt) < 0) { - free(htxt.s); - return NULL; - } - if ( bcf_hdr_parse(dst, htxt.s) < 0 ) { - bcf_hdr_destroy(dst); - dst = NULL; + //magic string + hdr length + if(curr_offset+BCF_HEADER_MAGIC_STRING_LENGTH+sizeof(int) > capacity) + return offset; + const char* buffer_magic_string = (const char*)(buffer+curr_offset); + if(strncmp(buffer_magic_string, BCF_V_2_2_HEADER_MAGIC_STRING, BCF_HEADER_MAGIC_STRING_LENGTH) != 0 + && strncmp(buffer_magic_string, BCF_V_2_1_HEADER_MAGIC_STRING, BCF_HEADER_MAGIC_STRING_LENGTH) != 0) + { + fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 and BCFv2.1 are supported.\n", __FILE__,__LINE__,__FUNCTION__); + return offset; } - free(htxt.s); - return dst; - } + curr_offset += BCF_HEADER_MAGIC_STRING_LENGTH; + //Header length + memcpy(&hdr_length, buffer+curr_offset, sizeof(int)); + curr_offset += sizeof(int); + if(curr_offset+hdr_length > capacity) + return offset; + } + return bcf_hdr_parse(h, (char*)(buffer+curr_offset)); +} - int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res; - for (i=0; inhrec; i++) +size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr) +{ + if(is_bcf) { - if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) - { - int j; - for (j=0; jhrec[j]->type!=BCF_HL_GEN ) continue; - - // Checking only the key part of generic lines, otherwise - // the VCFs are too verbose. Should we perhaps add a flag - // to bcf_hdr_combine() and make this optional? - if ( !strcmp(src->hrec[i]->key,dst->hrec[j]->key) ) break; - } - if ( j>=ndst_ori ) { - res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); - if (res < 0) return NULL; - need_sync += res; - } - } - else if ( src->hrec[i]->type==BCF_HL_STR ) + bcf_clear(v); + size_t curr_offset = offset; + if(curr_offset+8*sizeof(uint32_t) >= capacity) + return offset; + const if_pair* x = (if_pair*)(buffer+curr_offset); + size_t shared_length = x[0].i-6*sizeof(int); + size_t indiv_length = x[1].i; + if(curr_offset+8*sizeof(uint32_t)+shared_length+indiv_length > capacity) + return offset; + ks_resize(&v->shared, shared_length); + ks_resize(&v->indiv, indiv_length); + v->rid = x[2].i; + v->pos = x[3].i; + v->rlen = x[4].i; + v->qual = x[5].f; + v->n_allele = (x[6].i)>>16; v->n_info = (x[6].i)&0xffff; + v->n_fmt = (x[7].i)>>24; v->n_sample = (x[7].i)&0xffffff; + v->shared.l = shared_length, v->indiv.l = indiv_length; + // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 + if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; + curr_offset += 8*sizeof(uint32_t); + + memcpy(v->shared.s, buffer+curr_offset, shared_length); + curr_offset += shared_length; + + memcpy(v->indiv.s, buffer+curr_offset, indiv_length); + curr_offset += indiv_length; + return curr_offset; + } + else + { + kstring_t tmp; + assert(offset < capacity); + tmp.s = (char*)(buffer+offset); + size_t max_length = capacity-offset; + size_t line_length = max_length; + //See if newline exists + char* line_end_ptr = (char*)(memchr(tmp.s, '\n', max_length)); + if(line_end_ptr) { - // NB: we are ignoring fields without ID - int j = bcf_hrec_find_key(src->hrec[i],"ID"); - if ( j>=0 ) - { - bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], src->hrec[i]->key); - if ( !rec ) { - res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); - if (res < 0) return NULL; - need_sync += res; - } - } + line_length = ((size_t)(line_end_ptr - tmp.s)); + *line_end_ptr = 0; //replace '\n' with null byte, vcf_parse doesn't like '\n' } + tmp.l = line_length; + tmp.m = max_length; + int status = vcf_parse(&tmp, hdr, v); + //vcf parsed succesfully + if(status == 0) + return offset + line_length + (line_end_ptr ? 1u : 0u); //for the \n character else - { - int j = bcf_hrec_find_key(src->hrec[i],"ID"); - assert( j>=0 ); // this should always be true for valid VCFs - - bcf_hrec_t *rec = bcf_hdr_get_hrec(dst, src->hrec[i]->type, "ID", src->hrec[i]->vals[j], NULL); - if ( !rec ) { - res = bcf_hdr_add_hrec(dst, bcf_hrec_dup(src->hrec[i])); - if (res < 0) return NULL; - need_sync += res; - } else if ( src->hrec[i]->type==BCF_HL_INFO || src->hrec[i]->type==BCF_HL_FMT ) - { - // Check that both records are of the same type. The bcf_hdr_id2length - // macro cannot be used here because dst header is not synced yet. - vdict_t *d_src = (vdict_t*)src->dict[BCF_DT_ID]; - vdict_t *d_dst = (vdict_t*)dst->dict[BCF_DT_ID]; - khint_t k_src = kh_get(vdict, d_src, src->hrec[i]->vals[0]); - khint_t k_dst = kh_get(vdict, d_dst, src->hrec[i]->vals[0]); - if ( (kh_val(d_src,k_src).info[rec->type]>>8 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>8 & 0xf) ) - { - hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths", - src->hrec[i]->vals[0]); - ret |= 1; - } - if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) - { - hts_log_warning("Trying to combine \"%s\" tag definitions of different types", - src->hrec[i]->vals[0]); - ret |= 1; - } - } - } - } - if ( need_sync ) { - if (bcf_hdr_sync(dst) < 0) return NULL; + return offset; } - return dst; } int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) @@ -3627,7 +4859,8 @@ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) int i; if ( line->errcode ) { - hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_seqname_safe(src_hdr,line), line->pos+1); + char errordescription[1024] = ""; + hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)), bcf_seqname_safe(src_hdr,line), line->pos+1); exit(1); } if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id @@ -3988,25 +5221,33 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t if ( alt[0]=='<' ) { if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant + if( strncmp(alt, "", 9) == 0) { var->n = 0; var->type = VCF_NON_REF; return; } if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; } var->type = VCF_OTHER; return; } + // Catch "joined before" breakend case + if ( alt[0]==']' || alt[0] == '[' ) + { + var->type = VCF_BND; return; + } + + // Iterate through alt characters that match the reference const char *r = ref, *a = alt; while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed if ( *a && !*r ) { - if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } + if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend while ( *a ) a++; - var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; + var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return; } else if ( *r && !*a ) { while ( *r ) r++; - var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; + var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return; } else if ( !*r && !*a ) { @@ -4021,13 +5262,13 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t { if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; } var->n = -(re-r); - if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; } + if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; } var->type = VCF_OTHER; return; } else if ( re==r ) { var->n = ae-a; - if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; } + if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; } var->type = VCF_OTHER; return; } @@ -4043,7 +5284,10 @@ static int bcf_set_variant_types(bcf1_t *b) bcf_dec_t *d = &b->d; if ( d->n_var < b->n_allele ) { - d->var = (bcf_variant_t *) realloc(d->var, sizeof(bcf_variant_t)*b->n_allele); + bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele); + if (!new_var) + return -1; + d->var = new_var; d->n_var = b->n_allele; } int i; @@ -4059,15 +5303,80 @@ static int bcf_set_variant_types(bcf1_t *b) return 0; } +// bcf_get_variant_type/bcf_get_variant_types should only return the following, +// to be compatible with callers that are not expecting newer values +// like VCF_INS, VCF_DEL. The full set is available from the newer +// vcf_has_variant_type* interfaces. +#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_NON_REF) int bcf_get_variant_types(bcf1_t *rec) { - if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return rec->d.var_type; + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) { + hts_log_error("Couldn't get variant types: %s", strerror(errno)); + exit(1); // Due to legacy API having no way to report failures + } + } + return rec->d.var_type & ORIG_VAR_TYPES; } + int bcf_get_variant_type(bcf1_t *rec, int ith_allele) { - if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return rec->d.var[ith_allele].type; + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) { + hts_log_error("Couldn't get variant types: %s", strerror(errno)); + exit(1); // Due to legacy API having no way to report failures + } + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) { + hts_log_error("Requested allele outside valid range"); + exit(1); + } + return rec->d.var[ith_allele].type & ORIG_VAR_TYPES; +} +#undef ORIG_VAR_TYPES + +int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask) +{ + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return -1; + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1; + if (bitmask == VCF_REF) { // VCF_REF is 0, so handled as a special case + return rec->d.var[ith_allele].type == VCF_REF; + } + return bitmask & rec->d.var[ith_allele].type; +} + +int bcf_variant_length(bcf1_t *rec, int ith_allele) +{ + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing; + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing; + return rec->d.var[ith_allele].n; +} + +int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, + enum bcf_variant_match mode) +{ + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return -1; + } + uint32_t type = rec->d.var_type; + if ( mode==bcf_match_overlap ) return bitmask & type; + + // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may + // ask for say `VCF_INS` or `VCF_INDEL` only + if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL; + else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL); + + if ( mode==bcf_match_subset ) + { + if ( ~bitmask & type ) return 0; + else return bitmask & type; + } + // mode == bcf_match_exact + return type==bitmask ? type : 0; } int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) @@ -4138,11 +5447,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v #ifdef VCF_ALLOW_INT64 else if ( type==BCF_HT_LONG ) { - if (n != 1) { - hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1); - abort(); - } - bcf_enc_long1(&str, *(int64_t *) values); + bcf_enc_vlong(&str, n, (const int64_t*)values, -1); } #endif else @@ -4155,7 +5460,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v if ( inf ) { // Is it big enough to accommodate new block? - if ( str.l <= inf->vptr_len + inf->vptr_off ) + if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off ) { if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF; uint8_t *ptr = inf->vptr - inf->vptr_off; @@ -4312,7 +5617,7 @@ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const else { // The tag is already present, check if it is big enough to accommodate the new block - if ( str.l <= fmt->p_len + fmt->p_off ) + if ( fmt->p && str.l <= fmt->p_len + fmt->p_off ) { // good, the block is big enough if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1; @@ -4433,28 +5738,64 @@ static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nal int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) { if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR); - kstring_t tmp = {0,0,0}; char *free_old = NULL; - - // If the supplied alleles are not pointers to line->d.als, the existing block can be reused. + char buffer[256]; + size_t used = 0; + + // The pointers in alleles may point into the existing line->d.als memory, + // so care needs to be taken not to clobber them while updating. Usually + // they will be short so we can copy through an intermediate buffer. + // If they're longer, or won't fit in the existing allocation we + // can allocate a new buffer to write into. Note that in either case + // pointers to line->d.als memory in alleles may not be valid when we've + // finished. int i; - for (i=0; i=line->d.als && alleles[i]d.als+line->d.m_als ) break; - if ( i==nals ) - { - // all alleles point elsewhere, reuse the existing block - tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als; - } - else + size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer); + for (i=0; id.m_als) // Don't shrink the buffer + needed = line->d.m_als; + if (needed > INT_MAX) { + hts_log_error("REF + alleles too long to fit in a BCF record"); + return -1; + } + new_als = malloc(needed); + if (!new_als) + return -1; free_old = line->d.als; + line->d.als = new_als; + line->d.m_als = needed; + } - for (i=0; id.m_als); + memcpy(line->d.als, buffer, used); } - line->d.als = tmp.s; line->d.m_als = tmp.m; - free(free_old); + + // Add in any remaining entries - if this happens we will always be + // writing to a newly-allocated buffer. + for (; i < nals; i++) { + size_t sz = strlen(alleles[i]) + 1; + memcpy(line->d.als + used, alleles[i], sz); + used += sz; + } + + if (free_old) + free(free_old); return _bcf1_sync_alleles(hdr,line,nals); } @@ -4560,7 +5901,12 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi { int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header - if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type + if((type & 0xff) == BCF_HT_LONG) { + const int ht_type_in_hdr = bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id); + if(ht_type_in_hdr != BCF_HT_INT && ht_type_in_hdr != BCF_HT_LONG) return -2; // expected different type + } + else + if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); @@ -4633,6 +5979,14 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi } else { BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; } + case BCF_BT_INT64: + if (type == BCF_HT_LONG) { + BRANCH(int64_t, le_to_i64, p==bcf_int64_missing, p==bcf_int64_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + hts_log_error("Trying to get 32-bit int data from a field which contains 64 bit values"); + return -2; + } + break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break; default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2; } @@ -4742,9 +6096,102 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; + case BCF_BT_INT64: BRANCH(int64_t, le_to_i64, p==bcf_int64_missing, p==bcf_int64_vector_end, *tmp=bcf_int64_missing, *tmp=bcf_int64_vector_end, *tmp=p, int64_t); break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break; default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1); } #undef BRANCH return nsmpl*fmt->n; } + +//error description structure definition +typedef struct err_desc { + int errorcode; + const char *description; +}err_desc; + +// error descriptions +static const err_desc errdesc_bcf[] = { + { BCF_ERR_CTG_UNDEF, "Contig not defined in header"}, + { BCF_ERR_TAG_UNDEF, "Tag not defined in header" }, + { BCF_ERR_NCOLS, "Incorrect number of columns" }, + { BCF_ERR_LIMITS, "Limits reached" }, + { BCF_ERR_CHAR, "Invalid character" }, + { BCF_ERR_CTG_INVALID, "Invalid contig" }, + { BCF_ERR_TAG_INVALID, "Invalid tag" }, +}; + +/// append given description to buffer based on available size and add ... when not enough space + /** @param buffer buffer to which description to be appended + @param offset offset at which to be appended + @param maxbuffer maximum size of the buffer + @param description the description to be appended +on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site +on success returns 0 + */ +static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) { + + if (!description || !buffer || !offset || (maxbuffer < 4)) + return -1; + + size_t rembuffer = maxbuffer - *offset; + if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) { //add description with optionally required ',' + *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description); + } else { //not enough space for description, put ... + size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset; + snprintf(buffer + tmppos, 4, "..."); //ignore offset update + return -1; + } + return 0; +} + +//get description for given error code. return NULL on error +const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { + size_t usedup = 0; + int ret = 0; + int idx; + + if (!buffer || maxbuffer < 4) + return NULL; //invalid / insufficient buffer + + if (!errorcode) { + buffer[0] = '\0'; //no error, set null + return buffer; + } + + for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) { + if (errorcode & errdesc_bcf[idx].errorcode) { //error is set, add description + ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description); + if (ret < 0) + break; //not enough space, ... added, no need to continue + + errorcode &= ~errdesc_bcf[idx].errorcode; //reset the error + } + } + + if (errorcode && (ret >= 0)) { //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§ + add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error"); + } + return buffer; +} + +uint64_t bcf_hdr_id2contig_length(const bcf_hdr_t* hdr, const int id) +{ + bcf_hrec_t* hrec = bcf_hdr_id2hrec(hdr, BCF_DT_CTG, 0, id); + int i = 0; + for(i=0;inkeys;++i) + if(strcmp(hrec->keys[i], "length") == 0) + return strtoull(hrec->vals[i], 0, 10); + return 0; +} + +void bcf_set_end_point_from_info(const bcf_hdr_t* hdr, bcf1_t* line) +{ + bcf_unpack(line, BCF_UN_INFO); + bcf_info_t* info = bcf_get_info(hdr, line, "END"); + if(info) + line->m_end_point = info->v1.i - 1; //END value is 1 based, line->pos is 0 based, change to 0 based + else //no END tag, end is same as pos if not deletion, else depends on rlen + line->m_end_point = line->pos + line->rlen - 1; +} + diff --git a/vcfutils.c b/vcfutils.c index aff5cdd55..890c50a16 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -1,6 +1,6 @@ /* vcfutils.c -- allele-related utility functions. - Copyright (C) 2012-2018, 2020 Genome Research Ltd. + Copyright (C) 2012-2018, 2020-2022 Genome Research Ltd. Author: Petr Danecek @@ -53,6 +53,17 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) } if ( an>=0 && ac_ptr ) { + if ( ac_len != line->n_allele - 1 ) + { + static int warned = 0; + if ( !warned ) + { + hts_log_warning("Incorrect number of AC fields at %s:%"PRIhts_pos". (This message is printed only once.)\n", + header->id[BCF_DT_CTG][line->rid].key, line->pos+1); + warned = 1; + } + return 0; + } int nac = 0; #define BRANCH_INT(type_t, convert) { \ for (i=0; in_allele, sizeof(int)); uint8_t *dat = NULL; + bcf_unpack(line, BCF_UN_ALL); + // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; kputs(line->d.allele[0], &str); @@ -496,8 +509,8 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb } // Update GT fields, the allele indexes might have changed - for (i=1; in_allele; i++) if ( map[i]!=i ) break; - if ( in_allele ) + for (i=1; ipos+1, al, nR_ori, map[al]); goto err; } - ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); + // if an allele other than the reference is mapped to 0, it has been removed, + // so translate it to 'missing', while preserving the phasing bit + ptr[j] = ((al>0 && !map[al]) ? bcf_gt_missing : (map[al]+1)<<1) | (ptr[j]&1); } ptr += nret; } @@ -732,7 +747,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nnew = nR_new; } - #define BRANCH(type_t,is_vector_end) \ + #define BRANCH(type_t,is_vector_end,set_missing) \ { \ for (j=0; jn_sample; j++) \ { \ @@ -742,7 +757,12 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int k_src, k_dst = 0; \ for (k_src=0; k_src # -# Copyright (C) 2017-2018 Genome Research Ltd. +# Copyright (C) 2017-2018, 2021 Genome Research Ltd. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal @@ -24,13 +24,18 @@ # DEALINGS IN THE SOFTWARE. # Master version, for use in tarballs or non-git source copies -VERSION=1.10.2 +VERSION=1.18 # If we have a git clone, then check against the current tag -if [ -e .git ] +srcdir=${0%/version.sh} +if [ -e $srcdir/.git ] then # If we ever get to 10.x this will need to be more liberal - VERSION=`git describe --match '[0-9].[0-9]*' --dirty` + v=`cd $srcdir && git describe --always --match '[0-9].[0-9]*' --dirty` + case $v in + [0-9]*.[0-9]*) VERSION="$v" ;; + [0-9a-f][0-9a-f]*) VERSION="$VERSION-1-g$v" ;; + esac fi # Numeric version is for use in .dylib or .so libraries @@ -43,7 +48,7 @@ then v1=`expr "$VERSION" : '\([0-9]*\)'` v2=`expr "$VERSION" : '[0-9]*.\([0-9]*\)'` v3=`expr "$VERSION" : '[0-9]*.[0-9]*.\([0-9]*\)'` - if [ -z "`expr "$VERSION" : '^\([0-9.]*\)$'`" ] + if [ -z "`expr "$VERSION" : '\([0-9.]*\)$'`" ] then VERSION="$v1.$v2.255" else