From d50a3a531c597a24644c6878fc0068d0d1cc0a0d Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Mon, 14 Sep 2020 20:49:22 +0100 Subject: [PATCH 001/488] News update Sept 2020. --- NEWS | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) diff --git a/NEWS b/NEWS index 290183012..1299753dc 100644 --- a/NEWS +++ b/NEWS @@ -1,15 +1,182 @@ + + Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Features and Updates +-------------------- + +* Support added for remote reference files. fai_path() can take a remote + reference file and will return the corresponding index file. Remote indexes + can be handle by refs_load_fai(). UR tags in @SQ lines can now be set to + remote URIs. + * Added tabix --separate-regions option, which lets the user group the output by the number of corresponding target regions supplied in the command line. +* Added tabix --cache option to set a BGZF block cache size. Most beneficial + when the -R option is used and the same blocks need to be re-read multiple + times. + * New method vcf_open_mode() changes the opening mode of a variant call file, based on its file extension. Similar to sam_open_mode(). +* VCF parser improvements to for greater code clarity and speed. + +* Support for simultaneous reading of unindexed VCF/BCF files. Input files must + have the chromosomes in the same order as each other and be consistent with + the order sequences in the header. + * hts_srand48() now seeds the same POSIX-standard sequences of pseudo-random numbers regardless of platform, including on OpenBSD where plain srand48() produces a different cryptographically-strong non-deterministic sequence. + +* Iterator now works with 64 bit positions. + +* Alignments which consume no reference bases are now considered to have + length 1. This would make such alignments cover 1 reference position in the + same manner as alignments that are unmapped or have no CIGAR strings. These + alignments can now be returned by iterator based queries. Thanks to John + Marshall. (Fixes samtools/#1240, see also hts-specs#521). + +* Speed up bcf_check_record(), giving a 15% speed increase on an uncompressed + BCF file using gcc9. + +* Added a bam_set_seqi() function to modify a single base in the BAM structure. + Companion function to bam_seqi(). + +* Improved SAM writing speed by around 30%. + +* SAM threading improvements in error handling and memory management. + +* Added sam_format_aux1() which returns a BAM aux tag to SAM format string. + +* Improved bam_aux_update_str() by no longer requiring NUL-terminated strings, + reducing the number of memmove() calls needed and adding more tag size tests. + + +CRAM improvements +----------------- + +* Delay CRAM crc32 checks to block decompression step. CRC is not validated + until the data needs to be used. With other changes this leads to a + 20x speed up in indexing and other sub-query based actions. + +* CRAM now handles the transition from mapped to unmapped data in a better way. + +* CRAM can now use libdeflate. + +* Harden rANS decoding against malicious or random input. (#510, #511, #512 and + #513) + +* CRAM fix for MD tag generation with "b" FC. + +* Fixed the multi-threaded CRAM multi-region iterator inefficiency regression + by taking a different approach. It now works more like the single iterator + and does not preemptively decode the next container unless it will be used. + (#1061) + +* Set CRAM default quality in lossy quality modes. If lossy quality is enabled + and 'B', 'q' or 'Q' features are used, CRAM starts off with QUAL being all 255 + (as per BAM spec and "*" quality) and then modifies individual qualities as + dictated by the specific features. + + However that then produces ASCII quality (q=-1) for the unmodified bases. + Instead ASCII quality "?" (q=30) is used, as per htsjdk. Quality 255 is still + used for sequences with no modifications at all. + + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Fixes to pass cppcheck-1.90 warnings. (#995) + +* Unify kroundup macros and fix gcc warnings. + +* Ensure HTSlib prefers its own headers when being compiled. Thanks to + John Marshall. (#347) + +* Link plugins against libhts.so/.dylib and fix dynamically unloading HTSlib. + Thanks to John Marshall. + +* Define HTSLIB_EXPORT without using a helper macro. Thanks to John Marshall. + +* Fix dirty default build by including latest pkg.m4 instead of using + aclocal.m4. Thanks to Damien Zammit. + +* Make kvsprintf format checking platform configurable. + +* Add struct tags to htslib/*.h public typedefs. This makes it possible to + to forward declare htsFile without including htslib/hts.h. Thanks to + John Marshall. (#1106) + +* Have realloc and free in the same layer of kstring. This prevents compilers + using different malloc() and free() implementations. (#1114) + + +Bug fixes +--------- + +* Fixed missing return parameter value in idx_test_and_fetch(). Thanks to + Lilian Janin. + +* Fixed nuances between gzip vs bgzf compressed files. [fuzz] + +* Added and/or fixed byte swapping code for big-endian platforms. Thanks + to Jun Aruga, John Marshall, Michael R Crusoe and Gianfranco Costamagna + for their help. (#119 and #355) + +* In sam_hdr_create(), free newly allocated SN strings when encountering an + error. [fuzz] + +* Prevent double free in case of idx_test_and_fetch() failure. (#1033) + +* Prevent negative rlen on erroneous INFO/END when reading BCF. (bcftools/#1154) + +* In the header, link a new PG line only to valid chains. (samtools #1235) + +* Also in the header, when calling sam_hdr_update_line(), update target arrays + only when name or length is changed. (#1007) + +* Fixed buffer overflow in hts_md5_update(). [fuzz] + +* Fix mpileup regression between 1.9 and 1.10. + +* kputs() now checks for null pointer in source string. [fuzz] + +* Error message typo "Number=R" instead of "Number=G" fixed in + bcf_remove_allele_set(). Thanks to Ilya Vorontsov. + +* Fixes potential bcf_update_alleles() crash on 0 alleles. Thanks to + John Marshall. (#994) + +* Fixed hfile_libcurl breakage when using libcurl 7.69.1 or later. Thanks to + John Marshall for tracking down the exact libcurl change that caused the + incompatibility. (samtools/#1254 and samtools/#1284) + +* Download index files atomically in idx_test_and_fetch(). This prevents + corruption when running parallel jobs on S3 files. Thanks to John Marshall. + (samtools/#1242). + +* The pileup constructor now called with its own copy of bam1_t struct. + +* Verify that the BCF input is within the length of its assigned type. [fuzz] + +* Fixed possible error in code_sort() on negative CRAM Huffman code length. + +* Fixed possible undefined shift in cram_byte_array_stop_decode_init(). + +* Fix interval end comparison in CRAM iterator method. (samtools/#1173) + +* Fix integer overflow in cram_read_slice. [fuzz] + +* Fix parameter passed to cram_name2id. (#977) + +* Fix memory leak on failure in cram_decode_slice(). [fuzz] + +* Fixed incomplete decode to encode conversion in CRAM. (samtools/#1276) Noteworthy changes in release 1.10.2 (19th December 2019) From 1748a2dd34e5ff4b4800fcf04dbd5860d8c30f22 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 16 Sep 2020 10:10:46 +0100 Subject: [PATCH 002/488] More NEWS updates --- NEWS | 270 ++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 183 insertions(+), 87 deletions(-) diff --git a/NEWS b/NEWS index 1299753dc..32cee4051 100644 --- a/NEWS +++ b/NEWS @@ -8,73 +8,127 @@ Features and Updates * Support added for remote reference files. fai_path() can take a remote reference file and will return the corresponding index file. Remote indexes - can be handle by refs_load_fai(). UR tags in @SQ lines can now be set to - remote URIs. + can be handled by refs_load_fai(). UR tags in @SQ lines can now be set to + remote URIs. (#1017) * Added tabix --separate-regions option, which lets the user group the output by the number of corresponding target regions supplied in the command line. + (#1108) * Added tabix --cache option to set a BGZF block cache size. Most beneficial when the -R option is used and the same blocks need to be re-read multiple - times. + times. (#1053) -* New method vcf_open_mode() changes the opening mode of a variant call file, - based on its file extension. Similar to sam_open_mode(). +* Improved error checking in tabix and added a --verbosity option so + it is possible to change the amount of logging when it runs. (#1040) -* VCF parser improvements to for greater code clarity and speed. +* A note about the maximum chromosome length usable with TBI indexes has been + added to the tabix manual page. Thanks to John Marshall. (#1070) -* Support for simultaneous reading of unindexed VCF/BCF files. Input files must - have the chromosomes in the same order as each other and be consistent with - the order sequences in the header. +* New method vcf_open_mode() changes the opening mode of a variant file + based on its file extension. Similar to sam_open_mode(). (#1096) + +* The VCF parser has been made faster and easier to maintain. (#1057) + +* bcf_record_check() has been made faster, giving a 15% speed increase when + reading an uncompressed BCF file. (#1130) + +* The VCF parser now recognises the "" symbolic allele produced + by GATK. (#1045) + +* Support has been added for simultaneous reading of unindexed VCF/BCF files + when using the synced_bcf_reader interface. Input files must have the + chromosomes in the same order as each other and be consistent with the order + of sequences in the header. (#1089) + +* The VCF and BCF readers will now attempt to fix up invalid INFO/END tags + where the stored END value is less than POS, resulting in an apparently + negative record length. Such files have been generated by programs which + used END incorrectly, and by broken lift-over processes that failed to + update any END tags present. (#1021; fixed samtools/bcftools#1154) + +* The htsFile interface can now detect the crypt4gh encrypted format (see + https://samtools.github.io/hts-specs/crypt4gh.pdf). If HTSlib is + built with external plug-in support, and the hfile_crypt4gh plug-in is + present, the file will be passed to it for decryption. The plug-in + can be obtained from https://github.com/samtools/htslib-crypt4gh. (#1046) * hts_srand48() now seeds the same POSIX-standard sequences of pseudo-random numbers regardless of platform, including on OpenBSD where plain srand48() produces a different cryptographically-strong non-deterministic sequence. - -* Iterator now works with 64 bit positions. + Thanks to John Marshall. (#1002) + +* Iterators now work with 64 bit positions. (#1018) + +* Improved the speed of range queries when using BAI indexes by + making better use of the linear index data included in the file. + The best improvement is on low-coverage data. (#1031) * Alignments which consume no reference bases are now considered to have length 1. This would make such alignments cover 1 reference position in the same manner as alignments that are unmapped or have no CIGAR strings. These alignments can now be returned by iterator based queries. Thanks to John - Marshall. (Fixes samtools/#1240, see also hts-specs#521). + Marshall. (#1063; fixed samtools/samtools#1240, see also + samtools/hts-specs#521). + +* A bam_set_seqi() function to modify a single base in the BAM structure + has been added. This is a companion function to bam_seqi(). (#1022) -* Speed up bcf_check_record(), giving a 15% speed increase on an uncompressed - BCF file using gcc9. +* Writing SAM format is around 30% faster. (#1035) -* Added a bam_set_seqi() function to modify a single base in the BAM structure. - Companion function to bam_seqi(). +* Added sam_format_aux1() which converts a BAM aux tag to a SAM format string. + (#1134) -* Improved SAM writing speed by around 30%. +* bam_aux_update_str() no longer requires NUL-terminated strings. It + is also now possible to create tags containing part of a longer string. + (#1088) -* SAM threading improvements in error handling and memory management. +* It is now possible to use external plug-ins in language bindings that + dynamically load HTSlib. Note that a side-effect of this change is that + some plug-ins now link against libhts.so, which means that have to be able + to find the shared library when they are started up. Thanks to + John Marshall. (#1072) -* Added sam_format_aux1() which returns a BAM aux tag to SAM format string. +* bgzf_close(), and therefore hts_close(), will now return non-zero when + closing a bgzf handle on which errors have been detected. (Part of #1117) -* Improved bam_aux_update_str() by no longer requiring NUL-terminated strings, - reducing the number of memmove() calls needed and adding more tag size tests. +* Added a special case to the kt_fisher_exact() test for when the table + probability is too small to be represented in a double. This fixes a + bug where it would, for some inputs, fail to correctly determine which + side of the distribution the table was on resulting in swapped p-values + being returned for the left- and right-tailed tests. The two-tailed + test value was not affected by this problem. (#1126) +* Improved error diagnostics in the CRAM decoder (#1042), BGZF (#1049), + the VCF and BCF readers (#1059), and the SAM parser (#1073) + +* ks_resize() now allocates 1.5 times the requested size when it needs + to expand a kstring instead of rounding up to the next power of two. + This has been done mainly to make the inlined function smaller, but it + also reduces the overhead of storing data in kstrings at the expense of + possibly needing a few more reallocations. (#1129) CRAM improvements ----------------- -* Delay CRAM crc32 checks to block decompression step. CRC is not validated - until the data needs to be used. With other changes this leads to a - 20x speed up in indexing and other sub-query based actions. - -* CRAM now handles the transition from mapped to unmapped data in a better way. +* Delay CRAM crc32 checks until the data actually needs to be used. With + other changes this leads to a 20x speed up in indexing and other sub-query + based actions. (#988) -* CRAM can now use libdeflate. +* CRAM now handles the transition from mapped to unmapped data in a better + way, improving compression of the unmapped data. (#961) -* Harden rANS decoding against malicious or random input. (#510, #511, #512 and - #513) +* CRAM can now use libdeflate. (#961) -* CRAM fix for MD tag generation with "b" FC. +* Fixed bug in MD tag generation with "b" read feature codes, causing the + numbers in the tag to be too large. Note that HTSlib never uses this + feature code so it is unlikely that this bug would be seen on real data. + The problem was found when testing against hand-crafted CRAM files. (#1086) -* Fixed the multi-threaded CRAM multi-region iterator inefficiency regression - by taking a different approach. It now works more like the single iterator +* Fixed a regression where the CRAM multi-region iterator became much less + efficient when using threads. It now works more like the single iterator and does not preemptively decode the next container unless it will be used. - (#1061) + (#1061) * Set CRAM default quality in lossy quality modes. If lossy quality is enabled and 'B', 'q' or 'Q' features are used, CRAM starts off with QUAL being all 255 @@ -83,101 +137,143 @@ CRAM improvements However that then produces ASCII quality (q=-1) for the unmodified bases. Instead ASCII quality "?" (q=30) is used, as per htsjdk. Quality 255 is still - used for sequences with no modifications at all. + used for sequences with no modifications at all. (#1094) Build changes ------------- -These are compiler, configuration and makefile based changes. +These are compiler, configuration and makefile based changes. -* Fixes to pass cppcheck-1.90 warnings. (#995) +* make all now builds htslib_static.mk and htslib-uninstalled.pc. Thanks to + John Marshall. (#1011) -* Unify kroundup macros and fix gcc warnings. +* Various cppcheck-1.90 warnings have been fixed. (#995, #1011) -* Ensure HTSlib prefers its own headers when being compiled. Thanks to - John Marshall. (#347) +* HTSlib now prefers its own headers when being compiled, fixing build + failures on machines that already had a system-installed HTSlib. Thanks to + John Marshall. (#1078; fixed #347) -* Link plugins against libhts.so/.dylib and fix dynamically unloading HTSlib. - Thanks to John Marshall. +* Define HTSLIB_EXPORT without using a helper macro to reduce the length of + compiler diagnostics that mention exported functions. Thanks to + John Marshall. (#1029) -* Define HTSLIB_EXPORT without using a helper macro. Thanks to John Marshall. +* Fix dirty default build by including latest pkg.m4 instead of using + aclocal.m4. Thanks to Damien Zammit. (#1091) -* Fix dirty default build by including latest pkg.m4 instead of using - aclocal.m4. Thanks to Damien Zammit. +* struct tags have been added to htslib/*.h public typedefs. This makes it + possible to forward declare htsFile without including htslib/hts.h. Thanks + to John Marshall. (#1106) -* Make kvsprintf format checking platform configurable. - -* Add struct tags to htslib/*.h public typedefs. This makes it possible to - to forward declare htsFile without including htslib/hts.h. Thanks to - John Marshall. (#1106) - -* Have realloc and free in the same layer of kstring. This prevents compilers - using different malloc() and free() implementations. (#1114) +* Fixed compiler warnings emitted by the latest gcc and clang releases + when compiling HTSlib, along with some -Wextra warnings in the public + include files. Thanks to John Marshall. (#1066, #1063, #1083) +* A large number of spelling mistakes have been fixed. Thanks to + John Marshall. (#1137) Bug fixes --------- +* Fixed hfile_libcurl breakage when using libcurl 7.69.1 or later. Thanks to + John Marshall for tracking down the exact libcurl change that caused the + incompatibility. (#1105; fixed samtools/samtools#1254 and + samtools/samtools#1284) + +* Fixed overflows kroundup32() and kroundup_size_t() which caused them to + return zero when rounding up values where the most significant bit was + set. When this happens they now return the highest value that can + be stored (#1044). All of the kroundup macro definitions have also been + gathered together into a unified implementation (#1051). + * Fixed missing return parameter value in idx_test_and_fetch(). Thanks to - Lilian Janin. - -* Fixed nuances between gzip vs bgzf compressed files. [fuzz] + Lilian Janin. (#1014) + +* Fixed crashes due to inconsistent selection between bgzf and plain (hfile) + interfaces when reading files. [fuzz] (#1019) * Added and/or fixed byte swapping code for big-endian platforms. Thanks to Jun Aruga, John Marshall, Michael R Crusoe and Gianfranco Costamagna - for their help. (#119 and #355) - + for their help. (#1023; fixed #119 and #355) + +* Fixed a problem with multi-threaded on-the-fly indexes which would + occasionally write virtual offsets pointing at the end of a BGZF block. + Attempting to read from such an offset caused EOF to be incorrectly + reported. These offsets are now handled correctly, and the indexer + has been updated to avoid generating them. (#1028; fixed + samtools/samtools#1197) + * In sam_hdr_create(), free newly allocated SN strings when encountering an - error. [fuzz] - -* Prevent double free in case of idx_test_and_fetch() failure. (#1033) + error. [fuzz] (#1034) -* Prevent negative rlen on erroneous INFO/END when reading BCF. (bcftools/#1154) +* Prevent double free in case of idx_test_and_fetch() failure. Thanks to + @fanwayne for the bug report. (#1047; fixed #1033) -* In the header, link a new PG line only to valid chains. (samtools #1235) +* In the header, link a new PG line only to valid chains. Prevents an + explosive growth of PG lines on headers where PG lines are already present + but not linked together correctly. (#1062; fixed samtools/samtools#1235) * Also in the header, when calling sam_hdr_update_line(), update target arrays - only when name or length is changed. (#1007) - -* Fixed buffer overflow in hts_md5_update(). [fuzz] + only when the name or length is changed. (#1007) + +* Fixed buffer overflows in CRAM MD5 calculation triggered by + files with invalid compression headers, or files with embedded + references that were one byte too short. [fuzz] (#1024, #1068) -* Fix mpileup regression between 1.9 and 1.10. +* Fix mpileup regression between 1.9 and 1.10 where overlap detection + was incorrectly skipped on reads where RNEXT, PNEXT and TLEN were + set to the "unavailable" values ("*", 0, 0 in SAM). (#1097) -* kputs() now checks for null pointer in source string. [fuzz] +* kputs() now checks for null pointer in source string. [fuzz] (#1087) -* Error message typo "Number=R" instead of "Number=G" fixed in - bcf_remove_allele_set(). Thanks to Ilya Vorontsov. - -* Fixes potential bcf_update_alleles() crash on 0 alleles. Thanks to +* Fix potential bcf_update_alleles() crash on 0 alleles. Thanks to John Marshall. (#994) -* Fixed hfile_libcurl breakage when using libcurl 7.69.1 or later. Thanks to - John Marshall for tracking down the exact libcurl change that caused the - incompatibility. (samtools/#1254 and samtools/#1284) +* Added bcf_unpack() calls to some bcf_update functions to fix a bug + where updates made after a call to bcf_dup() could be lost. (#1032; + fixed #1030) -* Download index files atomically in idx_test_and_fetch(). This prevents - corruption when running parallel jobs on S3 files. Thanks to John Marshall. - (samtools/#1242). - -* The pileup constructor now called with its own copy of bam1_t struct. +* Error message typo "Number=R" instead of "Number=G" fixed in + bcf_remove_allele_set(). Thanks to Ilya Vorontsov. (#1100) -* Verify that the BCF input is within the length of its assigned type. [fuzz] +* Fixed crashes that could occur in BCF files that use IDX= header annotations + to create a sparse set of CHROM, FILTER or FORMAT indexes, and + include records that use one of the missing index values. [fuzz] (#1092) + +* Fixed potential integer overflows in the VCF parser and ensured that + the total length of FORMAT fields cannot go over 2Gbytes. [fuzz] (#1044, + #1104) + +* Download index files atomically in idx_test_and_fetch(). This prevents + corruption when running parallel jobs on S3 files. Thanks to John Marshall. + (#1112; samtools/samtools#1242). -* Fixed possible error in code_sort() on negative CRAM Huffman code length. +* The pileup constructor callback is now given the copy of the bam1_t struct + made by pileup instead of the original one passed to bam_plp_push(). This + makes it the same as the one passed to the destructor and ensures that + cached data, for example the location of an aux tag, will remain valid. + (#1127) -* Fixed possible undefined shift in cram_byte_array_stop_decode_init(). +* Fixed possible error in code_sort() on negative CRAM Huffman code + length. (#1008) -* Fix interval end comparison in CRAM iterator method. (samtools/#1173) +* Fixed possible undefined shift in cram_byte_array_stop_decode_init(). (#1009) -* Fix integer overflow in cram_read_slice. [fuzz] +* Fixed a bug where range queries to the end of a given reference + would return incorrect results on CRAM files. (#1016; + fixed samtools/samtools#1173) -* Fix parameter passed to cram_name2id. (#977) +* Fixed an integer overflow in cram_read_slice. [fuzz] (#1026) -* Fix memory leak on failure in cram_decode_slice(). [fuzz] +* Fixed a memory leak on failure in cram_decode_slice(). [fuzz] (#1054) -* Fixed incomplete decode to encode conversion in CRAM. (samtools/#1276) +* Fixed a regression which caused cram_transcode_rg() to fail, resulting + in a crash in "samtools cat" on CRAM files. (#1093; + fixed samtools/samtools#1276) +* Fixed an undersized string reallocation in the threaded SAM reader which + caused it to crash when reading SAM files with very long lines. Numerous + memory allocation checks have also been added. (#1117) Noteworthy changes in release 1.10.2 (19th December 2019) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 75e2017bb097cb47722ac8796de10036d21c9625 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 17 Sep 2020 09:43:36 +0100 Subject: [PATCH 003/488] Minor NEWS updates Reword tabix --separate-regions description. Fix PR#1094 text -- the original has in the description (edit the PR comment to see it). Credit @lczech. Trivial formatting fixes. Spelling mistakes PR surely too trivial to mention :-). --- NEWS | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/NEWS b/NEWS index 32cee4051..ede94289b 100644 --- a/NEWS +++ b/NEWS @@ -11,9 +11,9 @@ Features and Updates can be handled by refs_load_fai(). UR tags in @SQ lines can now be set to remote URIs. (#1017) -* Added tabix --separate-regions option, which lets the user group the output by - the number of corresponding target regions supplied in the command line. - (#1108) +* Added tabix --separate-regions option, which adds header comment lines + separating different regions' output records when multiple target regions + are supplied on the command line. (#1108) * Added tabix --cache option to set a BGZF block cache size. Most beneficial when the -R option is used and the same blocks need to be re-read multiple @@ -56,7 +56,7 @@ Features and Updates * hts_srand48() now seeds the same POSIX-standard sequences of pseudo-random numbers regardless of platform, including on OpenBSD where plain srand48() produces a different cryptographically-strong non-deterministic sequence. - Thanks to John Marshall. (#1002) + Thanks to John Marshall. (#1002) * Iterators now work with 64 bit positions. (#1018) @@ -65,10 +65,10 @@ Features and Updates The best improvement is on low-coverage data. (#1031) * Alignments which consume no reference bases are now considered to have - length 1. This would make such alignments cover 1 reference position in the - same manner as alignments that are unmapped or have no CIGAR strings. These - alignments can now be returned by iterator based queries. Thanks to John - Marshall. (#1063; fixed samtools/samtools#1240, see also + length 1. This would make such alignments cover 1 reference position in + the same manner as alignments that are unmapped or have no CIGAR strings. + These alignments can now be returned by iterator-based queries. Thanks + to John Marshall. (#1063; fixed samtools/samtools#1240, see also samtools/hts-specs#521). * A bam_set_seqi() function to modify a single base in the BAM structure @@ -100,7 +100,7 @@ Features and Updates test value was not affected by this problem. (#1126) * Improved error diagnostics in the CRAM decoder (#1042), BGZF (#1049), - the VCF and BCF readers (#1059), and the SAM parser (#1073) + the VCF and BCF readers (#1059), and the SAM parser (#1073). * ks_resize() now allocates 1.5 times the requested size when it needs to expand a kstring instead of rounding up to the next power of two. @@ -135,9 +135,9 @@ CRAM improvements (as per BAM spec and "*" quality) and then modifies individual qualities as dictated by the specific features. - However that then produces ASCII quality (q=-1) for the unmodified bases. - Instead ASCII quality "?" (q=30) is used, as per htsjdk. Quality 255 is still - used for sequences with no modifications at all. (#1094) + However that then produces ASCII quality " " (space, q=-1) for the unmodified + bases. Instead ASCII quality "?" (q=30) is used, as per HTSJDK. Quality 255 + is still used for sequences with no modifications at all. (#1094) Build changes @@ -145,8 +145,8 @@ Build changes These are compiler, configuration and makefile based changes. -* make all now builds htslib_static.mk and htslib-uninstalled.pc. Thanks to - John Marshall. (#1011) +* `make all` now also builds htslib_static.mk and htslib-uninstalled.pc. + Thanks to John Marshall. (#1011) * Various cppcheck-1.90 warnings have been fixed. (#995, #1011) @@ -161,17 +161,14 @@ These are compiler, configuration and makefile based changes. * Fix dirty default build by including latest pkg.m4 instead of using aclocal.m4. Thanks to Damien Zammit. (#1091) -* struct tags have been added to htslib/*.h public typedefs. This makes it +* Struct tags have been added to htslib/*.h public typedefs. This makes it possible to forward declare htsFile without including htslib/hts.h. Thanks - to John Marshall. (#1106) + to Lucas Czech and John Marshall. (#1115; fixed #1106) * Fixed compiler warnings emitted by the latest gcc and clang releases when compiling HTSlib, along with some -Wextra warnings in the public include files. Thanks to John Marshall. (#1066, #1063, #1083) -* A large number of spelling mistakes have been fixed. Thanks to - John Marshall. (#1137) - Bug fixes --------- @@ -263,7 +260,7 @@ Bug fixes would return incorrect results on CRAM files. (#1016; fixed samtools/samtools#1173) -* Fixed an integer overflow in cram_read_slice. [fuzz] (#1026) +* Fixed an integer overflow in cram_read_slice(). [fuzz] (#1026) * Fixed a memory leak on failure in cram_decode_slice(). [fuzz] (#1054) @@ -275,6 +272,7 @@ Bug fixes caused it to crash when reading SAM files with very long lines. Numerous memory allocation checks have also been added. (#1117) + Noteworthy changes in release 1.10.2 (19th December 2019) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 80161279b755d83df6e0dbc52942f625df414452 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 18 Sep 2020 11:44:04 +0100 Subject: [PATCH 004/488] Add back assert.h to vcf.h. This undoes a change in #1060. It's a philosphical point. On one hand we have an inclusion of a spurious file that we do not utilise ourselves. On the other hand we have an API breakage where old software (in this case bcftools 1.10) attemting to compile against current htslib no longer succeeds because we've changed which symbols we declare. This is unclean, but I think backwards compatibity trumps tidyness in this case. --- htslib/vcf.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/htslib/vcf.h b/htslib/vcf.h index fb5106e17..659ddc7c5 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -41,6 +41,9 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" #include "hts_endian.h" +/* Included only for backwards compatibility with e.g. bcftools 1.10 */ +#include + #ifdef __cplusplus extern "C" { #endif From cd0a84b490e910c085b987e3026f724be98349bf Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 22 Sep 2020 13:14:15 +0100 Subject: [PATCH 005/488] More minor NEWS fixes --- NEWS | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/NEWS b/NEWS index ede94289b..e82ddff6f 100644 --- a/NEWS +++ b/NEWS @@ -85,12 +85,12 @@ Features and Updates * It is now possible to use external plug-ins in language bindings that dynamically load HTSlib. Note that a side-effect of this change is that - some plug-ins now link against libhts.so, which means that have to be able - to find the shared library when they are started up. Thanks to + some plug-ins now link against libhts.so, which means that they have to be + able to find the shared library when they are started up. Thanks to John Marshall. (#1072) * bgzf_close(), and therefore hts_close(), will now return non-zero when - closing a bgzf handle on which errors have been detected. (Part of #1117) + closing a BGZF handle on which errors have been detected. (Part of #1117) * Added a special case to the kt_fisher_exact() test for when the table probability is too small to be represented in a double. This fixes a @@ -186,7 +186,7 @@ Bug fixes * Fixed missing return parameter value in idx_test_and_fetch(). Thanks to Lilian Janin. (#1014) -* Fixed crashes due to inconsistent selection between bgzf and plain (hfile) +* Fixed crashes due to inconsistent selection between BGZF and plain (hFILE) interfaces when reading files. [fuzz] (#1019) * Added and/or fixed byte swapping code for big-endian platforms. Thanks From 3c1ea37046e701afa1dffd07ea72e10b9b9eb8e2 Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 24 Sep 2020 15:58:49 +1000 Subject: [PATCH 006/488] Update hts.c Include unistd to guarantee acccess to R_OK definition --- hts.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hts.c b/hts.c index 4552b5a80..d0ca8a5f1 100644 --- a/hts.c +++ b/hts.c @@ -31,6 +31,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include #include From 450c91271ecf688150c4f35d75c4a8d0381c848d Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 25 Sep 2020 17:16:43 +0100 Subject: [PATCH 007/488] Include and directly where needed Similarly to the previous commit, sam.c needs explicitly for environments in which it doesn't get it via or similar. Both htslib/sam.h and test/test-vcf-api.c use errno but get it only semi-accidentally via htslib/kstring.h. Include explicitly, and similarly for strerror(). Various test/*.c programs that use getopt() need include only rather than , which is needed only for getopt_long(). --- htslib/sam.h | 1 + sam.c | 1 + test/plugins-dlhts.c | 2 +- test/test-vcf-api.c | 2 ++ test/test_index.c | 2 +- test/test_kstring.c | 2 +- test/test_realn.c | 2 +- test/test_str2int.c | 2 +- test/test_view.c | 1 - 9 files changed, 9 insertions(+), 6 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index f3b684291..9e595ae12 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTSLIB_SAM_H #define HTSLIB_SAM_H +#include #include #include "hts.h" #include "hts_endian.h" diff --git a/sam.c b/sam.c index 02cd6092e..7a58e361e 100644 --- a/sam.c +++ b/sam.c @@ -35,6 +35,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress deprecation message for cigar_tab, which we initialise #include "htslib/hts_defs.h" diff --git a/test/plugins-dlhts.c b/test/plugins-dlhts.c index 0e4638d1f..aa98ef3f5 100644 --- a/test/plugins-dlhts.c +++ b/test/plugins-dlhts.c @@ -37,9 +37,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include #include #include +#include #ifndef EPROTONOSUPPORT #define EPROTONOSUPPORT ENOSYS diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index 65912e3eb..87bce4aab 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -24,7 +24,9 @@ DEALINGS IN THE SOFTWARE. */ #include +#include #include +#include #include "../htslib/hts.h" #include "../htslib/vcf.h" diff --git a/test/test_index.c b/test/test_index.c index 0740427ab..402879666 100644 --- a/test/test_index.c +++ b/test/test_index.c @@ -24,7 +24,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include +#include #include "../htslib/sam.h" #include "../htslib/vcf.h" diff --git a/test/test_kstring.c b/test/test_kstring.c index 5923ba2a7..8dcce6b5b 100644 --- a/test/test_kstring.c +++ b/test/test_kstring.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include +#include #include "../htslib/kstring.h" diff --git a/test/test_realn.c b/test/test_realn.c index b96ef9d59..3f511704b 100644 --- a/test/test_realn.c +++ b/test/test_realn.c @@ -28,8 +28,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include +#include #include "../htslib/sam.h" #include "../htslib/hts.h" diff --git a/test/test_str2int.c b/test/test_str2int.c index 6345fdc4c..70e799169 100644 --- a/test/test_str2int.c +++ b/test/test_str2int.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include +#include #include "../textutils_internal.h" diff --git a/test/test_view.c b/test/test_view.c index 604ec42c6..30e604610 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -29,7 +29,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include #include "../cram/cram.h" From 30d82d6bdd6ab2bfc00d81d325a678dd3b3b5e48 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 30 Sep 2020 09:37:45 +0100 Subject: [PATCH 008/488] Variable name used in dlsym corrected. The hfile irods plugin depends on a constructed symbol name which was no longer being used. --- plugin.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugin.c b/plugin.c index dbbe03eab..d5c1981ca 100644 --- a/plugin.c +++ b/plugin.c @@ -153,7 +153,7 @@ plugin_void_func *load_plugin(void **pluginp, const char *filename, const char * const char *basename = slash? slash+1 : filename; kputsn(basename, strcspn(basename, ".-+"), &symbolg); - *(void **) &sym = dlsym(lib, symbol); + *(void **) &sym = dlsym(lib, symbolg.s); free(symbolg.s); if (sym == NULL) goto error; } From 14a83590dd5944ba6b5b5f7cbd85237623bd40a1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 6 Oct 2020 10:13:30 +0100 Subject: [PATCH 009/488] Replace the indexing range check error message. (#1151) This code detects when the end position is too large for the current index. It then computes an adjusted n_lvls parameter and tells the user to switch to CSI with this parameter. That has numerous failings though. 1. The only user-adjustable parameter is min-shift, not levels, so the advice is impossible to follow. 2. CSI auto-scales based on SQ headers, so we don't need to be explicit anyway (assuming we currently have a BAI index that doesn't fit). 3. If we already were using CSI index on SAM/BAM or VCF/BCF and if we did have the configuration options to do what it asks, it wouldn't fix the problem. Given CSI auto-scales, if we see this message it's because the alignments don't match the headers; either the header has been replaced by something inappropriate or there are bugs in the aligner that has emitted rogue POS fields. There is no fix to the index that actually fixes the data; it's simply brushing the problem under the carpet. --- hts.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/hts.c b/hts.c index d0ca8a5f1..afa71987f 100644 --- a/hts.c +++ b/hts.c @@ -1865,26 +1865,16 @@ int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) int64_t maxpos = (int64_t) 1 << (idx->min_shift + idx->n_lvls * 3); if (tid < 0 || (beg <= maxpos && end <= maxpos)) return 0; - int64_t max = end > beg ? end : beg, s = 1 << 14; - int n_lvls = 0; - while (max > s) { - n_lvls++; - s <<= 3; - } if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a csi index " - "with min_shift = %d, n_lvls = %d. Try using " - "min_shift = 14, n_lvls >= %d", - beg, end, - idx->min_shift, idx->n_lvls, - n_lvls); + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos + " cannot be stored in a csi index. " + "Please check headers match the data", + beg, end); } else { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" cannot be stored in a %s index. " - "Try using a csi index with min_shift = 14, " - "n_lvls >= %d", - beg, end, idx_format_name(idx->fmt), - n_lvls); + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos + " cannot be stored in a %s index. Try using a csi index", + beg, end, idx_format_name(idx->fmt)); } errno = ERANGE; return -1; From ff495a966fd59e29e3d3d176c6d854b346dc6e0e Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Tue, 29 Sep 2020 19:49:23 +0200 Subject: [PATCH 010/488] Bug fix: handling of CRLF line terminators in sam_parse_worker(). --- sam.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/sam.c b/sam.c index 7a58e361e..321787089 100644 --- a/sam.c +++ b/sam.c @@ -2508,9 +2508,17 @@ static void *sam_parse_worker(void *arg) { // However this is an API change so for now we copy. char *nl = strchr(cp, '\n'); - if (!nl) nl = cp_end; - if (*nl) *nl++ = '\0'; - kstring_t ks = {nl-cp, gl->alloc, cp}; + char *line_end; + if (nl) { + line_end = nl; + if (line_end > cp && *(line_end - 1) == '\r') + line_end--; + nl++; + } else { + nl = line_end = cp_end; + } + *line_end = '\0'; + kstring_t ks = { line_end - cp, gl->alloc, cp }; if (sam_parse1(&ks, fd->h, &b[i]) < 0) { sam_state_err(fd, errno ? errno : EIO); cleanup_sp_lines(gl); From ed198a55b6a7e6dbcfe53b22f7cc8b3c21086301 Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Tue, 29 Sep 2020 19:50:44 +0200 Subject: [PATCH 011/488] Added test cases for kgetline() and kgetline2(). --- test/test_kstring.c | 118 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 118 insertions(+) diff --git a/test/test_kstring.c b/test/test_kstring.c index 8dcce6b5b..ee913a2e3 100644 --- a/test/test_kstring.c +++ b/test/test_kstring.c @@ -261,6 +261,118 @@ static int test_kputw(int64_t start, int64_t end) { return 0; } +// callback used by test_kgetline +static char *mock_fgets(char *str, int num, void *p) { + int *mock_state = (int*)p; + (*mock_state)++; + switch (*mock_state) { + case 1: + case 4: + case 7: + // a few characters, no endline + strcpy(str, "ABCD"); + break; + case 2: + case 3: + // \n endline + strcpy(str, "\n"); + break; + case 5: + case 6: + // \r\n endline + strcpy(str, "\r\n"); + break; + default: + // eof + return 0; + } + + return str; +} + +static int test_kgetline() { + kstring_t s = KS_INITIALIZE; + int mock_state = 0; + + // normal line, \n terminated, called with non-empty s + kputs("_", &s); + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("_ABCD", s.s) || 5 != s.l) return -1; + s.l = 0; + // empty line, \n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // normal line, \r\n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // empty line, \r\n terminated + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // line terminated by EOF + if (0 != kgetline(&s, mock_fgets, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // EOF + if (EOF != kgetline(&s, mock_fgets, &mock_state) || 0 != s.l) return -1; + + ks_free(&s); + return EXIT_SUCCESS; +} + +// callback used by test_kgetline2 +static ssize_t mock_fgets2(char *str, size_t num, void *p) { + int *mock_state = (int*)p; + (*mock_state)++; + switch (*mock_state) { + case 1: + case 4: + case 7: + // a few characters, no endline + strcpy(str, "ABCD"); + break; + case 2: + case 3: + // \n endline + strcpy(str, "\n"); + break; + case 5: + case 6: + // \r\n endline + strcpy(str, "\r\n"); + break; + default: + // eof + return 0; + } + + return strlen(str); +} + +static int test_kgetline2() { + kstring_t s = KS_INITIALIZE; + int mock_state = 0; + + // normal line, \n terminated, called with non-empty s + kputs("_", &s); + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("_ABCD", s.s) || 5 != s.l) return -1; + s.l = 0; + // empty line, \n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // normal line, \r\n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // empty line, \r\n terminated + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("", s.s) || 0 != s.l) return -1; + s.l = 0; + // line terminated by EOF + if (0 != kgetline2(&s, mock_fgets2, &mock_state) || 0 != strcmp("ABCD", s.s) || 4 != s.l) return -1; + s.l = 0; + // EOF + if (EOF != kgetline2(&s, mock_fgets2, &mock_state) || 0 != s.l) return -1; + + ks_free(&s); + return EXIT_SUCCESS; +} + int main(int argc, char **argv) { int opt, res = EXIT_SUCCESS; int64_t start = 0; @@ -301,5 +413,11 @@ int main(int argc, char **argv) { if (!test || strcmp(test, "kputw") == 0) if (test_kputw(start, end) != 0) res = EXIT_FAILURE; + if (!test || strcmp(test, "kgetline") == 0) + if (test_kgetline() != 0) res = EXIT_FAILURE; + + if (!test || strcmp(test, "kgetline2") == 0) + if (test_kgetline2() != 0) res = EXIT_FAILURE; + return res; } From 81eca5e695e957ef6844d619b44246abe91d9eef Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Tue, 29 Sep 2020 19:51:25 +0200 Subject: [PATCH 012/488] Improved documentation of kgetline() and kgetline2() in the header file. --- htslib/kstring.h | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/htslib/kstring.h b/htslib/kstring.h index ed9bdeeaa..150757ca6 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -109,18 +109,21 @@ extern "C" { HTSLIB_EXPORT char *kstrtok(const char *str, const char *sep, ks_tokaux_t *aux); - /* kgetline() uses the supplied fgets()-like function to read a "\n"- - * or "\r\n"-terminated line from fp. The line read is appended to the - * kstring without its terminator and 0 is returned; EOF is returned at - * EOF or on error (determined by querying fp, as per fgets()). */ - typedef char *kgets_func(char *, int, void *); + /* kgetline() uses the supplied fgets()-like function to read a "\n"- + * or "\r\n"-terminated line from fp. The line read is appended to the + * kstring without its terminator and 0 is returned; EOF is returned at + * EOF or on error (determined by querying fp, as per fgets()). */ + typedef char *kgets_func(char *, int, void *); HTSLIB_EXPORT - int kgetline(kstring_t *s, kgets_func *fgets, void *fp); + int kgetline(kstring_t *s, kgets_func *fgets_fn, void *fp); - // This matches the signature of hgetln(), apart from the last pointer - typedef ssize_t kgets_func2(char *, size_t, void *); + /* kgetline2() uses the supplied hgetln()-like function to read a "\n"- + * or "\r\n"-terminated line from fp. The line read is appended to the + * ksring without its terminator and 0 is returned; EOF is returned at + * EOF or on error (determined by querying fp, as per fgets()). */ + typedef ssize_t kgets_func2(char *, size_t, void *); HTSLIB_EXPORT - int kgetline2(kstring_t *s, kgets_func2 *fgets, void *fp); + int kgetline2(kstring_t *s, kgets_func2 *fgets_fn, void *fp); #ifdef __cplusplus } From 2264113e5df1946210828e45d29c605915bd3733 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 7 Oct 2020 13:00:00 +0100 Subject: [PATCH 013/488] Added new test and data for CRLF line endings. --- .gitattributes | 4 + test/index_dos.sam | 190 +++++++++++++++++++++++++++++++++++++++++++++ test/test.pl | 8 ++ 3 files changed, 202 insertions(+) create mode 100644 test/index_dos.sam diff --git a/.gitattributes b/.gitattributes index efd67e585..e46cc5cb4 100644 --- a/.gitattributes +++ b/.gitattributes @@ -16,3 +16,7 @@ README.md export-ignore # line separators on Windows machines. It causes the index files to become out # of sync with the fasta files. *.fa* -text + +# Remove the text attribute from index_dos.sam, so that the line separators +# for the test file don't get converted into Unix format. +test/index_dos.sam -text diff --git a/test/index_dos.sam b/test/index_dos.sam new file mode 100644 index 000000000..b006aa77d --- /dev/null +++ b/test/index_dos.sam @@ -0,0 +1,190 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:CHROMOSOME_I LN:1009800 M5:8ede36131e0dbf3417807e48f77f3ebd +@SQ SN:CHROMOSOME_II LN:5000 M5:8e7993f7a93158587ee897d7287948ec +@SQ SN:CHROMOSOME_III LN:5000 M5:3adcb065e1cf74fafdbba1e8c352b323 +@SQ SN:CHROMOSOME_IV LN:5000 M5:251af66a69ee589c9f3757340ec2de6f +@SQ SN:CHROMOSOME_V LN:5000 M5:cf200a65fb754836dcc56b24b3170ee8 +@SQ SN:CHROMOSOME_X LN:5000 M5:6f9368fd2192c89c613718399d2d31fc +@SQ SN:CHROMOSOME_MtDNA LN:5000 M5:cd05857ece6411f40257a565ccfe15bb +@PG ID:bowtie2 PN:bowtie2 VN:2.0.0-beta5 +SRR065390.17240207 16 CHROMOSOME_I 999901 42 100M * 0 0 ATGTTTACAGGACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAG CACAC?CBBAA@?@?BADDBBDBBAB>DDDBBDDABBBCCADDDDDCBCBCCCDBDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.15493040 0 CHROMOSOME_I 999912 42 100M * 0 0 ACTTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDBCCBDBCCBDDA@>DC?5@?@@??:><<>8>39<37 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.6144221 0 CHROMOSOME_I 999914 42 100M * 0 0 TTCAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCCCCBDCDDBBDDBDBDD@BBB@DBABDB AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8057275 0 CHROMOSOME_I 999916 42 100M * 0 0 CAAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTT CCCCCCCBCCC@CCCCCCCCCCC>BBB>BB?4CCCCCC;>====ACCCA@CCCBBCCBC;>@==>BBBBA?<;@<@######################## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24679913 16 CHROMOSOME_I 999917 42 100M * 0 0 AAGCAGAGGATTTTTCGATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTT ==56>??>AB?>D>?A?DBDABBB=BDBDACDBBCCDBBBBDDCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.25513175 0 CHROMOSOME_I 999934 42 100M * 0 0 ATGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCBC@CADCDDAABA=B?=A=B.>AA?AADA########################## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17492782 0 CHROMOSOME_I 999935 42 100M * 0 0 TGATTGCCAAAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCDCDCCDCCBDCDDBDDBDD@BBBBBBACBBAB=AB>BBBAB>?BA@CAAA? AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17146364 16 CHROMOSOME_I 999942 42 100M * 0 0 CAAAAATTGTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAG #######@/A@@<:BBBBB>ABBDADC@=DDBDDDCDCCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-3 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:8T91 YT:Z:UU +SRR065390.14459471 16 CHROMOSOME_I 999944 42 100M * 0 0 AAAATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGC @@@@=B@CCCBAABACCC@DCCCCCDCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.6968616 16 CHROMOSOME_I 999947 42 100M * 0 0 ATTTTGGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTA BDB>B@DDDD@DDDDBCACB@DCBCCACCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9052825 16 CHROMOSOME_I 999952 42 100M * 0 0 GGAACTTTTATAGGCTTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACC ?B;DABDABDDBDDADCCCD@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22926164 0 CHROMOSOME_I 999967 42 100M * 0 0 TTAAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBCCBCCCCCCCCDCCDCDDDDCCDACDCADBDDBBCBCBCCABBA@BABABCBABC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27108093 16 CHROMOSOME_I 999969 42 100M * 0 0 AAGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACN ##########AAAAA388333-533')''+AA8AAAAAAAAAA8AAAAAA67788AAAA888887AAA5AAAAAAAAAAAA8AAAAAAAA+*++)))))! AS:i:-1 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:99C0 YT:Z:UU +SRR065390.19145675 0 CHROMOSOME_I 999970 42 100M * 0 0 AGCTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCBCCCCCCCCCCADCBDBBCBBBBBDCBABBBABAABB??DDAACCAACC>AC?C?= AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22660118 16 CHROMOSOME_I 999972 42 100M * 0 0 CTTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCC B9ABABDB>DBBBD8CBDCDBCDBCDBCBCCBCCCCCCCCCCCCCCC>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1589310 0 CHROMOSOME_I 999973 42 100M * 0 0 NTATGGTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCG !++((22221AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA7A8AAAAAAAA8AAAAAAAAAAAAA7A7AA768655 AS:i:-1 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:0T99 YT:Z:UU +SRR065390.32984687 0 CHROMOSOME_I 999978 42 100M * 0 0 GTTATGTTTAGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCT CCCCCCCCCCCBCCCCCCCCCCCCCCCCC@CC@CCCBCCCCCCBDACDCC>@B@CDBADB@BCBD@B=BBB@BD>C@BBCBACAABAB;D9<4:<66 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28347129 16 CHROMOSOME_I 999978 42 100M * 0 0 GTTATGTTTAGGCGTAGGCTTAGACATACGCTTAGGTTTCGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCT ##############################################@B?BB@A@ABBBDABD@DDBBB@@B;C@BACBC@CC@CCCCCBCCCCCCCCCCC AS:i:-10 XN:i:0 XM:i:5 XO:i:0 XG:i:0 NM:i:5 MD:Z:23G1T2G2G7A60 YT:Z:UU +SRR065390.17964692 16 CHROMOSOME_I 999984 42 100M * 0 0 TTTGGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAA #####@<@=<53.830;>.?A5@@?ABAAADBDBC<@CB@D@BCB@CBCDCDBBDC=C@C@CAAC@C@ACCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:3A96 YT:Z:UU +SRR065390.16701032 0 CHROMOSOME_I 999987 42 100M * 0 0 AGGCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCCCCACCCCCCCCCCDCBCCCCCCDCCBAA@BBBBBC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24060716 16 CHROMOSOME_I 999989 42 100M * 0 0 GCGTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAA @8>68BD?B??B@DB>ABB?BA@A=ADBCC@?AA@CCBBCBCCDBCDCCBCBCCC@CCCCBCCCCCCCACCCCCCCCACCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24907628 16 CHROMOSOME_I 999989 42 100M * 0 0 GCGTAGGCTGAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAA ################################BDDBB?BB>?>BADABBBDBDBABDBDC;?>9=C?B>CC@CCCCDCCCCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:9T90 YT:Z:UU +SRR065390.21366278 16 CHROMOSOME_I 999991 42 100M * 0 0 GTAGGCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAG ######?9>A09=@?=>BBDBBBB8B>DBCDCCDCBCBCBDCCC@CCCCCCCBCCCCCCC@@CCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27662957 0 CHROMOSOME_I 999995 42 100M * 0 0 GCTTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAGAGTA CCCCCCCCCCCCCCCCCCC@ACCCCCCCCCCCCCCADCCCBC?CDDDDAC=BA?@B@DBDB>?>>D?#################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.29477959 0 CHROMOSOME_I 999997 42 100M * 0 0 TTAGGCTTAGGCGTAGGTTTAGGCTTTGGCTTAGGCCTATGCTAGGCCTAGTACCATAATACTATTCTTACCGCCGCGCCTGATCAAACCAAAGAGTAGG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=AB?DAB@3=@8@=@?@ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.13030274 16 CHROMOSOME_I 1000208 42 100M * 0 0 TCAATTAAACTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTC 955576>0@BBBBBBDBBD?DABDDDDCD@DCDDCCDCDDCACBACCCCCCBCCCCCCCCCCCCCCBCCCCCCCCCCBBCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.18054898 16 CHROMOSOME_I 1000209 42 100M * 0 0 CAATTAAACTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCC CAC@CAA?BC?D??BCABB8=>@@?#### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.26866653 16 CHROMOSOME_I 1000217 42 100M * 0 0 CTGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACC ###########??????4D;AA?AAD?A>>?CABCBABBBBAA@AD>ADAAC@CCCCBCCBCCC?CCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCBCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.23714265 0 CHROMOSOME_I 1000218 0 78M2I20M * 0 0 TGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGAGATCGGAAGAGCGGTTCAGCAGGAA CCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCADDCCBBBBBDBBBB+=7=0?==>A#################### AS:i:-48 XN:i:0 XM:i:16 XO:i:1 XG:i:2 NM:i:18 MD:Z:75T0T1T0T0G0T2T0T0T3C0T0T0T0T0T1C0 YT:Z:UU +SRR065390.20744360 16 CHROMOSOME_I 1000218 42 100M * 0 0 TGGACTACGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCG #####@ABBBBDBD@BA@DCDBABBBBBDA>@CBBDBBAD=BBDCBACBCCCCCCCBCBCCCCACCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCBCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3611567 16 CHROMOSOME_I 1000225 42 100M * 0 0 CGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTT #####@<2@=BBBBAC=DBBB@BBACBBBB=C;BBCCBACC@CCACCCCBCCCCCCBCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10053218 16 CHROMOSOME_I 1000225 42 100M * 0 0 CGACAATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTT @@=@6AA=AAC?CAC>BB>?A>>CBB@@CBAD>CC;>C@BC>A################################################# AS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 NM:i:4 MD:Z:66A7A14C2A7 YT:Z:UU +SRR065390.21951837 0 CHROMOSOME_I 1000229 42 100M * 0 0 AATTATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCBCDCACCCCCCBCCB>AACCC@1/?@?CCC@@BABCB=?@@+:A?B###### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.21381202 0 CHROMOSOME_I 1000232 40 100M * 0 0 TATTGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGGTTTTTTTAGTTTTTTCTTTTTTCCCAATTTTTTTGGATA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCDCDC?=8@';4@AA############################################# AS:i:-16 XN:i:0 XM:i:8 XO:i:0 XG:i:0 NM:i:8 MD:Z:60A5G8C6A2G7A0G1A3 YT:Z:UU +SRR065390.22184926 16 CHROMOSOME_I 1000235 42 100M * 0 0 TGGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCT ??CAACCBAADD?DBB?@>BBB;BABBBBB@>CCCDBCDBACCCCAACACACCACCC@@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17603173 0 CHROMOSOME_I 1000236 42 100M * 0 0 GGGTTCAAACATTTGAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTG CCCCCCCCCCCCCCCCCCCCCCCCACCCCCCCCDCCCCCCCCB>CAB@ACCC################################################ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17587471 16 CHROMOSOME_I 1000250 42 100M * 0 0 GAAAATTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTT 10?8;;?;AA??:AA@BBBBB?BDDDDDBCDA>@DDDCCCDACCCDDCCDCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.635026 0 CHROMOSOME_I 1000255 42 100M * 0 0 TTTTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGA CCCCCCCCCCCCCCBCCCCCCCBD@CCCCB0:>8:=BBBBC6:=7@>?B?B43/+2>@@/@########## AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.33333470 0 CHROMOSOME_I 1000257 42 100M * 0 0 TTTTGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTTTGTTT CCCCBCCCCCCC?CCC?CCCCDBCADCCCCCA@@:;CCCC?7.)8;>???-3>>;A?3?6;/2;>?A:24775=4B<@@<4)+75:70(4@>::)9,B>BB?BBD:>BADDD=ABBBDDDBD@DBCCCDCCDBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:0T99 YT:Z:UU +SRR065390.18670433 0 CHROMOSOME_I 1000260 40 100M * 0 0 TGGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTTTGAATATCTGGGGATTTTTCGTTTTTTTTTTTTT CCCCCCCCCCCCC>CCCCBBC4A@ACCC8@;5/8;A?A/6,>==AAC6<@################################################## AS:i:-14 XN:i:0 XM:i:7 XO:i:0 XG:i:0 NM:i:7 MD:Z:65A0G11A4C7C1G0A5 YT:Z:UU +SRR065390.5800524 0 CHROMOSOME_I 1000261 42 100M * 0 0 GGCCGACGTCGGAACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTTCCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTT CCCCCCCCCCCCCCCCCDDC*/,0/??/<<508BAA@@BCBCAC?BAADBCD@@@CBCCBA9CCCACCCCCCCCCDCCCCCC?CCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCBBBCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1793614 16 CHROMOSOME_I 1000274 42 100M * 0 0 ACGTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTT A:CAADB=DBDD@CBACC>@CACCCCCCCDCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.20107270 0 CHROMOSOME_I 1000276 42 100M * 0 0 GTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAG CCCCCCCCCCCCCCCCCCCCCCC@@CCCCCCCCCCCCCCCCADDCCCCCDCC?ACACDCCCCC@CCCDCD@BCDCBB3>B@BCCC@@9=3BB?@B@>85; AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8268806 16 CHROMOSOME_I 1000276 42 100M * 0 0 GTCTCACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAG ##########D?:BBA>;BBABBAABBBBBDDB>DDDDBDCDDCDCDDCCCDCCCDCCCCDCCCCCCCCCCCCCCCCC@BBCCCCCCCCBBBCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.19264263 16 CHROMOSOME_I 1000280 42 100M * 0 0 CACTTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCC ##BB?>CBABBB?:BBBBABABABB@DBCBBDAABDCCCCCCBCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.18391831 0 CHROMOSOME_I 1000283 42 100M * 0 0 TTTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCC CCCCCCCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC?ACCCCCCCBCC@CC8BBCCCCCB@>A>CCCDDC@@@DBBBC?:CCDBAC;CDDDDCBDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCC@BBCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.24029537 16 CHROMOSOME_I 1000284 42 100M * 0 0 TTTTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCA DB>B8BB<9;?>ABDDAADB@DD@C@BBAABBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBBBCCCCCCCCBBBCCCCCCCCCCCCCDCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28630205 0 CHROMOSOME_I 1000286 42 100M * 0 0 TTCCTGATTTTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCACG CCCCCCCCCCCCC@BCCCCCCCCCCCCCCCCCDBCCCCCDDBBBCBCDCDB@=?BBBBDBBABBBBBB@@CBBDB>>>A>BCBCCB:;:>=<9:@A#### AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:98A1 YT:Z:UU +SRR065390.15799530 0 CHROMOSOME_I 1000295 42 100M * 0 0 TTTGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCCT CCCCCCCCCCCCCCCCCCCCCCAACCCCCCCCCACCBCBCCCC?B@CCCCB@93=@B5>BB>>3/77:7:B>CDBDDB@>;B>BBBBDACAAB@D@<9<9<7 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.22494349 0 CHROMOSOME_I 1000297 42 100M * 0 0 TGTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCCTAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACDCCCCCD@@CCDCDCBBDCDDDBADDDDCD>B;@>DAABBB@>5A>BDBB?6??@D?9@####### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12445253 0 CHROMOSOME_I 1000298 42 100M * 0 0 GTAGTTTTTCCTTTTTACCGAATTTTTAGGAATATCTGGGAATTTCTCGTTTTCTGATTTTTTTTCAATTGTTTTTAGATCCCCCAAGCCTAAGCATAAC CCCCCCCCCCCCCCCCCCCCACCCCCCCCC@DCCCCCCCC?BACCBC@CBDCCACB?BBBCDC@@;4BCBABDC@B56?B@96=4A>BAB;;5;:@19A;@;;;6?BBBBB3BBB??@@@>@BBB;@AA@9@AA9BABBBAA@@AABAABAB@BB:;??>:?DBAB?BBDDBBABB;ACBDB?BBB@CCCBDD@CD@CCDBCDDDCACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.27194079 0 CHROMOSOME_II 2920 42 100M * 0 0 CTAATTTTCAGAGAGACTGAAAGAGTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAA CCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCC=BBBCB?BBBA?BBBDB?>BB=CBCCAACAC;DAB=ACAC?##### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.21775125 16 CHROMOSOME_II 2934 42 100M * 0 0 AACTGAAAGAGTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACC #####ABA?=<<=5=@BBA?=@>:A:7.44?B?8B@@>BBB=@B?ADBBBCBBACBD9CBD?A9?=A?.AABADDABBB@BABDDBACBBCCDCBCCDCCCCDCCCCDCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3790175 16 CHROMOSOME_II 2944 42 100M * 0 0 GTTTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTG 8BDD@:=7)/>B>ABBB?BB?>?DB@B:BBB?BBADDC@BDCDDCDBCDCCCBADCCCCCCCBCCCCCCCCCCCCCCCCCCCDCCCCDCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.4091455 0 CHROMOSOME_II 2946 42 100M * 0 0 TTAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC?CCCCCCCCCCDCCCCBCCCDACBCDCACC@C@CA@CBAAD=BBAADD06@##### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.8676436 0 CHROMOSOME_II 2947 42 100M * 0 0 TAAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCCCCGCGGTCCGTGTGC <:>:>/000/:<<:BAB?>8A?A;:A873;3?>?>A>>A8B############################################# AS:i:-8 XN:i:0 XM:i:4 XO:i:0 XG:i:0 NM:i:4 MD:Z:84A7A0A5T0 YT:Z:UU +SRR065390.28734084 0 CHROMOSOME_II 2948 42 100M * 0 0 AAAAGTTCTACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACTCCACCGCGGTAAGTGTGTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCBCBCDCBCCCBBDDDCADABADBBABB:BB=D?B<@B@>CA?CA>BACADAA########### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10526869 0 CHROMOSOME_II 2956 40 100M * 0 0 TACCGACCACATCGAACCTACTCAAGCTAATAGAGTATGGACAATTGTGAACGGAGAGGTTCAATGGAAGACGCCACCGCGGGGAGGGGGGTTGTTTTAT CCCCCCCCCCCCCCCCCCCCADCBBDDDDDDDBBB8BA@B>6<:>9=789=0>D>AA<@<8B>1>A9>;@5=@8C:48;*AAA=<>9>9>>:>>AB?D>BBDBCBDBBCCBABBB>@CDCCBCDCAACCCCCACCCCCCCCBCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.10879394 16 CHROMOSOME_V 938 42 100M * 0 0 TATGTTTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCA B;B:B>@B?>@>7BBDABADADBBCBDCCBACBCCBBB@CCCCCBCCACACCCCCC>CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1520161 0 CHROMOSOME_V 941 42 100M * 0 0 GTTTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATG CCCCCCCBBBCCCCCCCCCCCCCCCCCCCCACCCCCCDCC@CCCCCCCCCCCCCCCCCCACCCBCCAD=D@BC?C?C?C##################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.17468019 16 CHROMOSOME_V 943 42 100M * 0 0 TTTTCTTGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAA >ABBBABBDDDB=DBCD?DDBDBDADDADDBDCCCCCCC=CCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12403970 0 CHROMOSOME_V 949 42 100M * 0 0 TGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCC@CBBCACBC@?144:>><@@DAB?:=9@<>/>9?;=927= AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.16193993 16 CHROMOSOME_V 949 42 100M * 0 0 TGAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTC ;;/67AAC@ADCCDBCDCCCCCCD@CCCCCCCC@CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCDC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.30032741 16 CHROMOSOME_V 950 42 100M * 0 0 GAAAATGTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCA AACBBAB?BB>BABBCDBBDABDBADDDDBDDBBADDDDBACCDCBDDDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.32455256 0 CHROMOSOME_V 956 42 100M * 0 0 GTTATCAACACTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGGTATAATACAGCGACTCAATGAAAAAATCAAAAAAA CCCCACCCCCBB=?ABB?BBA?BAABBBBBB@BBABBBBBBBBBBBBBAA@BBBBBBB>B######################################## AS:i:-4 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:64T25C9 YT:Z:UU +SRR065390.15571530 16 CHROMOSOME_V 966 42 100M * 0 0 CTGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTT B;:B;B?D?@?BBBB5-=<:@@AA@BBA>BBADBBDDDDCDCDCDBBDCCCDCCCCCCCCCDCCCCCDCCCCCCCCCCCACCBBBCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9595122 0 CHROMOSOME_V 967 42 100M * 0 0 TGATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCC?@:AAA>C@CBB@@>?B=A?BBBBBCB>@/@>=>=>BB# AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.3600239 16 CHROMOSOME_V 969 42 100M * 0 0 ATAATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCA DB>DBBBBA@AAB?DCA@CB@ABB@BB=AAAA>@==>>6/>:>5:688/85A?AAA>>657==BBB<;;;9>>8>>BBBB> AS:i:-6 XN:i:0 XM:i:2 XO:i:0 XG:i:0 NM:i:2 MD:Z:28T25T45 YT:Z:UU +SRR065390.31266674 0 CHROMOSOME_V 971 42 100M * 0 0 AATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTTACTTTGCACG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCADDCCBC@CBC5<5<7?:83;+471/0<4=8;??BBD(.94;9?@?################ AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:89G10 YT:Z:UU +SRR065390.23187971 16 CHROMOSOME_V 972 42 100M * 0 0 ATCTGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGC 647:0BBB?B==@?@@BDBDBBBDDDBDDBDBDDDCBCCCCBBCCCCCDCCBCCCCCCCCCCCBBBCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.28661392 16 CHROMOSOME_V 975 42 100M * 0 0 TGAAAAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTAT ACCACAA5BDABAA>BDBDBDCBCBA@DBDB>DBBBBBAABDBDBDDBCCCCDCCCCCDCCCCCCCCCCCCCCBBBCCCCCCCCCCDCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.1859967 0 CHROMOSOME_V 979 42 100M * 0 0 AAATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGCAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCA>=>C<@@;:@A@A=53@?AB::?@CCACC=B/<;53;7BB:>B=::=A@?@?ACCC>C@CCCCCB:/&-7735@B7B>B?;@@CC@35A@@CCBC@######################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.12435485 16 CHROMOSOME_V 981 42 100M * 0 0 ATTATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAA B<=BB>B@>>BBBD@>?DABBBBBDDDDDDDDCADCDCCDCCCDCDBCCCCCDCCCCCCCCCCCCCCBBBCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.7485987 0 CHROMOSOME_V 983 42 100M * 0 0 TATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGGTAAAA C@?C@CCCCCCCCCCCC@CCCCC@?C8CCC@BC?@CC############################################################### AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:94T5 YT:Z:UU +SRR065390.17264189 0 CHROMOSOME_V 983 42 100M * 0 0 TATAATTTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACCCTATGGTTAAAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=C=/////=?5=;:@8???AA############################ AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:87G12 YT:Z:UU +SRR065390.6356855 0 CHROMOSOME_V 986 42 100M * 0 0 AATTTAAAACTTAAACGAAGCTAAAATGTGGCTGGTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAAAA CCBCCBCCCCCCCCCC@CC?@CCCCC@CCC>A=@.88/45+()/.=>2==BBCB659?9?'))10;9??############################### AS:i:-7 XN:i:0 XM:i:3 XO:i:0 XG:i:0 NM:i:3 MD:Z:34T63T0G0 YT:Z:UU +SRR065390.20107175 0 CHROMOSOME_V 989 42 100M * 0 0 TTAAAACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAAT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@C@?./..):3872A=@=A<=:;=B>B>>87777@>&@9A@@@8:@>88 AS:i:-2 XN:i:0 XM:i:1 XO:i:0 XG:i:0 NM:i:1 MD:Z:87T12 YT:Z:UU +SRR065390.6431660 16 CHROMOSOME_V 994 42 100M * 0 0 ACTTAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATT AACBABABDC@@ADABBDDCDCDBCDDDCCDCDBCACCCBCCDCCCCCCCCBCC@@@CCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.4439503 16 CHROMOSOME_V 997 42 100M * 0 0 TAAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTT ###########B>:AAAAA@C@=;937<ACCC8@@@AABCC>@+/662BBBC?B>BBB?BBBB#################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.11492188 0 CHROMOSOME_V 998 42 100M * 0 0 AAACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTG CCCCCCCCCCCCCCCCDCCCCC>A@AAAAAACA??B@@BBD>BACACC08;;AAACB==/*/1//:=@99BBABA@;<@;<:9>>B??>B??:?6B??B9 AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.9605367 0 CHROMOSOME_V 999 42 100M * 0 0 AACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTGG CCCCCCCCCCCCCCCDCCCCC@C<>>A9<4=9>=B###################### AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.29302896 0 CHROMOSOME_V 1000 42 100M * 0 0 ACGAAGCTAAAATGTGGCTGTTATAATACAGCGACTCAATGAAAAACTCAAAAAAAAGTTGACTTTGCACGCTATGGTTAAAAATGAATGAATTCTTGGT CCCCCCCCCCCCCCDCCCCC6?:??AABCCCC8?C@BCCCC@@5;><9>>>B>>AB=<)6=4:):9>>@@################ AS:i:0 XN:i:0 XM:i:0 XO:i:0 XG:i:0 NM:i:0 MD:Z:100 YT:Z:UU +SRR065390.13754 4 * 0 0 * * 0 0 TCGCTGCTGTGATGTTGCGTTTTTATCAGCACAAAGGCGGTCAGGCCGAGGCCTATTTTTTCCGGATCCAGCAGGGCGACTTTGCCGATAAGGATACCGT CCCCCCCCCCCCCCCCCCCCCCCCCBBCCCCCCC@CCCCCDCCCCCCCDCCCC@ACCCC@>>CCD?>>>@@@ YT:Z:UU +SRR065390.13765 4 * 0 0 * * 0 0 CGTGGTCGTGCCGGTTACAAGCCTGCCGTGAAAAGCCGTTTCAGTAAGTCAGCCAATAGCAAATTCTCCCATACTATCGCTTTTGCCTGATCCTGAACTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCDCCACCCCCCCC@CCCCCCCCCACCCBB@?CBDABDDADB<=ABBB@B@BB@ YT:Z:UU +SRR065390.13778 4 * 0 0 * * 0 0 TTTTATACCAACAAAAAACGGAAAGCAGATAACCCAGCAGCCCGAGTAACAGTATCCGGGCATCCAGGCCAAAAGCTAACAGAGCCGCGATAAAATCCCA CCCCCCCCCCCCCCBBBBBCCCCCCCBCCCCCCCCCBCCCCCCCCCDCCCCCACCCCCCCCCDCCCCDCBCA@AC>@=@CC?B>CBBCC>=?8A8=?>66 YT:Z:UU +SRR065390.13779 4 * 0 0 * * 0 0 ATAATGGACAACTTTAATGGCAATCACTAAATCAACTCCGGCACCATTAACCGGTGGGACGTTATGGTGCGTCACTATTGCATTGTCATTAGCGACATTT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCC=CC?BCCBCCACBABCCDCDBDADBBCDCBD>DBBD==BAA:>5<> YT:Z:UU +SRR065390.13802 4 * 0 0 * * 0 0 AAGGCGTTTATTATATACACTCGCATGGCTTTTCTTCTGAAAATGTAGAATAATTGAGTAATTTTTAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAG ??B:??????BBB>B99;;;>B>>>:BBBBB;;B=70///0-/01/BA>BABB>B>BBB@BB>>ABABA>BA>6BB88183,<8;<4>:@><>A>> YT:Z:UU +SRR065390.13808 4 * 0 0 * * 0 0 GTTTGCTGACTGGCCAGCCAGCTCAAGGCATCAAAAGCGTCTTTGAGAGGGAAAGGAATATCGATAACCCGAATGCCGGGTGGCGCCGGAATCTCTTGCG CCCCBCCCBCCC@CCC@CCCAA:A=BB?BBBB@>B>A#################### YT:Z:UU +SRR065390.13853 4 * 0 0 * * 0 0 CTGGTACGTCACCACACGCCGCGATGGCGTCATCCACCGACTTCACCCACGTTACGCGATCGTCCGTACCCGGGTGACCGTTGGGGATAATATTTTTGCG #################################################################################################### YT:Z:UU +SRR065390.13861 4 * 0 0 * * 0 0 TTCAGAAACTGGATGAACAGTGCGCAGCCATCTGCAAATATGAATTAGTTCAAGTCACTCAAAAGCTATTTATTTGAATGGAAGAAATTTTTGAACTATA CCCCCCCCCCCBCCB@@CCCBCCBCCCCCCCCCCCCC?CCCC@CCC@C@CCC@CACCBCC?BBBC@C7CBCBCB@@ABCCBBBC=BABCCBBBBAB@@CA YT:Z:UU +SRR065390.13907 4 * 0 0 * * 0 0 CATTACCATTCAGTTGTATTGTTTGCGCACCAGAAAAATGAGACTGCACAGAATAAATTATACTGACCAGAAATTGTAAAATTCGTATATTCTTATTCAT 8998;9:;9;>9:9>?BABBAAA2A@@@@@>:3'3A################################################################ YT:Z:UU +SRR065390.13946 4 * 0 0 * * 0 0 TTTCCTCGAGTTCTTGATGAAATGGTCCATTATTTGTCAACCATTTATTTTTCCATATTTTTTCCAGGTAAGGCATGAATTCTGCAAGTTCCGGCAAAGA CCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCBBBBCACCCCCCB@BCCCCBC?CCCCACCBCBCCCCC@BBDCDDBCA4@@A YT:Z:UU +SRR065390.13956 4 * 0 0 * * 0 0 CGGCGCAACAATACTCAGCAGTTAATTGCAAAGGTATCGCACACCATTAAAAGCATTAAGCCGGGAGTCGAATTTGGTGTTAGCCCGGCAGGCGTGTGGC CCCCCCCCCCCBCCCCCCCCCDCCCCCCCCCCCC>CCCCCCCCCCCDCCCCBDCCDDCBDCC@?@BA@B@B>BBABAABB6?BB>B@?B??2?=+>->60 YT:Z:UU +SRR065390.13964 4 * 0 0 * * 0 0 NTTGAGGTGCTCCAGTGGCTTCTGTTTCTATCAGCTGTCCCTCCTGTTCAGCTACTGACGGGGGGGTGCGCAACGGCAAAAGCACCGCCGGGCATCAGCG !))))++++*AAAAA8AAAA################################################################################ YT:Z:UU +SRR065390.13969 4 * 0 0 * * 0 0 CGGGCGATAGTCAAAAACTTATTTTCACAATTTTCGGCTAGGGAGTATATTTACAGTTAATTTGCGATGTGTTAGATCGGAAGAGCGGTTCAGCAGGAAT CCCCCCCCCCCCCCCCCCCCCCCCBBCCCCCCCCCCCCCCCCCCCACCCCCCCCCCDCCCACCCCCDCCACBCCCCCCBCBBCDCBCC?BCBBCBCBC;A YT:Z:UU +SRR065390.13978 4 * 0 0 * * 0 0 AGACGGTAACTTTCAATTTGCACCCATGATTAAATTTTATGTTGATTAAAATAGAAGCAAAAATCATTACATTACACTACAAAATACGCCGAAATGTTAA CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBACCCDCBCCCABCADCCAABC? YT:Z:UU +SRR065390.13985 4 * 0 0 * * 0 0 TAACCAAAAACTGGATTATGCAAATAACTAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATATCGTATGCCGTCTTCTGCTTGAAAAAAAAA CCCCCCCCCCCCCC?CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCBCCDCDCCD@CBD5>@=:=><9A@3>=B?BB>CBACB?BBA YT:Z:UU +SRR065390.14000 4 * 0 0 * * 0 0 TAGGTGAGAAAAGCGTTATTGGTCCGGTATACCTGCGAAGCGACAAAGCAATAAGGCAACAATGGCAGGTAATGCTGCTCAAAAAAGCGTTTACTGATCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCACCCCCCBADCCCB?@B>B@BADAAABBD@C5;B9?:?;ACABAB YT:Z:UU +SRR065390.14032 4 * 0 0 * * 0 0 GAAGGTCCAAGTGCCTTGAAGATAGAAAATTATAGCATTTCTCTTTAATTTCAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATATCGCATGA CCCACCCCCC>AAAACBCCCCCCCBCC?CCCCCCCCCCCCCCCCCCCCCCCB@CCCCCCBAC@CDCA/@B<;8=?@B>BC>?>?BB=:A########### YT:Z:UU +SRR065390.14061 4 * 0 0 * * 0 0 TGAAGCCGACAATTTGAGGCCAAACATCTTACATTCGACAGTAAATATTTGGGGATTAAGACTTATGTTAGATCGGAAGAGCGGTTCAGCAGGAATGCCG CCCCCCCCCCCCCCCCCCCCCCCCCCC=CCCCCCCCCCADCBCCCC=CCCCCCCBCCC=CBCCCCCCCABCCCCCCBACBC@CCBB;@B;?A@A@=?99A YT:Z:UU +SRR065390.14072 4 * 0 0 * * 0 0 TGAGTGAGGCTCAGGATTTTGAGTGAGGCTCAGGATTATGAGTGAGGTTGAAGAATTTGAGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAG @B=@@BB@B@<@BB>BB>>@BB@==2;:;8BBBBBB@B@@@:@?1B@B@B@3@@@>3;@;@<@?>;@B@@##### YT:Z:UU +SRR065390.14100 4 * 0 0 * * 0 0 AAGCCTGAGGGATAATTTTCGTCAAATTAAGGCAATTGCCGAGTGTTTCATCCCTGGCAAGCAGAACGGCTTTTTCGTTATTTATATCGGGAGAATTTAT CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCBCDCDCDCCCCDBBCDADBBBABDDBBBBBBBBDCBCD?BABB>B>AB>BCABAA>C YT:Z:UU +SRR065390.14105 4 * 0 0 * * 0 0 AAATTGTCCCCAAATAAAACAATTCCAGTGATCTTCCGATTCTAGGTGCCAAATAACCCAAATAGTCACTGCATTAGTTTTTATCTCACTTTTCTCCCCC #################################################################################################### YT:Z:UU +SRR065390.14107 4 * 0 0 * * 0 0 TGAAATTTCAAGAAAAATGTTAATTACCACCGTATTAAAAAAAAAAAACTTAAAATCAAAGATCGGAAAAGGGGTCAGGCAGGAATGCCAAAACCGACAC CCCCBCCCCCCCBCB>>>ACCCBCACCC?CCCCCCCCCCC?B########################################################## YT:Z:UU +SRR065390.14137 4 * 0 0 * * 0 0 CTGTGGCGTTTTTATCAAATTGGCAGAGCCACGTTCAGAGCTGAAAAAGCCACAGAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTTGGA BCCC@CCCCC?CCC>>CCBCCC?>C@CCCB=6?AA>=>3?>@?@86;86.@A@==378::68829>B9B############################### YT:Z:UU +SRR065390.14141 4 * 0 0 * * 0 0 GGTCACCAATCATAAGAGGAACAGCGACTGCACCTGCGTACATGACAAGGACGTGTTGCAGACCGAGTATGATCAGCTTTCCTGGTGATAGTATGCGCTC AAA@A?AA8:>A######################################################################################## YT:Z:UU +SRR065390.14162 4 * 0 0 * * 0 0 ATACTTCACCGGATGGTGGAATTAACGAAAACAACAACTGGTGTCACATCCCGCAGGCAAAAGAGGCAGCGGCTAACTAAGCGGCCTGCTGACTTTCTCG CCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCCCBBCCCCCCDCDCCCCCCCCCC?C?CCCCCACD@CAD@AB<>@CB;6B#################### YT:Z:UU +SRR065390.14168 4 * 0 0 * * 0 0 TCGAGGGTGAGGGCGTCTGCCAGATCGGAAGAGCGGGTCAGCAGGAATGCCGAGACCGATATCGGATGCCGTCTCCTGCTGGACAAAAAATGAGAATGGG AACC@0@>@6:<>??>?BBBBB?+B6BBB>B?B=:?BBB=BBBBB>B######################################## YT:Z:UU +SRR065390.14173 4 * 0 0 * * 0 0 AAGAAACTCAACAAACCGGACTTGCAGGTGAAACTGATTCCGATTACCTCACAAAACCGTATTCCACTGCTGCAAAACGGCACTTTCGATTTTGAATGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC=@CCCCCCCCAA?ABC@CCA=CCCABDCCAABDD?BB@BBA YT:Z:UU +SRR065390.14178 4 * 0 0 * * 0 0 GCGCTTTGTTTACCTGATACGGAATTTCGTGGACGATAATGGTTTCACGACCGGTTTTGGCGTCAACTTCCACTTCTGCGCGAGCGCGGATATACACCTT BCCCCCCCCC@@CCCCCCCCCCCCCCCCCCA=AA>AAA=ACCBCCCCC?CCCCAB@ACCC?A?<CB?=CAB9B@BA################### YT:Z:UU +SRR065390.14182 4 * 0 0 * * 0 0 ATTTACTCTAATGTTCTGAAAAATAATTTACTCTAATGTTCTGCCAAATAATTTACTCTAACGTTCTGCCAAATAATTTACTCTAATGTTCTGCCAAATA CCCCCCCCCBCCCCCCCBCC@CCB@@@BCCCCCCC@CCCCCCBBCCCCCCBCCCC@CCC?CCC>>CCBCCCCA@CCCC;CBCCBDCCB@CCBCAACB@BB;B?B0B=8??9>??BB>B?@?B>A>A########## YT:Z:UU +SRR065390.14197 4 * 0 0 * * 0 0 GTACCTCGCCGTTGTTCTCGACCTGTTCGCAAGAAAACCAGTGGGCTGGGCCATGTCGTTCTCGCCGGACAGCAGGCTCACCATGCAAGCGCTGGAAATG CCCCCCCCCCCCCDCBB=B@?BB@BBBBBB@@@B@B==BBB9B@@@@B@=BBB@BBB=@BBABBB@@@BB<@BA@BBB=B;B?BBACA YT:Z:UU +SRR065390.14284 4 * 0 0 * * 0 0 CGGTGCATGATGCGGATTCCAGGAATCAACGTACAGCGTCGGGCTAAACCAGAACCAGCCAATAATGCACAGACCGACGACCGGAATAATAACCCCCCAC BCACCCCCCCCBCCBCCCCCCCBCC@CCCCCCCAC@CBACCACCCCC@CBCCCCCCCA8CC?A@9@AB@9CACC8=81B@CC9CCCCCCC,<8??CBC@BB?@C@ACBCB################################################## YT:Z:UU +SRR065390.14312 4 * 0 0 * * 0 0 ACAGTAACATTCAACGTTAAATATGTTAATAAGACGTTGCATTATTGTCCTGAAGTTGAAGATAGCAGGTATGGCGGTTGGATAGCACGGCGTTGGTTTA CCCCCCCCCCCCCCCCCCCCBCCCCCCBCCCCCCCCCCCCCCCCCCCBCCCCCDCCCCCCCCCACCBC@?CACBC######################### YT:Z:UU +SRR065390.14331 4 * 0 0 * * 0 0 GAATAATGAAGATGATGCGACGCGTCTGGCGCGTTTGAACGAACGCTTTAAACGCGAAGGTAAACCGGAGTTGAAGAAGCTGGATGATCTACCTAAAGAT CCCCCCCCCCCCCCBCCBCCCCCCBCCCC@CCCB@CCCCACCDDCBCC?CAC@B@DABA?BAB@@@?C?C@BC?9A::>=@@C;?############### YT:Z:UU +SRR065390.14335 4 * 0 0 * * 0 0 TCCATTTGATGAACCTGAAGTTTAAGTATTGACTTGAGAGGAAAAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCT CCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCBCCC@BBA>CCCCCCCCBBCCBC=CBCCCB################### YT:Z:UU +SRR065390.14342 4 * 0 0 * * 0 0 AAGTTCATGAATTAAAGCCGACTCAAACACTCTGTTTAAAAACTGGATAGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGAGATCGTAGGCCGTC 0000079;9;AAAA?;;;>9>3>9BB8BBBB@############################ YT:Z:UU +SRR065390.14359 4 * 0 0 * * 0 0 GCATCAGTACGATAAAACGCGTACCGAACTACTGAATGATGTCGCAGGGGCGCTGGCTCTTGATGACAAACTCGGACGTAGCACCAATCAACTTTCCGGC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCBABACCBB@BBBBBB>BDBBDBBB>B@@@>>?BCBAC?CBC?> YT:Z:UU +SRR065390.14364 4 * 0 0 * * 0 0 GGTCGCCGATCCGATTTGCACTTTAACCACTTTCGGTAAAGAAACCGTTGTTAGTGAAAGCGAAAAACGCACAACGACCACTGATGACCCGCTACAGGTG CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC@@CCCCCCCCCBCCCCCCCCCACCDACCBCACA@CACCAA=B=CBC=ACAAAC=)>? YT:Z:UU +SRR065390.14392 4 * 0 0 * * 0 0 GTTATCCTTTTCCGTGATATGTGCGGTACTGCAGCGTATGCCGGCAAGGGTTGCAAACGGTGGTAGTGTGCAGGTTGACTGTTGGTCGGATTCCTCCACC CCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCCCCDCCCCCCCCCBCCCA@CCB@AACB?CB?BA=A8@BACB?:===@C@-A6==?@@<@@AA##### YT:Z:UU +SRR065390.14393 4 * 0 0 * * 0 0 AGAAATTTACTGGCTCGCCGCAGCCAACTCCTCTTCTGACACCCCGGTAAAGCGCATGATGTCTGTAAGAGGGGCCCCGGATTCAAGCATTATTTTGGCT CCCCCCCCCCCCCCCBA9::<4A>AAAA:?A#################################### YT:Z:UU +SRR065390.14434 4 * 0 0 * * 0 0 GGTAGATTCCCATAAAAATCGCCAGCGGAATGGTGAACGCAACGGTATACGTTCCCCACGGGCTATGAGTCAGGGCTTTCACCACGATCATCGCCAGTAC DCACCCBCCCCCCCC>CBBCCCCCCCCCCCCCC?CCCCCCCCCCCACACCC@BCCCCBCD=ABB@BCBD?@@B6BC8B@B>BABCBB@AB=@2C###### YT:Z:UU diff --git a/test/test.pl b/test/test.pl index 2c4a1b1f4..1fb6112bd 100755 --- a/test/test.pl +++ b/test/test.pl @@ -762,6 +762,14 @@ sub test_index unlink("$$opts{tmp}/index.sam.gz.bai"); test_compare($opts,"$$opts{path}/test_index -b $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + # SAM DOS LINE ENDINGS (\r\n) + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 14 -x $$opts{tmp}/index.sam.gz.csi $$opts{path}/index_dos.sam > $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.csi", "$$opts{path}/index.sam.gz.csi", gz=>1); + unlink("$$opts{tmp}/index.bam.bai"); + test_compare($opts,"$$opts{path}/test_index -c $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.csi", "$$opts{path}/index.sam.gz.csi", gz=>1); + test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -z -m 0 -x $$opts{tmp}/index.sam.gz.bai $$opts{path}/index_dos.sam > $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + unlink("$$opts{tmp}/index.sam.gz.bai"); + test_compare($opts,"$$opts{path}/test_index -b $$opts{tmp}/index.sam.gz", "$$opts{tmp}/index.sam.gz.bai", "$$opts{path}/index.sam.gz.bai"); + # CRAM local $ENV{REF_PATH} = $$opts{m5_dir}; test_compare($opts,"$$opts{path}/test_view $nthreads -l 0 -C -x $$opts{tmp}/index.cram.crai $$opts{path}/index.sam > $$opts{tmp}/index.cram", "$$opts{tmp}/index.cram.crai", "$$opts{path}/index.cram.crai", gz=>1); From 49059d3cbb1ac145fadbafed18bce2a8eb5dc0d9 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Wed, 7 Oct 2020 14:53:44 +0100 Subject: [PATCH 014/488] Adjust the min offset of a bin when unmapped placed reads detected. --- hts.c | 66 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 58 insertions(+), 8 deletions(-) diff --git a/hts.c b/hts.c index afa71987f..278ecd36a 100644 --- a/hts.c +++ b/hts.c @@ -2552,6 +2552,7 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t bidx_t *bidx; uint64_t min_off, max_off; hts_itr_t *iter; + uint32_t unmapped = 0, rel_off; // It's possible to call this function with NULL idx iff // tid is one of the special values HTS_IDX_REST or HTS_IDX_NONE @@ -2585,13 +2586,20 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t return NULL; } + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) + unmapped = kh_val(bidx, k).list[1].v; + else + unmapped = 1; + iter->tid = tid, iter->beg = beg, iter->end = end; iter->i = -1; iter->readrec = readrec; if ( !kh_size(bidx) ) { iter->finished = 1; return iter; } + rel_off = beg>>idx->min_shift; // compute min_off - bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + bin = hts_bin_first(idx->n_lvls) + rel_off; do { int first; k = kh_get(bin, bidx, bin); @@ -2602,10 +2610,28 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } while (bin); if (bin == 0) k = kh_get(bin, bidx, bin); min_off = k != kh_end(bidx)? kh_val(bidx, k).loff : 0; + // min_off can be calculated more accurately if the + // linear index is available if (idx->lidx[tid].offset - && beg>>idx->min_shift < idx->lidx[tid].n - && min_off < idx->lidx[tid].offset[beg>>idx->min_shift]) - min_off = idx->lidx[tid].offset[beg>>idx->min_shift]; + && rel_off < idx->lidx[tid].n) { + if (min_off < idx->lidx[tid].offset[rel_off]) + min_off = idx->lidx[tid].offset[rel_off]; + if (unmapped) { + int tmp_off; + for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { + if (idx->lidx[tid].offset[tmp_off] < min_off) { + min_off = idx->lidx[tid].offset[tmp_off]; + break; + } + } + + if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0)) + min_off = kh_val(bidx, k).list[0].u; + } + } else if (unmapped) { //CSI index + if (k != kh_end(bidx)) + min_off = kh_val(bidx, k).list[0].u; + } // compute max_off: a virtual offset from a bin to the right of end bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; @@ -2688,6 +2714,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) int tid; hts_pos_t beg, end; hts_reglist_t *curr_reg; + uint32_t unmapped = 0, rel_off; if (!idx || !iter || !iter->multi) return -1; @@ -2720,6 +2747,12 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL || !kh_size(bidx)) continue; + k = kh_get(bin, bidx, META_BIN(idx)); + if (k != kh_end(bidx)) + unmapped = kh_val(bidx, k).list[1].v; + else + unmapped = 1; + for(j=0; jcount; j++) { hts_pair32_t *curr_intv = &curr_reg->intervals[j]; if (curr_intv->end < curr_intv->beg) @@ -2727,12 +2760,13 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) beg = curr_intv->beg; end = curr_intv->end; + rel_off = beg>>idx->min_shift; /* Compute 'min_off' by searching the lowest level bin containing 'beg'. If the computed bin is not in the index, try the next bin to the left, belonging to the same parent. If it is the first sibling bin, try the parent bin. */ - bin = hts_bin_first(idx->n_lvls) + (beg>>idx->min_shift); + bin = hts_bin_first(idx->n_lvls) + rel_off; do { int first; k = kh_get(bin, bidx, bin); @@ -2747,9 +2781,25 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) // min_off can be calculated more accurately if the // linear index is available if (idx->lidx[tid].offset - && beg>>idx->min_shift < idx->lidx[tid].n - && min_off < idx->lidx[tid].offset[beg>>idx->min_shift]) - min_off = idx->lidx[tid].offset[beg>>idx->min_shift]; + && rel_off < idx->lidx[tid].n) { + if (min_off < idx->lidx[tid].offset[rel_off]) + min_off = idx->lidx[tid].offset[rel_off]; + if (unmapped) { + int tmp_off; + for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { + if (idx->lidx[tid].offset[tmp_off] < min_off) { + min_off = idx->lidx[tid].offset[tmp_off]; + break; + } + } + + if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0)) + min_off = kh_val(bidx, k).list[0].u; + } + } else if (unmapped) { //CSI index + if (k != kh_end(bidx)) + min_off = kh_val(bidx, k).list[0].u; + } // compute max_off: a virtual offset from a bin to the right of end bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; From c9175183c42382f1030503e88ca7e60cb9c08536 Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Fri, 23 Oct 2020 15:43:26 +0200 Subject: [PATCH 015/488] Add bam_set1() function and unit tests (PR #1159) --- htslib/sam.h | 31 +++++ sam.c | 122 +++++++++++++++++++ test/sam.c | 322 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 475 insertions(+) diff --git a/htslib/sam.h b/htslib/sam.h index 9e595ae12..8fcf819bf 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1003,6 +1003,37 @@ bam1_t *bam_copy1(bam1_t *bdst, const bam1_t *bsrc) HTS_RESULT_USED; HTSLIB_EXPORT bam1_t *bam_dup1(const bam1_t *bsrc); +/// Sets all components of an alignment structure +/** + @param bam Target alignment structure. Must be initialized by a call to bam_init1(). + The data field will be reallocated automatically as needed. + @param l_qname Length of the query name. If set to 0, the placeholder query name "*" will be used. + @param qname Query name, may be NULL if l_qname = 0 + @param flag Bitwise flag, a combination of the BAM_F* constants. + @param tid Chromosome ID, defined by sam_hdr_t (a.k.a. RNAME). + @param pos 0-based leftmost coordinate. + @param mapq Mapping quality. + @param n_cigar Number of CIGAR operations. + @param cigar CIGAR data, may be NULL if n_cigar = 0. + @param mtid Chromosome ID of next read in template, defined by sam_hdr_t (a.k.a. RNEXT). + @param mpos 0-based leftmost coordinate of next read in template (a.k.a. PNEXT). + @param isize Observed template length ("insert size") (a.k.a. TLEN). + @param l_seq Length of the query sequence (read) and sequence quality string. + @param seq Sequence, may be NULL if l_seq = 0. + @param qual Sequence quality, may be NULL. + @param l_aux Length to be reserved for auxiliary field data, may be 0. + + @return >= 0 on success (number of bytes written to bam->data), negative (with errno set) on failure. +*/ +HTSLIB_EXPORT +int bam_set1(bam1_t *bam, + size_t l_qname, const char *qname, + uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, + size_t n_cigar, const uint32_t *cigar, + int32_t mtid, hts_pos_t mpos, hts_pos_t isize, + size_t l_seq, const char *seq, const char *qual, + size_t l_aux); + /// Calculate query length from CIGAR data /** @param n_cigar Number of items in @p cigar diff --git a/sam.c b/sam.c index 321787089..49b1a0f3f 100644 --- a/sam.c +++ b/sam.c @@ -485,6 +485,128 @@ static void bam_cigar2rqlens(int n_cigar, const uint32_t *cigar, } } +static int subtract_check_underflow(size_t length, size_t *limit) +{ + if (length <= *limit) { + *limit -= length; + return 0; + } + + return -1; +} + +int bam_set1(bam1_t *bam, + size_t l_qname, const char *qname, + uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq, + size_t n_cigar, const uint32_t *cigar, + int32_t mtid, hts_pos_t mpos, hts_pos_t isize, + size_t l_seq, const char *seq, const char *qual, + size_t l_aux) +{ + // use a default qname "*" if none is provided + if (l_qname == 0) { + l_qname = 1; + qname = "*"; + } + + // note: the qname is stored nul terminated and padded as described in the + // documentation for the bam1_t struct. + size_t qname_nuls = 4 - l_qname % 4; + + // the aligment length, needed for bam_reg2bin(), is calculated as in bam_endpos(). + // can't use bam_endpos() directly as some fields not yet set up. + hts_pos_t rlen = 0, qlen = 0; + if (!(flag & BAM_FUNMAP)) { + bam_cigar2rqlens((int)n_cigar, cigar, &rlen, &qlen); + } + if (rlen == 0) { + rlen = 1; + } + + // validate parameters + if (l_qname > 254) { + hts_log_error("Query name too long"); + errno = EINVAL; + return -1; + } + if (HTS_POS_MAX - rlen <= pos) { + hts_log_error("Read ends beyond highest supported position"); + errno = EINVAL; + return -1; + } + if (!(flag & BAM_FUNMAP) && l_seq > 0 && n_cigar == 0) { + hts_log_error("Mapped query must have a CIGAR"); + errno = EINVAL; + return -1; + } + if (!(flag & BAM_FUNMAP) && l_seq > 0 && l_seq != qlen) { + hts_log_error("CIGAR and query sequence are of different length"); + errno = EINVAL; + return -1; + } + + size_t limit = INT32_MAX; + int u = subtract_check_underflow(l_qname + qname_nuls, &limit); + u += subtract_check_underflow(n_cigar * 4, &limit); + u += subtract_check_underflow((l_seq + 1) / 2, &limit); + u += subtract_check_underflow(l_seq, &limit); + u += subtract_check_underflow(l_aux, &limit); + if (u != 0) { + hts_log_error("Size overflow"); + errno = EINVAL; + return -1; + } + + // re-allocate the data buffer as needed. + size_t data_len = l_qname + qname_nuls + n_cigar * 4 + (l_seq + 1) / 2 + l_seq; + if (realloc_bam_data(bam, data_len + l_aux) < 0) { + return -1; + } + + bam->l_data = (int)data_len; + bam->core.pos = pos; + bam->core.tid = tid; + bam->core.bin = bam_reg2bin(pos, pos + rlen); + bam->core.qual = mapq; + bam->core.l_extranul = (uint8_t)(qname_nuls - 1); + bam->core.flag = flag; + bam->core.l_qname = (uint16_t)(l_qname + qname_nuls); + bam->core.n_cigar = (uint32_t)n_cigar; + bam->core.l_qseq = (int32_t)l_seq; + bam->core.mtid = mtid; + bam->core.mpos = mpos; + bam->core.isize = isize; + + uint8_t *cp = bam->data; + strncpy((char *)cp, qname, l_qname); + int i; + for (i = 0; i < qname_nuls; i++) { + cp[l_qname + i] = '\0'; + } + cp += l_qname + qname_nuls; + + if (n_cigar > 0) { + memcpy(cp, cigar, n_cigar * 4); + } + cp += n_cigar * 4; + + for (i = 0; i + 1 < l_seq; i += 2) { + *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]]; + } + for (; i < l_seq; i++) { + *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; + } + + if (qual) { + memcpy(cp, qual, l_seq); + } + else { + memset(cp, '\xff', l_seq); + } + + return (int)data_len; +} + hts_pos_t bam_cigar2qlen(int n_cigar, const uint32_t *cigar) { int k; diff --git a/test/sam.c b/test/sam.c index 1c65c4556..8a6001840 100644 --- a/test/sam.c +++ b/test/sam.c @@ -69,6 +69,11 @@ static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) fail(const char *fmt, ...) status = EXIT_FAILURE; } +#define VERIFY(test, message) if (!(test)) { \ + fail("%s: %s", __func__, (message)); \ + goto cleanup; \ +} + uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type) { uint8_t *p = bam_aux_get(aln, tag); @@ -1837,6 +1842,312 @@ static void test_mempolicy(void) } } +static void test_bam_set1_minimal() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, 0); + // expected number of bytes written is qname: 4, cigar: 0, sequence: 0, qual: 0, aux: 0. + VERIFY(r == 4, "call to bam_set1() failed or did not write the correct number of bytes."); + + VERIFY(bam->core.l_qname == 4, "l_qname should include terminating null and be padded to the nearest 32-bit boundary."); + VERIFY(bam->core.l_extranul == 2, "l_extranul not set correctly"); + VERIFY(strcmp(bam_get_qname(bam), "*") == 0, "qname not set correctly."); + + VERIFY(bam->core.pos == 0, "pos not set correctly."); + VERIFY(bam->core.tid == -1, "tid not set correctly."); + VERIFY(bam->core.bin == hts_reg2bin(0, 1, 14, 5), "bin not set correctly."); + VERIFY(bam->core.qual == 0xff, "mapq not set correctly."); + VERIFY(bam->core.flag == BAM_FUNMAP, "flag not set correctly."); + VERIFY(bam->core.n_cigar == 0, "n_cigar not set correctly."); + VERIFY(bam->core.mtid == -1, "mtid not set correctly."); + VERIFY(bam->core.mpos == 0, "mpos not set correctly."); + VERIFY(bam->core.isize == 0, "isize not set correctly."); + VERIFY(bam->core.l_qseq == 0, "l_seq not set correctly."); + VERIFY(bam_get_l_aux(bam) == 0, "l_aux not set correctly."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_full() +{ + const char *qname = "!??AAA~~~~"; + const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + const char *qual = "DBBBB+=7=0"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, strlen(qname), qname, + BAM_FREVERSE, 1, 1000, 42, + sizeof(cigar) / 4, cigar, 2, 2000, 3000, + strlen(seq), seq, qual, 64); + // expected number of bytes written is qname: 12, cigar: 12, sequence: 5, qual: 10, aux: 0. + VERIFY(r == 39, "call to bam_set1() failed or did not write the correct number of bytes."); + + VERIFY(bam->core.l_qname == 12, "l_qname should include terminating null and be padded to the nearest 32-bit boundary."); + VERIFY(bam->core.l_extranul == 1, "l_extranul not set correctly"); + VERIFY(strcmp(bam_get_qname(bam), qname) == 0, "qname not set correctly."); + + VERIFY(bam->core.n_cigar == sizeof(cigar) / 4, "n_cigar not set correctly."); + VERIFY(memcmp(bam_get_cigar(bam), cigar, sizeof(cigar)) == 0, "cigar not set correctly."); + + VERIFY(bam->core.l_qseq == strlen(seq), "l_seq not set correctly."); + for (i = 0; i < strlen(seq); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq[i]], "seq not set correctly."); + } + VERIFY(memcmp(bam_get_qual(bam), qual, strlen(seq)) == 0, "qual not set correctly."); + + VERIFY(bam->core.pos == 1000, "pos not set correctly."); + VERIFY(bam->core.tid == 1, "tid not set correctly."); + VERIFY(bam->core.bin == hts_reg2bin(1000, 1010, 14, 5), "bin not set correctly."); + VERIFY(bam->core.qual == 42, "mapq not set correctly."); + VERIFY(bam->core.flag == BAM_FREVERSE, "flag not set correctly."); + VERIFY(bam->core.mtid == 2, "mtid not set correctly."); + VERIFY(bam->core.mpos == 2000, "mpos not set correctly."); + VERIFY(bam->core.isize == 3000, "isize not set correctly."); + VERIFY(bam_get_l_aux(bam) == 0, "l_aux not set correctly."); + VERIFY(bam->m_data - bam->l_data >= 64, "not enough memory allocated for aux data."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_even_and_odd_seq_len() +{ + const char *seq_even = "TGGACTACGA"; + const char *seq_odd = "TGGACTACGAC"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, 0, 0, 0, 0, NULL, 0, 0, 0, + strlen(seq_even), seq_even, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq_even), "l_seq not set correctly."); + for (i = 0; i < strlen(seq_even); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq_even[i]], "seq not set correctly."); + } + + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, 0, 0, 0, 0, NULL, 0, 0, 0, + strlen(seq_odd), seq_odd, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq_odd), "l_seq not set correctly."); + for (i = 0; i < strlen(seq_odd); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq_odd[i]], "seq not set correctly."); + } + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_with_seq_but_no_qual() +{ + const char *seq = "TGGACTACGA"; + + int r, i; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + r = bam_set1(bam, 0, NULL, + BAM_FUNMAP, 0, 0, 0, + 0, NULL, 0, 0, 0, + strlen(seq), seq, NULL, 0); + VERIFY(r >= 0, "call to bam_set1() failed."); + VERIFY(bam->core.l_qseq == strlen(seq), "l_seq not set correctly."); + for (i = 0; i < strlen(seq); i++) { + VERIFY(bam_seqi(bam_get_seq(bam), i) == seq_nt16_table[(uint8_t)seq[i]], "seq not set correctly."); + VERIFY(bam_get_qual(bam)[i] == 0xff, "qual not set correctly"); + } + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_qname() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // qname too long + const char too_long[255] = { 'A' }; + r = bam_set1(bam, sizeof(too_long), too_long, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_seq() +{ + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // seq too long + const char *sequence = "C"; + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, (size_t)INT32_MAX + 1, sequence, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_cigar() +{ + const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // mapped query must have a CIGAR + r = bam_set1(bam, 0, NULL, 0, -1, 0, 0xff, 0, NULL, -1, 0, 0, strlen(seq), seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // pos + ref len from CIGAR should be <= HTS_POS_MAX + r = bam_set1(bam, 0, NULL, 0, -1, HTS_POS_MAX - 10, 0xff, sizeof(cigar) / 4, cigar, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // query len from CIGAR should match the sequence length + r = bam_set1(bam, 0, NULL, 0, -1, 0, 0xff, sizeof(cigar) / 4, cigar, -1, 0, 0, strlen(seq), seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_validate_size_limits() +{ + const uint32_t cigar[] = { 20 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + + int r; + bam1_t *bam = NULL; + bam = bam_init1(); + VERIFY(bam != NULL, "failed to initialize BAM struct."); + + // very long sequence. each base counts for 1/2 byte of sequence data and + // 1 byte of sequence quality data. the sum of all components may not exceed + // INT32_MAX, which is the maximum possible value that can be stored in l_data. + // In this case the 4 bytes of qname will cause it to overflow. + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 2 * (size_t)INT32_MAX / 3, seq, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // very long CIGAR + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, (size_t)INT32_MAX / 4, cigar, -1, 0, 0, 0, NULL, NULL, 0); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + + // very long aux + r = bam_set1(bam, 0, NULL, BAM_FUNMAP, -1, 0, 0xff, 0, NULL, -1, 0, 0, 0, NULL, NULL, INT32_MAX); + VERIFY(r < 0, "call to bam_set1() should have failed."); + VERIFY(errno == EINVAL, "errno should be set."); + +cleanup: + if (bam != NULL) bam_destroy1(bam); +} + +static void test_bam_set1_write_and_read_back() +{ + const char *qname = "q1"; + const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; + const char *seq = "TGGACTACGA"; + const char *qual = "DBBBB+=7=0"; + + int r; + htsFile *writer = NULL, *reader = NULL; + sam_hdr_t *w_header = NULL, *r_header = NULL; + bam1_t *w_bam = NULL, *r_bam = NULL; + kstring_t ks = KS_INITIALIZE; + + // open file for writing + writer = hts_open("test_bam_set1_write_and_read_back.bam", "wb"); + VERIFY(writer != NULL, "failed to open bam file for writing."); + + // write header + w_header = bam_hdr_init(); + VERIFY(w_header != NULL, "failed to initialize bam header."); + r = sam_hdr_add_line(w_header, "SQ", "SN", "t1", "LN", "5000", NULL); + VERIFY(r == 0, "failed to add SQ header line."); + r = sam_hdr_write(writer, w_header); + VERIFY(r == 0, "failed to write bam header."); + + // write alignments + w_bam = bam_init1(); + VERIFY(w_bam != NULL, "failed to initialize BAM struct."); + r = bam_set1(w_bam, strlen(qname), qname, + BAM_FREVERSE, 0, 1000, 42, + sizeof(cigar) / 4, cigar, 0, 2000, 3000, + strlen(seq), seq, qual, 64); + VERIFY(r >= 0, "call to bam_set1() failed."); + r = sam_write1(writer, w_header, w_bam); + VERIFY(r >= 0, "failed to write alignment."); + bam_destroy1(w_bam); + + // close file + r = hts_close(writer); + VERIFY(r == 0, "failed to close bam file for writing."); + sam_hdr_destroy(w_header); + + // open file for reading + reader = hts_open("test_bam_set1_write_and_read_back.bam", "rb"); + VERIFY(reader != NULL, "failed to open bam file for reading."); + + // read header + r_header = sam_hdr_read(reader); + VERIFY(r_header != NULL, "failed to read bam header."); + r = sam_hdr_find_tag_id(r_header, "SQ", NULL, NULL, "SN", &ks); + VERIFY(r == 0, "failed to read SQ/SN value"); + VERIFY(strcmp(ks_c_str(&ks), "t1") == 0, "expected reference sequence name in the header == 't1'"); + VERIFY(r_header->n_targets == 1, "expected number of reference sequences == 1"); + VERIFY(strcmp(r_header->target_name[0], "t1") == 0, "expected reference sequence name == 't1'"); + VERIFY(r_header->target_len[0] == 5000, "expected reference sequence length == 5000"); + + // read alignments + r_bam = bam_init1(); + VERIFY(r_bam != NULL, "failed to initialize BAM struct."); + r = sam_read1(reader, r_header, r_bam); + VERIFY(r >= 0, "failed to read alignment."); + VERIFY(strcmp(bam_get_qname(r_bam), qname) == 0, "qname does not match."); + VERIFY(r_bam->core.n_cigar == sizeof(cigar) / 4, "cigar length does not match."); + VERIFY(memcmp(bam_get_cigar(r_bam), cigar, sizeof(cigar)) == 0, "cigar data does not match."); + VERIFY(r_bam->core.l_qseq == strlen(seq), "sequence length does not match."); + + r = sam_read1(reader, r_header, r_bam); + VERIFY(r < 0, "expected no more alignments."); + bam_destroy1(r_bam); + + // close file + r = hts_close(reader); + VERIFY(r == 0, "failed to close bam file for reading."); + sam_hdr_destroy(r_header); + +cleanup: + ks_free(&ks); +} + int main(int argc, char **argv) { int i; @@ -1864,5 +2175,16 @@ int main(int argc, char **argv) set_qname(); for (i = 1; i < argc; i++) faidx1(argv[i]); + hts_set_log_level(HTS_LOG_OFF); + test_bam_set1_minimal(); + test_bam_set1_full(); + test_bam_set1_even_and_odd_seq_len(); + test_bam_set1_with_seq_but_no_qual(); + test_bam_set1_validate_qname(); + test_bam_set1_validate_seq(); + test_bam_set1_validate_cigar(); + test_bam_set1_validate_size_limits(); + test_bam_set1_write_and_read_back(); + return status; } From 90f27abcb669373e8af7cfeca22951109e0e102b Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 23 Oct 2020 15:21:51 +0100 Subject: [PATCH 016/488] Use temporary filename for bam_set1() test file The bam_set1() tests create a temporary BAM file. Rename it to match test/*.tmp.* so that it is automatically git-ignored and cleaned. --- test/sam.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/sam.c b/test/sam.c index 8a6001840..af1dc34da 100644 --- a/test/sam.c +++ b/test/sam.c @@ -2075,6 +2075,7 @@ static void test_bam_set1_write_and_read_back() const uint32_t cigar[] = { 6 << BAM_CIGAR_SHIFT | BAM_CMATCH, 2 << BAM_CIGAR_SHIFT | BAM_CINS, 2 << BAM_CIGAR_SHIFT | BAM_CMATCH }; const char *seq = "TGGACTACGA"; const char *qual = "DBBBB+=7=0"; + const char *temp_fname = "test/test_bam_set1_write_and_read_back.tmp.bam"; int r; htsFile *writer = NULL, *reader = NULL; @@ -2083,7 +2084,7 @@ static void test_bam_set1_write_and_read_back() kstring_t ks = KS_INITIALIZE; // open file for writing - writer = hts_open("test_bam_set1_write_and_read_back.bam", "wb"); + writer = hts_open(temp_fname, "wb"); VERIFY(writer != NULL, "failed to open bam file for writing."); // write header @@ -2112,7 +2113,7 @@ static void test_bam_set1_write_and_read_back() sam_hdr_destroy(w_header); // open file for reading - reader = hts_open("test_bam_set1_write_and_read_back.bam", "rb"); + reader = hts_open(temp_fname, "rb"); VERIFY(reader != NULL, "failed to open bam file for reading."); // read header From ca25304f29a5f94198c186b3a47eceab18416f63 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 2 Nov 2020 16:02:24 +0000 Subject: [PATCH 017/488] Update Windows image version in .appveyor.yml Mainly to fix pacman, which broke following changes to msys2 packaging (see https://www.msys2.org/news/#2020-06-29-new-packagers) Prior to this we were using the default image (Visual Studio 2015) which has not had the necessary updates to make pacman work. --- .appveyor.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.appveyor.yml b/.appveyor.yml index 14145cf1b..6f94944bc 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -18,6 +18,9 @@ skip_tags: true # - docs/* # - '**/*.html' +# Appveyor Windows images are based on Visual studio version +image: Visual Studio 2019 + # We use Mingw/Msys, so use pacman for installs install: - set HOME=. From b211b4c04dccad6f0dc0b9865be7d0210f2a6957 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 2 Nov 2020 16:50:59 +0000 Subject: [PATCH 018/488] Fix a long-standing typo in hts_lrand48. The return type of the definition was wrong, causing callers using the function to get incorrect results in some cases, mainly depending on the platform and compiler options used. Note: this function is not used by htslib. It was only added for reasons of completeness when we were putting in hts_drand48, used by htslib/ksort.h. --- hts_os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hts_os.c b/hts_os.c index 359392173..c26700975 100644 --- a/hts_os.c +++ b/hts_os.c @@ -49,7 +49,7 @@ HTSLIB_EXPORT double hts_drand48(void) { return drand48(); } HTSLIB_EXPORT -double hts_lrand48(void) { return lrand48(); } +long hts_lrand48(void) { return lrand48(); } #endif // // On Windows when using the MSYS or Cygwin terminals, isatty fails From 21b677e52ea98219c30f8a0c6bd3d973210d361f Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 5 Nov 2020 14:52:48 +0000 Subject: [PATCH 019/488] Add Cirrus-CI integration (PR #1175) --- .cirrus.yml | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 166 insertions(+) create mode 100644 .cirrus.yml diff --git a/.cirrus.yml b/.cirrus.yml new file mode 100644 index 000000000..6b5ff0b98 --- /dev/null +++ b/.cirrus.yml @@ -0,0 +1,166 @@ +# Note we have a maximum of 16 CPUs available, so adjust our +# builds so we can start all concurrently without needing to schedule. + +# Sadly though there is still a finite limit to macOS of one instance. +# Can we cull our Mac test to just one instance? + +timeout_in: 10m + +#-------------------------------------------------- +# Template: build libdeflate dependency + +libdeflate_template: &LIBDEFLATE + libdeflate_script: | + if test "x$USE_LIBDEFLATE" == "xyes"; then + pushd "$HOME" + git clone --depth 1 https://github.com/ebiggers/libdeflate.git + pushd libdeflate + make -j 4 CFLAGS='-fPIC -O3' libdeflate.a + popd + popd + fi + +#-------------------------------------------------- +# Template: compile and test + +compile_template: &COMPILE + compile_script: | + if test "x$USE_LIBDEFLATE" = "xyes"; then + CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' + else + CONFIG_OPTS='--without-libdeflate' + fi + if test "$USE_CONFIG" = "yes"; then + MAKE_OPTS= + autoreconf + eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ + ( cat config.log; false ) + else + MAKE_OPTS=-e + fi + if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then + make maintainer-check + fi + make -j 4 $MAKE_OPTS + +test_template: &TEST + test_script: | + make test-shlib-exports + make test + + +#-------------------------------------------------- +# Task: linux builds. + +# Debian + latest GCC +gcc_task: + name: debian-gcc + container: + image: gcc:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + DO_MAINTAINER_CHECKS: yes + USE_CONFIG: no + - environment: + USE_CONFIG: yes + CFLAGS: -std=c99 -pedantic + USE_LIBDEFLATE: yes + + << : *LIBDEFLATE + << : *COMPILE + << : *TEST + + +# Ubuntu + Clang +ubuntu_task: + name: ubuntu-clang + container: + #image: ubuntu:latest # use << : *LIBDEFLATE + image: ubuntu:devel + cpu: 2 + memory: 1G + + environment: + CC: clang + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + USE_CONFIG: yes + - environment: + USE_CONFIG: yes + CFLAGS: -g -Wall -O3 -fsanitize=address + LDFLAGS: -fsanitize=address + USE_LIBDEFLATE: yes + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates clang libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + libdeflate-dev + + << : *COMPILE + << : *TEST + + +# CentOS +centos_task: + name: centos-gcc + container: + image: centos:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + USE_CONFIG: yes + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \ + bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \ + diffutils + + << : *COMPILE + << : *TEST + +#-------------------------------------------------- +# Task: macOS builds + +macosx_task: + name: macosx + clang + osx_instance: + image: catalina-base + + environment: + CC: clang + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + + matrix: + - environment: + USE_CONFIG: no + - environment: + USE_CONFIG: yes + USE_LIBDEFLATE: yes + + package_install_script: + - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz + + << : *LIBDEFLATE + << : *COMPILE + << : *TEST + From 0ad12d7593ada731e0ff20c8bcd027d31dfb98fa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 28 Oct 2020 12:13:30 +0000 Subject: [PATCH 020/488] Fix assumption of pthread_t being a non-structure. Replaces #1154 See also #1153 It does this by the addition of a separate variable to do the boolean check on instead of fd->dispatcher itself. I deemed this easier to understand than overloading the interpretation of fd->h being set plus I'm unsure of the potential failure case in sam_hdr_fill_hrecs where fd->h has been set but we errored before creating the threads. Also improves error recovering in case of pthread creation failure, avoiding a false pthread_join later. Thanks to John Marshall and Anders Kaplan for identifying that issue. --- sam.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sam.c b/sam.c index 49b1a0f3f..9127e1ea4 100644 --- a/sam.c +++ b/sam.c @@ -2377,6 +2377,7 @@ typedef struct SAM_state { pthread_mutex_t lines_m; hts_tpool_process *q; pthread_t dispatcher; + int dispatcher_set; sp_lines *lines; sp_bams *bams; @@ -2462,7 +2463,7 @@ int sam_state_destroy(htsFile *fp) { if (fd->q) hts_tpool_wake_dispatch(fd->q); // unstick the reader - if (!fp->is_write && fd->q && fd->dispatcher) { + if (!fp->is_write && fd->q && fd->dispatcher_set) { for (;;) { // Avoid deadlocks with dispatcher if (fd->command == SAM_CLOSE_DONE) @@ -2502,7 +2503,8 @@ int sam_state_destroy(htsFile *fp) { } // Wait for it to acknowledge - pthread_join(fd->dispatcher, NULL); + if (fd->dispatcher_set) + pthread_join(fd->dispatcher, NULL); if (!ret) ret = -fd->errcode; } @@ -3110,8 +3112,10 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) return -2; // We can only do this once we've got a header - if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, fp) != 0) + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, + fp) != 0) return -2; + fd->dispatcher_set = 1; } if (fd->h != h) { @@ -3299,13 +3303,15 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) // destroy it later on and sam_hdr_destroy takes non-const. // // We do this because some tools do sam_hdr_destroy; sam_close - // while others do sam_close; sam_hdr_destroy. The former is an - // issue as we need the header still when flushing. + // while others do sam_close; sam_hdr_destroy. The former is + // an issue as we need the header still when flushing. fd->h = (sam_hdr_t *)h; fd->h->ref_count++; - if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, fp) != 0) + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_write, + fp) != 0) return -2; + fd->dispatcher_set = 1; } if (fd->h != h) { From b710781df0d9bb6f665756316653554fe32e5c68 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 4 Nov 2020 18:52:24 +0000 Subject: [PATCH 021/488] Fix memory leak on bad input in vcf_parse_info() Credit to OSS-Fuzz Fixes oss-fuzz 26968 --- vcf.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vcf.c b/vcf.c index 89ee90c20..a1aa55525 100644 --- a/vcf.c +++ b/vcf.c @@ -2709,7 +2709,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; - return -1; + goto fail; } val = end = 0; c = *r; *r = 0; @@ -2736,7 +2736,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; - return -1; + goto fail; } } uint32_t y = kh_val(d, k).info[BCF_HL_INFO]; @@ -2757,7 +2757,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (!a_tmp) { hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // No appropriate code? - return -1; + goto fail; } a_val = a_tmp; max_n_val = n_val; @@ -2854,6 +2854,10 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p free(a_val); return 0; + + fail: + free(a_val); + return -1; } int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) From 7f8211cd76a59ba870566a430a69287ee657dd4e Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 23 Oct 2020 15:06:47 +0100 Subject: [PATCH 022/488] Prevent GCC -Wextra switch/case statement "fall through" warnings Add comments indicating intentional fall-throughs. Adjust comment in textutils_internal.h to match GCC's -Wimplicit-fallthrough[=3] pattern. Annotate several functions that call exit() as HTS_NORETURN so the compiler knows they won't actually fall through in any switch statements they appear in. (Clang understands only not-yet-standard attributes and annotations, not comment text, so there's little point in catering to its -Wimplicit- fallthrough warning, which isn't included in -Wall/-Wextra in Clang.) --- cram/cram_codecs.c | 28 ++++++++++++++-------------- cram/rANS_static.c | 9 ++++++--- hts.c | 5 +++-- tabix.c | 6 ++++-- test/test_index.c | 2 +- textutils_internal.h | 2 +- 6 files changed, 29 insertions(+), 23 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 9f339c723..80131b633 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -202,20 +202,20 @@ static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { } switch(nbits) { -// case 15: GET_BIT_MSB(block, val); -// case 14: GET_BIT_MSB(block, val); -// case 13: GET_BIT_MSB(block, val); -// case 12: GET_BIT_MSB(block, val); -// case 11: GET_BIT_MSB(block, val); -// case 10: GET_BIT_MSB(block, val); -// case 9: GET_BIT_MSB(block, val); - case 8: GET_BIT_MSB(block, val); - case 7: GET_BIT_MSB(block, val); - case 6: GET_BIT_MSB(block, val); - case 5: GET_BIT_MSB(block, val); - case 4: GET_BIT_MSB(block, val); - case 3: GET_BIT_MSB(block, val); - case 2: GET_BIT_MSB(block, val); +// case 15: GET_BIT_MSB(block, val); // fall through +// case 14: GET_BIT_MSB(block, val); // fall through +// case 13: GET_BIT_MSB(block, val); // fall through +// case 12: GET_BIT_MSB(block, val); // fall through +// case 11: GET_BIT_MSB(block, val); // fall through +// case 10: GET_BIT_MSB(block, val); // fall through +// case 9: GET_BIT_MSB(block, val); // fall through + case 8: GET_BIT_MSB(block, val); // fall through + case 7: GET_BIT_MSB(block, val); // fall through + case 6: GET_BIT_MSB(block, val); // fall through + case 5: GET_BIT_MSB(block, val); // fall through + case 4: GET_BIT_MSB(block, val); // fall through + case 3: GET_BIT_MSB(block, val); // fall through + case 2: GET_BIT_MSB(block, val); // fall through case 1: GET_BIT_MSB(block, val); break; diff --git a/cram/rANS_static.c b/cram/rANS_static.c index b58fe7189..584f8b561 100644 --- a/cram/rANS_static.c +++ b/cram/rANS_static.c @@ -157,9 +157,9 @@ unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size, RansEncInit(&rans3); switch (i=(in_size&3)) { - case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); - case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); - case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); + case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); // fall through + case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); // fall through + case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); // fall through case 0: break; } @@ -348,10 +348,13 @@ unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size, switch(out_sz&3) { case 3: out_buf[out_end+2] = D.R[RansDecGet(&R[2], TF_SHIFT)]; + // fall through case 2: out_buf[out_end+1] = D.R[RansDecGet(&R[1], TF_SHIFT)]; + // fall through case 1: out_buf[out_end] = D.R[RansDecGet(&R[0], TF_SHIFT)]; + // fall through default: break; } diff --git a/hts.c b/hts.c index 278ecd36a..1ab89400e 100644 --- a/hts.c +++ b/hts.c @@ -784,8 +784,8 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { // NB: Doesn't support floats, eg 1.5g // TODO: extend hts_parse_decimal? See also samtools sort. switch (*endp) { - case 'g': case 'G': o->val.i *= 1024; - case 'm': case 'M': o->val.i *= 1024; + case 'g': case 'G': o->val.i *= 1024; // fall through + case 'm': case 'M': o->val.i *= 1024; // fall through case 'k': case 'K': o->val.i *= 1024; break; case '\0': break; default: @@ -2731,6 +2731,7 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) switch (tid) { case HTS_IDX_NONE: iter->finished = 1; + // fall through case HTS_IDX_START: case HTS_IDX_REST: iter->curr_off = t_off; diff --git a/tabix.c b/tabix.c index 01cbf801c..3013aa550 100644 --- a/tabix.c +++ b/tabix.c @@ -52,7 +52,8 @@ typedef struct } args_t; -HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error(const char *format, ...) +static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +error(const char *format, ...) { va_list ap; fflush(stdout); @@ -63,7 +64,8 @@ HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error(const char *format, ...) exit(EXIT_FAILURE); } -HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) static void error_errno(const char *format, ...) +static void HTS_FORMAT(HTS_PRINTF_FMT, 1, 2) HTS_NORETURN +error_errno(const char *format, ...) { va_list ap; int eno = errno; diff --git a/test/test_index.c b/test/test_index.c index 402879666..cc90f51fe 100644 --- a/test/test_index.c +++ b/test/test_index.c @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/sam.h" #include "../htslib/vcf.h" -void usage(FILE *fp) { +void HTS_NORETURN usage(FILE *fp) { fprintf(fp, "Usage: test_index [opts] in.{sam.gz,bam,cram}|in.{vcf.gz,bcf}\n\n"); fprintf(fp, " -b Use BAI index (BAM, SAM)\n"); fprintf(fp, " -c Use CSI index (BAM, SAM, VCF, BCF)\n"); diff --git a/textutils_internal.h b/textutils_internal.h index 7e8628658..4b120bdbc 100644 --- a/textutils_internal.h +++ b/textutils_internal.h @@ -355,7 +355,7 @@ static inline double hts_str2dbl(const char *in, char **end, int *failed) { case '0': if (v[1] != 'x' && v[1] != 'X') break; - // else fall through (hex number) + // else fall through - hex number default: // Non numbers, like NaN, Inf From 9a55e4e74829593722b1a08a5c2cc414a57e1a25 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 27 Oct 2020 13:45:12 +0000 Subject: [PATCH 023/488] Extract the CIGAR parsing logic into separate methods for each of the two most common use cases: independent uint32_t array and bam1_t record. Add documentation. --- htslib/sam.h | 23 ++++++++ sam.c | 125 ++++++++++++++++++++++++++++++++++++----- test/mpileup/mp_N2.sam | 2 +- 3 files changed, 134 insertions(+), 16 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 8fcf819bf..d08646f13 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1103,6 +1103,29 @@ char *bam_flag2str(int flag); /** The string must be freed by the user */ HTSLIB_EXPORT int bam_set_qname(bam1_t *b, const char *qname); +/*! @function + @abstract Parse a CIGAR string into a uint32_t array + @param in [in] pointer to the source string + @param end [out] address of the pointer to the new end of the input string + can be NULL + @param a_cigar [out] address of the destination uint32_t buffer + @param a_mem [in/out] address of the allocated number of buffer elements + @return number of processed CIGAR operators; 0 if error + */ +HTSLIB_EXPORT +size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem); + +/*! @function + @abstract Parse a CIGAR string into a bam1_t struct + @param in [in] pointer to the source string + @param end [out] address of the pointer to the new end of the input string + can be NULL + @param b [in/out] address of the destination bam1_t struct + @return number of processed CIGAR operators; 0 if error + */ +HTSLIB_EXPORT +size_t bam_parse_cigar(const char *in, char **end, bam1_t *b); + /************************* *** BAM/CRAM indexing *** *************************/ diff --git a/sam.c b/sam.c index 9127e1ea4..90180be66 100644 --- a/sam.c +++ b/sam.c @@ -2148,22 +2148,13 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) if (*p++ != '\t') goto err_ret; // cigar if (*p != '*') { - uint32_t *cigar; - size_t n_cigar = 0; - for (q = p; *p && *p != '\t'; ++p) - if (!isdigit_c(*p)) ++n_cigar; - if (*p++ != '\t') goto err_ret; - _parse_err(n_cigar == 0, "no CIGAR operations"); - _parse_err(n_cigar >= 2147483647, "too many CIGAR operations"); + uint32_t *cigar = NULL; + int old_l_data = b->l_data; + uint32_t n_cigar = bam_parse_cigar(p, &p, b); + if (!n_cigar || *p++ != '\t') goto err_ret; + cigar = (uint32_t *)(b->data + old_l_data); c->n_cigar = n_cigar; - _get_mem(uint32_t, &cigar, b, c->n_cigar * sizeof(uint32_t)); - for (i = 0; i < c->n_cigar; ++i) { - int op; - cigar[i] = hts_str2uint(q, &q, 28, &overflow)<flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; if (cigreflen == 0) cigreflen = 1; @@ -2328,6 +2319,110 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) return -2; } +static uint32_t read_ncigar(const char *in) { + uint32_t n_cigar = 0; + char *q = (char *)in; + for (; *q && *q != '\t'; ++q) + if (!isdigit_c(*q)) ++n_cigar; + if (!n_cigar) { + hts_log_error("No CIGAR operations"); + return 0; + } + if (n_cigar >= 2147483647) { + hts_log_error("Too many CIGAR operations"); + return 0; + } + + return n_cigar; +} + +/*! @function + @abstract Parse a CIGAR string into preallocated a uint32_t array + @param in [in] pointer to the source string + @param a_cigar [out] address of the destination uint32_t buffer + @return number of processed input characters; 0 if error + */ +static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { + int i, overflow = 0; + char *p, *q = (char *)in; + for (i = 0; i < n_cigar; i++) { + uint32_t len; + int op; + p = q; + len = hts_str2uint(q, &q, 28, &overflow)< *a_mem) { + uint32_t *a_tmp = realloc(*a_cigar, n_cigar*sizeof(**a_cigar)); + if (a_tmp) { + *a_cigar = a_tmp; + *a_mem = n_cigar; + } else { + hts_log_error("Memory allocation error"); + return 0; + } + } + + if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return 0; + if (end) *end = (char *)in+diff; + + return n_cigar; +} + +size_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { + size_t n_cigar = 0; + int diff; + + if (!in || !b) { + hts_log_error("NULL pointer arguments"); + return 0; + } + if (end) *end = (char *)in; + + n_cigar = read_ncigar(in); + if (!n_cigar) return 0; + if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) { + hts_log_error("Memory allocation error"); + return 0; + } + + if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return 0; + b->l_data += (n_cigar * sizeof(uint32_t)); + if (end) *end = (char *)in+diff; + + return n_cigar; +} + /* * ----------------------------------------------------------------------------- * SAM threading diff --git a/test/mpileup/mp_N2.sam b/test/mpileup/mp_N2.sam index 292cfcb58..8ea072db9 100644 --- a/test/mpileup/mp_N2.sam +++ b/test/mpileup/mp_N2.sam @@ -40,7 +40,7 @@ @CO sD1 0 z 1 0 4M2I5D2I4M * 0 0 TAGCAATTAGGT ABCDEFGHIJKL sD2 0 z 1 0 4M1I1P5D1P1I4M * 0 0 TAGCATAGGT ABCDEHIJKL -sD3 0 z 1 0 4M1P1II5D1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL +sD3 0 z 1 0 4M1P1I5D1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL sN1 0 z 1 0 4M2I5N2I4M * 0 0 TAGCAATTAGGT ABCDEFGHIJKL sN2 0 z 1 0 4M1I1P5N1P1I4M * 0 0 TAGCATAGGT ABCDEHIJKL sN3 0 z 1 0 4M1P1I5N1I1P4M * 0 0 TAGCATAGGT ABCDFGIJKL From 51275bcbda6d1e0849d0e50d0edd13814d38ebd1 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 10 Nov 2020 12:38:36 +0000 Subject: [PATCH 024/488] Make bam_itr_next an alias for sam_itr_next. --- htslib/sam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/sam.h b/htslib/sam.h index d08646f13..1ccad8776 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1135,7 +1135,7 @@ size_t bam_parse_cigar(const char *in, char **end, bam1_t *b); #define bam_itr_destroy(iter) hts_itr_destroy(iter) #define bam_itr_queryi(idx, tid, beg, end) sam_itr_queryi(idx, tid, beg, end) #define bam_itr_querys(idx, hdr, region) sam_itr_querys(idx, hdr, region) -#define bam_itr_next(htsfp, itr, r) hts_itr_next((htsfp)->fp.bgzf, (itr), (r), 0) +#define bam_itr_next(htsfp, itr, r) sam_itr_next((htsfp), (itr), (r)) // Load/build .csi or .bai BAM index file. Does not work with CRAM. // It is recommended to use the sam_index_* functions below instead. From 8c7eccfc8670f4ea23122eb10f70fc491ccc805f Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Wed, 25 Nov 2020 16:07:33 +0000 Subject: [PATCH 025/488] Adjust parser for undefined CIGAR. --- htslib/sam.h | 11 ++++++----- sam.c | 30 +++++++++++++++++++----------- test/sam.c | 23 +++++++++++++++++++++++ 3 files changed, 48 insertions(+), 16 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 1ccad8776..bfd7855a6 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#include #include "hts.h" #include "hts_endian.h" @@ -1108,12 +1109,12 @@ int bam_set_qname(bam1_t *b, const char *qname); @param in [in] pointer to the source string @param end [out] address of the pointer to the new end of the input string can be NULL - @param a_cigar [out] address of the destination uint32_t buffer + @param a_cigar [in/out] address of the destination uint32_t buffer @param a_mem [in/out] address of the allocated number of buffer elements - @return number of processed CIGAR operators; 0 if error + @return number of processed CIGAR operators; -1 on error */ HTSLIB_EXPORT -size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem); +ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem); /*! @function @abstract Parse a CIGAR string into a bam1_t struct @@ -1121,10 +1122,10 @@ size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t @param end [out] address of the pointer to the new end of the input string can be NULL @param b [in/out] address of the destination bam1_t struct - @return number of processed CIGAR operators; 0 if error + @return number of processed CIGAR operators; -1 on error */ HTSLIB_EXPORT -size_t bam_parse_cigar(const char *in, char **end, bam1_t *b); +ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b); /************************* *** BAM/CRAM indexing *** diff --git a/sam.c b/sam.c index 90180be66..467b270c3 100644 --- a/sam.c +++ b/sam.c @@ -2150,8 +2150,8 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) if (*p != '*') { uint32_t *cigar = NULL; int old_l_data = b->l_data; - uint32_t n_cigar = bam_parse_cigar(p, &p, b); - if (!n_cigar || *p++ != '\t') goto err_ret; + int n_cigar = bam_parse_cigar(p, &p, b); + if (n_cigar < 1 || *p++ != '\t') goto err_ret; cigar = (uint32_t *)(b->data + old_l_data); c->n_cigar = n_cigar; @@ -2340,7 +2340,7 @@ static uint32_t read_ncigar(const char *in) { @abstract Parse a CIGAR string into preallocated a uint32_t array @param in [in] pointer to the source string @param a_cigar [out] address of the destination uint32_t buffer - @return number of processed input characters; 0 if error + @return number of processed input characters; 0 on error */ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { int i, overflow = 0; @@ -2370,16 +2370,20 @@ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { return q-in; } -size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem) { +ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t *a_mem) { size_t n_cigar = 0; int diff; if (!in || !a_cigar || !a_mem) { hts_log_error("NULL pointer arguments"); - return 0; + return -1; } if (end) *end = (char *)in; + if (*in == '*') { + if (end) (*end)++; + return 0; + } n_cigar = read_ncigar(in); if (!n_cigar) return 0; if (n_cigar > *a_mem) { @@ -2389,34 +2393,38 @@ size_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, uint32_t *a_mem = n_cigar; } else { hts_log_error("Memory allocation error"); - return 0; + return -1; } } - if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return 0; + if (!(diff = parse_cigar(in, *a_cigar, n_cigar))) return -1; if (end) *end = (char *)in+diff; return n_cigar; } -size_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { +ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { size_t n_cigar = 0; int diff; if (!in || !b) { hts_log_error("NULL pointer arguments"); - return 0; + return -1; } if (end) *end = (char *)in; + if (*in == '*') { + if (end) (*end)++; + return 0; + } n_cigar = read_ncigar(in); if (!n_cigar) return 0; if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) { hts_log_error("Memory allocation error"); - return 0; + return -1; } - if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return 0; + if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1; b->l_data += (n_cigar * sizeof(uint32_t)); if (end) *end = (char *)in+diff; diff --git a/test/sam.c b/test/sam.c index af1dc34da..b6f6c0e04 100644 --- a/test/sam.c +++ b/test/sam.c @@ -33,6 +33,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include // Suppress message for faidx_fetch_nseq(), which we're intentionally testing #include "../htslib/hts_defs.h" @@ -2149,6 +2150,27 @@ static void test_bam_set1_write_and_read_back() ks_free(&ks); } +static void test_cigar_api(void) +{ + uint32_t *buf = NULL; + char *cig = "*"; + char *end; + size_t m = 0; + int n; + n = sam_parse_cigar(cig, &end, &buf, &m); + VERIFY(n == 0 && m == 0 && (end-cig) == 1, "failed to parse undefined CIGAR"); + cig = "2M3X1I10M5D"; + n = sam_parse_cigar(cig, &end, &buf, &m); + VERIFY(n == 5 && m > 0 && (end-cig) == 11, "failed to parse CIGAR string: 2M3X1I10M5D"); + n = sam_parse_cigar("722M15D187217376188323783284M67I", NULL, &buf, &m); + VERIFY(n == -1, "failed to flag CIGAR string with long op length: 722M15D187217376188323783284M67I"); + n = sam_parse_cigar("53I722MD8X", NULL, &buf, &m); + VERIFY(n == -1, "failed to flag CIGAR string with no op length: 53I722MD8X"); + +cleanup: + free(buf); +} + int main(int argc, char **argv) { int i; @@ -2186,6 +2208,7 @@ int main(int argc, char **argv) test_bam_set1_validate_cigar(); test_bam_set1_validate_size_limits(); test_bam_set1_write_and_read_back(); + test_cigar_api(); return status; } From 2056490488b81169bf69beb5ac835cd22dc74a42 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 26 Nov 2020 10:26:03 +0000 Subject: [PATCH 026/488] Recode to use const char pointers (cherry picked from commit a12d4d447cd3ca089db05dcc35144818339a4911) --- sam.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/sam.c b/sam.c index 467b270c3..eb2712965 100644 --- a/sam.c +++ b/sam.c @@ -2319,9 +2319,8 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) return -2; } -static uint32_t read_ncigar(const char *in) { +static uint32_t read_ncigar(const char *q) { uint32_t n_cigar = 0; - char *q = (char *)in; for (; *q && *q != '\t'; ++q) if (!isdigit_c(*q)) ++n_cigar; if (!n_cigar) { @@ -2344,12 +2343,12 @@ static uint32_t read_ncigar(const char *in) { */ static int parse_cigar(const char *in, uint32_t *a_cigar, uint32_t n_cigar) { int i, overflow = 0; - char *p, *q = (char *)in; + const char *p = in; for (i = 0; i < n_cigar; i++) { uint32_t len; int op; - p = q; - len = hts_str2uint(q, &q, 28, &overflow)< Date: Tue, 1 Dec 2020 10:24:33 +0000 Subject: [PATCH 027/488] Fix the bgzf_idx_flush assertion. (PR #1168) The assumption is that we call bgzf_idx_flush once per outgoing multi-threaded block, and the block numbers will match. The assertion is to validate we're not indexing out of order. However, the test in the assertion was the wrong way round. Very long records (eg with huge aux tags or long seqs) can mean a single record spans multiple blocks and we inherently then skip blocks in bgzf_idx_push calls. What it actually needs to catch is cases where the blocks have been written out before the associated index entries have been added. Fixes samtools/samtools#1328 --- bgzf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bgzf.c b/bgzf.c index 09f18a643..ded3884e4 100644 --- a/bgzf.c +++ b/bgzf.c @@ -273,7 +273,7 @@ static int bgzf_idx_flush(BGZF *fp) { hts_idx_cache_entry *e = mt->idx_cache.e; int i; - assert(mt->idx_cache.nentries == 0 || mt->block_written >= e[0].block_number); + assert(mt->idx_cache.nentries == 0 || mt->block_written <= e[0].block_number); for (i = 0; i < mt->idx_cache.nentries && e[i].block_number == mt->block_written; i++) { if (hts_idx_push(mt->hts_idx, e[i].tid, e[i].beg, e[i].end, From 7ad1ec2b4d1de9f04cfe827539446b6de8548a54 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 6 Dec 2020 20:12:16 +0000 Subject: [PATCH 028/488] Use $(ALL_CPPFLAGS), similarly to samtools's Makefile This extra level of indirection simplifies making additions to the CPPFLAGS used during compilation without interfering with the user's setting for $CPPFLAGS. Fix related typo in the MSYS/MinGW part of configure.ac. --- Makefile | 6 ++++-- config.mk.in | 2 +- configure.ac | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 245b7a16c..2adc21d25 100644 --- a/Makefile +++ b/Makefile @@ -103,6 +103,8 @@ BUILT_THRASH_PROGRAMS = \ all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \ htslib_static.mk htslib-uninstalled.pc +ALL_CPPFLAGS = -I. $(CPPFLAGS) + HTSPREFIX = include htslib_vars.mk @@ -133,10 +135,10 @@ show-version: .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: - $(CC) $(CFLAGS) -I. $(CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< .c.pico: - $(CC) $(CFLAGS) -I. $(CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + $(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< LIBHTS_OBJS = \ diff --git a/config.mk.in b/config.mk.in index 976e557df..f8decf0a2 100644 --- a/config.mk.in +++ b/config.mk.in @@ -94,7 +94,7 @@ pluginpath = @pluginpath@ LIBHTS_OBJS += plugin.o PLUGIN_OBJS += $(plugin_OBJS) -plugin.o plugin.pico: CPPFLAGS += -DPLUGINPATH=\"$(pluginpath)\" +plugin.o plugin.pico: ALL_CPPFLAGS += -DPLUGINPATH=\"$(pluginpath)\" # When built as separate plugins, these record their version themselves. hfile_gcs.o hfile_gcs.pico: version.h diff --git a/configure.ac b/configure.ac index ec137b75d..9bd1642d7 100644 --- a/configure.ac +++ b/configure.ac @@ -157,7 +157,7 @@ case $host_alias in # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, # %lld and %z printf formats work. It also enforces the snprintf to # be C99 compliant so it returns the correct values (in kstring.c). - CPPFLAGS="$CPPCFLAGS -D_XOPEN_SOURCE=600" + CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" ;; *) host_result="plain .so" From b581944da4015182a6a9a9bc325ee15b002cba61 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Mon, 7 Dec 2020 15:30:07 +0000 Subject: [PATCH 029/488] Set lines to be skipped independently. --- tabix.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tabix.c b/tabix.c index 3013aa550..36c414798 100644 --- a/tabix.c +++ b/tabix.c @@ -498,6 +498,7 @@ int main(int argc, char *argv[]) memset(&args,0,sizeof(args_t)); args.cache_megs = 10; args.download_index = 1; + int32_t new_line_skip = -1; static const struct option loptions[] = { @@ -570,7 +571,7 @@ int main(int argc, char *argv[]) detect = 0; break; case 'S': - conf.line_skip = strtol(optarg,&tmp,10); + new_line_skip = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: -S %s\n", optarg); detect = 0; break; @@ -605,6 +606,9 @@ int main(int argc, char *argv[]) } } + if (new_line_skip >= 0) + conf.line_skip = new_line_skip; + if ( optind==argc ) return usage(stderr, EXIT_FAILURE); if ( list_chroms ) From 7ca2b49fc9200171c3ffdb2a8c403c845a286e9d Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 5 Nov 2020 11:26:09 +0000 Subject: [PATCH 030/488] Don't link plugins with libhts.dylib on macOS PR #1072 changed plugin linking so that plugins are linked back to the dynamic libhts.so/.dylib, to facilitate use when libhts is itself dynamically dlopen()ed with RTLD_LOCAL, e.g., by the Python runtime which uses default dlopen() flags which on Linux means RTLD_LOCAL. This broke plugin loading on macOS when opening plugins in an executable in which libhts.a has been statically linked, as there were then two copies of the library globals (notably hfile.c::schemes), one from the executable's libhts.a and one from the plugin's libhts.NN.dylib. (The Linux loading model does not suffer from this issue.) The default dlopen() flag on macOS is RTLD_GLOBAL, so this can be fixed by reverting the change (on macOS only) and depending on the symbols supplied by a static libhts.a, a dynamically linked libhts.NN.dylib, or a RTLD_GLOBALly dlopen()ed libhts.NN.dylib. This rebreaks the case of dlopen()ing libhts on macOS while explicitly specifying RTLD_LOCAL, but this is not a common case. Fixes #1176. Disable the `plugins-dlhts -l` test case on macOS. Add a test of accessing plugins from an executable with a statically linked libhts.a (namely, htsfile) to test/test.pl. --- Makefile | 12 ++++++++++-- test/plugins-dlhts.c | 18 +++++++++++++++--- test/test.pl | 21 +++++++++++++++++++++ 3 files changed, 46 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 2adc21d25..bd637eacc 100644 --- a/Makefile +++ b/Makefile @@ -309,11 +309,19 @@ hts-$(LIBHTS_SOVERSION).dll hts.dll.a: $(LIBHTS_OBJS) hts-object-files: $(LIBHTS_OBJS) touch $@ +# On Unix dlopen("libhts.so.NN", RTLD_LAZY) may default to RTLD_LOCAL. +# Hence plugins need to link to (shared) libhts.so.NN themselves, as they +# may not be able to access libhts symbols via the main program's libhts +# if that was dynamically loaded without an explicit RTLD_GLOBAL. %.so: %.pico libhts.so $(CC) -shared -Wl,-E $(LDFLAGS) -o $@ $< libhts.so $(LIBS) -lpthread -%.bundle: %.o libhts.dylib - $(CC) -bundle -Wl,-undefined,dynamic_lookup $(LDFLAGS) -o $@ $< libhts.dylib $(LIBS) +# For programs *statically* linked to libhts.a, on macOS loading a plugin +# linked to a shared libhts.NN.dylib would lead to conflicting duplicate +# symbols. Fortunately macOS dlopen() defaults to RTLD_GLOBAL so there +# is less need for plugins to link back to libhts themselves. +%.bundle: %.o + $(CC) -bundle -Wl,-undefined,dynamic_lookup $(LDFLAGS) -o $@ $< $(LIBS) %.cygdll: %.o libhts.dll.a $(CC) -shared $(LDFLAGS) -o $@ $< libhts.dll.a $(LIBS) diff --git a/test/plugins-dlhts.c b/test/plugins-dlhts.c index aa98ef3f5..f90e3bd74 100644 --- a/test/plugins-dlhts.c +++ b/test/plugins-dlhts.c @@ -101,6 +101,7 @@ void verbose_log(const char *message) int main(int argc, char **argv) { int dlflags = RTLD_NOW; + int skip = 0; int c; while ((c = getopt(argc, argv, "glv")) >= 0) @@ -133,15 +134,26 @@ int main(int argc, char **argv) hclose_abruptly_p = (hclose_abruptly_func *) func(htslib, "hclose_abruptly"); test_hopen("bad-scheme:unsupported", 0); + +#ifdef __APPLE__ + /* Skip -l tests as we don't link plugins back to libhts on macOS, as this + would conflict with a statically linked libhts.a on this platform. */ + skip = (dlflags & RTLD_LOCAL) != 0; +#endif + + if (! skip) { #ifdef HAVE_LIBCURL - test_hopen("https://localhost:99999/invalid_port", 1); + test_hopen("https://localhost:99999/invalid_port", 1); #endif #ifdef ENABLE_GCS - test_hopen("gs:invalid", 1); + test_hopen("gs:invalid", 1); #endif #ifdef ENABLE_S3 - test_hopen("s3:invalid", 1); + test_hopen("s3:invalid", 1); #endif + } + else + verbose_log("Skipping most tests"); verbose_log("Calling hts_lib_shutdown()"); (func(htslib, "hts_lib_shutdown"))(); diff --git a/test/test.pl b/test/test.pl index 1fb6112bd..5db7b2320 100755 --- a/test/test.pl +++ b/test/test.pl @@ -57,6 +57,7 @@ test_convert_padded_header($opts); test_rebgzip($opts); test_logging($opts); +test_plugin_loading($opts); test_realn($opts); print "\nNumber of tests:\n"; @@ -936,6 +937,26 @@ sub test_logging else { passed($opts,$test); } } +sub test_plugin_loading { + my ($opts) = @_; + + my $test = "test_plugin_loading"; + + unless (-e "$$opts{bin}/hfile_libcurl.so" || -e "$$opts{bin}/hfile_libcurl.bundle") { + print "$test: .. skipping\n\n"; + return; + } + + # Test that plugins can be loaded from an executable statically linked to libhts.a + my $url = "https://localhost:99999/invalid_port"; + my $cmd = "HTS_PATH=$$opts{bin} $$opts{path}/with-shlib.sh $$opts{bin}/htsfile $url"; + print "$test:\n\t$cmd\n"; + my ($ret, $out) = _cmd("$cmd 2>&1"); + if ($ret == 0) { failed($opts, $test, "successful exit status"); } + elsif ($out =~ /couldn't register/i || $out =~ /not supported/i) { failed($opts, $test, $out); } + else { passed($opts, $test); } +} + sub test_realn { my ($opts) = @_; From 246c146f3f46d184b1dc3877ca35b16d13ee220a Mon Sep 17 00:00:00 2001 From: Tim Gates Date: Thu, 17 Dec 2020 21:48:04 +1100 Subject: [PATCH 031/488] docs: fix simple typo, seperated -> separated There is a small typo in htslib/vcf.h. Should read `separated` rather than `seperated`. --- htslib/vcf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 659ddc7c5..6e476187a 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -182,7 +182,7 @@ typedef struct bcf_dec_t { int m_fmt, m_info, m_id, m_als, m_allele, m_flt; // allocated size (high-water mark); do not change int n_flt; // Number of FILTER fields int *flt; // FILTER keys in the dictionary - char *id, *als; // ID and REF+ALT block (\0-seperated) + char *id, *als; // ID and REF+ALT block (\0-separated) char **allele; // allele[0] is the REF (allele[] pointers to the als block); all null terminated bcf_info_t *info; // INFO bcf_fmt_t *fmt; // FORMAT and individual sample From 78441c964312aba387d50dbe144f0ff6975820ae Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 1 Jan 2021 10:37:22 +0000 Subject: [PATCH 032/488] Don't set $host_alias as that confuses autoconf 2.70 Autoconf 2.70 is more careful about cross compilation, so with this version using AC_FUNC_MMAP implies AC_CANONICAL_HOST and hence computes $build/build_alias/host/host_alias/etc. Setting $host_alias ourselves interferes with that. Hat tip Matthias Klose (via debbug#978835). As autoconf 2.70 implicitly uses AC_CANONICAL_HOST, it requires (and its autoreconf --install installs) config.guess and config.sub. Ignore those, and ignore install-sh as well for good measure. --- .gitignore | 3 +++ configure.ac | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 363698f1c..76be407b5 100644 --- a/.gitignore +++ b/.gitignore @@ -11,12 +11,15 @@ autom4te.cache config.cache +config.guess config.h config.h.in config.log config.mk config.status +config.sub configure +install-sh hfile_*.bundle hfile_*.cygdll diff --git a/configure.ac b/configure.ac index 9bd1642d7..f473c97e5 100644 --- a/configure.ac +++ b/configure.ac @@ -137,9 +137,9 @@ AC_ARG_ENABLE([s3], [support Amazon AWS S3 URLs])], [], [enable_s3=check]) -test -n "$host_alias" || host_alias=unknown-`uname -s` -AC_MSG_CHECKING([shared library type for $host_alias]) -case $host_alias in +basic_host=${host_alias:-unknown-`uname -s`} +AC_MSG_CHECKING([shared library type for $basic_host]) +case $basic_host in *-cygwin* | *-CYGWIN*) host_result="Cygwin DLL" PLATFORM=CYGWIN From 44787d9800bba41e463e4db11c2280fac3cdbb7f Mon Sep 17 00:00:00 2001 From: Anders Kaplan Date: Fri, 23 Oct 2020 21:00:41 +0200 Subject: [PATCH 033/488] Replaced bam_construct_seq() with bam_set1(). --- Makefile | 2 - cram/cram_decode.c | 41 +++++++-------- cram/cram_samtools.c | 123 ------------------------------------------- cram/cram_samtools.h | 23 -------- htslib.mk | 1 - 5 files changed, 18 insertions(+), 172 deletions(-) delete mode 100644 cram/cram_samtools.c diff --git a/Makefile b/Makefile index bd637eacc..e189ecb28 100644 --- a/Makefile +++ b/Makefile @@ -174,7 +174,6 @@ LIBHTS_OBJS = \ cram/cram_external.o \ cram/cram_index.o \ cram/cram_io.o \ - cram/cram_samtools.o \ cram/cram_stats.o \ cram/mFILE.o \ cram/open_trace_file.o \ @@ -367,7 +366,6 @@ cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) -cram/cram_samtools.o cram/cram_samtools.pico: cram/cram_samtools.c config.h $(cram_h) $(htslib_sam_h) $(sam_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 6aeb0aa37..ad09fb757 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2839,10 +2839,10 @@ int cram_decode_slice_mt(cram_fd *fd, cram_container *c, cram_slice *s, */ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, cram_record *cr, int rec, bam_seq_t **bam) { - int bam_idx, rg_len; + int ret, rg_len; char name_a[1024], *name; int name_len; - char *aux, *aux_orig; + char *aux; char *seq, *qual; sam_hrecs_t *bfd = sh->hrecs; @@ -2887,7 +2887,6 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, cr->len = 0; } - if (fd->required_fields & SAM_QUAL) { if (!BLOCK_DATA(s->qual_blk)) return -1; @@ -2896,41 +2895,37 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, qual = NULL; } - bam_idx = bam_construct_seq(bam, cr->aux_size + rg_len, - name, name_len, - cr->flags, - cr->ref_id, - cr->apos, - cr->aend, - cr->mqual, - cr->ncigar, &s->cigar[cr->cigar], - cr->mate_ref_id, - cr->mate_pos, - cr->tlen, - cr->len, - seq, - qual); - if (bam_idx == -1) - return -1; + ret = bam_set1(*bam, + name_len, name, + cr->flags, cr->ref_id, cr->apos - 1, cr->mqual, + cr->ncigar, &s->cigar[cr->cigar], + cr->mate_ref_id, cr->mate_pos - 1, cr->tlen, + cr->len, seq, qual, + cr->aux_size + rg_len); + if (ret < 0) { + return ret; + } - aux = aux_orig = (char *)bam_aux(*bam); + aux = (char *)bam_aux(*bam); /* Auxiliary strings */ if (cr->aux_size != 0) { memcpy(aux, BLOCK_DATA(s->aux_blk) + cr->aux, cr->aux_size); aux += cr->aux_size; + (*bam)->l_data += cr->aux_size; } /* RG:Z: */ - if (cr->rg != -1) { - int len = bfd->rg[cr->rg].name_len; + if (rg_len > 0) { *aux++ = 'R'; *aux++ = 'G'; *aux++ = 'Z'; + int len = bfd->rg[cr->rg].name_len; memcpy(aux, bfd->rg[cr->rg].name, len); aux += len; *aux++ = 0; + (*bam)->l_data += rg_len; } - return bam_idx + (aux - aux_orig); + return (*bam)->l_data; } /* diff --git a/cram/cram_samtools.c b/cram/cram_samtools.c deleted file mode 100644 index 890a7fad4..000000000 --- a/cram/cram_samtools.c +++ /dev/null @@ -1,123 +0,0 @@ -/* -Copyright (c) 2010-2013, 2017-2019 Genome Research Ltd. -Author: James Bonfield - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: - - 1. Redistributions of source code must retain the above copyright notice, -this list of conditions and the following disclaimer. - - 2. Redistributions in binary form must reproduce the above copyright notice, -this list of conditions and the following disclaimer in the documentation -and/or other materials provided with the distribution. - - 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger -Institute nor the names of its contributors may be used to endorse or promote -products derived from this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND -ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED -WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE -FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*/ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include - -#include "cram.h" -#include "../htslib/sam.h" -#include "../sam_internal.h" - -/*--------------------------------------------------------------------------- - * Samtools compatibility portion - */ -int bam_construct_seq(bam_seq_t **bp, size_t extra_len, - const char *qname, size_t qname_len, - int flag, - int rname, // Ref ID - int64_t pos, - int64_t end, // aligned start/end coords - int mapq, - uint32_t ncigar, const uint32_t *cigar, - int mrnm, // Mate Ref ID - int64_t mpos, - int64_t isize, - int len, - const char *seq, - const char *qual) { - static const char L[256] = { - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15, 0,15,15, - 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, - 15, 1,14, 2,13,15,15, 4,11,15,15,12,15, 3,15,15, - 15,15, 5, 6, 8,15, 7, 9,15,10,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15, - 15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15 - }; - bam1_t *b = (bam1_t *)*bp; - uint8_t *cp; - int i, qname_nuls, bam_len; - - //b->l_aux = extra_len; // we fill this out later - - qname_nuls = 4 - qname_len%4; - bam_len = qname_len + qname_nuls + ncigar*4 + (len+1)/2 + len + extra_len; - if (realloc_bam_data(b, bam_len) < 0) - return -1; - b->l_data = bam_len; - - b->core.tid = rname; - b->core.pos = pos-1; - b->core.bin = bam_reg2bin(pos-1, end); - b->core.qual = mapq; - b->core.l_qname = qname_len+qname_nuls; - b->core.l_extranul = qname_nuls-1; - b->core.flag = flag; - b->core.n_cigar = ncigar; - b->core.l_qseq = len; - b->core.mtid = mrnm; - b->core.mpos = mpos-1; - b->core.isize = isize; - - cp = b->data; - - strncpy((char *)cp, qname, qname_len); - for (i = 0; i < qname_nuls; i++) - cp[qname_len+i] = '\0'; - cp += qname_len+qname_nuls; - if (ncigar > 0) memcpy(cp, cigar, ncigar*4); - cp += ncigar*4; - - for (i = 0; i+1 < len; i+=2) { - *cp++ = (L[(uc)seq[i]]<<4) + L[(uc)seq[i+1]]; - } - if (i < len) - *cp++ = L[(uc)seq[i]]<<4; - - if (qual) - memcpy(cp, qual, len); - else - memset(cp, '\xff', len); - - return bam_len; -} diff --git a/cram/cram_samtools.h b/cram/cram_samtools.h index 115a96550..34c1db40e 100644 --- a/cram/cram_samtools.h +++ b/cram/cram_samtools.h @@ -72,27 +72,4 @@ enum cigar_op { typedef bam1_t bam_seq_t; -#ifdef __cplusplus -extern "C" { -#endif - -int bam_construct_seq(bam_seq_t **bp, size_t extra_len, - const char *qname, size_t qname_len, - int flag, - int rname, // Ref ID - int64_t pos, - int64_t end, // aligned start/end coords - int mapq, - uint32_t ncigar, const uint32_t *cigar, - int mrnm, // Mate Ref ID - int64_t mpos, - int64_t isize, - int len, - const char *seq, - const char *qual); - -#ifdef __cplusplus -} -#endif - #endif /* CRAM_SAMTOOLS_H */ diff --git a/htslib.mk b/htslib.mk index b750869c8..ceb9bf3c4 100644 --- a/htslib.mk +++ b/htslib.mk @@ -127,7 +127,6 @@ HTSLIB_ALL = \ $(HTSDIR)/cram/cram_index.h \ $(HTSDIR)/cram/cram_io.c \ $(HTSDIR)/cram/cram_io.h \ - $(HTSDIR)/cram/cram_samtools.c \ $(HTSDIR)/cram/cram_samtools.h \ $(HTSDIR)/cram/cram_stats.c \ $(HTSDIR)/cram/cram_stats.h \ From 999d1819e0b4083e6fe3339188a41cca7be1c508 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 23 Nov 2020 12:22:21 +0000 Subject: [PATCH 034/488] Add a general SAM/BAM/CRAM input filter option (HTS_OPT_SAM_FILTER). This uses an expression language and SAM format fields to specify rules which must pass (so filter-in rather than filter-out). The code structure is expr.[ch] implements a generic expression language which accepts a function callback for performing variable lookup, with sam.c:bam_sym_lookup() being that function. The sam_read1 function has been changed to keep looping until the filter passes, or we run out of data. This works for all file formats, but does so by adding a new member to htsFile. Technically it's an ABI change, but at least our own usage of it never uses arrays of them and it's stated in the header to be an opaque type which in theory should also ban that, albeit without any compiler checks). If this is a problem, would could add the same member to BGZF, cram_fd and hFILE as they're all private, but that feels like needless hoop jumping. Syntax supports Grouping: (, ), eg "(1+2)*3" Values: integers, floats, strings or variables Unary ops: +, -, !, ~ eg -10 +10, !10 (0), ~5 (bitwise not) Math ops: *, /, % [TODO: add // for floor division?] Math ops: +, - Bit-wise: &, |, ^ [NB as 3 precedence levels, in that order] Conditionals: >, >=, <, <=, Equality: ==, !=, =~ !~ Boolean: &&, || Data types are numerics (integers or doubles, eg 10, -0x2B, 10.2e-7) and strings "str" with backslash as an escape char within strings. The key-words currently supported are: flag int flag.paired int flag.proper_pair int flag.unmap int flag.munmap int flag.reverse int flag.mreverse int flag.read1 int flag.read2 int flag.secondary int flag.qcfail int flag.dup int flag.supplementary int mapq int mpos / pnext int mrefid int mrname / rnext string ncigar int pos int qlen int qname string refid int rlen int rname string tlen int [XX] tag int / string For example, to return only records where the mate reference name differs to the mapped query name and the read is aligned with high mapping quality: test_view -i 'sam_filter=mqual >= 30 && mrname != rname' in.bam Filters to find significant deletions: cigar =~ "[0-9]{2}D" rlen - qlen > 10 Duplicates that aren't part of a "proper pair" (equivalent to "samtools view -F 2 -f 0x400") sam_filter=flag.dup && !flag.proper_pair --- Makefile | 10 +- configure.ac | 3 + expr.c | 582 ++++++++++++++++++++++++++++++++++++ expr.h | 64 ++++ hts.c | 26 ++ htslib/hts.h | 12 + sam.c | 289 +++++++++++++++++- test/sam_filter/filter.sh | 34 +++ test/sam_filter/filter.tst | 46 +++ test/sam_filter/int1.out | 1 + test/sam_filter/int2.out | 1 + test/sam_filter/int3.out | 1 + test/sam_filter/string1.out | 6 + test/sam_filter/string2.out | 6 + test/sam_filter/string3.out | 7 + test/sam_filter/string4.out | 8 + test/test_expr.c | 233 +++++++++++++++ 17 files changed, 1313 insertions(+), 16 deletions(-) create mode 100644 expr.c create mode 100644 expr.h create mode 100755 test/sam_filter/filter.sh create mode 100644 test/sam_filter/filter.tst create mode 100644 test/sam_filter/int1.out create mode 100644 test/sam_filter/int2.out create mode 100644 test/sam_filter/int3.out create mode 100644 test/sam_filter/string1.out create mode 100644 test/sam_filter/string2.out create mode 100644 test/sam_filter/string3.out create mode 100644 test/sam_filter/string4.out create mode 100644 test/test_expr.c diff --git a/Makefile b/Makefile index e189ecb28..bd091ae36 100644 --- a/Makefile +++ b/Makefile @@ -77,6 +77,7 @@ BUILT_TEST_PROGRAMS = \ test/plugins-dlhts \ test/sam \ test/test_bgzf \ + test/test_expr \ test/test_kfunc \ test/test_kstring \ test/test_realn \ @@ -148,6 +149,7 @@ LIBHTS_OBJS = \ bcf_sr_sort.o \ bgzf.o \ errmod.o \ + expr.o \ faidx.o \ header.o \ hfile.o \ @@ -331,6 +333,7 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) +expr.o expr.pico: expr.c expr.h config.h $(htslib_kstring_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) knetfile.o knetfile.pico: knetfile.c config.h $(htslib_hts_log_h) $(htslib_knetfile_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) @@ -340,7 +343,7 @@ hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts.o hts.pico: hts.c config.h expr.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) @@ -402,6 +405,7 @@ maintainer-check: # MSYS2_ARG_CONV_EXCL="*" make check check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) test/hts_endian + test/test_expr test/test_kfunc test/test_kstring test/test_str2int @@ -411,6 +415,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR) test/test_bgzf test/bgziptest.txt test/test-parse-reg -t test/colons.bam + cd test/sam_filter && ./filter.sh filter.tst cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq @@ -441,6 +446,9 @@ test/sam: test/sam.o libhts.a test/test_bgzf: test/test_bgzf.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_bgzf.o libhts.a -lz $(LIBS) -lpthread +test/test_expr: test/test_expr.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread + test/test_kfunc: test/test_kfunc.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread diff --git a/configure.ac b/configure.ac index f473c97e5..e5e64496b 100644 --- a/configure.ac +++ b/configure.ac @@ -399,6 +399,9 @@ dnl Only need to add to static_LIBS if not building as a plugin fi fi +dnl Look for regcomp in various libraries (needed on windows/mingw). +AC_SEARCH_LIBS(regcomp, regex, [libregex=needed], []) + dnl Look for PTHREAD_MUTEX_RECURSIVE. dnl This is normally in pthread.h except on some broken glibc implementations. AC_CHECK_DECL(PTHREAD_MUTEX_RECURSIVE, [], [AC_DEFINE([_XOPEN_SOURCE],[600], [Needed for PTHREAD_MUTEX_RECURSIVE])], [[#include ]]) diff --git a/expr.c b/expr.c new file mode 100644 index 000000000..73df14dd1 --- /dev/null +++ b/expr.c @@ -0,0 +1,582 @@ +/* expr.c -- filter expression parsing and processing. + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +// TODO: +// - add maths functions. pow, sqrt, log, min, max, ? +// - ?: operator for conditionals? + +#include + +#include +#include +#include +#include +#include +#include +#include // may need configure rule for this + +#include "expr.h" +#include "textutils_internal.h" + +// Could also cache fexpr_t stack here for kstring reuse? +#define MAX_REGEX 10 +struct sam_filter_t { + char *str; + int parsed; + int curr_regex, max_regex; + regex_t preg[MAX_REGEX]; +}; + +/* + * This is designed to be mostly C like with mostly same the precedence rules, + * with the exception of bit operators (widely considered as a mistake in C). + * It's not full C (eg no bit-shifting), but good enough for our purposes. + * + * Supported syntax, in order of precedence: + * + * Grouping: (, ), eg "(1+2)*3" + * Values: integers, floats, strings or variables + * Unary ops: +, -, !, ~ eg -10 +10, !10 (0), ~5 (bitwise not) + * Math ops: *, /, % [TODO: add // for floor division?] + * Math ops: +, - + * Bit-wise: &, |, ^ [NB as 3 precedence levels, in that order] + * Conditionals: >, >=, <, <=, + * Equality: ==, !=, =~ !~ + * Boolean: &&, || + */ + +// Skip to start of term +static char *ws(char *str) { + while (*str && (*str == ' ' || *str == '\t')) + str++; + return str; +} + +static int expression(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res); + +/* + * simple_expr + * : identifier + * | constant + * // | string ? + * | '(' expression ')' +*/ +static int simple_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + // Main recursion step + str = ws(str); + if (*str == '(') { + if (expression(filt, data, fn, str+1, end, res)) return -1; + str = ws(*end); + if (*str != ')') { + fprintf(stderr, "Missing ')'\n"); + return -1; + } + *end = str+1; + + return 0; + } + + // Otherwise a basic element. + int fail = 0; + double d = hts_str2dbl(str, end, &fail); + if (str != *end) { + res->is_str = 0; + res->d = d; + } else { + // Not valid floating point syntax. + // TODO: add function call names in here; len(), sqrt(), pow(), etc + if (*str == '"') { + res->is_str = 1; + char *e = str+1; + int backslash = 0; + while (*e && *e != '"') { + if (*e == '\\') + backslash=1, e+=1+(e[1]!='\0'); + else + e++; + } + + kputsn(str+1, e-(str+1), ks_clear(&res->s)); + if (backslash) { + size_t i, j; + for (i = j = 0; i < res->s.l; i++) { + res->s.s[j++] = res->s.s[i]; + if (res->s.s[i] == '\\') { + switch (res->s.s[++i]) { + case '"': res->s.s[j-1] = '"'; break; + case '\\':res->s.s[j-1] = '\\'; break; + case 't': res->s.s[j-1] = '\t'; break; + case 'n': res->s.s[j-1] = '\n'; break; + case 'r': res->s.s[j-1] = '\r'; break; + default: res->s.s[j++] = res->s.s[i]; + } + } + } + res->s.s[j] = 0; + res->s.l = j; + } + if (*e != '"') + return -1; + *end = e+1; + } else if (fn) + // Look up variable. + return fn(data, str, end, res); + else + return -1; + } + + return 0; +} + +/* + * unary_expr + * : simple_expr + * | '+' simple_expr + * | '-' simple_expr + * | '!' unary_expr // higher precedence + * | '~' unary_expr // higher precedence + */ +static int unary_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + int err; + str = ws(str); + if (*str == '+') { + err = simple_expr(filt, data, fn, str+1, end, res); + err |= res->is_str; + res->is_true = res->d != 0; + } else if (*str == '-') { + err = simple_expr(filt, data, fn, str+1, end, res); + err |= res->is_str; + res->d = -res->d; + res->is_true = res->d != 0; + } else if (*str == '!') { + err = unary_expr(filt, data, fn, str+1, end, res); + if (res->is_str) { + res->is_str = 0; + res->d = 0; + res->is_true = !res->is_true; + } else { + res->d = !(int64_t)res->d; + res->is_true = res->d != 0; + } + } else if (*str == '~') { + err = unary_expr(filt, data, fn, str+1, end, res); + err |= res->is_str; + res->d = ~(int64_t)res->d; + res->is_true = res->d != 0; + } else { + err = simple_expr(filt, data, fn, str, end, res); + } + return err ? -1 : 0; +} + + +/* + * mul_expr + * : unary_expr ( + * unary_expr '*' unary_expr + * | unary_expr '/' unary_expr + * | unary_expr '%' unary_expr + * )* + */ +static int mul_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (unary_expr(filt, data, fn, str, end, res)) + return -1; + + str = *end; + fexpr_t val = FEXPR_INIT; + while (*str) { + str = ws(str); + if (*str == '*' || *str == '/' || *str == '%') { + if (unary_expr(filt, data, fn, str+1, end, &val)) return -1; + if (val.is_str || res->is_str) { + fexpr_free(&val); + return -1; // arith on strings + } + } + + if (*str == '*') + res->d *= val.d; + else if (*str == '/') + res->d /= val.d; + else if (*str == '%') + res->d = (int64_t)res->d % (int64_t)val.d; + else + break; + + str = *end; + } + fexpr_free(&val); + + return 0; +} + +/* + * add_expr + * : mul_expr ( + * mul_expr '+' mul_expr + * | mul_expr '-' mul_expr + * )* + */ +static int add_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (mul_expr(filt, data, fn, str, end, res)) + return -1; + + str = *end; + fexpr_t val = FEXPR_INIT; + while (*str) { + str = ws(str); + if (*str == '+' || *str == '-') { + if (mul_expr(filt, data, fn, str+1, end, &val)) return -1; + if (val.is_str || res->is_str) { + fexpr_free(&val); + return -1; // arith on strings + } + } + + if (*str == '+') + res->d += val.d; + else if (*str == '-') + res->d -= val.d; + else + break; + + str = *end; + } + fexpr_free(&val); + + return 0; +} + +/* + * bitand_expr + * : add_expr + * | bitand_expr '&' add_expr + */ +static int bitand_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (add_expr(filt, data, fn, str, end, res)) return -1; + + fexpr_t val = FEXPR_INIT; + for (;;) { + str = ws(*end); + if (*str == '&' && str[1] != '&') { + if (add_expr(filt, data, fn, str+1, end, &val)) return -1; + if (res->is_str || val.is_str) { + fexpr_free(&val); + return -1; + } + res->is_true = res->d = (int64_t)res->d & (int64_t)val.d; + } else { + break; + } + } + fexpr_free(&val); + + return 0; +} + +/* + * bitxor_expr + * : bitand_expr + * | bitxor_expr '^' bitand_expr + */ +static int bitxor_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (bitand_expr(filt, data, fn, str, end, res)) return -1; + + fexpr_t val = FEXPR_INIT; + for (;;) { + str = ws(*end); + if (*str == '^') { + if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1; + if (res->is_str || val.is_str) { + fexpr_free(&val); + return -1; + } + res->is_true = res->d = (int64_t)res->d ^ (int64_t)val.d; + } else { + break; + } + } + fexpr_free(&val); + + return 0; +} + +/* + * bitor_expr + * : xor_expr + * | bitor_expr '|' xor_expr + */ +static int bitor_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (bitxor_expr(filt, data, fn, str, end, res)) return -1; + + fexpr_t val = FEXPR_INIT; + for (;;) { + str = ws(*end); + if (*str == '|' && str[1] != '|') { + if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1; + if (res->is_str || val.is_str) { + fexpr_free(&val); + return -1; + } + res->is_true = res->d = (int64_t)res->d | (int64_t)val.d; + } else { + break; + } + } + fexpr_free(&val); + + return 0; +} + +/* + * cmp_expr + * : bitor_expr + * | cmp_expr '<=' bitor_expr + * | cmp_expr '<' bitor_expr + * | cmp_expr '>=' bitor_expr + * | cmp_expr '>' bitor_expr + */ +static int cmp_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (bitor_expr(filt, data, fn, str, end, res)) return -1; + + str = ws(*end); + fexpr_t val = FEXPR_INIT; + int err = 0; + + if (*str == '>' && str[1] == '=') { + err = cmp_expr(filt, data, fn, str+2, end, &val); + res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) >= 0 + : !res->is_str && !val.is_str && res->d >= val.d; + res->is_str = 0; + } else if (*str == '>') { + err = cmp_expr(filt, data, fn, str+1, end, &val); + res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) > 0 + : !res->is_str && !val.is_str && res->d > val.d; + res->is_str = 0; + } else if (*str == '<' && str[1] == '=') { + err = cmp_expr(filt, data, fn, str+2, end, &val); + res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) <= 0 + : !res->is_str && !val.is_str && res->d <= val.d; + res->is_str = 0; + } else if (*str == '<') { + err = cmp_expr(filt, data, fn, str+1, end, &val); + res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) < 0 + : !res->is_str && !val.is_str && res->d < val.d; + res->is_str = 0; + } + fexpr_free(&val); + + return err ? -1 : 0; +} + +/* + * eq_expr + * : cmp_expr + * | eq_expr '==' cmp_expr + * | eq_expr '!=' cmp_expr + * | eq_expr '=~' cmp_expr + * | eq_expr '!~' cmp_expr + */ +static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (cmp_expr(filt, data, fn, str, end, res)) return -1; + + str = ws(*end); + + int err = 0; + fexpr_t val = FEXPR_INIT; + + // numeric vs numeric comparison is as expected + // string vs string comparison is as expected + // numeric vs string is false + if (str[0] == '=' && str[1] == '=') { + if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { + res->is_true = res->d = 0; + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s)==0 : 0) + : !res->is_str && !val.is_str && res->d == val.d; + } + res->is_str = 0; + + } else if (str[0] == '!' && str[1] == '=') { + if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { + res->is_true = res->d = 0; + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s) != 0 : 1) + : res->is_str != val.is_str || res->d != val.d; + } + res->is_str = 0; + + } else if ((str[0] == '=' && str[1] == '~') || + (str[0] == '!' && str[1] == '~')) { + err = eq_expr(filt, data, fn, str+2, end, &val); + if (!val.is_str || !res->is_str) { + fexpr_free(&val); + return -1; + } + if (val.s.s && res->s.s && val.is_true >= 0 && res->is_true >= 0) { + regex_t preg_, *preg; + if (filt->curr_regex >= filt->max_regex) { + // Compile regex if not seen before + if (filt->curr_regex >= MAX_REGEX) { + preg = &preg_; + } else { + preg = &filt->preg[filt->curr_regex]; + filt->max_regex++; + } + + int ec = regcomp(preg, val.s.s, REG_EXTENDED | REG_NOSUB); + if (ec != 0) { + char errbuf[1024]; + regerror(ec, preg, errbuf, 1024); + fprintf(stderr, "Failed regex: %.1024s\n", errbuf); + fexpr_free(&val); + return -1; + } + } else { + preg = &filt->preg[filt->curr_regex]; + } + res->is_true = res->d = regexec(preg, res->s.s, 0, NULL, 0) == 0 + ? *str == '=' // matcn + : *str == '!'; // no-match + if (preg == &preg_) + regfree(preg); + + filt->curr_regex++; + } else { + // nul regexp or input is considered false + res->is_true = 0; + } + res->is_str = 0; + } + fexpr_free(&val); + + return err ? -1 : 0; +} + +/* + * and_expr + * : eq_expr + * | and_expr 'and' eq_expr + * | and_expr 'or' eq_expr + */ +static int and_expr(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + if (eq_expr(filt, data, fn, str, end, res)) return -1; + + fexpr_t val = FEXPR_INIT; + for (;;) { + str = ws(*end); + if (str[0] == '&' && str[1] == '&') { + if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; + res->is_true = res->d = + (res->is_true || (res->is_str && res->s.s) || res->d) && + (val.is_true || (val.is_str && val.s.s) || val.d); + res->is_str = 0; + } else if (str[0] == '|' && str[1] == '|') { + if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; + res->is_true = res->d = + res->is_true || (res->is_str && res->s.s) || res->d || + val.is_true || (val.is_str && val.s.s ) || val.d; + res->is_str = 0; + } else { + break; + } + } + fexpr_free(&val); + + return 0; +} + +static int expression(sam_filter_t *filt, void *data, sym_func *fn, + char *str, char **end, fexpr_t *res) { + return and_expr(filt, data, fn, str, end, res); +} + +sam_filter_t *sam_filter_init(const char *str) { + sam_filter_t *f = calloc(1, sizeof(*f)); + if (!f) return NULL; + + // Oversize to permit faster comparisons with memcmp over strcmp + size_t len = strlen(str)+100; + if (!(f->str = malloc(len))) { + free(f); + return NULL; + } + strcpy(f->str, str); + return f; +} + +void sam_filter_free(sam_filter_t *filt) { + if (!filt) + return; + + int i; + for (i = 0; i < filt->max_regex; i++) + regfree(&filt->preg[i]); + + free(filt->str); + free(filt); +} + +int sam_filter_eval(sam_filter_t *filt, void *data, sym_func *fn, + fexpr_t *res) { + char *end = NULL; + + memset(res, 0, sizeof(*res)); + + filt->curr_regex = 0; + if (expression(filt, data, fn, filt->str, &end, res)) + return -1; + + if (end && *ws(end)) { + fprintf(stderr, "Unable to parse expression at %s\n", filt->str); + return -1; + } + + // Strings evaluate to true. An empty string is also true, but an + // absent (null) string is false. An empty string has kstring length + // of zero, but a pointer as it's nul-terminated. + if (res->is_str) + res->is_true = res->d = res->s.s != NULL; + else + res->is_true |= res->d != 0; + + return 0; +} diff --git a/expr.h b/expr.h new file mode 100644 index 000000000..c4674fc41 --- /dev/null +++ b/expr.h @@ -0,0 +1,64 @@ +/* expr.c -- filter expression parsing and processing. + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTS_EXPR_H +#define HTS_EXPR_H + +#include +#include + +// fexpr_t is our return type and the type for elements within the expr. +// Note we cope with zero-but-true in order to implement a basic +// "exists(something)" check where "something" may even be zero. +// +// Eg in the aux tag searching syntax, "[NM]" should return true if +// NM tag exists even if zero. +// Take care when negating this. "[NM] != 0" will be true when +// [NM] is absent, thus consider "[NM] && [NM] != 0". +typedef struct { + char is_str; // Use .s vs .d + char is_true; // Force true if even zero + kstring_t s; // is_str and empty s permitted (eval as false) + double d; // otherwise this +} fexpr_t; + +#define FEXPR_INIT {0, 0, KS_INITIALIZE, 0} + +// Create a SAM filter for expression "str". +// +// Returns a pointer on success, +// NULL on failure +sam_filter_t *sam_filter_init(const char *str); + +// Frees a sam_filter_t created via sam_filter_init +void sam_filter_free(sam_filter_t *filt); + +typedef int (sym_func)(void *data, char *str, char **end, fexpr_t *res); +int sam_filter_eval(sam_filter_t *filt, void *data, sym_func *f, fexpr_t *res); + +static inline void fexpr_free(fexpr_t *f) { + ks_free(&f->s); +} + +#endif /* HTS_EXPR_H */ diff --git a/hts.c b/hts.c index 1ab89400e..519b5b809 100644 --- a/hts.c +++ b/hts.c @@ -50,6 +50,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_internal.h" #include "hfile_internal.h" #include "sam_internal.h" +#include "expr.h" #include "htslib/hts_os.h" // drand48 #include "htslib/khash.h" @@ -824,6 +825,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "LEVEL") == 0) o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0); + else if (strcmp(o->arg, "sam_filter") == 0 || + strcmp(o->arg, "SAM_FILTER") == 0) + o->opt = HTS_OPT_SAM_FILTER, o->val.s = val; + else { hts_log_error("Unknown option '%s'", o->arg); free(o->arg); @@ -863,6 +868,7 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) { // fall through case CRAM_OPT_VERSION: case CRAM_OPT_PREFIX: + case HTS_OPT_SAM_FILTER: if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) return -1; break; @@ -1231,6 +1237,7 @@ int hts_close(htsFile *fp) save = errno; sam_hdr_destroy(fp->bam_header); hts_idx_destroy(fp->idx); + sam_filter_free(fp->filter); free(fp->fn); free(fp->fn_aux); free(fp->line.s); @@ -1335,6 +1342,13 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { fp->fp.bgzf->compress_level = level; } + case HTS_OPT_SAM_FILTER: { + va_start(args, opt); + char *expr = va_arg(args, char *); + va_end(args); + return hts_set_filter_expression(fp, expr); + } + default: break; } @@ -1396,6 +1410,18 @@ int hts_set_fai_filename(htsFile *fp, const char *fn_aux) return 0; } +int hts_set_filter_expression(htsFile *fp, const char *expr) +{ + if (fp->filter) + sam_filter_free(fp->filter); + + if (!expr) + return 0; + + return (fp->filter = sam_filter_init(expr)) + ? 0 : -1; +} + hFILE *hts_open_tmpfile(const char *fname, const char *mode, kstring_t *tmpname) { int pid = (int) getpid(); diff --git a/htslib/hts.h b/htslib/hts.h index 7a85ca38c..9ccbb986b 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -224,6 +224,7 @@ typedef struct htsFormat { struct hts_idx_t; typedef struct hts_idx_t hts_idx_t; +typedef struct sam_filter_t sam_filter_t; /** * @brief File handle returned by hts_open() etc. @@ -256,6 +257,7 @@ typedef struct htsFile { hts_idx_t *idx; const char *fnidx; struct sam_hdr_t *bam_header; + sam_filter_t *filter; } htsFile; // A combined thread pool and queue allocation size. @@ -321,6 +323,7 @@ enum hts_fmt_option { HTS_OPT_THREAD_POOL, HTS_OPT_CACHE_SIZE, HTS_OPT_BLOCK_SIZE, + HTS_OPT_SAM_FILTER, }; // For backwards compatibility @@ -607,6 +610,15 @@ HTSLIB_EXPORT int hts_set_fai_filename(htsFile *fp, const char *fn_aux); +/*! + @abstract Sets a filter expression + @return 0 for success, negative on failure + @discussion + To clear an existing filter, specifying expr as NULL. +*/ +HTSLIB_EXPORT +int hts_set_filter_expression(htsFile *fp, const char *expr); + /*! @abstract Determine whether a given htsFile contains a valid EOF block @return 3 for a non-EOF checkable filetype; diff --git a/sam.c b/sam.c index eb2712965..93a9a994a 100644 --- a/sam.c +++ b/sam.c @@ -50,6 +50,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hfile.h" #include "htslib/hts_endian.h" #include "header.h" +#include "expr.h" #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) @@ -3154,40 +3155,283 @@ int sam_set_threads(htsFile *fp, int nthreads) { return 0; } +// Bam record pointer and SAM header combined +typedef struct { + const sam_hdr_t *h; + const bam1_t *b; +} hb_pair; + +// Looks up variable names in str and replaces them with their value. +// Also supports aux tags. +// +// Note the expression parser deliberately overallocates str size so it +// is safe to use memcmp over strcmp. +static int bam_sym_lookup(void *data, char *str, char **end, fexpr_t *res) { + hb_pair *hb = (hb_pair *)data; + const bam1_t *b = hb->b; + + res->is_str = 0; + switch(*str) { + case 'c': + if (memcmp(str, "cigar", 5) == 0) { + *end = str+5; + res->is_str = 1; + ks_clear(&res->s); + uint32_t *cigar = bam_get_cigar(b); + int i, n = b->core.n_cigar, r = 0; + for (i = 0; i < n; i++) { + r |= kputw (bam_cigar_oplen(cigar[i]), &res->s); + r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s); + } + kputs("", &res->s); + return r ? 0 : -1; + } + break; + + case 'f': + if (memcmp(str, "flag", 4) == 0) { + str = *end = str+4; + if (*str != '.') { + res->d = b->core.flag; + return 0; + } else { + str++; + if (!memcmp(str, "paired", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FPAIRED; + return 0; + } else if (!memcmp(str, "proper_pair", 11)) { + *end = str+11; + res->d = b->core.flag & BAM_FPROPER_PAIR; + return 0; + } else if (!memcmp(str, "unmap", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FUNMAP; + return 0; + } else if (!memcmp(str, "munmap", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FMUNMAP; + return 0; + } else if (!memcmp(str, "reverse", 7)) { + *end = str+7; + res->d = b->core.flag & BAM_FREVERSE; + return 0; + } else if (!memcmp(str, "mreverse", 8)) { + *end = str+8; + res->d = b->core.flag & BAM_FMREVERSE; + return 0; + } else if (!memcmp(str, "read1", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD1; + return 0; + } else if (!memcmp(str, "read2", 6)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD2; + return 0; + } else if (!memcmp(str, "secondary", 9)) { + *end = str+9; + res->d = b->core.flag & BAM_FSECONDARY; + return 0; + } else if (!memcmp(str, "qcfail", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FQCFAIL; + return 0; + } else if (!memcmp(str, "dup", 3)) { + *end = str+3; + res->d = b->core.flag & BAM_FDUP; + return 0; + } else if (!memcmp(str, "supplementary", 13)) { + *end = str+13; + res->d = b->core.flag & BAM_FSUPPLEMENTARY; + return 0; + } else { + hts_log_error("Unrecognised flag string"); + return -1; + } + } + } + break; + + case 'm': + if (memcmp(str, "mapq", 4) == 0) { + *end = str+4; + res->d = b->core.qual; + return 0; + } else if (memcmp(str, "mpos", 4) == 0) { + *end = str+4; + res->d = b->core.mpos+1; + return 0; + } else if (memcmp(str, "mrname", 6) == 0) { + *end = str+6; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "mrefid", 6) == 0) { + *end = str+6; + res->d = b->core.mtid; + return 0; + } + break; + + case 'n': + if (memcmp(str, "ncigar", 6) == 0) { + *end = str+6; + res->d = b->core.n_cigar; + return 0; + } + break; + + case 'p': + if (memcmp(str, "pos", 3) == 0) { + *end = str+3; + res->d = b->core.pos+1; + return 0; + } else if (memcmp(str, "pnext", 5) == 0) { + *end = str+5; + res->d = b->core.mpos+1; + return 0; + } + break; + + case 'q': + if (memcmp(str, "qlen", 4) == 0) { + *end = str+4; + res->d = b->core.l_qseq; + return 0; + } else if (memcmp(str, "qname", 5) == 0) { + *end = str+5; + res->is_str = 1; + kputs(bam_get_qname(b), ks_clear(&res->s)); + return 0; + } + break; + + case 'r': + if (memcmp(str, "rlen", 4) == 0) { + *end = str+4; + res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + return 0; + } else if (memcmp(str, "rname", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "rnext", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "refid", 5) == 0) { + *end = str+5; + res->d = b->core.tid; + return 0; + } + break; + + case 't': + if (memcmp(str, "tlen", 4) == 0) { + *end = str+4; + res->d = b->core.isize; + return 0; + } + break; + + case '[': + if (*str == '[' && str[1] && str[2] && str[3] == ']') { + /* aux tags */ + *end = str+4; + + uint8_t *aux = bam_aux_get(b, str+1); + if (aux) { + // we define the truth of a tag to be its presence, even if 0. + res->is_true = 1; + switch (*aux) { + case 'Z': + case 'H': + res->is_str = 1; + kputs((char *)aux+1, ks_clear(&res->s)); + break; + + case 'A': + res->is_str = 1; + kputsn((char *)aux+1, 1, ks_clear(&res->s)); + break; + + case 'i': case 'I': + case 's': case 'S': + case 'c': case 'C': + res->is_str = 0; + res->d = bam_aux2i(aux); + break; + + case 'f': + case 'd': + res->is_str = 0; + res->d = bam_aux2f(aux); + break; + + default: + hts_log_error("Aux type '%c not yet supported by filters", + *aux); + return -1; + } + return 0; + + } else { + // hence absent tags are always false (and strings) + res->is_str = 1; + res->s.l = 0; + res->d = 0; + res->is_true = 0; + return 0; + } + } + break; + } + + // All successful matches in switch should return 0. + // So if we didn't match, it's a parse error. + return -1; +} + // Returns 0 on success, // -1 on EOF, // <-1 on error int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) { + int ret; + + filtered: switch (fp->format.format) { - case bam: { - int r = bam_read1(fp->fp.bgzf, b); - if (h && r >= 0) { + case bam: + ret = bam_read1(fp->fp.bgzf, b); + if (h && ret >= 0) { if (b->core.tid >= h->n_targets || b->core.tid < -1 || b->core.mtid >= h->n_targets || b->core.mtid < -1) { errno = ERANGE; return -3; } } - return r; - } + break; - case cram: { - int ret = cram_get_bam_seq(fp->fp.cram, &b); + case cram: + ret = cram_get_bam_seq(fp->fp.cram, &b); if (ret < 0) return cram_eof(fp->fp.cram) ? -1 : -2; if (bam_tag2cigar(b, 1, 1) < 0) return -2; - return ret; - } + break; case sam: { // Consume 1st line after header parsing as it wasn't using peek if (fp->line.l != 0) { - int ret = sam_parse1(&fp->line, h, b); + ret = sam_parse1(&fp->line, h, b); fp->line.l = 0; - return ret; + break; } if (fp->state) { @@ -3255,12 +3499,10 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) fd->curr_idx = 0; } - return 0; + ret = 0; } else { - int ret; err_recover: - ret = hts_getline(fp, KS_SEP_LINE, &fp->line); if (ret < 0) return ret; @@ -3270,8 +3512,8 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) hts_log_warning("Parse error at line %lld", (long long)fp->lineno); if (h->ignore_sam_err) goto err_recover; } - return ret; } + break; } case empty_format: @@ -3282,6 +3524,23 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) errno = EFTYPE; return -3; } + + if (ret >= 0 && fp->filter) { + // Process on-the-fly filter rules + hb_pair hb = {h, b}; + fexpr_t res; + if (sam_filter_eval(fp->filter, &hb, bam_sym_lookup, &res)) { + hts_log_error("Couldn't process filter expression"); + fexpr_free(&res); + return -1; + } + int t = res.is_true; + fexpr_free(&res); + if (!t) + goto filtered; + } + + return ret; } diff --git a/test/sam_filter/filter.sh b/test/sam_filter/filter.sh new file mode 100755 index 000000000..575cb13ae --- /dev/null +++ b/test/sam_filter/filter.sh @@ -0,0 +1,34 @@ +#!/bin/sh +# +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing sam_filter..." + +tv="../test_view" + +test_driver $@ + +exit $? diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst new file mode 100644 index 000000000..f5558a41b --- /dev/null +++ b/test/sam_filter/filter.tst @@ -0,0 +1,46 @@ +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $tv is replaced with the path to test_view + +# String matches +P string1.out $tv -i 'sam_filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam +P string2.out $tv -i 'sam_filter=rname=="CHROMOSOME_II"' ../ce#5b.sam +P string3.out $tv -i 'sam_filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam +P string4.out $tv -i 'sam_filter=cigar=~"D"' ../ce#1000.sam + +# Integer ops +P int1.out $tv -i 'sam_filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' +P int2.out $tv -i 'sam_filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@' + +# Aux tags +P int3.out $tv -i 'sam_filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' diff --git a/test/sam_filter/int1.out b/test/sam_filter/int1.out new file mode 100644 index 000000000..6529ff889 --- /dev/null +++ b/test/sam_filter/int1.out @@ -0,0 +1 @@ +98 diff --git a/test/sam_filter/int2.out b/test/sam_filter/int2.out new file mode 100644 index 000000000..e522732c7 --- /dev/null +++ b/test/sam_filter/int2.out @@ -0,0 +1 @@ +38 diff --git a/test/sam_filter/int3.out b/test/sam_filter/int3.out new file mode 100644 index 000000000..ea70ce013 --- /dev/null +++ b/test/sam_filter/int3.out @@ -0,0 +1 @@ +72 diff --git a/test/sam_filter/string1.out b/test/sam_filter/string1.out new file mode 100644 index 000000000..7ba8527b2 --- /dev/null +++ b/test/sam_filter/string1.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.14978392 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 diff --git a/test/sam_filter/string2.out b/test/sam_filter/string2.out new file mode 100644 index 000000000..be94b071e --- /dev/null +++ b/test/sam_filter/string2.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +II.14978392 16 CHROMOSOME_II 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU diff --git a/test/sam_filter/string3.out b/test/sam_filter/string3.out new file mode 100644 index 000000000..2424115de --- /dev/null +++ b/test/sam_filter/string3.out @@ -0,0 +1,7 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +II.14978392 16 CHROMOSOME_II 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU +III 16 CHROMOSOME_III 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC XG:i:1 XM:i:5 XN:i:0 XO:i:1 XS:i:-18 AS:i:-18 YT:Z:UU diff --git a/test/sam_filter/string4.out b/test/sam_filter/string4.out new file mode 100644 index 000000000..386300c53 --- /dev/null +++ b/test/sam_filter/string4.out @@ -0,0 +1,8 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.14978392 16 CHROMOSOME_I 2 1 27M1D73M * 0 0 CCTAGCCCTAACCCTAACCCTAACCCTAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAA #############################@B?8B?BA@@DDBCDDCBC@CDCDCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC AS:i:-18 XS:i:-18 XN:i:0 XM:i:5 XO:i:1 XG:i:1 YT:Z:UU MD:Z:4A0G5G5G5G3^A73 NM:i:6 +SRR065390.32874267 0 CHROMOSOME_I 75 1 13M1D87M * 0 0 CTAAGCCTAAGCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAG DCCCCCCCCCCCCCCCCCCCCCCCBCCCCCCCCCCCCCCCCCCCC@CCCCCCCCCCCCCCA>/=;=9>:/5AA############# AS:i:-8 XS:i:-8 XN:i:0 XM:i:0 XO:i:1 XG:i:1 YT:Z:UU MD:Z:13^T87 NM:i:1 +SRR065390.723611 0 CHROMOSOME_I 155 1 5M1D95M * 0 0 AAGCCAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCC CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCDCCCCCCCCCCCDCCBCABCDADBACDDDBABBDB?AB@@<>;B>B?DB?=@@?@: AS:i:-8 XS:i:-8 XN:i:0 XM:i:0 XO:i:1 XG:i:1 YT:Z:UU MD:Z:5^T95 NM:i:1 diff --git a/test/test_expr.c b/test/test_expr.c new file mode 100644 index 000000000..7ad83fcd2 --- /dev/null +++ b/test/test_expr.c @@ -0,0 +1,233 @@ +/* test-expr.c -- Testing: filter expression parsing and processing. + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include "../expr.h" + +int lookup(void *data, char *str, char **end, fexpr_t *res) { + int foo = 15551; // my favourite palindromic prime + int a = 1; + int b = 2; + int c = 3; + res->is_str = 0; + if (strncmp(str, "foo", 3) == 0) { + *end = str+3; + res->d = foo; + } else if (*str == 'a') { + *end = str+1; + res->d = a; + } else if (*str == 'b') { + *end = str+1; + res->d = b; + } else if (*str == 'c') { + *end = str+1; + res->d = c; + } else if (strncmp(str, "magic", 5) == 0) { + // non-empty string + *end = str+5; + res->is_str = 1; + kputs("plugh", ks_clear(&res->s)); + } else if (strncmp(str, "empty", 5) == 0) { + // empty string + *end = str+5; + res->is_str = 1; + kputs("", ks_clear(&res->s)); + } else if (strncmp(str, "null", 4) == 0) { + // null string (eg aux:Z tag is absent) + *end = str+4; + res->is_str = 1; + ks_clear(&res->s); + + } else { + return -1; + } + + return 0; +} + +typedef struct { + double dval; + char *sval; + char *str; +} test_ev; + +int test(void) { + // These are all valid expressions that should work + test_ev tests[] = { + { 1, NULL, "1"}, + { 1, NULL, "+1"}, + { -1, NULL, "-1"}, + { 0, NULL, "!7"}, + { 1, NULL, "!0"}, + { 1, NULL, "!(!7)"}, + { 1, NULL, "!!7"}, + + { 5, NULL, "2+3"}, + { -1, NULL, "2+-3"}, + { 6, NULL, "1+2+3"}, + { 1, NULL, "-2+3"}, + + { 6, NULL, "2*3"}, + { 6, NULL, "1*2*3"}, + { 0, NULL, "2*0"}, + + { 7, NULL, "(7)"}, + { 7, NULL, "((7))"}, + { 21, NULL, "(1+2)*(3+4)"}, + { 14, NULL, "(4*5)-(-2*-3)"}, + + { 1, NULL, "(1+2)*3==9"}, + { 1, NULL, "(1+2)*3!=8"}, + { 0, NULL, "(1+2)*3!=9"}, + { 0, NULL, "(1+2)*3==8"}, + + { 0, NULL, "1>2"}, + { 1, NULL, "1<2"}, + { 0, NULL, "3<3"}, + { 0, NULL, "3>3"}, + { 1, NULL, "9<=9"}, + { 1, NULL, "9>=9"}, + { 1, NULL, "2*4==8"}, + { 1, NULL, "16==0x10"}, + { 1, NULL, "15<0x10"}, + { 1, NULL, "17>0x10"}, + { 0, NULL, "2*4!=8"}, + { 1, NULL, "4+2<3+4"}, + { 0, NULL, "4*2<3+4"}, + { 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 + + { 1, NULL, "(1<2) == (3>2)"}, + { 1, NULL, "1<2 == 3>2"}, + + { 1, NULL, "2 && 1"}, + { 0, NULL, "2 && 0"}, + { 0, NULL, "0 && 2"}, + { 1, NULL, "2 || 1"}, + { 1, NULL, "2 || 0"}, + { 1, NULL, "0 || 2"}, + { 1, NULL, "1 || 2 && 3"}, + { 1, NULL, "2 && 3 || 1"}, + { 1, NULL, "0 && 3 || 2"}, + { 0, NULL, "0 && 3 || 0"}, + + { 1, NULL, "3 & 1"}, + { 2, NULL, "3 & 2"}, + { 3, NULL, "1 | 2"}, + { 3, NULL, "1 | 3"}, + { 7, NULL, "1 | 6"}, + { 2, NULL, "1 ^ 3"}, + + { 1, NULL, "(1^0)&(4^3)"}, + { 2, NULL, "1 ^(0&4)^ 3"}, + { 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ + + { 6, NULL, "(1|0)^(4|3)"}, + { 7, NULL, "1 |(0^4)| 3"}, + { 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | + + { 1, NULL, "4 & 2 || 1"}, + { 1, NULL, "(4 & 2) || 1"}, + { 0, NULL, "4 & (2 || 1)"}, + { 1, NULL, "1 || 4 & 2"}, + { 1, NULL, "1 || (4 & 2)"}, + { 0, NULL, "(1 || 4) & 2"}, + + { 1, NULL, " (2*3)&7 > 4"}, + { 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv + { 1, NULL, "((2*3)&7) > 4"}, // Python precendece equiv + { 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, + + { 1, "plugh", "magic"}, + { 1, "", "empty"}, + { 1, NULL, "magic == \"plugh\""}, + { 1, NULL, "magic != \"xyzzy\""}, + + { 1, NULL, "\"abc\" < \"def\""}, + { 1, NULL, "\"abc\" <= \"abc\""}, + { 0, NULL, "\"abc\" < \"ab\""}, + { 0, NULL, "\"abc\" <= \"ab\""}, + + { 0, NULL, "\"abc\" > \"def\""}, + { 1, NULL, "\"abc\" >= \"abc\""}, + { 1, NULL, "\"abc\" > \"ab\""}, + { 1, NULL, "\"abc\" >= \"ab\""}, + + { 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, + { 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, + { 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, + { 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, + }; + + int i; + fexpr_t r; + for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { + sam_filter_t *filt = sam_filter_init(tests[i].str); + if (!filt) + return 1; + if (sam_filter_eval(filt, NULL, lookup, &r)) { + fprintf(stderr, "Failed to parse filter string %s\n", + tests[i].str); + return 1; + } + + if (r.is_str && (strcmp(r.s.s, tests[i].sval) != 0 + || r.d != tests[i].dval)) { + fprintf(stderr, "Failed test: %s == %s, got %s, %f\n", + tests[i].str, tests[i].sval, r.s.s, r.d); + return 1; + } else if (!r.is_str && r.d != tests[i].dval) { + fprintf(stderr, "Failed test: %s == %f, got %f\n", + tests[i].str, tests[i].dval, r.d); + return 1; + } + + fexpr_free(&r); + sam_filter_free(filt); + } + + return 0; +} + +int main(int argc, char **argv) { + if (argc > 1) { + fexpr_t v; + sam_filter_t *filt = sam_filter_init(argv[1]); + if (sam_filter_eval(filt, NULL, lookup, &v)) + return 1; + + if (v.is_str) + puts(v.s.s); + else + printf("%g\n", v.d); + + fexpr_free(&v); + sam_filter_free(filt); + return 0; + } + + return test(); +} From 2c75c104d824b0469463c9a62f8daf1594f22ebc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Nov 2020 17:27:39 +0000 Subject: [PATCH 035/488] Filter tidy ups from review. - Rename all the sam_* types and functions in expr.c to hts_*. This file is intended to be file format agnostic. A few key functions here are now externally visible too. - Renamed fexpr_t to hts_expr_val_t. Similarly the other bits named after fexpr. It was a poor choice as it's not a filter expression, but a *value*. - Renamed expr.[ch] to hts_expr.[ch]. The header is now public too, but included by hts.h so it does not need to be explicitly included. - Fixed a bug where the multi-region CRAM iterator wasn't using the filter (it calls cram_readrec). All other iterators, both single and multi, worked fine. - Added a simpler sam_passes_filter function that combines hts_filter_eval with the internal bam_sym_lookup. This is an externally visible function. - Rename sym_func to hts_expr_sym_func. - Add more documentation. - Renamed HTS_OPT_SAM_FILTER back to HTS_OPT_FILTER. It doesn't yet check if you attempt to specify this on e.g. a VCF file, but neither does it break as it's simply ignored. - Restructured sam_read1 into a series of internal format-specific functions. This makes the body of sam_read1 itself far simpler. This now means we can do sensible looping without concern over indenting an enormous chunk of code, and the loop is now small enough to see the logic in a single page. --- Makefile | 7 +- expr.h | 64 --- hts.c | 18 +- expr.c => hts_expr.c | 113 ++--- htslib/hts.h | 6 +- htslib/hts_expr.h | 97 +++++ htslib/sam.h | 9 + htslib_vars.mk | 3 +- sam.c | 865 +++++++++++++++++++------------------ test/sam_filter/filter.tst | 14 +- test/test_expr.c | 24 +- 11 files changed, 651 insertions(+), 569 deletions(-) delete mode 100644 expr.h rename expr.c => hts_expr.c (83%) create mode 100644 htslib/hts_expr.h diff --git a/Makefile b/Makefile index bd091ae36..659936e2b 100644 --- a/Makefile +++ b/Makefile @@ -149,12 +149,12 @@ LIBHTS_OBJS = \ bcf_sr_sort.o \ bgzf.o \ errmod.o \ - expr.o \ faidx.o \ header.o \ hfile.o \ hfile_net.o \ hts.o \ + hts_expr.o \ hts_os.o\ md5.o \ multipart.o \ @@ -333,7 +333,6 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) -expr.o expr.pico: expr.c expr.h config.h $(htslib_kstring_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) knetfile.o knetfile.pico: knetfile.c config.h $(htslib_hts_log_h) $(htslib_knetfile_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) @@ -343,7 +342,8 @@ hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h expr.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts_expr.o hts_expr.pico: hts_expr.c $(htslib_hts_expr_h) config.h $(htslib_kstring_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) @@ -493,6 +493,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) test/plugins-dlhts.o: test/plugins-dlhts.c config.h test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_hts_log_h) test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) +test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h) test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h) diff --git a/expr.h b/expr.h deleted file mode 100644 index c4674fc41..000000000 --- a/expr.h +++ /dev/null @@ -1,64 +0,0 @@ -/* expr.c -- filter expression parsing and processing. - - Copyright (C) 2020 Genome Research Ltd. - - Author: James Bonfield - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notices and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#ifndef HTS_EXPR_H -#define HTS_EXPR_H - -#include -#include - -// fexpr_t is our return type and the type for elements within the expr. -// Note we cope with zero-but-true in order to implement a basic -// "exists(something)" check where "something" may even be zero. -// -// Eg in the aux tag searching syntax, "[NM]" should return true if -// NM tag exists even if zero. -// Take care when negating this. "[NM] != 0" will be true when -// [NM] is absent, thus consider "[NM] && [NM] != 0". -typedef struct { - char is_str; // Use .s vs .d - char is_true; // Force true if even zero - kstring_t s; // is_str and empty s permitted (eval as false) - double d; // otherwise this -} fexpr_t; - -#define FEXPR_INIT {0, 0, KS_INITIALIZE, 0} - -// Create a SAM filter for expression "str". -// -// Returns a pointer on success, -// NULL on failure -sam_filter_t *sam_filter_init(const char *str); - -// Frees a sam_filter_t created via sam_filter_init -void sam_filter_free(sam_filter_t *filt); - -typedef int (sym_func)(void *data, char *str, char **end, fexpr_t *res); -int sam_filter_eval(sam_filter_t *filt, void *data, sym_func *f, fexpr_t *res); - -static inline void fexpr_free(fexpr_t *f) { - ks_free(&f->s); -} - -#endif /* HTS_EXPR_H */ diff --git a/hts.c b/hts.c index 519b5b809..f854ed725 100644 --- a/hts.c +++ b/hts.c @@ -50,7 +50,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_internal.h" #include "hfile_internal.h" #include "sam_internal.h" -#include "expr.h" +#include "htslib/hts_expr.h" #include "htslib/hts_os.h" // drand48 #include "htslib/khash.h" @@ -825,9 +825,9 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "LEVEL") == 0) o->opt = HTS_OPT_COMPRESSION_LEVEL, o->val.i = strtol(val, NULL, 0); - else if (strcmp(o->arg, "sam_filter") == 0 || - strcmp(o->arg, "SAM_FILTER") == 0) - o->opt = HTS_OPT_SAM_FILTER, o->val.s = val; + else if (strcmp(o->arg, "filter") == 0 || + strcmp(o->arg, "FILTER") == 0) + o->opt = HTS_OPT_FILTER, o->val.s = val; else { hts_log_error("Unknown option '%s'", o->arg); @@ -868,7 +868,7 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) { // fall through case CRAM_OPT_VERSION: case CRAM_OPT_PREFIX: - case HTS_OPT_SAM_FILTER: + case HTS_OPT_FILTER: if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) return -1; break; @@ -1237,7 +1237,7 @@ int hts_close(htsFile *fp) save = errno; sam_hdr_destroy(fp->bam_header); hts_idx_destroy(fp->idx); - sam_filter_free(fp->filter); + hts_filter_free(fp->filter); free(fp->fn); free(fp->fn_aux); free(fp->line.s); @@ -1342,7 +1342,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { fp->fp.bgzf->compress_level = level; } - case HTS_OPT_SAM_FILTER: { + case HTS_OPT_FILTER: { va_start(args, opt); char *expr = va_arg(args, char *); va_end(args); @@ -1413,12 +1413,12 @@ int hts_set_fai_filename(htsFile *fp, const char *fn_aux) int hts_set_filter_expression(htsFile *fp, const char *expr) { if (fp->filter) - sam_filter_free(fp->filter); + hts_filter_free(fp->filter); if (!expr) return 0; - return (fp->filter = sam_filter_init(expr)) + return (fp->filter = hts_filter_init(expr)) ? 0 : -1; } diff --git a/expr.c b/hts_expr.c similarity index 83% rename from expr.c rename to hts_expr.c index 73df14dd1..3d00ed339 100644 --- a/expr.c +++ b/hts_expr.c @@ -1,4 +1,4 @@ -/* expr.c -- filter expression parsing and processing. +/* hts_expr.c -- filter expression parsing and processing. Copyright (C) 2020 Genome Research Ltd. @@ -36,12 +36,12 @@ DEALINGS IN THE SOFTWARE. */ #include #include // may need configure rule for this -#include "expr.h" +#include "htslib/hts_expr.h" #include "textutils_internal.h" -// Could also cache fexpr_t stack here for kstring reuse? +// Could also cache hts_expr_val_t stack here for kstring reuse? #define MAX_REGEX 10 -struct sam_filter_t { +struct hts_filter_t { char *str; int parsed; int curr_regex, max_regex; @@ -73,8 +73,8 @@ static char *ws(char *str) { return str; } -static int expression(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res); +static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res); /* * simple_expr @@ -83,8 +83,8 @@ static int expression(sam_filter_t *filt, void *data, sym_func *fn, * // | string ? * | '(' expression ')' */ -static int simple_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { // Main recursion step str = ws(str); if (*str == '(') { @@ -159,8 +159,8 @@ static int simple_expr(sam_filter_t *filt, void *data, sym_func *fn, * | '!' unary_expr // higher precedence * | '~' unary_expr // higher precedence */ -static int unary_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { int err; str = ws(str); if (*str == '+') { @@ -202,19 +202,19 @@ static int unary_expr(sam_filter_t *filt, void *data, sym_func *fn, * | unary_expr '%' unary_expr * )* */ -static int mul_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (unary_expr(filt, data, fn, str, end, res)) return -1; str = *end; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; while (*str) { str = ws(str); if (*str == '*' || *str == '/' || *str == '%') { if (unary_expr(filt, data, fn, str+1, end, &val)) return -1; if (val.is_str || res->is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; // arith on strings } } @@ -230,7 +230,7 @@ static int mul_expr(sam_filter_t *filt, void *data, sym_func *fn, str = *end; } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -242,19 +242,19 @@ static int mul_expr(sam_filter_t *filt, void *data, sym_func *fn, * | mul_expr '-' mul_expr * )* */ -static int add_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (mul_expr(filt, data, fn, str, end, res)) return -1; str = *end; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; while (*str) { str = ws(str); if (*str == '+' || *str == '-') { if (mul_expr(filt, data, fn, str+1, end, &val)) return -1; if (val.is_str || res->is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; // arith on strings } } @@ -268,7 +268,7 @@ static int add_expr(sam_filter_t *filt, void *data, sym_func *fn, str = *end; } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -278,17 +278,17 @@ static int add_expr(sam_filter_t *filt, void *data, sym_func *fn, * : add_expr * | bitand_expr '&' add_expr */ -static int bitand_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (add_expr(filt, data, fn, str, end, res)) return -1; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '&' && str[1] != '&') { if (add_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } res->is_true = res->d = (int64_t)res->d & (int64_t)val.d; @@ -296,7 +296,7 @@ static int bitand_expr(sam_filter_t *filt, void *data, sym_func *fn, break; } } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -306,17 +306,17 @@ static int bitand_expr(sam_filter_t *filt, void *data, sym_func *fn, * : bitand_expr * | bitxor_expr '^' bitand_expr */ -static int bitxor_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (bitand_expr(filt, data, fn, str, end, res)) return -1; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '^') { if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } res->is_true = res->d = (int64_t)res->d ^ (int64_t)val.d; @@ -324,7 +324,7 @@ static int bitxor_expr(sam_filter_t *filt, void *data, sym_func *fn, break; } } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -334,17 +334,17 @@ static int bitxor_expr(sam_filter_t *filt, void *data, sym_func *fn, * : xor_expr * | bitor_expr '|' xor_expr */ -static int bitor_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (bitxor_expr(filt, data, fn, str, end, res)) return -1; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (*str == '|' && str[1] != '|') { if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1; if (res->is_str || val.is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } res->is_true = res->d = (int64_t)res->d | (int64_t)val.d; @@ -352,7 +352,7 @@ static int bitor_expr(sam_filter_t *filt, void *data, sym_func *fn, break; } } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } @@ -365,12 +365,12 @@ static int bitor_expr(sam_filter_t *filt, void *data, sym_func *fn, * | cmp_expr '>=' bitor_expr * | cmp_expr '>' bitor_expr */ -static int cmp_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (bitor_expr(filt, data, fn, str, end, res)) return -1; str = ws(*end); - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; int err = 0; if (*str == '>' && str[1] == '=') { @@ -398,7 +398,7 @@ static int cmp_expr(sam_filter_t *filt, void *data, sym_func *fn, : !res->is_str && !val.is_str && res->d < val.d; res->is_str = 0; } - fexpr_free(&val); + hts_expr_val_free(&val); return err ? -1 : 0; } @@ -411,14 +411,14 @@ static int cmp_expr(sam_filter_t *filt, void *data, sym_func *fn, * | eq_expr '=~' cmp_expr * | eq_expr '!~' cmp_expr */ -static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (cmp_expr(filt, data, fn, str, end, res)) return -1; str = ws(*end); int err = 0; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; // numeric vs numeric comparison is as expected // string vs string comparison is as expected @@ -447,7 +447,7 @@ static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, (str[0] == '!' && str[1] == '~')) { err = eq_expr(filt, data, fn, str+2, end, &val); if (!val.is_str || !res->is_str) { - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } if (val.s.s && res->s.s && val.is_true >= 0 && res->is_true >= 0) { @@ -466,7 +466,7 @@ static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, char errbuf[1024]; regerror(ec, preg, errbuf, 1024); fprintf(stderr, "Failed regex: %.1024s\n", errbuf); - fexpr_free(&val); + hts_expr_val_free(&val); return -1; } } else { @@ -485,7 +485,7 @@ static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, } res->is_str = 0; } - fexpr_free(&val); + hts_expr_val_free(&val); return err ? -1 : 0; } @@ -496,11 +496,11 @@ static int eq_expr(sam_filter_t *filt, void *data, sym_func *fn, * | and_expr 'and' eq_expr * | and_expr 'or' eq_expr */ -static int and_expr(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { if (eq_expr(filt, data, fn, str, end, res)) return -1; - fexpr_t val = FEXPR_INIT; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { str = ws(*end); if (str[0] == '&' && str[1] == '&') { @@ -519,18 +519,18 @@ static int and_expr(sam_filter_t *filt, void *data, sym_func *fn, break; } } - fexpr_free(&val); + hts_expr_val_free(&val); return 0; } -static int expression(sam_filter_t *filt, void *data, sym_func *fn, - char *str, char **end, fexpr_t *res) { +static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { return and_expr(filt, data, fn, str, end, res); } -sam_filter_t *sam_filter_init(const char *str) { - sam_filter_t *f = calloc(1, sizeof(*f)); +hts_filter_t *hts_filter_init(const char *str) { + hts_filter_t *f = calloc(1, sizeof(*f)); if (!f) return NULL; // Oversize to permit faster comparisons with memcmp over strcmp @@ -543,7 +543,7 @@ sam_filter_t *sam_filter_init(const char *str) { return f; } -void sam_filter_free(sam_filter_t *filt) { +void hts_filter_free(hts_filter_t *filt) { if (!filt) return; @@ -555,8 +555,9 @@ void sam_filter_free(sam_filter_t *filt) { free(filt); } -int sam_filter_eval(sam_filter_t *filt, void *data, sym_func *fn, - fexpr_t *res) { +int hts_filter_eval(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { char *end = NULL; memset(res, 0, sizeof(*res)); diff --git a/htslib/hts.h b/htslib/hts.h index 9ccbb986b..8360e2cf1 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -34,6 +34,7 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" #include "hts_log.h" +#include "hts_expr.h" #include "kstring.h" #include "kroundup.h" @@ -224,7 +225,6 @@ typedef struct htsFormat { struct hts_idx_t; typedef struct hts_idx_t hts_idx_t; -typedef struct sam_filter_t sam_filter_t; /** * @brief File handle returned by hts_open() etc. @@ -257,7 +257,7 @@ typedef struct htsFile { hts_idx_t *idx; const char *fnidx; struct sam_hdr_t *bam_header; - sam_filter_t *filter; + hts_filter_t *filter; } htsFile; // A combined thread pool and queue allocation size. @@ -323,7 +323,7 @@ enum hts_fmt_option { HTS_OPT_THREAD_POOL, HTS_OPT_CACHE_SIZE, HTS_OPT_BLOCK_SIZE, - HTS_OPT_SAM_FILTER, + HTS_OPT_FILTER, }; // For backwards compatibility diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h new file mode 100644 index 000000000..c628d2e6f --- /dev/null +++ b/htslib/hts_expr.h @@ -0,0 +1,97 @@ +/* expr.c -- filter expression parsing and processing. + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notices and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTS_EXPR_H +#define HTS_EXPR_H + +#include +#include "hts_defs.h" + +/// Holds a filter variable. This is also used to return the results. +/** + * Note we cope with zero-but-true in order to implement a basic + * "exists(something)" check where "something" may even be zero. + * + * Eg in the aux tag searching syntax, "[NM]" should return true if + * NM tag exists even if zero. + * Take care when negating this. "[NM] != 0" will be true when + * [NM] is absent, thus consider "[NM] && [NM] != 0". + */ +typedef struct { + char is_str; // Use .s vs .d + char is_true; // Force true if even zero + kstring_t s; // is_str and empty s permitted (eval as false) + double d; // otherwise this +} hts_expr_val_t; + +/// Frees a hts_expr_val_t type. +static inline void hts_expr_val_free(hts_expr_val_t *f) { + ks_free(&f->s); +} + +/// Opaque hts_filter_t type. Definition in hts_expr.c +typedef struct hts_filter_t hts_filter_t; + +/// For static initialisation of hts_expr_val_t values +#define HTS_EXPR_VAL_INIT {0, 0, KS_INITIALIZE, 0} + +/// Creates a filter for expression "str". +/** @param str The filter expression + * @return A pointer on success, NULL on failure + */ +HTSLIB_EXPORT +hts_filter_t *hts_filter_init(const char *str); + +/// Frees an hts_filter_t created via hts_filter_init +/** @param filt The filter pointer. + */ +HTSLIB_EXPORT +void hts_filter_free(hts_filter_t *filt); + +/// Type for expression symbol lookups; name -> value. +typedef int (hts_expr_sym_func)(void *data, char *str, char **end, + hts_expr_val_t *res); + +/// Evaluates a filter expression and returns the value +/** @param filt The filter, produced by hts_filter_init + * @param data Arbitrary caller data, passed into sym_func + * @param sym_func Callback function to lookup variables. + * @param res Filled out with the result of the filter evaluation + * @return Returns 0 on success, -1 on failure + * + * sym_func and data may be NULL if the caller does not need its own data + * pointer or if it has no variables to lookup. + * + * The type of the returned result may be numeric of string, as defined by + * the is_str member. It can also be explicitly defined to be true even + * for a null value. This may be used to check for the existance of + * something, irrespective of whether that something evaluates to zero. + */ +HTSLIB_EXPORT +int hts_filter_eval(hts_filter_t *filt, + void *data, hts_expr_sym_func *sym_func, + hts_expr_val_t *res); + + +#endif /* HTS_EXPR_H */ diff --git a/htslib/sam.h b/htslib/sam.h index bfd7855a6..3c7b9b693 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1408,6 +1408,15 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; +/// sam_passes_filter - Checks whether a record passes an hts_filter. +/** @param h Pointer to the header structure previously read + * @param b Pointer to the BAM record to be checked + * @param filt Pointer to the filter, created from hts_filter_init. + * @return 1 if passes, 0 if not, and <0 on error. + */ +HTSLIB_EXPORT +int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt); + /************************************* *** Manipulating auxiliary fields *** *************************************/ diff --git a/htslib_vars.mk b/htslib_vars.mk index f22c7c22b..fa907f50f 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -29,9 +29,10 @@ htslib_bgzf_h = $(HTSPREFIX)htslib/bgzf.h $(htslib_hts_defs_h) htslib_cram_h = $(HTSPREFIX)htslib/cram.h $(htslib_hts_defs_h) $(htslib_hts_h) $(htslib_sam_h) htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h $(htslib_hts_defs_h) $(htslib_hts_h) htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) -htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) +htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) $(htslib_hts_expr_h) htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h htslib_hts_endian_h = $(HTSPREFIX)htslib/hts_endian.h +htslib_hts_expr_h = $(HTSPREFIX)htslib/hts_expr.h htslib_hts_log_h = $(HTSPREFIX)htslib/hts_log.h $(htslib_hts_defs_h) htslib_hts_os_h = $(HTSPREFIX)htslib/hts_os.h $(htslib_hts_defs_h) htslib_kbitset_h = $(HTSPREFIX)htslib/kbitset.h diff --git a/sam.c b/sam.c index 93a9a994a..d044046eb 100644 --- a/sam.c +++ b/sam.c @@ -50,7 +50,6 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hfile.h" #include "htslib/hts_endian.h" #include "header.h" -#include "expr.h" #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) @@ -1099,20 +1098,288 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_po return ret; } +// Bam record pointer and SAM header combined +typedef struct { + const sam_hdr_t *h; + const bam1_t *b; +} hb_pair; + +// Looks up variable names in str and replaces them with their value. +// Also supports aux tags. +// +// Note the expression parser deliberately overallocates str size so it +// is safe to use memcmp over strcmp. +static int bam_sym_lookup(void *data, char *str, char **end, + hts_expr_val_t *res) { + hb_pair *hb = (hb_pair *)data; + const bam1_t *b = hb->b; + + res->is_str = 0; + switch(*str) { + case 'c': + if (memcmp(str, "cigar", 5) == 0) { + *end = str+5; + res->is_str = 1; + ks_clear(&res->s); + uint32_t *cigar = bam_get_cigar(b); + int i, n = b->core.n_cigar, r = 0; + for (i = 0; i < n; i++) { + r |= kputw (bam_cigar_oplen(cigar[i]), &res->s); + r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s); + } + kputs("", &res->s); + return r ? 0 : -1; + } + break; + + case 'f': + if (memcmp(str, "flag", 4) == 0) { + str = *end = str+4; + if (*str != '.') { + res->d = b->core.flag; + return 0; + } else { + str++; + if (!memcmp(str, "paired", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FPAIRED; + return 0; + } else if (!memcmp(str, "proper_pair", 11)) { + *end = str+11; + res->d = b->core.flag & BAM_FPROPER_PAIR; + return 0; + } else if (!memcmp(str, "unmap", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FUNMAP; + return 0; + } else if (!memcmp(str, "munmap", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FMUNMAP; + return 0; + } else if (!memcmp(str, "reverse", 7)) { + *end = str+7; + res->d = b->core.flag & BAM_FREVERSE; + return 0; + } else if (!memcmp(str, "mreverse", 8)) { + *end = str+8; + res->d = b->core.flag & BAM_FMREVERSE; + return 0; + } else if (!memcmp(str, "read1", 5)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD1; + return 0; + } else if (!memcmp(str, "read2", 6)) { + *end = str+5; + res->d = b->core.flag & BAM_FREAD2; + return 0; + } else if (!memcmp(str, "secondary", 9)) { + *end = str+9; + res->d = b->core.flag & BAM_FSECONDARY; + return 0; + } else if (!memcmp(str, "qcfail", 6)) { + *end = str+6; + res->d = b->core.flag & BAM_FQCFAIL; + return 0; + } else if (!memcmp(str, "dup", 3)) { + *end = str+3; + res->d = b->core.flag & BAM_FDUP; + return 0; + } else if (!memcmp(str, "supplementary", 13)) { + *end = str+13; + res->d = b->core.flag & BAM_FSUPPLEMENTARY; + return 0; + } else { + hts_log_error("Unrecognised flag string"); + return -1; + } + } + } + break; + + case 'm': + if (memcmp(str, "mapq", 4) == 0) { + *end = str+4; + res->d = b->core.qual; + return 0; + } else if (memcmp(str, "mpos", 4) == 0) { + *end = str+4; + res->d = b->core.mpos+1; + return 0; + } else if (memcmp(str, "mrname", 6) == 0) { + *end = str+6; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "mrefid", 6) == 0) { + *end = str+6; + res->d = b->core.mtid; + return 0; + } + break; + + case 'n': + if (memcmp(str, "ncigar", 6) == 0) { + *end = str+6; + res->d = b->core.n_cigar; + return 0; + } + break; + + case 'p': + if (memcmp(str, "pos", 3) == 0) { + *end = str+3; + res->d = b->core.pos+1; + return 0; + } else if (memcmp(str, "pnext", 5) == 0) { + *end = str+5; + res->d = b->core.mpos+1; + return 0; + } + break; + + case 'q': + if (memcmp(str, "qlen", 4) == 0) { + *end = str+4; + res->d = b->core.l_qseq; + return 0; + } else if (memcmp(str, "qname", 5) == 0) { + *end = str+5; + res->is_str = 1; + kputs(bam_get_qname(b), ks_clear(&res->s)); + return 0; + } + break; + + case 'r': + if (memcmp(str, "rlen", 4) == 0) { + *end = str+4; + res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); + return 0; + } else if (memcmp(str, "rname", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "rnext", 5) == 0) { + *end = str+5; + res->is_str = 1; + const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); + kputs(rn ? rn : "*", ks_clear(&res->s)); + return 0; + } else if (memcmp(str, "refid", 5) == 0) { + *end = str+5; + res->d = b->core.tid; + return 0; + } + break; + + case 't': + if (memcmp(str, "tlen", 4) == 0) { + *end = str+4; + res->d = b->core.isize; + return 0; + } + break; + + case '[': + if (*str == '[' && str[1] && str[2] && str[3] == ']') { + /* aux tags */ + *end = str+4; + + uint8_t *aux = bam_aux_get(b, str+1); + if (aux) { + // we define the truth of a tag to be its presence, even if 0. + res->is_true = 1; + switch (*aux) { + case 'Z': + case 'H': + res->is_str = 1; + kputs((char *)aux+1, ks_clear(&res->s)); + break; + + case 'A': + res->is_str = 1; + kputsn((char *)aux+1, 1, ks_clear(&res->s)); + break; + + case 'i': case 'I': + case 's': case 'S': + case 'c': case 'C': + res->is_str = 0; + res->d = bam_aux2i(aux); + break; + + case 'f': + case 'd': + res->is_str = 0; + res->d = bam_aux2f(aux); + break; + + default: + hts_log_error("Aux type '%c not yet supported by filters", + *aux); + return -1; + } + return 0; + + } else { + // hence absent tags are always false (and strings) + res->is_str = 1; + res->s.l = 0; + res->d = 0; + res->is_true = 0; + return 0; + } + } + break; + } + + // All successful matches in switch should return 0. + // So if we didn't match, it's a parse error. + return -1; +} + +// Returns 1 when accepted by the filter, 0 if not, -1 on error. +int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) +{ + hb_pair hb = {h, b}; + hts_expr_val_t res; + if (hts_filter_eval(filt, &hb, bam_sym_lookup, &res)) { + hts_log_error("Couldn't process filter expression"); + hts_expr_val_free(&res); + return -1; + } + + int t = res.is_true; + hts_expr_val_free(&res); + + return t; +} + static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg, hts_pos_t *end) { htsFile *fp = fpv; bam1_t *b = bv; - int ret = cram_get_bam_seq(fp->fp.cram, &b); - if (ret < 0) - return cram_eof(fp->fp.cram) ? -1 : -2; + int filtered, ret; - if (bam_tag2cigar(b, 1, 1) < 0) - return -2; + do { + ret = cram_get_bam_seq(fp->fp.cram, &b); + if (ret < 0) + return cram_eof(fp->fp.cram) ? -1 : -2; - *tid = b->core.tid; - *beg = b->core.pos; - *end = bam_endpos(b); + if (bam_tag2cigar(b, 1, 1) < 0) + return -2; + + *tid = b->core.tid; + *beg = b->core.pos; + *end = bam_endpos(b); + + filtered = sam_passes_filter(fp->bam_header, b, fp->filter); + if (filtered < 0) + return -2; + } while (filtered == 0); return ret; } @@ -3096,451 +3363,221 @@ static void *sam_format_worker(void *arg) { } pthread_mutex_unlock(&fd->lines_m); - return gl; - - err: - // Possible race between this and fd->curr_bam. - // Easier to not free and leave it on the input list so it - // gets freed there instead? - // sam_free_sp_bams(gb); - if (gl) { - free(gl->data); - free(gl); - } - return NULL; -} - -int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { - if (fp->state) - return 0; - - if (!(fp->state = sam_state_create(fp))) - return -1; - SAM_state *fd = (SAM_state *)fp->state; - - pthread_mutex_init(&fd->lines_m, NULL); - pthread_mutex_init(&fd->command_m, NULL); - pthread_cond_init(&fd->command_c, NULL); - fd->p = p->pool; - int qsize = p->qsize; - if (!qsize) - qsize = 2*hts_tpool_size(fd->p); - fd->q = hts_tpool_process_init(fd->p, qsize, 0); - if (!fd->q) { - sam_state_destroy(fp); - return -1; - } - - if (fp->format.compression == bgzf) - return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); - - return 0; -} - -int sam_set_threads(htsFile *fp, int nthreads) { - if (nthreads <= 0) - return 0; - - htsThreadPool p; - p.pool = hts_tpool_init(nthreads); - p.qsize = nthreads*2; - - int ret = sam_set_thread_pool(fp, &p); - if (ret < 0) - return ret; - - SAM_state *fd = (SAM_state *)fp->state; - fd->own_pool = 1; - - return 0; -} - -// Bam record pointer and SAM header combined -typedef struct { - const sam_hdr_t *h; - const bam1_t *b; -} hb_pair; - -// Looks up variable names in str and replaces them with their value. -// Also supports aux tags. -// -// Note the expression parser deliberately overallocates str size so it -// is safe to use memcmp over strcmp. -static int bam_sym_lookup(void *data, char *str, char **end, fexpr_t *res) { - hb_pair *hb = (hb_pair *)data; - const bam1_t *b = hb->b; - - res->is_str = 0; - switch(*str) { - case 'c': - if (memcmp(str, "cigar", 5) == 0) { - *end = str+5; - res->is_str = 1; - ks_clear(&res->s); - uint32_t *cigar = bam_get_cigar(b); - int i, n = b->core.n_cigar, r = 0; - for (i = 0; i < n; i++) { - r |= kputw (bam_cigar_oplen(cigar[i]), &res->s); - r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s); - } - kputs("", &res->s); - return r ? 0 : -1; - } - break; - - case 'f': - if (memcmp(str, "flag", 4) == 0) { - str = *end = str+4; - if (*str != '.') { - res->d = b->core.flag; - return 0; - } else { - str++; - if (!memcmp(str, "paired", 6)) { - *end = str+6; - res->d = b->core.flag & BAM_FPAIRED; - return 0; - } else if (!memcmp(str, "proper_pair", 11)) { - *end = str+11; - res->d = b->core.flag & BAM_FPROPER_PAIR; - return 0; - } else if (!memcmp(str, "unmap", 5)) { - *end = str+5; - res->d = b->core.flag & BAM_FUNMAP; - return 0; - } else if (!memcmp(str, "munmap", 6)) { - *end = str+6; - res->d = b->core.flag & BAM_FMUNMAP; - return 0; - } else if (!memcmp(str, "reverse", 7)) { - *end = str+7; - res->d = b->core.flag & BAM_FREVERSE; - return 0; - } else if (!memcmp(str, "mreverse", 8)) { - *end = str+8; - res->d = b->core.flag & BAM_FMREVERSE; - return 0; - } else if (!memcmp(str, "read1", 5)) { - *end = str+5; - res->d = b->core.flag & BAM_FREAD1; - return 0; - } else if (!memcmp(str, "read2", 6)) { - *end = str+5; - res->d = b->core.flag & BAM_FREAD2; - return 0; - } else if (!memcmp(str, "secondary", 9)) { - *end = str+9; - res->d = b->core.flag & BAM_FSECONDARY; - return 0; - } else if (!memcmp(str, "qcfail", 6)) { - *end = str+6; - res->d = b->core.flag & BAM_FQCFAIL; - return 0; - } else if (!memcmp(str, "dup", 3)) { - *end = str+3; - res->d = b->core.flag & BAM_FDUP; - return 0; - } else if (!memcmp(str, "supplementary", 13)) { - *end = str+13; - res->d = b->core.flag & BAM_FSUPPLEMENTARY; - return 0; - } else { - hts_log_error("Unrecognised flag string"); - return -1; - } - } - } - break; - - case 'm': - if (memcmp(str, "mapq", 4) == 0) { - *end = str+4; - res->d = b->core.qual; - return 0; - } else if (memcmp(str, "mpos", 4) == 0) { - *end = str+4; - res->d = b->core.mpos+1; - return 0; - } else if (memcmp(str, "mrname", 6) == 0) { - *end = str+6; - res->is_str = 1; - const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); - kputs(rn ? rn : "*", ks_clear(&res->s)); - return 0; - } else if (memcmp(str, "mrefid", 6) == 0) { - *end = str+6; - res->d = b->core.mtid; - return 0; - } - break; + return gl; - case 'n': - if (memcmp(str, "ncigar", 6) == 0) { - *end = str+6; - res->d = b->core.n_cigar; - return 0; - } - break; + err: + // Possible race between this and fd->curr_bam. + // Easier to not free and leave it on the input list so it + // gets freed there instead? + // sam_free_sp_bams(gb); + if (gl) { + free(gl->data); + free(gl); + } + return NULL; +} - case 'p': - if (memcmp(str, "pos", 3) == 0) { - *end = str+3; - res->d = b->core.pos+1; - return 0; - } else if (memcmp(str, "pnext", 5) == 0) { - *end = str+5; - res->d = b->core.mpos+1; - return 0; - } - break; +int sam_set_thread_pool(htsFile *fp, htsThreadPool *p) { + if (fp->state) + return 0; - case 'q': - if (memcmp(str, "qlen", 4) == 0) { - *end = str+4; - res->d = b->core.l_qseq; - return 0; - } else if (memcmp(str, "qname", 5) == 0) { - *end = str+5; - res->is_str = 1; - kputs(bam_get_qname(b), ks_clear(&res->s)); - return 0; - } - break; + if (!(fp->state = sam_state_create(fp))) + return -1; + SAM_state *fd = (SAM_state *)fp->state; - case 'r': - if (memcmp(str, "rlen", 4) == 0) { - *end = str+4; - res->d = bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); - return 0; - } else if (memcmp(str, "rname", 5) == 0) { - *end = str+5; - res->is_str = 1; - const char *rn = sam_hdr_tid2name(hb->h, b->core.tid); - kputs(rn ? rn : "*", ks_clear(&res->s)); - return 0; - } else if (memcmp(str, "rnext", 5) == 0) { - *end = str+5; - res->is_str = 1; - const char *rn = sam_hdr_tid2name(hb->h, b->core.mtid); - kputs(rn ? rn : "*", ks_clear(&res->s)); - return 0; - } else if (memcmp(str, "refid", 5) == 0) { - *end = str+5; - res->d = b->core.tid; - return 0; - } - break; + pthread_mutex_init(&fd->lines_m, NULL); + pthread_mutex_init(&fd->command_m, NULL); + pthread_cond_init(&fd->command_c, NULL); + fd->p = p->pool; + int qsize = p->qsize; + if (!qsize) + qsize = 2*hts_tpool_size(fd->p); + fd->q = hts_tpool_process_init(fd->p, qsize, 0); + if (!fd->q) { + sam_state_destroy(fp); + return -1; + } - case 't': - if (memcmp(str, "tlen", 4) == 0) { - *end = str+4; - res->d = b->core.isize; - return 0; - } - break; + if (fp->format.compression == bgzf) + return bgzf_thread_pool(fp->fp.bgzf, p->pool, p->qsize); - case '[': - if (*str == '[' && str[1] && str[2] && str[3] == ']') { - /* aux tags */ - *end = str+4; + return 0; +} - uint8_t *aux = bam_aux_get(b, str+1); - if (aux) { - // we define the truth of a tag to be its presence, even if 0. - res->is_true = 1; - switch (*aux) { - case 'Z': - case 'H': - res->is_str = 1; - kputs((char *)aux+1, ks_clear(&res->s)); - break; +int sam_set_threads(htsFile *fp, int nthreads) { + if (nthreads <= 0) + return 0; - case 'A': - res->is_str = 1; - kputsn((char *)aux+1, 1, ks_clear(&res->s)); - break; + htsThreadPool p; + p.pool = hts_tpool_init(nthreads); + p.qsize = nthreads*2; - case 'i': case 'I': - case 's': case 'S': - case 'c': case 'C': - res->is_str = 0; - res->d = bam_aux2i(aux); - break; + int ret = sam_set_thread_pool(fp, &p); + if (ret < 0) + return ret; - case 'f': - case 'd': - res->is_str = 0; - res->d = bam_aux2f(aux); - break; + SAM_state *fd = (SAM_state *)fp->state; + fd->own_pool = 1; - default: - hts_log_error("Aux type '%c not yet supported by filters", - *aux); - return -1; - } - return 0; + return 0; +} - } else { - // hence absent tags are always false (and strings) - res->is_str = 1; - res->s.l = 0; - res->d = 0; - res->is_true = 0; - return 0; - } +// Internal component of sam_read1 below +static int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { + int ret = bam_read1(fp->fp.bgzf, b); + if (h && ret >= 0) { + if (b->core.tid >= h->n_targets || b->core.tid < -1 || + b->core.mtid >= h->n_targets || b->core.mtid < -1) { + errno = ERANGE; + return -3; } - break; } + return ret; +} - // All successful matches in switch should return 0. - // So if we didn't match, it's a parse error. - return -1; +// Internal component of sam_read1 below +static int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { + int ret = cram_get_bam_seq(fp->fp.cram, b); + if (ret < 0) + return cram_eof(fp->fp.cram) ? -1 : -2; + + if (bam_tag2cigar(*b, 1, 1) < 0) + return -2; + + return ret; } -// Returns 0 on success, -// -1 on EOF, -// <-1 on error -int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) -{ +// Internal component of sam_read1 below +static int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { int ret; - filtered: - switch (fp->format.format) { - case bam: - ret = bam_read1(fp->fp.bgzf, b); - if (h && ret >= 0) { - if (b->core.tid >= h->n_targets || b->core.tid < -1 || - b->core.mtid >= h->n_targets || b->core.mtid < -1) { - errno = ERANGE; - return -3; + // Consume 1st line after header parsing as it wasn't using peek + if (fp->line.l != 0) { + ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; + return ret; + } + + if (fp->state) { + SAM_state *fd = (SAM_state *)fp->state; + + if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { + // We don't support multi-threaded SAM parsing with seeks yet. + int ret; + if ((ret = sam_state_destroy(fp)) < 0) { + errno = -ret; + return -2; } + if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) + return -1; + fp->fp.bgzf->seeked = 0; + goto err_recover; } - break; - - case cram: - ret = cram_get_bam_seq(fp->fp.cram, &b); - if (ret < 0) - return cram_eof(fp->fp.cram) ? -1 : -2; - if (bam_tag2cigar(b, 1, 1) < 0) - return -2; - break; + if (!fd->h) { + fd->h = h; + fd->h->ref_count++; + // Ensure hrecs is initialised now as we don't want multiple + // threads trying to do this simultaneously. + if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) + return -2; - case sam: { - // Consume 1st line after header parsing as it wasn't using peek - if (fp->line.l != 0) { - ret = sam_parse1(&fp->line, h, b); - fp->line.l = 0; - break; + // We can only do this once we've got a header + if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, + fp) != 0) + return -2; + fd->dispatcher_set = 1; } - if (fp->state) { - SAM_state *fd = (SAM_state *)fp->state; + if (fd->h != h) { + hts_log_error("SAM multi-threaded decoding does not support changing header"); + return -1; + } - if (fp->format.compression == bgzf && fp->fp.bgzf->seeked) { - // We don't support multi-threaded SAM parsing with seeks yet. - int ret; - if ((ret = sam_state_destroy(fp)) < 0) { - errno = -ret; - return -2; - } - if (bgzf_seek(fp->fp.bgzf, fp->fp.bgzf->seeked, SEEK_SET) < 0) - return -1; - fp->fp.bgzf->seeked = 0; - goto err_recover; + sp_bams *gb = fd->curr_bam; + if (!gb) { + if (fd->errcode) { + // In case reader failed + errno = fd->errcode; + return -2; } + hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); + if (!r) + return -2; + fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); + hts_tpool_delete_result(r, 0); + } + if (!gb) + return fd->errcode ? -2 : -1; + bam1_t *b_array = (bam1_t *)gb->bams; + if (fd->curr_idx < gb->nbams) + if (!bam_copy1(b, &b_array[fd->curr_idx++])) + return -2; + if (fd->curr_idx == gb->nbams) { + pthread_mutex_lock(&fd->lines_m); + gb->next = fd->bams; + fd->bams = gb; + pthread_mutex_unlock(&fd->lines_m); - if (!fd->h) { - fd->h = h; - fd->h->ref_count++; - // Ensure hrecs is initialised now as we don't want multiple - // threads trying to do this simultaneously. - if (!fd->h->hrecs && sam_hdr_fill_hrecs(fd->h) < 0) - return -2; + fd->curr_bam = NULL; + fd->curr_idx = 0; + } - // We can only do this once we've got a header - if (pthread_create(&fd->dispatcher, NULL, sam_dispatcher_read, - fp) != 0) - return -2; - fd->dispatcher_set = 1; - } + ret = 0; - if (fd->h != h) { - hts_log_error("SAM multi-threaded decoding does not support changing header"); - return -1; - } + } else { + err_recover: + ret = hts_getline(fp, KS_SEP_LINE, &fp->line); + if (ret < 0) return ret; - sp_bams *gb = fd->curr_bam; - if (!gb) { - if (fd->errcode) { - // In case reader failed - errno = fd->errcode; - return -2; - } - hts_tpool_result *r = hts_tpool_next_result_wait(fd->q); - if (!r) - return -2; - fd->curr_bam = gb = (sp_bams *)hts_tpool_result_data(r); - hts_tpool_delete_result(r, 0); - } - if (!gb) - return fd->errcode ? -2 : -1; - bam1_t *b_array = (bam1_t *)gb->bams; - if (fd->curr_idx < gb->nbams) - if (!bam_copy1(b, &b_array[fd->curr_idx++])) - return -2; - if (fd->curr_idx == gb->nbams) { - pthread_mutex_lock(&fd->lines_m); - gb->next = fd->bams; - fd->bams = gb; - pthread_mutex_unlock(&fd->lines_m); + ret = sam_parse1(&fp->line, h, b); + fp->line.l = 0; + if (ret < 0) { + hts_log_warning("Parse error at line %lld", (long long)fp->lineno); + if (h->ignore_sam_err) goto err_recover; + } + } - fd->curr_bam = NULL; - fd->curr_idx = 0; - } + return ret; +} - ret = 0; +// Returns 0 on success, +// -1 on EOF, +// <-1 on error +int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) +{ + int ret, pass_filter; - } else { - err_recover: - ret = hts_getline(fp, KS_SEP_LINE, &fp->line); - if (ret < 0) return ret; + do { + switch (fp->format.format) { + case bam: + ret = sam_read1_bam(fp, h, b); + break; - ret = sam_parse1(&fp->line, h, b); - fp->line.l = 0; - if (ret < 0) { - hts_log_warning("Parse error at line %lld", (long long)fp->lineno); - if (h->ignore_sam_err) goto err_recover; - } - } - break; - } + case cram: + ret = sam_read1_cram(fp, h, &b); + break; - case empty_format: - errno = EPIPE; - return -3; + case sam: { + ret = sam_read1_sam(fp, h, b); + break; + } - default: - errno = EFTYPE; - return -3; - } + case empty_format: + errno = EPIPE; + return -3; - if (ret >= 0 && fp->filter) { - // Process on-the-fly filter rules - hb_pair hb = {h, b}; - fexpr_t res; - if (sam_filter_eval(fp->filter, &hb, bam_sym_lookup, &res)) { - hts_log_error("Couldn't process filter expression"); - fexpr_free(&res); - return -1; + default: + errno = EFTYPE; + return -3; } - int t = res.is_true; - fexpr_free(&res); - if (!t) - goto filtered; - } - return ret; + pass_filter = (ret >= 0 && fp->filter) + ? sam_passes_filter(h, b, fp->filter) + : 1; + } while (pass_filter == 0); + + return pass_filter < 0 ? -2 : ret; } diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index f5558a41b..4831c8df8 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -33,14 +33,14 @@ # Command to execute. $tv is replaced with the path to test_view # String matches -P string1.out $tv -i 'sam_filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam -P string2.out $tv -i 'sam_filter=rname=="CHROMOSOME_II"' ../ce#5b.sam -P string3.out $tv -i 'sam_filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam -P string4.out $tv -i 'sam_filter=cigar=~"D"' ../ce#1000.sam +P string1.out $tv -i 'filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam +P string2.out $tv -i 'filter=rname=="CHROMOSOME_II"' ../ce#5b.sam +P string3.out $tv -i 'filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam +P string4.out $tv -i 'filter=cigar=~"D"' ../ce#1000.sam # Integer ops -P int1.out $tv -i 'sam_filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' -P int2.out $tv -i 'sam_filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@' +P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' +P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@' # Aux tags -P int3.out $tv -i 'sam_filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' +P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' diff --git a/test/test_expr.c b/test/test_expr.c index 7ad83fcd2..258548b88 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -26,9 +26,9 @@ DEALINGS IN THE SOFTWARE. */ #include #include -#include "../expr.h" +#include "../htslib/hts_expr.h" -int lookup(void *data, char *str, char **end, fexpr_t *res) { +int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { int foo = 15551; // my favourite palindromic prime int a = 1; int b = 2; @@ -183,12 +183,12 @@ int test(void) { }; int i; - fexpr_t r; + hts_expr_val_t r; for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { - sam_filter_t *filt = sam_filter_init(tests[i].str); + hts_filter_t *filt = hts_filter_init(tests[i].str); if (!filt) return 1; - if (sam_filter_eval(filt, NULL, lookup, &r)) { + if (hts_filter_eval(filt, NULL, lookup, &r)) { fprintf(stderr, "Failed to parse filter string %s\n", tests[i].str); return 1; @@ -205,8 +205,8 @@ int test(void) { return 1; } - fexpr_free(&r); - sam_filter_free(filt); + hts_expr_val_free(&r); + hts_filter_free(filt); } return 0; @@ -214,9 +214,9 @@ int test(void) { int main(int argc, char **argv) { if (argc > 1) { - fexpr_t v; - sam_filter_t *filt = sam_filter_init(argv[1]); - if (sam_filter_eval(filt, NULL, lookup, &v)) + hts_expr_val_t v; + hts_filter_t *filt = hts_filter_init(argv[1]); + if (hts_filter_eval(filt, NULL, lookup, &v)) return 1; if (v.is_str) @@ -224,8 +224,8 @@ int main(int argc, char **argv) { else printf("%g\n", v.d); - fexpr_free(&v); - sam_filter_free(filt); + hts_expr_val_free(&v); + hts_filter_free(filt); return 0; } From fc2d03e70f60bf553303a2fbb6211fbfa59822dd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 25 Nov 2020 16:10:22 +0000 Subject: [PATCH 036/488] Add missing return to HTS_OPT_COMPRESSION_LEVEL option. --- hts.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hts.c b/hts.c index f854ed725..9e50290cf 100644 --- a/hts.c +++ b/hts.c @@ -1340,6 +1340,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { va_end(args); if (fp->is_bgzf) fp->fp.bgzf->compress_level = level; + return 0; } case HTS_OPT_FILTER: { From c7c433fd314d70b650e31af4c6b07f004bf17962 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 30 Nov 2020 16:43:50 +0000 Subject: [PATCH 037/488] Filter "qlen" tweak, add seq/qual/library, and add functions. "seq", "qual" and "library" are string variables, so we can now match them with e.g. seq =~ "(AT){10}". We have basic functions that operate on strings, interpreting them as arrays of characters. (This could be updated for B aux arrays and larger objects in the future.) Functions are length, min, max and avg. For example avg(qual) or length(seq). Note qual here is binary, so NOT the ASCII phred+33. I wouldn't advise trying to match it in a regexp. "qlen" was previously length of sequence, and not the value of cigar2qlen. This differs when we have CIGAR but seq is "*". Use the cigar2qlen definition so it is symmetric with our "rlen" variable. To permit the previous definition, we can now do "length(seq)". Note that min and max of empty string is NAN. This isn't treated as an error. This permits us to do "avg(qual) > 10" where some entries have seq and qual both as "*" (ie zero length). --- Makefile | 4 +- hts_expr.c | 154 ++++++++++++++++++++++++++++++++---- htslib.mk | 2 + htslib/hts.h | 4 +- htslib/hts_expr.h | 6 +- htslib/sam.h | 6 +- htslib_vars.mk | 4 +- sam.c | 87 +++++++++++++++++--- test/sam_filter/filter.tst | 9 +++ test/sam_filter/func1.out | 1 + test/sam_filter/func2.out | 1 + test/sam_filter/func3.out | 1 + test/sam_filter/func4.out | 1 + test/sam_filter/string5.out | 6 ++ test/sam_filter/string6.out | 9 +++ test/sam_filter/string7.out | 11 +++ test/test_expr.c | 2 +- 17 files changed, 270 insertions(+), 38 deletions(-) create mode 100644 test/sam_filter/func1.out create mode 100644 test/sam_filter/func2.out create mode 100644 test/sam_filter/func3.out create mode 100644 test/sam_filter/func4.out create mode 100644 test/sam_filter/string5.out create mode 100644 test/sam_filter/string6.out create mode 100644 test/sam_filter/string7.out diff --git a/Makefile b/Makefile index 659936e2b..840d9a2f8 100644 --- a/Makefile +++ b/Makefile @@ -343,10 +343,10 @@ hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_kn hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) -hts_expr.o hts_expr.pico: hts_expr.c $(htslib_hts_expr_h) config.h $(htslib_kstring_h) +hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) -sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) +sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) diff --git a/hts_expr.c b/hts_expr.c index 3d00ed339..1a1b9c52d 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -23,9 +23,10 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // TODO: -// - add maths functions. pow, sqrt, log, min, max, ? +// - add maths functions. pow, sqrt, log, ? // - ?: operator for conditionals? +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include #include @@ -34,7 +35,8 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include // may need configure rule for this +#include +#include #include "htslib/hts_expr.h" #include "textutils_internal.h" @@ -60,9 +62,9 @@ struct hts_filter_t { * Unary ops: +, -, !, ~ eg -10 +10, !10 (0), ~5 (bitwise not) * Math ops: *, /, % [TODO: add // for floor division?] * Math ops: +, - - * Bit-wise: &, |, ^ [NB as 3 precedence levels, in that order] + * Bit-wise: &, ^, | [NB as 3 precedence levels, in that order] * Conditionals: >, >=, <, <=, - * Equality: ==, !=, =~ !~ + * Equality: ==, !=, =~, !~ * Boolean: &&, || */ @@ -76,11 +78,127 @@ static char *ws(char *str) { static int expression(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res); +/* + * Simple functions operating on strings only. + * length, min, max, avg. + * + * All return 0 on success, + * -1 on failure + */ +static int expr_func_length(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + res->is_str = 0; + res->d = res->s.l; + return 0; +} + +static int expr_func_min(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + int v = INT_MAX; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + if (v > x[l]) + v = x[l]; + + res->is_str = 0; + res->d = v == INT_MAX ? NAN : v; + + return 0; +} + +static int expr_func_max(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + int v = INT_MIN; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + if (v < x[l]) + v = x[l]; + + res->is_str = 0; + res->d = v == INT_MIN ? NAN : v; + + return 0; +} + +static int expr_func_avg(hts_expr_val_t *res) { + if (!res->is_str) + return -1; + + size_t l = res->s.l; + double v = 0; + const uint8_t *x = (uint8_t *)res->s.s; + for (l = 0; l < res->s.l; l++) + v += x[l]; + if (l) + v /= l; + + res->is_str = 0; + res->d = v; + + return 0; +} + +/* + * functions: FUNC(expr). + * Note for simplicity of parsing, the "(" must immediately follow FUNC, + * so "FUNC (x)" is invalid. + */ +static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, + char *str, char **end, hts_expr_val_t *res) { + int func_ok = -1; + switch (*str) { + case 'a': + if (strncmp(str, "avg(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_avg(res); + } + break; + + case 'l': + if (strncmp(str, "length(", 7) == 0) { + if (expression(filt, data, fn, str+7, end, res)) return -1; + func_ok = expr_func_length(res); + } + break; + + case 'm': + if (strncmp(str, "min(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_min(res); + } else if (strncmp(str, "max(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = expr_func_max(res); + } + break; + } + + if (func_ok < 0) + return -1; + + str = ws(*end); + if (*str != ')') { + fprintf(stderr, "Missing ')'\n"); + return -1; + } + *end = str+1; + + return 0; +} + /* * simple_expr * : identifier * | constant - * // | string ? + * | string + * | func_expr * | '(' expression ')' */ static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, @@ -141,11 +259,15 @@ static int simple_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, if (*e != '"') return -1; *end = e+1; - } else if (fn) - // Look up variable. - return fn(data, str, end, res); - else + } else if (fn) { + // Try lookup as variable, if not as function + if (fn(data, str, end, res) == 0) + return 0; + else + return func_expr(filt, data, fn, str, end, res); + } else { return -1; + } } return 0; @@ -197,9 +319,9 @@ static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, /* * mul_expr * : unary_expr ( - * unary_expr '*' unary_expr - * | unary_expr '/' unary_expr - * | unary_expr '%' unary_expr + * '*' unary_expr + * | '/' unary_expr + * | '%' unary_expr * )* */ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, @@ -238,8 +360,8 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, /* * add_expr * : mul_expr ( - * mul_expr '+' mul_expr - * | mul_expr '-' mul_expr + * '+' mul_expr + * | '-' mul_expr * )* */ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, @@ -331,8 +453,8 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, /* * bitor_expr - * : xor_expr - * | bitor_expr '|' xor_expr + * : bitxor_expr + * | bitor_expr '|' bitxor_expr */ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { diff --git a/htslib.mk b/htslib.mk index ceb9bf3c4..8cd659fc3 100644 --- a/htslib.mk +++ b/htslib.mk @@ -54,6 +54,7 @@ HTSLIB_PUBLIC_HEADERS = \ $(HTSDIR)/htslib/hts.h \ $(HTSDIR)/htslib/hts_defs.h \ $(HTSDIR)/htslib/hts_endian.h \ + $(HTSDIR)/htslib/hts_expr.h \ $(HTSDIR)/htslib/hts_log.h \ $(HTSDIR)/htslib/hts_os.h \ $(HTSDIR)/htslib/kbitset.h \ @@ -92,6 +93,7 @@ HTSLIB_ALL = \ $(HTSDIR)/hfile_s3.c \ $(HTSDIR)/hfile_s3_write.c \ $(HTSDIR)/hts.c \ + $(HTSDIR)/hts_expr.c \ $(HTSDIR)/hts_internal.h \ $(HTSDIR)/hts_os.c \ $(HTSDIR)/kfunc.c \ diff --git a/htslib/hts.h b/htslib/hts.h index 8360e2cf1..f4e06efdd 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -34,7 +34,6 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" #include "hts_log.h" -#include "hts_expr.h" #include "kstring.h" #include "kroundup.h" @@ -225,6 +224,7 @@ typedef struct htsFormat { struct hts_idx_t; typedef struct hts_idx_t hts_idx_t; +struct hts_filter_t; /** * @brief File handle returned by hts_open() etc. @@ -257,7 +257,7 @@ typedef struct htsFile { hts_idx_t *idx; const char *fnidx; struct sam_hdr_t *bam_header; - hts_filter_t *filter; + struct hts_filter_t *filter; } htsFile; // A combined thread pool and queue allocation size. diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h index c628d2e6f..d66a8edd8 100644 --- a/htslib/hts_expr.h +++ b/htslib/hts_expr.h @@ -25,7 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTS_EXPR_H #define HTS_EXPR_H -#include +#include "kstring.h" #include "hts_defs.h" /// Holds a filter variable. This is also used to return the results. @@ -38,7 +38,7 @@ DEALINGS IN THE SOFTWARE. */ * Take care when negating this. "[NM] != 0" will be true when * [NM] is absent, thus consider "[NM] && [NM] != 0". */ -typedef struct { +typedef struct hts_expr_val_t { char is_str; // Use .s vs .d char is_true; // Force true if even zero kstring_t s; // is_str and empty s permitted (eval as false) @@ -85,7 +85,7 @@ typedef int (hts_expr_sym_func)(void *data, char *str, char **end, * * The type of the returned result may be numeric of string, as defined by * the is_str member. It can also be explicitly defined to be true even - * for a null value. This may be used to check for the existance of + * for a null value. This may be used to check for the existence of * something, irrespective of whether that something evaluates to zero. */ HTSLIB_EXPORT diff --git a/htslib/sam.h b/htslib/sam.h index 3c7b9b693..97aa4acf2 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1408,6 +1408,9 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; +// Forward declaration, see hts_expr.h for full. +struct hts_filter_t; + /// sam_passes_filter - Checks whether a record passes an hts_filter. /** @param h Pointer to the header structure previously read * @param b Pointer to the BAM record to be checked @@ -1415,7 +1418,8 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, * @return 1 if passes, 0 if not, and <0 on error. */ HTSLIB_EXPORT -int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt); +int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, + struct hts_filter_t *filt); /************************************* *** Manipulating auxiliary fields *** diff --git a/htslib_vars.mk b/htslib_vars.mk index fa907f50f..3c2275d4a 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -29,10 +29,10 @@ htslib_bgzf_h = $(HTSPREFIX)htslib/bgzf.h $(htslib_hts_defs_h) htslib_cram_h = $(HTSPREFIX)htslib/cram.h $(htslib_hts_defs_h) $(htslib_hts_h) $(htslib_sam_h) htslib_faidx_h = $(HTSPREFIX)htslib/faidx.h $(htslib_hts_defs_h) $(htslib_hts_h) htslib_hfile_h = $(HTSPREFIX)htslib/hfile.h $(htslib_hts_defs_h) -htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) $(htslib_hts_expr_h) +htslib_hts_h = $(HTSPREFIX)htslib/hts.h $(htslib_hts_defs_h) $(htslib_hts_log_h) $(htslib_kstring_h) $(htslib_kroundup_h) htslib_hts_defs_h = $(HTSPREFIX)htslib/hts_defs.h htslib_hts_endian_h = $(HTSPREFIX)htslib/hts_endian.h -htslib_hts_expr_h = $(HTSPREFIX)htslib/hts_expr.h +htslib_hts_expr_h = $(HTSPREFIX)htslib/hts_expr.h $(htslib_kstring_h) $(htslib_hts_defs_h) htslib_hts_log_h = $(HTSPREFIX)htslib/hts_log.h $(htslib_hts_defs_h) htslib_hts_os_h = $(HTSPREFIX)htslib/hts_os.h $(htslib_hts_defs_h) htslib_kbitset_h = $(HTSPREFIX)htslib/kbitset.h diff --git a/sam.c b/sam.c index d044046eb..dbfc42e56 100644 --- a/sam.c +++ b/sam.c @@ -49,6 +49,7 @@ DEALINGS IN THE SOFTWARE. */ #include "sam_internal.h" #include "htslib/hfile.h" #include "htslib/hts_endian.h" +#include "htslib/hts_expr.h" #include "header.h" #include "htslib/khash.h" @@ -1098,6 +1099,34 @@ static int sam_readrec_rest(BGZF *ignored, void *fpv, void *bv, int *tid, hts_po return ret; } +// Internal (for now) func used by bam_sym_lookup. This is copied from +// samtools/bam.c. +static const char *bam_get_library(const bam_hdr_t *h, const bam1_t *b) +{ + const char *rg; + kstring_t lib = { 0, 0, NULL }; + rg = (char *)bam_aux_get(b, "RG"); + + if (!rg) + return NULL; + else + rg++; + + if (sam_hdr_find_tag_id((bam_hdr_t *)h, "RG", "ID", rg, "LB", &lib) < 0) + return NULL; + + static char LB_text[1024]; + int len = lib.l < sizeof(LB_text) - 1 ? lib.l : sizeof(LB_text) - 1; + + memcpy(LB_text, lib.s, len); + LB_text[len] = 0; + + free(lib.s); + + return LB_text; +} + + // Bam record pointer and SAM header combined typedef struct { const sam_hdr_t *h; @@ -1196,6 +1225,16 @@ static int bam_sym_lookup(void *data, char *str, char **end, } break; + case 'l': + if (memcmp(str, "library", 7) == 0) { + *end = str+7; + res->is_str = 1; + const char *lib = bam_get_library(hb->h, b); + kputs(lib ? lib : "", ks_clear(&res->s)); + return 0; + } + break; + case 'm': if (memcmp(str, "mapq", 4) == 0) { *end = str+4; @@ -1241,13 +1280,22 @@ static int bam_sym_lookup(void *data, char *str, char **end, case 'q': if (memcmp(str, "qlen", 4) == 0) { *end = str+4; - res->d = b->core.l_qseq; + res->d = bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)); return 0; } else if (memcmp(str, "qname", 5) == 0) { *end = str+5; res->is_str = 1; kputs(bam_get_qname(b), ks_clear(&res->s)); return 0; + } else if (memcmp(str, "qual", 4) == 0) { + *end = str+4; + ks_clear(&res->s); + if (ks_resize(&res->s, b->core.l_qseq+1) < 0) + return -1; + memcpy(res->s.s, bam_get_qual(b), b->core.l_qseq); + res->s.l = b->core.l_qseq; + res->is_str = 1; + return 0; } break; @@ -1275,6 +1323,20 @@ static int bam_sym_lookup(void *data, char *str, char **end, } break; + case 's': + if (memcmp(str, "seq", 3) == 0) { + *end = str+3; + ks_clear(&res->s); + if (ks_resize(&res->s, b->core.l_qseq+1) < 0) + return -1; + nibble2base(bam_get_seq(b), res->s.s, b->core.l_qseq); + res->s.s[b->core.l_qseq] = 0; + res->s.l = b->core.l_qseq; + res->is_str = 1; + return 0; + } + break; + case 't': if (memcmp(str, "tlen", 4) == 0) { *end = str+4; @@ -1362,7 +1424,7 @@ static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t { htsFile *fp = fpv; bam1_t *b = bv; - int filtered, ret; + int pass_filter, ret; do { ret = cram_get_bam_seq(fp->fp.cram, &b); @@ -1376,10 +1438,14 @@ static int cram_readrec(BGZF *ignored, void *fpv, void *bv, int *tid, hts_pos_t *beg = b->core.pos; *end = bam_endpos(b); - filtered = sam_passes_filter(fp->bam_header, b, fp->filter); - if (filtered < 0) - return -2; - } while (filtered == 0); + if (fp->filter) { + pass_filter = sam_passes_filter(fp->bam_header, b, fp->filter); + if (pass_filter < 0) + return -2; + } else { + pass_filter = 1; + } + } while (pass_filter == 0); return ret; } @@ -3423,7 +3489,7 @@ int sam_set_threads(htsFile *fp, int nthreads) { } // Internal component of sam_read1 below -static int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { +static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { int ret = bam_read1(fp->fp.bgzf, b); if (h && ret >= 0) { if (b->core.tid >= h->n_targets || b->core.tid < -1 || @@ -3436,7 +3502,7 @@ static int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { } // Internal component of sam_read1 below -static int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { +static inline int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { int ret = cram_get_bam_seq(fp->fp.cram, b); if (ret < 0) return cram_eof(fp->fp.cram) ? -1 : -2; @@ -3448,7 +3514,7 @@ static int sam_read1_cram(htsFile *fp, sam_hdr_t *h, bam1_t **b) { } // Internal component of sam_read1 below -static int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { +static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { int ret; // Consume 1st line after header parsing as it wasn't using peek @@ -3558,10 +3624,9 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) ret = sam_read1_cram(fp, h, &b); break; - case sam: { + case sam: ret = sam_read1_sam(fp, h, b); break; - } case empty_format: errno = EPIPE; diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index 4831c8df8..effb77a26 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -37,6 +37,9 @@ P string1.out $tv -i 'filter=qname =~ "\.1" && cigar =~ "D"' ../ce#1000.sam P string2.out $tv -i 'filter=rname=="CHROMOSOME_II"' ../ce#5b.sam P string3.out $tv -i 'filter=rname=~"CHROMOSOME_II"' ../ce#5b.sam P string4.out $tv -i 'filter=cigar=~"D"' ../ce#1000.sam +P string5.out $tv -i 'filter=seq =~ "(AT){2}"' ../ce#1000.sam +P string6.out $tv -i 'filter=library=="x"' ../xx#rg.sam +P string7.out $tv -i 'filter=library!="x"' ../xx#rg.sam # Integer ops P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' @@ -44,3 +47,9 @@ P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv ' # Aux tags P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' + +# Functions. +P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | egrep -cv '^@' +P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | egrep -cv '^@' +P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | egrep -cv '^@' +P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | egrep -cv '^@' diff --git a/test/sam_filter/func1.out b/test/sam_filter/func1.out new file mode 100644 index 000000000..d00491fd7 --- /dev/null +++ b/test/sam_filter/func1.out @@ -0,0 +1 @@ +1 diff --git a/test/sam_filter/func2.out b/test/sam_filter/func2.out new file mode 100644 index 000000000..d81cc0710 --- /dev/null +++ b/test/sam_filter/func2.out @@ -0,0 +1 @@ +42 diff --git a/test/sam_filter/func3.out b/test/sam_filter/func3.out new file mode 100644 index 000000000..0cfbf0888 --- /dev/null +++ b/test/sam_filter/func3.out @@ -0,0 +1 @@ +2 diff --git a/test/sam_filter/func4.out b/test/sam_filter/func4.out new file mode 100644 index 000000000..103a99dd8 --- /dev/null +++ b/test/sam_filter/func4.out @@ -0,0 +1 @@ +604 diff --git a/test/sam_filter/string5.out b/test/sam_filter/string5.out new file mode 100644 index 000000000..4be462117 --- /dev/null +++ b/test/sam_filter/string5.out @@ -0,0 +1,6 @@ +@SQ SN:CHROMOSOME_I LN:1009800 +@SQ SN:CHROMOSOME_II LN:5000 +@SQ SN:CHROMOSOME_III LN:5000 +@SQ SN:CHROMOSOME_IV LN:5000 +@SQ SN:CHROMOSOME_V LN:5000 +SRR065390.9154510 16 CHROMOSOME_I 56 0 100M * 0 0 TTCATATGGGCAGGGAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTA ##############################@B;@?>>B1?BCBBC@>CDB>B@CA@CCAC=AA>>AC;CCACCCCCCC=CCCCCCCCCCCCBCCCCCCCC AS:i:-28 XS:i:-28 XN:i:0 XM:i:14 XO:i:0 XG:i:0 YT:Z:UU MD:Z:0C0C0T1A0G0C0C0T0A0A0G0C0C0T85 NM:i:14 diff --git a/test/sam_filter/string6.out b/test/sam_filter/string6.out new file mode 100644 index 000000000..c2766bd73 --- /dev/null +++ b/test/sam_filter/string6.out @@ -0,0 +1,9 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:xx LN:20 AS:? SP:? UR:? M5:bbf4de6d8497a119dda6e074521643dc +@RG ID:x1 SM:x1 +@RG ID:x2 SM:x2 LB:x PG:foo:bar PI:1111 +@PG ID:emacs PN:emacs VN:23.1.1 +@CO also test +@CO other headers +b1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** RG:Z:x2 +b2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** RG:Z:x2 diff --git a/test/sam_filter/string7.out b/test/sam_filter/string7.out new file mode 100644 index 000000000..8efe886b3 --- /dev/null +++ b/test/sam_filter/string7.out @@ -0,0 +1,11 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:xx LN:20 AS:? SP:? UR:? M5:bbf4de6d8497a119dda6e074521643dc +@RG ID:x1 SM:x1 +@RG ID:x2 SM:x2 LB:x PG:foo:bar PI:1111 +@PG ID:emacs PN:emacs VN:23.1.1 +@CO also test +@CO other headers +a1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** RG:Z:x1 +c1 16 xx 1 1 10M * 0 0 AAAAAAAAAA ********** +a2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** RG:Z:x1 +c2 16 xx 11 1 10M * 0 0 TTTTTTTTTT ********** diff --git a/test/test_expr.c b/test/test_expr.c index 258548b88..606a9b3b5 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -158,7 +158,7 @@ int test(void) { { 1, NULL, " (2*3)&7 > 4"}, { 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv - { 1, NULL, "((2*3)&7) > 4"}, // Python precendece equiv + { 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv { 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, { 1, "plugh", "magic"}, From cecf738a4a200547250fd9b22e4365af1a9610fb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 7 Jan 2021 09:37:05 +0000 Subject: [PATCH 038/488] Remove knet. (PR #1200) There's little benefit to keeping it (basically FTP and unsecured http), while curl is already required for htsget, refget (CRAM ref cache), AWS S3 and GCS protocols. Curl is available on all platforms we support. The public API in htslib/knetfile.h is retained for compatibility. The knet_* functions have been replaced by stubs that use hFILE internally. htslib/knetfile.h has HTS_DEPRECATED markers for these functions, so anyone compiling against the new code will get a warning that we intend to delete them in the future. The knet_tell macro does direct access of knetFile::offset so we cannot simply wrap up htell as we'd like. Similarly knet_fileno directly queries knetFile::fd. However the only use I've found of this was in old copies of bgzf and bcftools main which then did fstat on it so it's only filled out for local files. (That's good because getting it out of curl isn't supported by our code.) --- INSTALL | 10 +- Makefile | 7 +- NEWS | 6 + hfile.c | 80 +++- hfile_internal.h | 3 - hfile_libcurl.c | 2 +- hfile_net.c | 112 ------ htslib.mk | 3 - htslib/knetfile.h | 62 ++- htslib_vars.mk | 1 - knetfile.c | 645 -------------------------------- test/header_syms.pl | 7 +- test/maintainer/check_spaces.pl | 2 - 13 files changed, 125 insertions(+), 815 deletions(-) delete mode 100644 hfile_net.c delete mode 100644 knetfile.c diff --git a/INSTALL b/INSTALL index 3ee36d0cf..7ae91da33 100644 --- a/INSTALL +++ b/INSTALL @@ -41,9 +41,10 @@ a development ('-dev' or '-devel') package separate from the main library. Disabling libbzip2 and liblzma will make some CRAM files unreadable, so is not recommended. -Using libcurl provides HTSlib with better network protocol support, for -example it enables the use of https:// URLs. It is also required if -direct access to Amazon S3 or Google Cloud Storage is enabled. +Using libcurl provides HTSlib with network protocol support, for +example it enables the use of ftp://, http://, and https:// URLs. +It is also required if direct access to Amazon S3 or Google Cloud +Storage is enabled. Amazon S3 support requires an HMAC function to calculate a message authentication code. On MacOS, the CCHmac function from the standard @@ -131,8 +132,7 @@ various features and specify further optional external requirements: --enable-libcurl Use libcurl () to implement network access to - remote files via FTP, HTTP, HTTPS, etc. By default, HTSlib uses its - own simple networking code to provide access via FTP and HTTP only. + remote files via FTP, HTTP, HTTPS, etc. --enable-gcs Implement network access to Google Cloud Storage. By default or with diff --git a/Makefile b/Makefile index 840d9a2f8..eac4b809d 100644 --- a/Makefile +++ b/Makefile @@ -144,7 +144,6 @@ show-version: LIBHTS_OBJS = \ kfunc.o \ - knetfile.o \ kstring.o \ bcf_sr_sort.o \ bgzf.o \ @@ -152,7 +151,6 @@ LIBHTS_OBJS = \ faidx.o \ header.o \ hfile.o \ - hfile_net.o \ hts.o \ hts_expr.o \ hts_os.o\ @@ -334,12 +332,10 @@ hts-object-files: $(LIBHTS_OBJS) bgzf.o bgzf.pico: bgzf.c config.h $(htslib_hts_h) $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_thread_pool_h) $(htslib_hts_endian_h) cram/pooled_alloc.h $(hts_internal_h) $(htslib_khash_h) errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htslib_hts_os_h) kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) -knetfile.o knetfile.pico: knetfile.c config.h $(htslib_hts_log_h) $(htslib_knetfile_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) -hfile_net.o hfile_net.pico: hfile_net.c config.h $(hfile_internal_h) $(htslib_knetfile_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) @@ -544,8 +540,7 @@ test-shlib-exports: header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt ( echo "Error: Found unexported symbols (listed above)" ; false ) # Extract symbols that should be exported from public headers using ctags -# Filter out macros in htslib/hts_defs.h, and knet_win32_ functions that -# aren't needed on non-Windows platforms. +# Filter out macros in htslib/hts_defs.h. header-exports.txt: test/header_syms.pl htslib/*.h test/header_syms.pl htslib/*.h | sort -u -o $@ diff --git a/NEWS b/NEWS index 2fd012bf9..658b7cad2 100644 --- a/NEWS +++ b/NEWS @@ -1,7 +1,13 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* The knet networking code has been removed. It only supported the http + and ftp protocols, and a better and safer alternative using libcurl + has been available since release 1.3. If you need access to ftp:// and + http:// URLs, HTSlib should be built with libcurl support. (#1200) +* The old htslib/knetfile.h interfaces have been marked as deprecated. Any + code still using them should be updated to use hFILE instead. (#1200) Noteworthy changes in release 1.11 (22nd September 2020) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/hfile.c b/hfile.c index 8bbba5cf9..218520a7c 100644 --- a/hfile.c +++ b/hfile.c @@ -1061,7 +1061,6 @@ static int load_hfile_plugins() hfile_add_scheme_handler("data", &data); hfile_add_scheme_handler("file", &file); hfile_add_scheme_handler("preload", &preload); - init_add_plugin(NULL, hfile_plugin_init_net, "knetfile"); init_add_plugin(NULL, hfile_plugin_init_mem, "mem"); init_add_plugin(NULL, hfile_plugin_init_crypt4gh_needed, "crypt4gh-needed"); @@ -1213,3 +1212,82 @@ char *haddextension(struct kstring_t *buffer, const char *filename, kputs(trailing, buffer) >= 0) return buffer->s; else return NULL; } + + +/* + * ---------------------------------------------------------------------- + * Minimal stub functions for knet, added after the removal of + * hfile_net.c and knetfile.c. + * + * They exist purely for ABI compatibility, but are simply wrappers to + * hFILE. API should be compatible except knet_fileno (unused?). + * + * CULL THESE and knetfile.h at the next .so version bump. + */ +typedef struct knetFile_s { + // As per htslib/knetfile.h. Duplicated here as we don't wish to + // have any dependence on the deprecated knetfile.h interface, plus + // it's hopefully only temporary. + int type, fd; + int64_t offset; + char *host, *port; + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; + int64_t file_size; + char *path, *http_host; + + // Our local addition + hFILE *hf; +} knetFile; + +HTSLIB_EXPORT +knetFile *knet_open(const char *fn, const char *mode) { + knetFile *fp = calloc(1, sizeof(*fp)); + if (!fp) return NULL; + if (!(fp->hf = hopen(fn, mode))) { + free(fp); + fp = NULL; + } + + // FD backend is the only one implementing knet_fileno + fp->fd = fp->hf->backend == &fd_backend + ? ((hFILE_fd *)fp->hf)->fd + : -1; + + return fp; +} + +HTSLIB_EXPORT +knetFile *knet_dopen(int fd, const char *mode) { + knetFile *fp = calloc(1, sizeof(*fp)); + if (!fp) return NULL; + if (!(fp->hf = hdopen(fd, mode))) { + free(fp); + fp = NULL; + } + fp->fd = fd; + return fp; +} + +HTSLIB_EXPORT +ssize_t knet_read(knetFile *fp, void *buf, size_t len) { + ssize_t r = hread(fp->hf, buf, len); + fp->offset += r>0?r:0; + return r; +} + +HTSLIB_EXPORT +off_t knet_seek(knetFile *fp, off_t off, int whence) { + off_t r = hseek(fp->hf, off, whence); + if (r >= 0) + fp->offset = r; + return r; +} + +HTSLIB_EXPORT +int knet_close(knetFile *fp) { + int r = hclose(fp->hf); + free(fp); + return r; +} diff --git a/hfile_internal.h b/hfile_internal.h index 386689626..70cc99c57 100644 --- a/hfile_internal.h +++ b/hfile_internal.h @@ -179,9 +179,6 @@ extern int hfile_plugin_init_s3(struct hFILE_plugin *self); extern int hfile_plugin_init_s3_write(struct hFILE_plugin *self); #endif -/* This one is never built as a separate plugin. */ -extern int hfile_plugin_init_net(struct hFILE_plugin *self); - // Callback to allow headers to be set in http connections. Currently used // to allow s3 to renew tokens when seeking. Kept internal for now, // although we may consider exposing it in the API later. diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 090db348a..2de7ccbd9 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -1221,7 +1221,7 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) err = curl_easy_setopt(fp->easy, CURLOPT_PRIVATE, fp); // Avoid many repeated CWD calls with FTP, instead requesting the filename - // by full path (as done in knet, but not strictly compliant with RFC1738). + // by full path (but not strictly compliant with RFC1738). err |= curl_easy_setopt(fp->easy, CURLOPT_FTP_FILEMETHOD, CURLFTPMETHOD_NOCWD); if (mode == 'r') { diff --git a/hfile_net.c b/hfile_net.c deleted file mode 100644 index 5443b2240..000000000 --- a/hfile_net.c +++ /dev/null @@ -1,112 +0,0 @@ -/* hfile_net.c -- network backend for low-level input/output streams. - - Copyright (C) 2013-2015 Genome Research Ltd. - - Author: John Marshall - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL -THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING -FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. */ - -#include - -#include -#include - -#include "hfile_internal.h" - -#include "htslib/knetfile.h" - -typedef struct { - hFILE base; - knetFile *netfp; -} hFILE_net; - -static int net_inited = 0; - -#ifdef _WIN32 -static void net_exit(void) -{ - knet_win32_destroy(); -} -#endif - -static int net_init(void) -{ -#ifdef _WIN32 - if (knet_win32_init() != 0) return -1; - - // In the unlikely event atexit() fails, it's better to succeed here and - // carry on and do the I/O; then eventually when the program exits, we'll - // merely have failed to clean up properly, as if we had aborted. - (void) atexit(net_exit); -#endif - - net_inited = 1; - return 0; -} - -static ssize_t net_read(hFILE *fpv, void *buffer, size_t nbytes) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_read(fp->netfp, buffer, nbytes); -} - -static off_t net_seek(hFILE *fpv, off_t offset, int whence) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_seek(fp->netfp, offset, whence); -} - -static int net_close(hFILE *fpv) -{ - hFILE_net *fp = (hFILE_net *) fpv; - return knet_close(fp->netfp); -} - -static const struct hFILE_backend net_backend = -{ - net_read, NULL, net_seek, NULL, net_close -}; - -hFILE *hopen_net(const char *filename, const char *mode) -{ - hFILE_net *fp; - - // Do any networking initialisation if this is the first use. - if (! net_inited) { if (net_init() < 0) return NULL; } - - fp = (hFILE_net *) hfile_init(sizeof (hFILE_net), mode, 0); - if (fp == NULL) return NULL; - - fp->netfp = knet_open(filename, mode); - if (fp->netfp == NULL) { hfile_destroy((hFILE *) fp); return NULL; } - - fp->base.backend = &net_backend; - return &fp->base; -} - -int hfile_plugin_init_net(struct hFILE_plugin *self) -{ - static const struct hFILE_scheme_handler handler = - { hopen_net, hfile_always_remote, "knetfile", 0 }; - - self->name = "knetfile"; - hfile_add_scheme_handler("http", &handler); - hfile_add_scheme_handler("ftp", &handler); - return 0; -} diff --git a/htslib.mk b/htslib.mk index 8cd659fc3..254c25797 100644 --- a/htslib.mk +++ b/htslib.mk @@ -62,7 +62,6 @@ HTSLIB_PUBLIC_HEADERS = \ $(HTSDIR)/htslib/khash.h \ $(HTSDIR)/htslib/khash_str2int.h \ $(HTSDIR)/htslib/klist.h \ - $(HTSDIR)/htslib/knetfile.h \ $(HTSDIR)/htslib/kseq.h \ $(HTSDIR)/htslib/ksort.h \ $(HTSDIR)/htslib/kstring.h \ @@ -89,7 +88,6 @@ HTSLIB_ALL = \ $(HTSDIR)/hfile.c \ $(HTSDIR)/hfile_gcs.c \ $(HTSDIR)/hfile_libcurl.c \ - $(HTSDIR)/hfile_net.c \ $(HTSDIR)/hfile_s3.c \ $(HTSDIR)/hfile_s3_write.c \ $(HTSDIR)/hts.c \ @@ -97,7 +95,6 @@ HTSLIB_ALL = \ $(HTSDIR)/hts_internal.h \ $(HTSDIR)/hts_os.c \ $(HTSDIR)/kfunc.c \ - $(HTSDIR)/knetfile.c \ $(HTSDIR)/kstring.c \ $(HTSDIR)/md5.c \ $(HTSDIR)/multipart.c \ diff --git a/htslib/knetfile.h b/htslib/knetfile.h index 87fba4adc..598ed379a 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -50,19 +50,20 @@ #define KNF_TYPE_FTP 2 #define KNF_TYPE_HTTP 3 +// Kept for API/ABI compatability only. Do not use directly! typedef struct knetFile_s { - int type, fd; - int64_t offset; - char *host, *port; - - // the following are for FTP only - int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; - char *response, *retr, *size_cmd; - int64_t seek_offset; // for lazy seek - int64_t file_size; - - // the following are for HTTP only - char *path, *http_host; + int type, fd; + int64_t offset; + char *host, *port; + + // the following are for FTP only + int ctrl_fd, pasv_ip[4], pasv_port, max_response, no_reconnect, is_ready; + char *response, *retr, *size_cmd; + int64_t seek_offset; // for lazy seek + int64_t file_size; + + // the following are for HTTP only + char *path, *http_host; } knetFile; #define knet_tell(fp) ((fp)->offset) @@ -72,35 +73,30 @@ typedef struct knetFile_s { extern "C" { #endif -#ifdef _WIN32 - int knet_win32_init(); - void knet_win32_destroy(); -#endif - HTSLIB_EXPORT - knetFile *knet_open(const char *fn, const char *mode); + knetFile *knet_open(const char *fn, const char *mode) HTS_DEPRECATED("Please use hopen instead"); - /* - This only works with local files. - */ + /* + This only works with local files. + */ HTSLIB_EXPORT - knetFile *knet_dopen(int fd, const char *mode); + knetFile *knet_dopen(int fd, const char *mode) HTS_DEPRECATED("Please use hdopen instead"); - /* - If ->is_ready==0, this routine updates ->fd; otherwise, it simply - reads from ->fd. - */ + /* + If ->is_ready==0, this routine updates ->fd; otherwise, it simply + reads from ->fd. + */ HTSLIB_EXPORT - ssize_t knet_read(knetFile *fp, void *buf, size_t len); + ssize_t knet_read(knetFile *fp, void *buf, size_t len) HTS_DEPRECATED("Please use hread instead"); - /* - This routine only sets ->offset and ->is_ready=0. It does not - communicate with the FTP server. - */ + /* + This routine only sets ->offset and ->is_ready=0. It does not + communicate with the FTP server. + */ HTSLIB_EXPORT - off_t knet_seek(knetFile *fp, off_t off, int whence); + off_t knet_seek(knetFile *fp, off_t off, int whence) HTS_DEPRECATED("Please use hseek instead"); HTSLIB_EXPORT - int knet_close(knetFile *fp); + int knet_close(knetFile *fp) HTS_DEPRECATED("Please use hclose instead"); #ifdef __cplusplus } diff --git a/htslib_vars.mk b/htslib_vars.mk index 3c2275d4a..85835f872 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -40,7 +40,6 @@ htslib_kfunc_h = $(HTSPREFIX)htslib/kfunc.h $(htslib_hts_defs_h) htslib_khash_h = $(HTSPREFIX)htslib/khash.h $(htslib_kstring_h) $(htslib_kroundup_h) htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h) htslib_klist_h = $(HTSPREFIX)htslib/klist.h -htslib_knetfile_h = $(HTSPREFIX)htslib/knetfile.h $(htslib_hts_defs_h) htslib_kroundup_h = $(HTSPREFIX)htslib/kroundup.h htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h diff --git a/knetfile.c b/knetfile.c deleted file mode 100644 index f0a608ea4..000000000 --- a/knetfile.c +++ /dev/null @@ -1,645 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2012-2014, 2017 Genome Research Ltd (GRL). - 2010 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Probably I will not do socket programming in the next few years and - therefore I decide to heavily annotate this file, for Linux and - Windows as well. -ac */ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#ifndef _WIN32 -#include -#include -#include -#include -#endif - -#include "htslib/knetfile.h" -#include "htslib/hts_log.h" - -/* In winsock.h, the type of a socket is SOCKET, which is: "typedef - * u_int SOCKET". An invalid SOCKET is: "(SOCKET)(~0)", or signed - * integer -1. In knetfile.c, I use "int" for socket type - * throughout. This should be improved to avoid confusion. - * - * In Linux/Mac, recv() and read() do almost the same thing. You can see - * in the header file that netread() is simply an alias of read(). In - * Windows, however, they are different and using recv() is mandatory. - */ - -/* This function tests if the file handler is ready for reading (or - * writing if is_read==0). */ -static int socket_wait(int fd, int is_read) -{ - fd_set fds, *fdr = 0, *fdw = 0; - struct timeval tv; - int ret; - tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out - FD_ZERO(&fds); - FD_SET(fd, &fds); - if (is_read) fdr = &fds; - else fdw = &fds; - ret = select(fd+1, fdr, fdw, 0, &tv); -#ifndef _WIN32 - if (ret == -1) perror("select"); -#else - if (ret == 0) - hts_log_warning("Select timed out"); - else if (ret == SOCKET_ERROR) - hts_log_error("Select returned error %d", WSAGetLastError()); -#endif - return ret; -} - -#ifndef _WIN32 -/* This function does not work with Windows due to the lack of - * getaddrinfo() in winsock. It is addapted from an example in "Beej's - * Guide to Network Programming" (http://beej.us/guide/bgnet/). */ -# ifdef __SUNPRO_C -# pragma error_messages(off, E_END_OF_LOOP_CODE_NOT_REACHED) -# endif -static int socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) - - int ai_err, on = 1, fd; - struct linger lng = { 0, 0 }; - struct addrinfo hints, *res = 0; - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - /* In Unix/Mac, getaddrinfo() is the most convenient way to get - * server information. */ - if ((ai_err = getaddrinfo(host, port, &hints, &res)) != 0) { hts_log_error("Can't resolve %s:%s: %s", host, port, gai_strerror(ai_err)); return -1; } - if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); - /* The following two setsockopt() are used by ftplib - * (http://nbpfaus.net/~pfau/ftplib/). I am not sure if they - * necessary. */ - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); - freeaddrinfo(res); - return fd; -} -# ifdef __SUNPRO_C -# pragma error_messages(off, E_END_OF_LOOP_CODE_NOT_REACHED) -# endif -#else -/* MinGW's printf has problem with "%lld" */ -char *int64tostr(char *buf, int64_t x) -{ - int cnt; - int i = 0; - do { - buf[i++] = '0' + x % 10; - x /= 10; - } while (x); - buf[i] = 0; - for (cnt = i, i = 0; i < cnt/2; ++i) { - int c = buf[i]; buf[i] = buf[cnt-i-1]; buf[cnt-i-1] = c; - } - return buf; -} - -int64_t strtoint64(const char *buf) -{ - int64_t x; - for (x = 0; *buf != '\0'; ++buf) - x = x * 10 + ((int64_t) *buf - 48); - return x; -} -/* In windows, the first thing is to establish the TCP connection. */ -int knet_win32_init() -{ - WSADATA wsaData; - return WSAStartup(MAKEWORD(2, 2), &wsaData); -} -void knet_win32_destroy() -{ - WSACleanup(); -} -/* A slightly modfied version of the following function also works on - * Mac (and presummably Linux). However, this function is not stable on - * my Mac. It sometimes works fine but sometimes does not. Therefore for - * non-Windows OS, I do not use this one. */ -static SOCKET socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) \ - do { \ - hts_log_error("The %s operation returned error %d", func, WSAGetLastError()); \ - return -1; \ - } while (0) - - int on = 1; - SOCKET fd; - struct linger lng = { 0, 0 }; - struct sockaddr_in server; - struct hostent *hp = 0; - // open socket - if ((fd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP)) == INVALID_SOCKET) __err_connect("socket"); - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char*)&on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, (char*)&lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - // get host info - if (isalpha(host[0])) hp = gethostbyname(host); - else { - struct in_addr addr; - addr.s_addr = inet_addr(host); - hp = gethostbyaddr((char*)&addr, 4, AF_INET); - } - if (hp == 0) __err_connect("gethost"); - // connect - server.sin_addr.s_addr = *((unsigned long*)hp->h_addr); - server.sin_family= AF_INET; - server.sin_port = htons(atoi(port)); - if (connect(fd, (struct sockaddr*)&server, sizeof(server)) != 0) __err_connect("connect"); - // freehostent(hp); // strangely in MSDN, hp is NOT freed (memory leak?!) - return fd; -} -#endif - -static off_t my_netread(int fd, void *buf, off_t len) -{ - off_t rest = len, curr, l = 0; - /* recv() and read() may not read the required length of data with - * one call. They have to be called repeatedly. */ - while (rest) { - if (socket_wait(fd, 1) <= 0) break; // socket is not ready for reading - curr = netread(fd, (void*)((char*)buf + l), rest); - /* According to the glibc manual, section 13.2, a zero returned - * value indicates end-of-file (EOF), which should mean that - * read() will not return zero if EOF has not been met but data - * are not immediately available. */ - if (curr == 0) break; - l += curr; rest -= curr; - } - return l; -} - -/************************* - * FTP specific routines * - *************************/ - -static int kftp_get_response(knetFile *ftp) -{ -#ifndef _WIN32 - unsigned char c; -#else - char c; -#endif - int n = 0; - char *p; - if (socket_wait(ftp->ctrl_fd, 1) <= 0) return 0; - while (netread(ftp->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O - //fputc(c, stderr); - if (n >= ftp->max_response) { - ftp->max_response = ftp->max_response? ftp->max_response<<1 : 256; - ftp->response = (char*)realloc(ftp->response, ftp->max_response); - } - ftp->response[n++] = c; - if (c == '\n') { - if (n >= 4 && isdigit((int)((unsigned char) ftp->response[0])) && isdigit((int)((unsigned char) ftp->response[1])) && isdigit((int)((unsigned char) ftp->response[2])) - && ftp->response[3] != '-') break; - n = 0; - continue; - } - } - if (n < 2) return -1; - ftp->response[n-2] = 0; - return strtol(ftp->response, &p, 0); -} - -static int kftp_send_cmd(knetFile *ftp, const char *cmd, int is_get) -{ - if (socket_wait(ftp->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing - int len = strlen(cmd); - if ( netwrite(ftp->ctrl_fd, cmd, len) != len ) return -1; - return is_get? kftp_get_response(ftp) : 0; -} - -static int kftp_pasv_prep(knetFile *ftp) -{ - char *p; - int v[6]; - kftp_send_cmd(ftp, "PASV\r\n", 1); - for (p = ftp->response; *p && *p != '('; ++p); - if (*p != '(') return -1; - ++p; - sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); - memcpy(ftp->pasv_ip, v, 4 * sizeof(int)); - ftp->pasv_port = (v[4]<<8&0xff00) + v[5]; - return 0; -} - - -static int kftp_pasv_connect(knetFile *ftp) -{ - char host[80], port[10]; - if (ftp->pasv_port == 0) { - hts_log_error("Must call kftp_pasv_prep() first"); - return -1; - } - sprintf(host, "%d.%d.%d.%d", ftp->pasv_ip[0], ftp->pasv_ip[1], ftp->pasv_ip[2], ftp->pasv_ip[3]); - sprintf(port, "%d", ftp->pasv_port); - ftp->fd = socket_connect(host, port); - if (ftp->fd == -1) return -1; - return 0; -} - -int kftp_connect(knetFile *ftp) -{ - ftp->ctrl_fd = socket_connect(ftp->host, ftp->port); - if (ftp->ctrl_fd == -1) return -1; - kftp_get_response(ftp); - kftp_send_cmd(ftp, "USER anonymous\r\n", 1); - kftp_send_cmd(ftp, "PASS kftp@\r\n", 1); - kftp_send_cmd(ftp, "TYPE I\r\n", 1); - return 0; -} - -int kftp_reconnect(knetFile *ftp) -{ - if (ftp->ctrl_fd != -1) { - netclose(ftp->ctrl_fd); - ftp->ctrl_fd = -1; - } - netclose(ftp->fd); - ftp->fd = -1; - return kftp_connect(ftp); -} - -// initialize ->type, ->host, ->retr and ->size -knetFile *kftp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p; - int l; - if (strstr(fn, "ftp://") != fn) return 0; - for (p = (char*)fn + 6; *p && *p != '/'; ++p); - if (*p != '/') return 0; - l = p - fn - 6; - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_FTP; - fp->fd = -1; - /* the Linux/Mac version of socket_connect() also recognizes a port - * like "ftp", but the Windows version does not. */ - fp->port = strdup("21"); - fp->host = (char*)calloc(l + 1, 1); - if (strchr(mode, 'c')) fp->no_reconnect = 1; - strncpy(fp->host, fn + 6, l); - fp->retr = (char*)calloc(strlen(p) + 8, 1); - sprintf(fp->retr, "RETR %s\r\n", p); - fp->size_cmd = (char*)calloc(strlen(p) + 8, 1); - sprintf(fp->size_cmd, "SIZE %s\r\n", p); - fp->seek_offset = 0; - return fp; -} -// place ->fd at offset off -int kftp_connect_file(knetFile *fp) -{ - int ret; - long long file_size; - if (fp->fd != -1) { - netclose(fp->fd); - if (fp->no_reconnect) kftp_get_response(fp); - } - kftp_pasv_prep(fp); - kftp_send_cmd(fp, fp->size_cmd, 1); -#ifndef _WIN32 - // If the file does not exist, the response will be "550 Could not get file - // size". Be silent on failure, hts_idx_load can be trying the existence of .csi or .tbi. - if ( sscanf(fp->response,"%*d %lld", &file_size) != 1 ) return -1; -#else - const char *p = fp->response; - while (*p != ' ') ++p; - while (*p < '0' || *p > '9') ++p; - file_size = strtoint64(p); -#endif - fp->file_size = file_size; - if (fp->offset>=0) { - char tmp[32]; -#ifndef _WIN32 - sprintf(tmp, "REST %lld\r\n", (long long)fp->offset); -#else - strcpy(tmp, "REST "); - int64tostr(tmp + 5, fp->offset); - strcat(tmp, "\r\n"); -#endif - kftp_send_cmd(fp, tmp, 1); - } - kftp_send_cmd(fp, fp->retr, 0); - kftp_pasv_connect(fp); - ret = kftp_get_response(fp); - if (ret != 150) { - hts_log_error("%s", fp->response); - netclose(fp->fd); - fp->fd = -1; - return -1; - } - fp->is_ready = 1; - return 0; -} - - -/************************** - * HTTP specific routines * - **************************/ - -knetFile *khttp_parse_url(const char *fn, const char *mode) -{ - knetFile *fp; - char *p, *proxy, *q; - int l; - if (strstr(fn, "http://") != fn) return 0; - // set ->http_host - for (p = (char*)fn + 7; *p && *p != '/'; ++p); - l = p - fn - 7; - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->http_host = (char*)calloc(l + 1, 1); - strncpy(fp->http_host, fn + 7, l); - fp->http_host[l] = 0; - for (q = fp->http_host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - // get http_proxy - proxy = getenv("http_proxy"); - // set ->host, ->port and ->path - if (proxy == 0) { - fp->host = strdup(fp->http_host); // when there is no proxy, server name is identical to http_host name. - fp->port = strdup(*q? q : "80"); - fp->path = strdup(*p? p : "/"); - } else { - fp->host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); - for (q = fp->host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - fp->port = strdup(*q? q : "80"); - fp->path = strdup(fn); - } - fp->type = KNF_TYPE_HTTP; - fp->ctrl_fd = fp->fd = -1; - fp->seek_offset = 0; - return fp; -} - -int khttp_connect_file(knetFile *fp) -{ - int ret, l = 0; - char *buf, *p; - if (fp->fd != -1) netclose(fp->fd); - fp->fd = socket_connect(fp->host, fp->port); - buf = (char*)calloc(0x10000, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. - l += sprintf(buf + l, "GET %s HTTP/1.0\r\nHost: %s\r\n", fp->path, fp->http_host); - if (fp->offset != 0) l += sprintf(buf + l, "Range: bytes=%lld-\r\n", (long long)fp->offset); - l += sprintf(buf + l, "\r\n"); - if ( netwrite(fp->fd, buf, l) != l ) { free(buf); return -1; } - l = 0; - while (netread(fp->fd, buf + l, 1)) { // read HTTP header; FIXME: bad efficiency - if (buf[l] == '\n' && l >= 3) - if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; - ++l; - } - buf[l] = 0; - if (l < 14) { // prematured header - free(buf); - netclose(fp->fd); - fp->fd = -1; - return -1; - } - ret = strtol(buf + 8, &p, 0); // HTTP return code - if (ret == 200 && fp->offset>0) { // 200 (complete result); then skip beginning of the file - off_t rest = fp->offset; - while (rest) { - off_t l = rest < 0x10000? rest : 0x10000; - rest -= my_netread(fp->fd, buf, l); - } - } else if (ret != 206 && ret != 200) { - // failed to open file - free(buf); - netclose(fp->fd); - switch (ret) { - case 401: errno = EPERM; break; - case 403: errno = EACCES; break; - case 404: errno = ENOENT; break; - case 407: errno = EPERM; break; - case 408: errno = ETIMEDOUT; break; - case 410: errno = ENOENT; break; - case 503: errno = EAGAIN; break; - case 504: errno = ETIMEDOUT; break; - default: errno = (ret >= 400 && ret < 500)? EINVAL : EIO; break; - } - fp->fd = -1; - return -1; - } - free(buf); - fp->is_ready = 1; - return 0; -} - -/******************** - * Generic routines * - ********************/ - -knetFile *knet_open(const char *fn, const char *mode) -{ - knetFile *fp = 0; - if (mode[0] != 'r') { - hts_log_error("Only mode \"r\" is supported"); - errno = ENOTSUP; - return 0; - } - if (strstr(fn, "ftp://") == fn) { - fp = kftp_parse_url(fn, mode); - if (fp == 0) return 0; - if (kftp_connect(fp) == -1) { - knet_close(fp); - return 0; - } - kftp_connect_file(fp); - } else if (strstr(fn, "http://") == fn) { - fp = khttp_parse_url(fn, mode); - if (fp == 0) return 0; - khttp_connect_file(fp); - } else { // local file -#ifdef _WIN32 - /* In windows, O_BINARY is necessary. In Linux/Mac, O_BINARY may - * be undefined on some systems, although it is defined on my - * Mac and the Linux I have tested on. */ - int fd = open(fn, O_RDONLY | O_BINARY); -#else - int fd = open(fn, O_RDONLY); -#endif - if (fd == -1) { - perror("open"); - return 0; - } - fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - fp->ctrl_fd = -1; - } - if (fp && fp->fd == -1) { - knet_close(fp); - return 0; - } - return fp; -} - -knetFile *knet_dopen(int fd, const char *mode) -{ - knetFile *fp = (knetFile*)calloc(1, sizeof(knetFile)); - fp->type = KNF_TYPE_LOCAL; - fp->fd = fd; - return fp; -} - -ssize_t knet_read(knetFile *fp, void *buf, size_t len) -{ - off_t l = 0; - if (fp->fd == -1) return 0; - if (fp->type == KNF_TYPE_FTP) { - if (fp->is_ready == 0) { - if (!fp->no_reconnect) kftp_reconnect(fp); - kftp_connect_file(fp); - } - } else if (fp->type == KNF_TYPE_HTTP) { - if (fp->is_ready == 0) - khttp_connect_file(fp); - } - if (fp->type == KNF_TYPE_LOCAL) { // on Windows, the following block is necessary; not on UNIX - size_t rest = len; - ssize_t curr; - while (rest) { - do { - curr = read(fp->fd, (void*)((char*)buf + l), rest); - } while (curr < 0 && EINTR == errno); - if (curr < 0) return -1; - if (curr == 0) break; - l += curr; rest -= curr; - } - } else l = my_netread(fp->fd, buf, len); - fp->offset += l; - return l; -} - -off_t knet_seek(knetFile *fp, off_t off, int whence) -{ - if (whence == SEEK_SET && off == fp->offset) return 0; - if (fp->type == KNF_TYPE_LOCAL) { - /* Be aware that lseek() returns the offset after seeking, while fseek() returns zero on success. */ - off_t offset = lseek(fp->fd, off, whence); - if (offset == -1) return -1; - fp->offset = offset; - return fp->offset; - } else if (fp->type == KNF_TYPE_FTP) { - if (whence == SEEK_CUR) fp->offset += off; - else if (whence == SEEK_SET) fp->offset = off; - else if (whence == SEEK_END) fp->offset = fp->file_size + off; - else return -1; - fp->is_ready = 0; - return fp->offset; - } else if (fp->type == KNF_TYPE_HTTP) { - if (whence == SEEK_END) { // FIXME: can we allow SEEK_END in future? - hts_log_error("SEEK_END is not supported for HTTP. Offset is unchanged"); - errno = ESPIPE; - return -1; - } - if (whence == SEEK_CUR) fp->offset += off; - else if (whence == SEEK_SET) fp->offset = off; - else return -1; - fp->is_ready = 0; - return fp->offset; - } - errno = EINVAL; - hts_log_error("%s", strerror(errno)); - return -1; -} - -int knet_close(knetFile *fp) -{ - if (fp == 0) return 0; - if (fp->ctrl_fd != -1) netclose(fp->ctrl_fd); // FTP specific - if (fp->fd != -1) { - /* On Linux/Mac, netclose() is an alias of close(), but on - * Windows, it is an alias of closesocket(). */ - if (fp->type == KNF_TYPE_LOCAL) close(fp->fd); - else netclose(fp->fd); - } - free(fp->host); free(fp->port); - free(fp->response); free(fp->retr); // FTP specific - free(fp->path); free(fp->http_host); // HTTP specific - free(fp->size_cmd); - free(fp); - return 0; -} - -#ifdef KNETFILE_MAIN -int main(void) -{ - char *buf; - knetFile *fp; - int type = 4, l; -#ifdef _WIN32 - knet_win32_init(); -#endif - buf = calloc(0x100000, 1); - if (type == 0) { - fp = knet_open("knetfile.c", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 1) { // NCBI FTP, large file - fp = knet_open("ftp://ftp.ncbi.nih.gov/1000genomes/ftp/data/NA12878/alignment/NA12878.chrom6.SLX.SRP000032.2009_06.bam", "r"); - knet_seek(fp, 2500000000ll, SEEK_SET); - l = knet_read(fp, buf, 255); - } else if (type == 2) { - fp = knet_open("ftp://ftp.sanger.ac.uk/pub4/treefam/tmp/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 3) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/index.shtml", "r"); - knet_seek(fp, 1000, SEEK_SET); - } else if (type == 4) { - fp = knet_open("http://www.sanger.ac.uk/Users/lh3/ex1.bam", "r"); - knet_read(fp, buf, 10000); - knet_seek(fp, 20000, SEEK_SET); - knet_seek(fp, 10000, SEEK_SET); - l = knet_read(fp, buf+10000, 10000000) + 10000; - } - if (type != 4 && type != 1) { - knet_read(fp, buf, 255); - buf[255] = 0; - printf("%s\n", buf); - } else write(fileno(stdout), buf, l); - knet_close(fp); - free(buf); - return 0; -} -#endif diff --git a/test/header_syms.pl b/test/header_syms.pl index fc9cfa303..fe5128a78 100755 --- a/test/header_syms.pl +++ b/test/header_syms.pl @@ -31,7 +31,7 @@ # Roughly equivalent Exuberant-ctags command is: # ctags -f - -n -I HTS_RESULT_USED -I HTS_DEPRECATED+ -I HTS_FORMAT+ \ -# -I KS_ATTR_PRINTF+ -I knet_win32_destroy+ -I knet_win32_init+ +# -I KS_ATTR_PRINTF+ # Unfortunately this is not the default ctags on all platforms, hence this # script. @@ -45,8 +45,9 @@ GetOptions('show-processed' => \$show_processed); -# List of functions to strip from the output -my %ignore = map { $_ => 1 } qw(knet_win32_init knet_win32_destroy); +# List of functions to strip from the output. Currently empty, +# but this functionality is retained for potential future use. +my %ignore = map { $_ => 1 } qw( ); foreach my $file (@ARGV) { extract_symbols($file, $show_processed, \%ignore); diff --git a/test/maintainer/check_spaces.pl b/test/maintainer/check_spaces.pl index 0daf24406..d5a53b1ed 100755 --- a/test/maintainer/check_spaces.pl +++ b/test/maintainer/check_spaces.pl @@ -45,14 +45,12 @@ sub check { my %allow_tabs = map { ("$root/$_", 1) } ( 'kfunc.c', -'knetfile.c', 'kstring.c', 'md5.c', 'htslib/khash.h', 'htslib/kseq.h', 'htslib/ksort.h', 'htslib/kstring.h', -'htslib/knetfile.h', 'htslib/klist.h', 'htslib/kbitset.h', 'os/rand.c', From a7f7abcb3e21646f15d4cb94d07920ddb828cf36 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 23 Sep 2019 16:19:36 +0100 Subject: [PATCH 039/488] Bug fix CRAM 2.1 output. In tested I spotted v2.1 output still uses the rANS codec, which is incorrect. It does this because rans is enabled when opening a file if the version is >= 3.0, but we set the version after opening the file and that didn't explicitly disable rans again. Io_lib's Scramble does these bits in reverse (set default vers and then open file) so worked fine. --- cram/cram_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 1904fa6cb..0a6814c9b 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -4890,8 +4890,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { } fd->version = major*256 + minor; - if (CRAM_MAJOR_VERS(fd->version) >= 3) - fd->use_rans = 1; + fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; + break; } From f5203fc035aab279d29ed3054664429adefc7b1d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 16 Sep 2019 14:39:58 +0100 Subject: [PATCH 040/488] Use htscodecs as a submodule. The cram/rANS_static.[ch] implementation has been removed. Updates the build system to make the required parts of htscodecs. Co-Authored-By: Rob Davies --- .appveyor.yml | 1 + .cirrus.yml | 5 +- .gitignore | 1 + .gitmodules | 4 + .travis.yml | 1 + INSTALL | 14 + Makefile | 19 +- configure.ac | 35 +- cram/cram_io.c | 7 +- cram/rANS_byte.h | 352 ----------- cram/rANS_static.c | 912 ----------------------------- cram/rANS_static.h | 51 -- htscodecs | 1 + htscodecs.mk.in | 5 + htscodecs_bundled.mk | 3 + htscodecs_external.mk | 6 + htslib.mk | 13 +- test/maintainer/check_copyright.pl | 3 + test/maintainer/check_spaces.pl | 3 + 19 files changed, 107 insertions(+), 1329 deletions(-) create mode 100644 .gitmodules delete mode 100644 cram/rANS_byte.h delete mode 100644 cram/rANS_static.c delete mode 100644 cram/rANS_static.h create mode 160000 htscodecs create mode 100644 htscodecs.mk.in create mode 100644 htscodecs_bundled.mk create mode 100644 htscodecs_external.mk diff --git a/.appveyor.yml b/.appveyor.yml index 6f94944bc..f8944daef 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -33,6 +33,7 @@ build_script: - set HOME=. - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% + - git submodule update --init --recursive - "sh -lc \"autoheader && autoconf && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" #build_script: diff --git a/.cirrus.yml b/.cirrus.yml index 6b5ff0b98..d4256b11a 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -25,6 +25,7 @@ libdeflate_template: &LIBDEFLATE compile_template: &COMPILE compile_script: | + git submodule update --init --recursive if test "x$USE_LIBDEFLATE" = "xyes"; then CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' else @@ -132,7 +133,7 @@ centos_task: install_script: | yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \ bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \ - diffutils + diffutils git << : *COMPILE << : *TEST @@ -158,7 +159,7 @@ macosx_task: USE_LIBDEFLATE: yes package_install_script: - - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz + - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git << : *LIBDEFLATE << : *COMPILE diff --git a/.gitignore b/.gitignore index 76be407b5..0855b6a88 100644 --- a/.gitignore +++ b/.gitignore @@ -28,6 +28,7 @@ hfile_*.so hts-object-files htslib_static.mk +htscodecs.mk cyg*.dll lib*.a diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..cb6f98d42 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "htscodecs"] + path = htscodecs + url = https://github.com/samtools/htscodecs.git + fetchRecurseSubmodules = true diff --git a/.travis.yml b/.travis.yml index 9c604d52c..d85224133 100644 --- a/.travis.yml +++ b/.travis.yml @@ -83,6 +83,7 @@ before_script: popd && \ popd fi + git submodule update --init --recursive script: - | diff --git a/INSTALL b/INSTALL index 7ae91da33..92a358a73 100644 --- a/INSTALL +++ b/INSTALL @@ -58,6 +58,20 @@ test temporary directory with e.g.: make check TEST_OPTS="-t C:/msys64/tmp/_" Whilst the code may work on Windows with other environments, these have not be verified. +Update htscodecs submodule +========================== + +Note that this section only applies to git checkouts. If you're building +from a release tar file, you can skip this section. + +Some parts of HTSlib are provided by the external "htscodecs" project. This +is included as a submodule. When building from the git repository, +either clone the project using "git clone -r", or run: + + git submodule update --init --recursive + +to ensure the correct version of the submodule is present. + Building Configure ================== diff --git a/Makefile b/Makefile index eac4b809d..53a39d13a 100644 --- a/Makefile +++ b/Makefile @@ -106,8 +106,15 @@ all: lib-static lib-shared $(BUILT_PROGRAMS) plugins $(BUILT_TEST_PROGRAMS) \ ALL_CPPFLAGS = -I. $(CPPFLAGS) +# Usually htscodecs.mk is generated by running configure or config.status, +# but if those aren't used create a default here. +htscodecs.mk: + echo '# Default htscodecs.mk generated by Makefile' > $@ + echo 'include $$(HTSPREFIX)htscodecs_bundled.mk' >> $@ + HTSPREFIX = include htslib_vars.mk +include htscodecs.mk # If not using GNU make, you need to copy the version number from version.sh # into here. @@ -178,8 +185,8 @@ LIBHTS_OBJS = \ cram/mFILE.o \ cram/open_trace_file.o \ cram/pooled_alloc.o \ - cram/rANS_static.o \ cram/string_alloc.o \ + $(HTSCODECS_OBJS) \ $(NONCONFIGURE_OBJS) # Without configure we wish to have a rich set of default figures, @@ -205,6 +212,9 @@ sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) +htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h + +htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h # To be effective, config.mk needs to appear after most Makefile variables are # set but before most rules appear, so that it can both use previously-set @@ -364,15 +374,15 @@ cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) cram/rANS_static.h $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/pooled_alloc.h $(cram_misc_h) -cram/rANS_static.o cram/rANS_static.pico: cram/rANS_static.c config.h cram/rANS_static.h cram/rANS_byte.h cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) +htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_rANS_static_h) bgzip: bgzip.o libhts.a $(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread @@ -605,6 +615,7 @@ testclean: mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h + -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico -rm -f hts-object-files clean: mostlyclean clean-$(SHLIB_FLAVOUR) @@ -612,7 +623,7 @@ clean: mostlyclean clean-$(SHLIB_FLAVOUR) distclean maintainer-clean: clean -rm -f config.cache config.h config.log config.mk config.status - -rm -f TAGS *.pc.tmp *-uninstalled.pc htslib_static.mk + -rm -f TAGS *.pc.tmp *-uninstalled.pc htslib_static.mk htscodecs.mk -rm -rf autom4te.cache clean-so: diff --git a/configure.ac b/configure.ac index e5e64496b..890d7aa31 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2019 Genome Research Ltd. +# Copyright (C) 2015-2020 Genome Research Ltd. # # Author: John Marshall # @@ -34,7 +34,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2018 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2020 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) @@ -108,6 +108,12 @@ AC_ARG_ENABLE([plugins], [], [enable_plugins=no]) AC_SUBST(enable_plugins) +AC_ARG_WITH([external-htscodecs], + [AS_HELP_STRING([--with-external-htscodecs], + [get htscodecs functions from a shared library])], + [], [with_external_htscodecs=no]) +AC_SUBST(with_external_htscodecs) + AC_ARG_WITH([libdeflate], [AS_HELP_STRING([--with-libdeflate], [use libdeflate for faster crc and deflate algorithms])], @@ -280,6 +286,29 @@ produced elsewhere unreadable) or resolve this error to build HTSlib.]) static_LIBS="$static_LIBS -llzma" fi +AS_IF([test "x$with_external_htscodecs" != "xno"], + [libhtscodecs=ok + AC_CHECK_HEADER([htscodecs/rANS_static4x16.h],[], + [libhtscodecs='missing header'],[;]) + AC_CHECK_LIB([htscodecs],[rans_compress_bound_4x16], + [:],[libhtscodecs='missing library']) + AS_IF([test "$libhtscodecs" = "ok"], + [AC_DEFINE([HAVE_EXTERNAL_LIBHTSCODECS], 1, [Define if using an external libhtscodecs]) + LIBS="-lhtscodecs $LIBS" + private_LIBS="-lhtscodecs $private_LIBS" + static_LIBS="-lhtscodecs $static_LIBS" + selected_htscodecs_mk="htscodecs_external.mk"], + [MSG_ERROR([libhtscodecs development files not found: $libhtscodecs + +You asked to use an external htscodecs library, but do not have the +required header / library files. You either need to supply these and +if necessary set CPPFLAGS and LDFLAGS so the compiler can find them; +or configure using --without-external-htscodecs to build the required +functions from the htscodecs submodule. +])])], + [selected_htscodecs_mk="htscodecs_bundled.mk"]) +AC_SUBST([selected_htscodecs_mk]) + AS_IF([test "x$with_libdeflate" != "xno"], [libdeflate=ok AC_CHECK_HEADER([libdeflate.h],[],[libdeflate='missing header'],[;]) @@ -421,5 +450,5 @@ AC_SUBST([private_LIBS]) AC_SUBST([static_LDFLAGS]) AC_SUBST([static_LIBS]) -AC_CONFIG_FILES([config.mk htslib.pc.tmp:htslib.pc.in]) +AC_CONFIG_FILES([config.mk htscodecs.mk htslib.pc.tmp:htslib.pc.in]) AC_OUTPUT diff --git a/cram/cram_io.c b/cram/cram_io.c index 0a6814c9b..d5ad50000 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -73,7 +73,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "os.h" #include "../htslib/hts.h" #include "open_trace_file.h" -#include "rANS_static.h" + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#else +#include "../htscodecs/htscodecs/rANS_static.h" +#endif //#define REF_DEBUG diff --git a/cram/rANS_byte.h b/cram/rANS_byte.h deleted file mode 100644 index f8bcae248..000000000 --- a/cram/rANS_byte.h +++ /dev/null @@ -1,352 +0,0 @@ -/* rans_byte.h originally from https://github.com/rygorous/ryg_rans - * - * This is a public-domain implementation of several rANS variants. rANS is an - * entropy coder from the ANS family, as described in Jarek Duda's paper - * "Asymmetric numeral systems" (http://arxiv.org/abs/1311.2540). - */ - -/*-------------------------------------------------------------------------- */ - -// Simple byte-aligned rANS encoder/decoder - public domain - Fabian 'ryg' Giesen 2014 -// -// Not intended to be "industrial strength"; just meant to illustrate the general -// idea. - -#ifndef RANS_BYTE_HEADER -#define RANS_BYTE_HEADER - -#include - -#ifdef assert -#define RansAssert assert -#else -#define RansAssert(x) -#endif - -// READ ME FIRST: -// -// This is designed like a typical arithmetic coder API, but there's three -// twists you absolutely should be aware of before you start hacking: -// -// 1. You need to encode data in *reverse* - last symbol first. rANS works -// like a stack: last in, first out. -// 2. Likewise, the encoder outputs bytes *in reverse* - that is, you give -// it a pointer to the *end* of your buffer (exclusive), and it will -// slowly move towards the beginning as more bytes are emitted. -// 3. Unlike basically any other entropy coder implementation you might -// have used, you can interleave data from multiple independent rANS -// encoders into the same bytestream without any extra signaling; -// you can also just write some bytes by yourself in the middle if -// you want to. This is in addition to the usual arithmetic encoder -// property of being able to switch models on the fly. Writing raw -// bytes can be useful when you have some data that you know is -// incompressible, and is cheaper than going through the rANS encode -// function. Using multiple rANS coders on the same byte stream wastes -// a few bytes compared to using just one, but execution of two -// independent encoders can happen in parallel on superscalar and -// Out-of-Order CPUs, so this can be *much* faster in tight decoding -// loops. -// -// This is why all the rANS functions take the write pointer as an -// argument instead of just storing it in some context struct. - -// -------------------------------------------------------------------------- - -// L ('l' in the paper) is the lower bound of our normalization interval. -// Between this and our byte-aligned emission, we use 31 (not 32!) bits. -// This is done intentionally because exact reciprocals for 31-bit uints -// fit in 32-bit uints: this permits some optimizations during encoding. -#define RANS_BYTE_L (1u << 23) // lower bound of our normalization interval - -// State for a rANS encoder. Yep, that's all there is to it. -typedef uint32_t RansState; - -// Initialize a rANS encoder. -static inline void RansEncInit(RansState* r) -{ - *r = RANS_BYTE_L; -} - -// Renormalize the encoder. Internal function. -static inline RansState RansEncRenorm(RansState x, uint8_t** pptr, uint32_t freq, uint32_t scale_bits) -{ - uint32_t x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; // this turns into a shift. - if (x >= x_max) { - uint8_t* ptr = *pptr; - do { - *--ptr = (uint8_t) (x & 0xff); - x >>= 8; - } while (x >= x_max); - *pptr = ptr; - } - return x; -} - -// Encodes a single symbol with range start "start" and frequency "freq". -// All frequencies are assumed to sum to "1 << scale_bits", and the -// resulting bytes get written to ptr (which is updated). -// -// NOTE: With rANS, you need to encode symbols in *reverse order*, i.e. from -// beginning to end! Likewise, the output bytestream is written *backwards*: -// ptr starts pointing at the end of the output buffer and keeps decrementing. -static inline void RansEncPut(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - // renormalize - RansState x = RansEncRenorm(*r, pptr, freq, scale_bits); - - // x = C(s,x) - *r = ((x / freq) << scale_bits) + (x % freq) + start; -} - -// Flushes the rANS encoder. -static inline void RansEncFlush(RansState* r, uint8_t** pptr) -{ - uint32_t x = *r; - uint8_t* ptr = *pptr; - - ptr -= 4; - ptr[0] = (uint8_t) (x >> 0); - ptr[1] = (uint8_t) (x >> 8); - ptr[2] = (uint8_t) (x >> 16); - ptr[3] = (uint8_t) (x >> 24); - - *pptr = ptr; -} - -// Initializes a rANS decoder. -// Unlike the encoder, the decoder works forwards as you'd expect. -static inline void RansDecInit(RansState* r, uint8_t** pptr) -{ - uint32_t x; - uint8_t* ptr = *pptr; - - x = ((uint32_t) ptr[0]) << 0; - x |= ((uint32_t) ptr[1]) << 8; - x |= ((uint32_t) ptr[2]) << 16; - x |= ((uint32_t) ptr[3]) << 24; - ptr += 4; - - *pptr = ptr; - *r = x; -} - -// Returns the current cumulative frequency (map it to a symbol yourself!) -static inline uint32_t RansDecGet(RansState* r, uint32_t scale_bits) -{ - return *r & ((1u << scale_bits) - 1); -} - -// Advances in the bit stream by "popping" a single symbol with range start -// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits", -// and the resulting bytes get written to ptr (which is updated). -static inline void RansDecAdvance(RansState* r, uint8_t** pptr, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - uint32_t mask = (1u << scale_bits) - 1; - - // s, x = D(x) - uint32_t x = *r; - x = freq * (x >> scale_bits) + (x & mask) - start; - - // renormalize - if (x < RANS_BYTE_L) { - uint8_t* ptr = *pptr; - do x = (x << 8) | *ptr++; while (x < RANS_BYTE_L); - *pptr = ptr; - } - - *r = x; -} - -// -------------------------------------------------------------------------- - -// That's all you need for a full encoder; below here are some utility -// functions with extra convenience or optimizations. - -// Encoder symbol description -// This (admittedly odd) selection of parameters was chosen to make -// RansEncPutSymbol as cheap as possible. -typedef struct { - uint32_t x_max; // (Exclusive) upper bound of pre-normalization interval - uint32_t rcp_freq; // Fixed-point reciprocal frequency - uint32_t bias; // Bias - uint16_t cmpl_freq; // Complement of frequency: (1 << scale_bits) - freq - uint16_t rcp_shift; // Reciprocal shift -} RansEncSymbol; - -// Decoder symbols are straightforward. -typedef struct { - uint16_t start; // Start of range. - uint16_t freq; // Symbol frequency. -} RansDecSymbol; - -// Initializes an encoder symbol to start "start" and frequency "freq" -static inline void RansEncSymbolInit(RansEncSymbol* s, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - RansAssert(scale_bits <= 16); - RansAssert(start <= (1u << scale_bits)); - RansAssert(freq <= (1u << scale_bits) - start); - - // Say M := 1 << scale_bits. - // - // The original encoder does: - // x_new = (x/freq)*M + start + (x%freq) - // - // The fast encoder does (schematically): - // q = mul_hi(x, rcp_freq) >> rcp_shift (division) - // r = x - q*freq (remainder) - // x_new = q*M + bias + r (new x) - // plugging in r into x_new yields: - // x_new = bias + x + q*(M - freq) - // =: bias + x + q*cmpl_freq (*) - // - // and we can just precompute cmpl_freq. Now we just need to - // set up our parameters such that the original encoder and - // the fast encoder agree. - - s->x_max = ((RANS_BYTE_L >> scale_bits) << 8) * freq; - s->cmpl_freq = (uint16_t) ((1 << scale_bits) - freq); - if (freq < 2) { - // freq=0 symbols are never valid to encode, so it doesn't matter what - // we set our values to. - // - // freq=1 is tricky, since the reciprocal of 1 is 1; unfortunately, - // our fixed-point reciprocal approximation can only multiply by values - // smaller than 1. - // - // So we use the "next best thing": rcp_freq=0xffffffff, rcp_shift=0. - // This gives: - // q = mul_hi(x, rcp_freq) >> rcp_shift - // = mul_hi(x, (1<<32) - 1)) >> 0 - // = floor(x - x/(2^32)) - // = x - 1 if 1 <= x < 2^32 - // and we know that x>0 (x=0 is never in a valid normalization interval). - // - // So we now need to choose the other parameters such that - // x_new = x*M + start - // plug it in: - // x*M + start (desired result) - // = bias + x + q*cmpl_freq (*) - // = bias + x + (x - 1)*(M - 1) (plug in q=x-1, cmpl_freq) - // = bias + 1 + (x - 1)*M - // = x*M + (bias + 1 - M) - // - // so we have start = bias + 1 - M, or equivalently - // bias = start + M - 1. - s->rcp_freq = ~0u; - s->rcp_shift = 0; - s->bias = start + (1 << scale_bits) - 1; - } else { - // Alverson, "Integer Division using reciprocals" - // shift=ceil(log2(freq)) - uint32_t shift = 0; - while (freq > (1u << shift)) - shift++; - - s->rcp_freq = (uint32_t) (((1ull << (shift + 31)) + freq-1) / freq); - s->rcp_shift = shift - 1; - - // With these values, 'q' is the correct quotient, so we - // have bias=start. - s->bias = start; - } - - s->rcp_shift += 32; // Avoid the extra >>32 in RansEncPutSymbol -} - -// Initialize a decoder symbol to start "start" and frequency "freq" -static inline void RansDecSymbolInit(RansDecSymbol* s, uint32_t start, uint32_t freq) -{ - RansAssert(start <= (1 << 16)); - RansAssert(freq <= (1 << 16) - start); - s->start = (uint16_t) start; - s->freq = (uint16_t) freq; -} - -// Encodes a given symbol. This is faster than straight RansEnc since we can do -// multiplications instead of a divide. -// -// See RansEncSymbolInit for a description of how this works. -static inline void RansEncPutSymbol(RansState* r, uint8_t** pptr, RansEncSymbol const* sym) -{ - RansAssert(sym->x_max != 0); // can't encode symbol with freq=0 - - // renormalize - uint32_t x = *r; - uint32_t x_max = sym->x_max; - - if (x >= x_max) { - uint8_t* ptr = *pptr; - do { - *--ptr = (uint8_t) (x & 0xff); - x >>= 8; - } while (x >= x_max); - *pptr = ptr; - } - - // x = C(s,x) - // NOTE: written this way so we get a 32-bit "multiply high" when - // available. If you're on a 64-bit platform with cheap multiplies - // (e.g. x64), just bake the +32 into rcp_shift. - //uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> 32) >> sym->rcp_shift; - - // The extra >>32 has already been added to RansEncSymbolInit - uint32_t q = (uint32_t) (((uint64_t)x * sym->rcp_freq) >> sym->rcp_shift); - *r = x + sym->bias + q * sym->cmpl_freq; -} - -// Equivalent to RansDecAdvance that takes a symbol. -static inline void RansDecAdvanceSymbol(RansState* r, uint8_t** pptr, RansDecSymbol const* sym, uint32_t scale_bits) -{ - RansDecAdvance(r, pptr, sym->start, sym->freq, scale_bits); -} - -// Advances in the bit stream by "popping" a single symbol with range start -// "start" and frequency "freq". All frequencies are assumed to sum to "1 << scale_bits". -// No renormalization or output happens. -static inline void RansDecAdvanceStep(RansState* r, uint32_t start, uint32_t freq, uint32_t scale_bits) -{ - uint32_t mask = (1u << scale_bits) - 1; - - // s, x = D(x) - uint32_t x = *r; - *r = freq * (x >> scale_bits) + (x & mask) - start; -} - -// Equivalent to RansDecAdvanceStep that takes a symbol. -static inline void RansDecAdvanceSymbolStep(RansState* r, RansDecSymbol const* sym, uint32_t scale_bits) -{ - RansDecAdvanceStep(r, sym->start, sym->freq, scale_bits); -} - -// Renormalize. -static inline void RansDecRenorm(RansState* r, uint8_t** pptr) -{ - // renormalize - uint32_t x = *r; - - if (x < RANS_BYTE_L) { - uint8_t* ptr = *pptr; - x = (x << 8) | *ptr++; - if (x < RANS_BYTE_L) - x = (x << 8) | *ptr++; - *pptr = ptr; - } - - *r = x; -} - -// Renormalize, with extra checks for falling off the end of the input. -static inline void RansDecRenormSafe(RansState* r, uint8_t** pptr, uint8_t *ptr_end) -{ - uint32_t x = *r; - uint8_t* ptr = *pptr; - if (x >= RANS_BYTE_L || ptr >= ptr_end) return; - x = (x << 8) | *ptr++; - if (x < RANS_BYTE_L && ptr < ptr_end) - x = (x << 8) | *ptr++; - *pptr = ptr; - *r = x; -} - - -#endif // RANS_BYTE_HEADER diff --git a/cram/rANS_static.c b/cram/rANS_static.c deleted file mode 100644 index 584f8b561..000000000 --- a/cram/rANS_static.c +++ /dev/null @@ -1,912 +0,0 @@ -/* - * Copyright (c) 2014-2019 Genome Research Ltd. - * Author(s): James Bonfield - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger - * Institute nor the names of its contributors may be used to endorse - * or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH - * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - -/* - * Author: James Bonfield, Wellcome Trust Sanger Institute. 2014 - */ - -#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "rANS_static.h" -#include "rANS_byte.h" - -#define TF_SHIFT 12 -#define TOTFREQ (1<0?(a):-(a)) -#ifndef BLK_SIZE -# define BLK_SIZE 1024*1024 -#endif - -// Room to allow for expanded BLK_SIZE on worst case compression. -#define BLK_SIZE2 ((int)(1.05*BLK_SIZE)) - -/*----------------------------------------------------------------------------- - * Memory to memory compression functions. - * - * These are original versions without any manual loop unrolling. They - * are easier to understand, but can be up to 2x slower. - */ - -unsigned char *rans_compress_O0(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - unsigned char *out_buf = malloc(1.05*in_size + 257*257*3 + 9); - unsigned char *cp, *out_end; - RansEncSymbol syms[256]; - RansState rans0, rans1, rans2, rans3; - uint8_t* ptr; - int F[256] = {0}, i, j, tab_size, rle, x, fsum = 0; - int m = 0, M = 0; - uint64_t tr; - - if (!out_buf) - return NULL; - - ptr = out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9; - - // Compute statistics - for (i = 0; i < in_size; i++) { - F[in[i]]++; - } - tr = ((uint64_t)TOTFREQ<<31)/in_size + (1<<30)/in_size; - normalise_harder: - // Normalise so T[i] == TOTFREQ - for (fsum = m = M = j = 0; j < 256; j++) { - if (!F[j]) - continue; - - if (m < F[j]) - m = F[j], M = j; - - if ((F[j] = (F[j]*tr)>>31) == 0) - F[j] = 1; - fsum += F[j]; - } - - fsum++; - if (fsum < TOTFREQ) { - F[M] += TOTFREQ-fsum; - } else if (fsum-TOTFREQ > F[M]/2) { - // Corner case to avoid excessive frequency reduction - tr = 2104533975; goto normalise_harder; // equiv to *0.98. - } else { - F[M] -= fsum-TOTFREQ; - } - - //printf("F[%d]=%d\n", M, F[M]); - assert(F[M]>0); - - // Encode statistics. - cp = out_buf+9; - - for (x = rle = j = 0; j < 256; j++) { - if (F[j]) { - // j - if (rle) { - rle--; - } else { - *cp++ = j; - if (!rle && j && F[j-1]) { - for(rle=j+1; rle<256 && F[rle]; rle++) - ; - rle -= j+1; - *cp++ = rle; - } - //fprintf(stderr, "%d: %d %d\n", j, rle, N[j]); - } - - // F[j] - if (F[j]<128) { - *cp++ = F[j]; - } else { - *cp++ = 128 | (F[j]>>8); - *cp++ = F[j]&0xff; - } - RansEncSymbolInit(&syms[j], x, F[j], TF_SHIFT); - x += F[j]; - } - } - *cp++ = 0; - - //write(1, out_buf+4, cp-(out_buf+4)); - tab_size = cp-out_buf; - - RansEncInit(&rans0); - RansEncInit(&rans1); - RansEncInit(&rans2); - RansEncInit(&rans3); - - switch (i=(in_size&3)) { - case 3: RansEncPutSymbol(&rans2, &ptr, &syms[in[in_size-(i-2)]]); // fall through - case 2: RansEncPutSymbol(&rans1, &ptr, &syms[in[in_size-(i-1)]]); // fall through - case 1: RansEncPutSymbol(&rans0, &ptr, &syms[in[in_size-(i-0)]]); // fall through - case 0: - break; - } - for (i=(in_size &~3); i>0; i-=4) { - RansEncSymbol *s3 = &syms[in[i-1]]; - RansEncSymbol *s2 = &syms[in[i-2]]; - RansEncSymbol *s1 = &syms[in[i-3]]; - RansEncSymbol *s0 = &syms[in[i-4]]; - - RansEncPutSymbol(&rans3, &ptr, s3); - RansEncPutSymbol(&rans2, &ptr, s2); - RansEncPutSymbol(&rans1, &ptr, s1); - RansEncPutSymbol(&rans0, &ptr, s0); - } - - RansEncFlush(&rans3, &ptr); - RansEncFlush(&rans2, &ptr); - RansEncFlush(&rans1, &ptr); - RansEncFlush(&rans0, &ptr); - - // Finalise block size and return it - *out_size = (out_end - ptr) + tab_size; - - cp = out_buf; - - *cp++ = 0; // order - *cp++ = ((*out_size-9)>> 0) & 0xff; - *cp++ = ((*out_size-9)>> 8) & 0xff; - *cp++ = ((*out_size-9)>>16) & 0xff; - *cp++ = ((*out_size-9)>>24) & 0xff; - - *cp++ = (in_size>> 0) & 0xff; - *cp++ = (in_size>> 8) & 0xff; - *cp++ = (in_size>>16) & 0xff; - *cp++ = (in_size>>24) & 0xff; - - memmove(out_buf + tab_size, ptr, out_end-ptr); - - return out_buf; -} - -typedef struct { - unsigned char R[TOTFREQ]; -} ari_decoder; - -unsigned char *rans_uncompress_O0(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Load in the static tables */ - unsigned char *cp = in + 9; - unsigned char *cp_end = in + in_size; - int i, j, x, rle; - unsigned int out_sz, in_sz; - char *out_buf; - ari_decoder D; - RansDecSymbol syms[256]; - - if (in_size < 26) // Need at least this many bytes just to start - return NULL; - - if (*in++ != 0) // Order-0 check - return NULL; - - in_sz = ((((uint32_t) in[0])<<0) | (((uint32_t) in[1])<<8) | - (((uint32_t) in[2])<<16) | (((uint32_t) in[3])<<24)); - out_sz = ((((uint32_t) in[4])<<0) | (((uint32_t) in[5])<<8) | - (((uint32_t) in[6])<<16) | (((uint32_t) in[7])<<24)); - if (in_sz != in_size-9) - return NULL; - - if (out_sz >= INT_MAX) - return NULL; // protect against some overflow cases - - // Precompute reverse lookup of frequency. - rle = x = 0; - j = *cp++; - do { - int F, C; - if (cp > cp_end - 16) return NULL; // Not enough input bytes left - if ((F = *cp++) >= 128) { - F &= ~128; - F = ((F & 127) << 8) | *cp++; - } - C = x; - - RansDecSymbolInit(&syms[j], C, F); - - /* Build reverse lookup table */ - if (x + F > TOTFREQ) - return NULL; - memset(&D.R[x], j, F); - - x += F; - - if (!rle && j+1 == *cp) { - j = *cp++; - rle = *cp++; - } else if (rle) { - rle--; - j++; - if (j > 255) - return NULL; - } else { - j = *cp++; - } - } while(j); - - if (x < TOTFREQ-1 || x > TOTFREQ) - return NULL; - if (x < TOTFREQ) // historically we fill 4095, not 4096 - D.R[x] = D.R[x-1]; - - if (cp > cp_end - 16) return NULL; // Not enough input bytes left - - RansState rans0, rans1, rans2, rans3; - uint8_t *ptr = cp; - RansDecInit(&rans0, &ptr); - RansDecInit(&rans1, &ptr); - RansDecInit(&rans2, &ptr); - RansDecInit(&rans3, &ptr); - - out_buf = malloc(out_sz); - if (!out_buf) - return NULL; - - int out_end = (out_sz&~3); - - RansState R[4]; - R[0] = rans0; - R[1] = rans1; - R[2] = rans2; - R[3] = rans3; - uint32_t mask = (1u << TF_SHIFT)-1; - - for (i=0; i < out_end; i+=4) { - uint32_t m[4] = {R[0] & mask, - R[1] & mask, - R[2] & mask, - R[3] & mask}; - uint8_t c[4] = {D.R[m[0]], - D.R[m[1]], - D.R[m[2]], - D.R[m[3]]}; - out_buf[i+0] = c[0]; - out_buf[i+1] = c[1]; - out_buf[i+2] = c[2]; - out_buf[i+3] = c[3]; - - // In theory all TOTFREQ elements of D.R are filled out, but it's - // possible this may not be true (invalid input). We could - // check with x == TOTFREQ after filling out D.R matrix, but - // for historical reasons this sums to TOTFREQ-1 leaving one - // byte in D.R uninitialised. Or we could check here that - // syms[c[0..3]].freq > 0 and initialising syms, but that is - // slow. - // - // We take the former approach and accept a potential for garbage in - // -> garbage out in the rare 1 in TOTFREQ case as the overhead of - // continuous validation of freq > 0 is steep on this tight loop. - - // RansDecAdvanceSymbolStep(&R[0], &syms[c[0]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[1], &syms[c[1]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[2], &syms[c[2]], TF_SHIFT); - // RansDecAdvanceSymbolStep(&R[3], &syms[c[3]], TF_SHIFT); - R[0] = syms[c[0]].freq * (R[0]>>TF_SHIFT); - R[0] += m[0] - syms[c[0]].start; - R[1] = syms[c[1]].freq * (R[1]>>TF_SHIFT); - R[1] += m[1] - syms[c[1]].start; - R[2] = syms[c[2]].freq * (R[2]>>TF_SHIFT); - R[2] += m[2] - syms[c[2]].start; - R[3] = syms[c[3]].freq * (R[3]>>TF_SHIFT); - R[3] += m[3] - syms[c[3]].start; - - if (ptr < cp_end - 8) { // Each renorm reads no more than 2 bytes - RansDecRenorm(&R[0], &ptr); - RansDecRenorm(&R[1], &ptr); - RansDecRenorm(&R[2], &ptr); - RansDecRenorm(&R[3], &ptr); - } else { - RansDecRenormSafe(&R[0], &ptr, cp_end); - RansDecRenormSafe(&R[1], &ptr, cp_end); - RansDecRenormSafe(&R[2], &ptr, cp_end); - RansDecRenormSafe(&R[3], &ptr, cp_end); - } - } - - switch(out_sz&3) { - case 3: - out_buf[out_end+2] = D.R[RansDecGet(&R[2], TF_SHIFT)]; - // fall through - case 2: - out_buf[out_end+1] = D.R[RansDecGet(&R[1], TF_SHIFT)]; - // fall through - case 1: - out_buf[out_end] = D.R[RansDecGet(&R[0], TF_SHIFT)]; - // fall through - default: - break; - } - - *out_size = out_sz; - - return (unsigned char *)out_buf; -} - -unsigned char *rans_compress_O1(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - unsigned char *out_buf = NULL, *out_end, *cp; - unsigned int last_i, tab_size, rle_i, rle_j; - RansEncSymbol (*syms)[256] = NULL; /* syms[256][256] */ - int (*F)[256] = NULL; /* F[256][256] */ - int *T = NULL; /* T[256] */ - int i, j; - unsigned char c; - - if (in_size < 4) - return rans_compress_O0(in, in_size, out_size); - - syms = malloc(256 * sizeof(*syms)); - if (!syms) goto cleanup; - F = calloc(256, sizeof(*F)); - if (!F) goto cleanup; - T = calloc(256, sizeof(*T)); - if (!T) goto cleanup; - out_buf = malloc(1.05*in_size + 257*257*3 + 9); - if (!out_buf) goto cleanup; - - out_end = out_buf + (int)(1.05*in_size) + 257*257*3 + 9; - cp = out_buf+9; - - //for (last = 0, i=in_size-1; i>=0; i--) { - // F[last][c = in[i]]++; - // T[last]++; - // last = c; - //} - - for (last_i=i=0; i>2)]]++; - F[0][in[2*(in_size>>2)]]++; - F[0][in[3*(in_size>>2)]]++; - T[0]+=3; - - // Normalise so T[i] == TOTFREQ - for (rle_i = i = 0; i < 256; i++) { - int t2, m, M; - unsigned int x; - - if (T[i] == 0) - continue; - - //uint64_t p = (TOTFREQ * TOTFREQ) / t; - double p = ((double)TOTFREQ)/T[i]; - normalise_harder: - for (t2 = m = M = j = 0; j < 256; j++) { - if (!F[i][j]) - continue; - - if (m < F[i][j]) - m = F[i][j], M = j; - - //if ((F[i][j] = (F[i][j] * p) / TOTFREQ) == 0) - if ((F[i][j] *= p) == 0) - F[i][j] = 1; - t2 += F[i][j]; - } - - t2++; - if (t2 < TOTFREQ) { - F[i][M] += TOTFREQ-t2; - } else if (t2-TOTFREQ >= F[i][M]/2) { - // Corner case to avoid excessive frequency reduction - p = .98; goto normalise_harder; - } else { - F[i][M] -= t2-TOTFREQ; - } - - // Store frequency table - // i - if (rle_i) { - rle_i--; - } else { - *cp++ = i; - // FIXME: could use order-0 statistics to observe which alphabet - // symbols are present and base RLE on that ordering instead. - if (i && T[i-1]) { - for(rle_i=i+1; rle_i<256 && T[rle_i]; rle_i++) - ; - rle_i -= i+1; - *cp++ = rle_i; - } - } - - int *F_i_ = F[i]; - x = 0; - rle_j = 0; - for (j = 0; j < 256; j++) { - if (F_i_[j]) { - //fprintf(stderr, "F[%d][%d]=%d, x=%d\n", i, j, F_i_[j], x); - - // j - if (rle_j) { - rle_j--; - } else { - *cp++ = j; - if (!rle_j && j && F_i_[j-1]) { - for(rle_j=j+1; rle_j<256 && F_i_[rle_j]; rle_j++) - ; - rle_j -= j+1; - *cp++ = rle_j; - } - } - - // F_i_[j] - if (F_i_[j]<128) { - *cp++ = F_i_[j]; - } else { - *cp++ = 128 | (F_i_[j]>>8); - *cp++ = F_i_[j]&0xff; - } - - RansEncSymbolInit(&syms[i][j], x, F_i_[j], TF_SHIFT); - x += F_i_[j]; - } - } - *cp++ = 0; - } - *cp++ = 0; - - //write(1, out_buf+4, cp-(out_buf+4)); - tab_size = cp - out_buf; - assert(tab_size < 257*257*3); - - RansState rans0, rans1, rans2, rans3; - RansEncInit(&rans0); - RansEncInit(&rans1); - RansEncInit(&rans2); - RansEncInit(&rans3); - - uint8_t* ptr = out_end; - - int isz4 = in_size>>2; - int i0 = 1*isz4-2; - int i1 = 2*isz4-2; - int i2 = 3*isz4-2; - int i3 = 4*isz4-2; - - unsigned char l0 = in[i0+1]; - unsigned char l1 = in[i1+1]; - unsigned char l2 = in[i2+1]; - unsigned char l3 = in[i3+1]; - - // Deal with the remainder - l3 = in[in_size-1]; - for (i3 = in_size-2; i3 > 4*isz4-2; i3--) { - unsigned char c3 = in[i3]; - RansEncPutSymbol(&rans3, &ptr, &syms[c3][l3]); - l3 = c3; - } - - for (; i0 >= 0; i0--, i1--, i2--, i3--) { - unsigned char c0, c1, c2, c3; - RansEncSymbol *s3 = &syms[c3 = in[i3]][l3]; - RansEncSymbol *s2 = &syms[c2 = in[i2]][l2]; - RansEncSymbol *s1 = &syms[c1 = in[i1]][l1]; - RansEncSymbol *s0 = &syms[c0 = in[i0]][l0]; - - RansEncPutSymbol(&rans3, &ptr, s3); - RansEncPutSymbol(&rans2, &ptr, s2); - RansEncPutSymbol(&rans1, &ptr, s1); - RansEncPutSymbol(&rans0, &ptr, s0); - - l0 = c0; - l1 = c1; - l2 = c2; - l3 = c3; - } - - RansEncPutSymbol(&rans3, &ptr, &syms[0][l3]); - RansEncPutSymbol(&rans2, &ptr, &syms[0][l2]); - RansEncPutSymbol(&rans1, &ptr, &syms[0][l1]); - RansEncPutSymbol(&rans0, &ptr, &syms[0][l0]); - - RansEncFlush(&rans3, &ptr); - RansEncFlush(&rans2, &ptr); - RansEncFlush(&rans1, &ptr); - RansEncFlush(&rans0, &ptr); - - *out_size = (out_end - ptr) + tab_size; - - cp = out_buf; - *cp++ = 1; // order - - *cp++ = ((*out_size-9)>> 0) & 0xff; - *cp++ = ((*out_size-9)>> 8) & 0xff; - *cp++ = ((*out_size-9)>>16) & 0xff; - *cp++ = ((*out_size-9)>>24) & 0xff; - - *cp++ = (in_size>> 0) & 0xff; - *cp++ = (in_size>> 8) & 0xff; - *cp++ = (in_size>>16) & 0xff; - *cp++ = (in_size>>24) & 0xff; - - memmove(out_buf + tab_size, ptr, out_end-ptr); - - cleanup: - free(syms); - free(F); - free(T); - - return out_buf; -} - -unsigned char *rans_uncompress_O1(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Load in the static tables */ - unsigned char *cp = in + 9; - unsigned char *ptr_end = in + in_size; - int i, j = -999, x, rle_i, rle_j; - unsigned int out_sz, in_sz; - char *out_buf = NULL; - ari_decoder *D = NULL; /* D[256] */ - RansDecSymbol (*syms)[256] = NULL; /* syms[256][256] */ - - if (in_size < 27) // Need at least this many bytes to start - return NULL; - - if (*in++ != 1) // Order-1 check - return NULL; - - in_sz = ((((uint32_t) in[0])<<0) | (((uint32_t) in[1])<<8) | - (((uint32_t) in[2])<<16) | (((uint32_t) in[3])<<24)); - out_sz = ((((uint32_t) in[4])<<0) | (((uint32_t) in[5])<<8) | - (((uint32_t) in[6])<<16) | (((uint32_t) in[7])<<24)); - if (in_sz != in_size-9) - return NULL; - - if (out_sz >= INT_MAX) - return NULL; // protect against some overflow cases - - // calloc may add 2% overhead to CRAM decode, but on linux with glibc it's - // often the same thing due to using mmap. - D = calloc(256, sizeof(*D)); - if (!D) goto cleanup; - syms = malloc(256 * sizeof(*syms)); - if (!syms) goto cleanup; - /* These memsets prevent illegal memory access in syms due to - broken compressed data. As D is calloc'd, all illegal transitions - will end up in either row or column 0 of syms. */ - memset(&syms[0], 0, sizeof(syms[0])); - for (i = 1; i < 256; i++) memset(&syms[i][0], 0, sizeof(syms[0][0])); - - //fprintf(stderr, "out_sz=%d\n", out_sz); - - //i = *cp++; - rle_i = 0; - i = *cp++; - do { - rle_j = x = 0; - j = *cp++; - do { - int F, C; - if (cp > ptr_end - 16) goto cleanup; // Not enough input bytes left - if ((F = *cp++) >= 128) { - F &= ~128; - F = ((F & 127) << 8) | *cp++; - } - C = x; - - //fprintf(stderr, "i=%d j=%d F=%d C=%d\n", i, j, F, C); - - if (!F) - F = TOTFREQ; - - RansDecSymbolInit(&syms[i][j], C, F); - - /* Build reverse lookup table */ - if (x + F > TOTFREQ) - goto cleanup; - memset(&D[i].R[x], j, F); - - x += F; - - if (!rle_j && j+1 == *cp) { - j = *cp++; - rle_j = *cp++; - } else if (rle_j) { - rle_j--; - j++; - if (j > 255) - goto cleanup; - } else { - j = *cp++; - } - } while(j); - - if (x < TOTFREQ-1 || x > TOTFREQ) - goto cleanup; - if (x < TOTFREQ) // historically we fill 4095, not 4096 - D[i].R[x] = D[i].R[x-1]; - - if (!rle_i && i+1 == *cp) { - i = *cp++; - rle_i = *cp++; - } else if (rle_i) { - rle_i--; - i++; - if (i > 255) - goto cleanup; - } else { - i = *cp++; - } - } while (i); - - // Precompute reverse lookup of frequency. - - RansState rans0, rans1, rans2, rans3; - uint8_t *ptr = cp; - if (ptr > ptr_end - 16) goto cleanup; // Not enough input bytes left - RansDecInit(&rans0, &ptr); if (rans0 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans1, &ptr); if (rans1 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans2, &ptr); if (rans2 < RANS_BYTE_L) goto cleanup; - RansDecInit(&rans3, &ptr); if (rans3 < RANS_BYTE_L) goto cleanup; - - int isz4 = out_sz>>2; - int l0 = 0; - int l1 = 0; - int l2 = 0; - int l3 = 0; - int i4[] = {0*isz4, 1*isz4, 2*isz4, 3*isz4}; - - RansState R[4]; - R[0] = rans0; - R[1] = rans1; - R[2] = rans2; - R[3] = rans3; - - /* Allocate output buffer */ - out_buf = malloc(out_sz); - if (!out_buf) goto cleanup; - - for (; i4[0] < isz4; i4[0]++, i4[1]++, i4[2]++, i4[3]++) { - uint32_t m[4] = {R[0] & ((1u << TF_SHIFT)-1), - R[1] & ((1u << TF_SHIFT)-1), - R[2] & ((1u << TF_SHIFT)-1), - R[3] & ((1u << TF_SHIFT)-1)}; - - uint8_t c[4] = {D[l0].R[m[0]], - D[l1].R[m[1]], - D[l2].R[m[2]], - D[l3].R[m[3]]}; - - out_buf[i4[0]] = c[0]; - out_buf[i4[1]] = c[1]; - out_buf[i4[2]] = c[2]; - out_buf[i4[3]] = c[3]; - - //RansDecAdvanceSymbolStep(&R[0], &syms[l0][c[0]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[1], &syms[l1][c[1]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[2], &syms[l2][c[2]], TF_SHIFT); - //RansDecAdvanceSymbolStep(&R[3], &syms[l3][c[3]], TF_SHIFT); - - R[0] = syms[l0][c[0]].freq * (R[0]>>TF_SHIFT); - R[0] += m[0] - syms[l0][c[0]].start; - R[1] = syms[l1][c[1]].freq * (R[1]>>TF_SHIFT); - R[1] += m[1] - syms[l1][c[1]].start; - R[2] = syms[l2][c[2]].freq * (R[2]>>TF_SHIFT); - R[2] += m[2] - syms[l2][c[2]].start; - R[3] = syms[l3][c[3]].freq * (R[3]>>TF_SHIFT); - R[3] += m[3] - syms[l3][c[3]].start; - - if (ptr < ptr_end - 8) { // Each renorm reads no more than 2 bytes - RansDecRenorm(&R[0], &ptr); - RansDecRenorm(&R[1], &ptr); - RansDecRenorm(&R[2], &ptr); - RansDecRenorm(&R[3], &ptr); - } else { - RansDecRenormSafe(&R[0], &ptr, ptr_end); - RansDecRenormSafe(&R[1], &ptr, ptr_end); - RansDecRenormSafe(&R[2], &ptr, ptr_end); - RansDecRenormSafe(&R[3], &ptr, ptr_end); - } - - l0 = c[0]; - l1 = c[1]; - l2 = c[2]; - l3 = c[3]; - } - - // Remainder - for (; i4[3] < out_sz; i4[3]++) { - unsigned char c3 = D[l3].R[RansDecGet(&R[3], TF_SHIFT)]; - out_buf[i4[3]] = c3; - - uint32_t m = R[3] & ((1u << TF_SHIFT)-1); - R[3] = syms[l3][c3].freq * (R[3]>>TF_SHIFT) + m - syms[l3][c3].start; - RansDecRenormSafe(&R[3], &ptr, ptr_end); - l3 = c3; - } - - *out_size = out_sz; - - cleanup: - if (D) - free(D); - free(syms); - - return (unsigned char *)out_buf; -} - -/*----------------------------------------------------------------------------- - * Simple interface to the order-0 vs order-1 encoders and decoders. - */ -unsigned char *rans_compress(unsigned char *in, unsigned int in_size, - unsigned int *out_size, int order) { - return order - ? rans_compress_O1(in, in_size, out_size) - : rans_compress_O0(in, in_size, out_size); -} - -unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size, - unsigned int *out_size) { - /* Both rans_uncompress functions need to be able to read at least 9 - bytes. */ - if (in_size < 9) - return NULL; - return in[0] - ? rans_uncompress_O1(in, in_size, out_size) - : rans_uncompress_O0(in, in_size, out_size); -} - - -#ifdef TEST_MAIN -/*----------------------------------------------------------------------------- - * Main. - * - * This is a simple command line tool for testing order-0 and order-1 - * compression using the rANS codec. Simply compile with - * - * gcc -DTEST_MAIN -O3 -I. cram/rANS_static.c -o cram/rANS_static - * - * Usage: cram/rANS_static -o0 < file > file.o0 - * cram/rANS_static -d < file.o0 > file2 - * - * cram/rANS_static -o1 < file > file.o1 - * cram/rANS_static -d < file.o1 > file2 - */ -int main(int argc, char **argv) { - int opt, order = 0; - unsigned char in_buf[BLK_SIZE2+257*257*3]; - int decode = 0; - FILE *infp = stdin, *outfp = stdout; - struct timeval tv1, tv2; - size_t bytes = 0; - - extern char *optarg; - extern int optind; - - while ((opt = getopt(argc, argv, "o:d")) != -1) { - switch (opt) { - case 'o': - order = atoi(optarg); - break; - - case 'd': - decode = 1; - break; - } - } - - order = order ? 1 : 0; // Only support O(0) and O(1) - - if (optind < argc) { - if (!(infp = fopen(argv[optind], "rb"))) { - perror(argv[optind]); - return 1; - } - optind++; - } - - if (optind < argc) { - if (!(outfp = fopen(argv[optind], "wb"))) { - perror(argv[optind]); - fclose(infp); - return 1; - } - optind++; - } - - gettimeofday(&tv1, NULL); - - if (decode) { - // Only used in some test implementations of RC_GetFreq() - //RC_init(); - //RC_init2(); - - for (;;) { - uint32_t in_size, out_size; - unsigned char *out; - - if (9 != fread(in_buf, 1, 9, infp)) - break; - in_size = *(int *)&in_buf[1]; - if (in_size != fread(in_buf+9, 1, in_size, infp)) { - fprintf(stderr, "Truncated input\n"); - exit(1); - } - out = rans_uncompress(in_buf, in_size+9, &out_size); - if (!out) - abort(); - - fwrite(out, 1, out_size, outfp); - free(out); - - bytes += out_size; - } - } else { - for (;;) { - uint32_t in_size, out_size; - unsigned char *out; - - in_size = fread(in_buf, 1, BLK_SIZE, infp); - if (in_size <= 0) - break; - - out = rans_compress(in_buf, in_size, &out_size, order); - - fwrite(out, 1, out_size, outfp); - free(out); - - bytes += in_size; - } - } - - gettimeofday(&tv2, NULL); - - fprintf(stderr, "Took %ld microseconds, %5.1f MB/s\n", - (long)(tv2.tv_sec - tv1.tv_sec)*1000000 + - tv2.tv_usec - tv1.tv_usec, - (double)bytes / ((long)(tv2.tv_sec - tv1.tv_sec)*1000000 + - tv2.tv_usec - tv1.tv_usec)); - - if (infp != stdin) fclose(infp); - if (outfp != stdout) fclose(outfp); - - return 0; -} -#endif diff --git a/cram/rANS_static.h b/cram/rANS_static.h deleted file mode 100644 index 5c3cf2cbf..000000000 --- a/cram/rANS_static.h +++ /dev/null @@ -1,51 +0,0 @@ -/* - * Copyright (c) 2014 Genome Research Ltd. - * Author(s): James Bonfield - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * - * 1. Redistributions of source code must retain the above copyright notice, - * this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above - * copyright notice, this list of conditions and the following - * disclaimer in the documentation and/or other materials provided - * with the distribution. - * - * 3. Neither the names Genome Research Ltd and Wellcome Trust Sanger - * Institute nor the names of its contributors may be used to endorse - * or promote products derived from this software without specific - * prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS - * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A - * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH - * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - */ - - -#ifndef RANS_STATIC_H -#define RANS_STATIC_H - -#ifdef __cplusplus -extern "C" { -#endif - -unsigned char *rans_compress(unsigned char *in, unsigned int in_size, - unsigned int *out_size, int order); -unsigned char *rans_uncompress(unsigned char *in, unsigned int in_size, - unsigned int *out_size); - -#ifdef __cplusplus -} -#endif - -#endif /* RANS_STATIC_H */ diff --git a/htscodecs b/htscodecs new file mode 160000 index 000000000..4e06c5d79 --- /dev/null +++ b/htscodecs @@ -0,0 +1 @@ +Subproject commit 4e06c5d795b2a603bebf141cb88082901b41399c diff --git a/htscodecs.mk.in b/htscodecs.mk.in new file mode 100644 index 000000000..f3bba4f31 --- /dev/null +++ b/htscodecs.mk.in @@ -0,0 +1,5 @@ +# This is @configure_input@ + +# This file selects Makefile rules for htscodecs + +include $(HTSPREFIX)@selected_htscodecs_mk@ diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk new file mode 100644 index 000000000..fac3fd956 --- /dev/null +++ b/htscodecs_bundled.mk @@ -0,0 +1,3 @@ +HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c + +HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) diff --git a/htscodecs_external.mk b/htscodecs_external.mk new file mode 100644 index 000000000..8d5640842 --- /dev/null +++ b/htscodecs_external.mk @@ -0,0 +1,6 @@ +HTSCODECS_SOURCES = +HTSCODECS_OBJS = + +htscodecs_rANS_static_h = + +htscodecs_rANS_byte_h = diff --git a/htslib.mk b/htslib.mk index 254c25797..4d0fa0205 100644 --- a/htslib.mk +++ b/htslib.mk @@ -35,6 +35,13 @@ HTSPREFIX = $(HTSDIR)/ include $(HTSDIR)/htslib_vars.mk +# This file provides the HTSCODECS_SOURCES variable. It may not be present +# in a freshly checked-out htslib, so is only included if available. The +# absence is unlikely to cause a problem as there will be plenty of other +# missing files that will trigger a build in htslib, and when that happens +# htslib's makefile will create it. +-include $(HTSDIR)/htscodecs.mk + # Rules for rebuilding an in-development htslib's static and shared libraries. # If your program foo links with libhts, adding the appropriate prerequisite # will cause the library to be rebuilt as necessary: @@ -138,13 +145,11 @@ HTSLIB_ALL = \ $(HTSDIR)/cram/os.h \ $(HTSDIR)/cram/pooled_alloc.c \ $(HTSDIR)/cram/pooled_alloc.h \ - $(HTSDIR)/cram/rANS_byte.h \ - $(HTSDIR)/cram/rANS_static.c \ - $(HTSDIR)/cram/rANS_static.h \ $(HTSDIR)/cram/string_alloc.c \ $(HTSDIR)/cram/string_alloc.h \ $(HTSDIR)/os/lzma_stub.h \ - $(HTSDIR)/os/rand.c + $(HTSDIR)/os/rand.c \ + $(HTSCODECS_SOURCES) $(HTSDIR)/config.h: +cd $(HTSDIR) && $(MAKE) config.h diff --git a/test/maintainer/check_copyright.pl b/test/maintainer/check_copyright.pl index 161a7214c..43fb5be4e 100755 --- a/test/maintainer/check_copyright.pl +++ b/test/maintainer/check_copyright.pl @@ -43,6 +43,9 @@ sub check { # Only check C, perl and shell files return unless (/(?:\.[ch]|\.pl|\.sh)$/); + # Exclude htscodecs submodule + return if (/\/htscodecs\//); + # Exclusions: my %exclude = map { ("$root/$_", 1) } ( 'config.h', # Auto-generated diff --git a/test/maintainer/check_spaces.pl b/test/maintainer/check_spaces.pl index d5a53b1ed..81b4ededc 100755 --- a/test/maintainer/check_spaces.pl +++ b/test/maintainer/check_spaces.pl @@ -43,6 +43,9 @@ sub check { # Only check C, perl and shell files return unless (/(?:\.[ch]|\.pl|\.sh)$/); + # Exclude htscodecs submodule + return if (/\/htscodecs\//); + my %allow_tabs = map { ("$root/$_", 1) } ( 'kfunc.c', 'kstring.c', From 4a38d3d5dcb26e09ab33dd5195995b58120b3958 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 16 Sep 2019 17:56:23 +0100 Subject: [PATCH 041/488] Added CRAM 3.1 encode and decode support. Note: this is experimental, subject to the 3.1 spec being agreed upon by the GA4GH File Formats committee. Example usage (high compression): ./test/test_view -C -o version=3.1 -o use_fqz -o use_bzip2 -o use_arith \ -o seqs_per_slice=100000 /tmp/30.cram -p /tmp/31-max.cram Also added support for compression profiles. Primarily these are CRAM only, but for consistency they're also used as synonyms for compression levels in bgzf. In the process of adding CRAM 3.1 several improvements and refactoring took place to the cram encoder (mainly by dint of copying over the cram_compress_block function from io_lib): - Added a "Gzip -1" deflate strategy as sometimes it's smaller than gzip or comparable size (but considerably fatser). This is purely for metrics and optimisation. - Refactored the cram_compress_block function so it can deal with many more compression method / strategies. It now has a number of lookup tables which need to be kept synchronised (see comments). - Added a new cram_compress_block2 function which takes a slice. This is used for the fqzcomp code to obtain the read lengths. - Cram_compress_block now has a consistency metric too. This is to learn when data isn't changing rapidly so don't be too quick to reevaluate. --- Makefile | 17 +- cram/cram_encode.c | 121 ++++++-- cram/cram_io.c | 701 +++++++++++++++++++++++++++--------------- cram/cram_io.h | 3 + cram/cram_structs.h | 45 +-- hts.c | 52 +++- htscodecs_bundled.mk | 7 +- htscodecs_external.mk | 9 + htslib/cram.h | 45 ++- htslib/hts.h | 13 + 10 files changed, 691 insertions(+), 322 deletions(-) diff --git a/Makefile b/Makefile index 53a39d13a..d6525a516 100644 --- a/Makefile +++ b/Makefile @@ -212,9 +212,18 @@ sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) +htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h +htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h +htscodecs_pack_h = htscodecs/htscodecs/pack.h htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h +htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h +htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h +htscodecs_varint_h = htscodecs/htscodecs/varint.h htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h +htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) +htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h # To be effective, config.mk needs to appear after most Makefile variables are # set but before most rules appear, so that it can both use previously-set @@ -374,7 +383,7 @@ cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) @@ -382,7 +391,13 @@ cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/po cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) +htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_c_simple_model.h) +htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model.h) +htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_rANS_static_h) +htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) + bgzip: bgzip.o libhts.a $(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 66f27eebf..7d2f5a9a5 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -825,86 +825,139 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { /* Compress the CORE Block too, with minimal zlib level */ if (level > 5 && s->block[0]->uncomp_size > 500) - cram_compress_block(fd, s->block[0], NULL, 1<block[0], NULL, 1<use_bz2) method |= 1<use_rans) - method |= (1<use_rans) { + method_ranspr = (1< 1) + method_ranspr |= + (1< 5) + method_ranspr |= (1<version >= (3<<8)+1); + if (fd->use_rans) { + methodF |= v31_or_above ? method_ranspr : method_rans; + method |= v31_or_above ? method_ranspr : method_rans; + } + + int method_arith = 0; + if (fd->use_arith) { + method_arith = (1< 1) + method_arith |= + (1<use_arith && v31_or_above) { + methodF |= method_arith; + method |= method_arith; + } if (fd->use_lzma) method |= (1<= 6) + if (level >= 5) { + method |= 1<use_fqz) { + qmethod |= 1<level > 4) { + qmethod |= 1<level > 6) { + qmethod |= (1<block[DS_IN], fd->m[DS_IN], //IN (seq) - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq) + method, level)) return -1; if (fd->level == 0) { /* Do nothing */ } else if (fd->level == 1) { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - methodF, 1)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethodF, 1)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, 1)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, 1)) return -1; } } else if (fd->level < 3) { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethod, 1)) return -1; - if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA], + method, 1)) return -1; if (s->block[DS_BB]) - if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], - method, 1)) + if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB], + method, 1)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, level)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, level)) return -1; } } else { - if (cram_compress_block(fd, s->block[DS_QS], fd->m[DS_QS], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_QS], fd->m[DS_QS], + qmethod, level)) return -1; - if (cram_compress_block(fd, s->block[DS_BA], fd->m[DS_BA], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_BA], fd->m[DS_BA], + method, level)) return -1; if (s->block[DS_BB]) - if (cram_compress_block(fd, s->block[DS_BB], fd->m[DS_BB], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_BB], fd->m[DS_BB], + method, level)) return -1; for (i = DS_aux; i <= DS_aux_oz; i++) { if (s->block[i]) - if (cram_compress_block(fd, s->block[i], fd->m[i], - method, level)) + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], + method, level)) return -1; } } // NAME: best is generally xz, bzip2, zlib then rans1 - if (cram_compress_block(fd, s->block[DS_RN], fd->m[DS_RN], - method & ~(1<version >= (3<<8)+1 && fd->use_tok) + method_rn |= fd->use_arith ? (1<block[DS_RN], fd->m[DS_RN], + method_rn, level)) return -1; // NS shows strong local correlation as rearrangements are localised if (s->block[DS_NS] != s->block[0]) - if (cram_compress_block(fd, s->block[DS_NS], fd->m[DS_NS], - method, level)) + if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS], + method, level)) return -1; @@ -920,8 +973,8 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { if (s->aux_block[i]->method != RAW) continue; - if (cram_compress_block(fd, s->aux_block[i], s->aux_block[i]->m, - method, level)) + if (cram_compress_block2(fd, s, s->aux_block[i], s->aux_block[i]->m, + method, level)) return -1; } } @@ -938,7 +991,7 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { if (s->block[i]->method != RAW) continue; - if (cram_compress_block(fd, s->block[i], fd->m[i], + if (cram_compress_block2(fd, s, s->block[i], fd->m[i], methodF, level)) return -1; } diff --git a/cram/cram_io.c b/cram/cram_io.c index d5ad50000..f771d2496 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -76,8 +76,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(HAVE_EXTERNAL_LIBHTSCODECS) #include +#include +#include +#include +#include #else #include "../htscodecs/htscodecs/rANS_static.h" +#include "../htscodecs/htscodecs/rANS_static4x16.h" +#include "../htscodecs/htscodecs/arith_dynamic.h" +#include "../htscodecs/htscodecs/tokenise_name3.h" +#include "../htscodecs/htscodecs/fqzcomp_qual.h" #endif //#define REF_DEBUG @@ -103,6 +111,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define TRIAL_SPAN 50 #define NTRIALS 3 +#define CRAM_DEFAULT_LEVEL 5 /* ---------------------------------------------------------------------- * ITF8 encoding and decoding. @@ -1182,6 +1191,62 @@ int cram_uncompress_block(cram_block *b) { break; } + case FQZ: { + uncomp_size = b->uncomp_size; + uncomp = fqz_decompress((char *)b->data, b->comp_size, &uncomp_size, NULL, 0); + if (!uncomp) + return -1; + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = uncomp_size; + b->method = RAW; + break; + } + + case RANS_PR0: { + unsigned int usize = b->uncomp_size, usize2; + uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2); + if (!uncomp || usize != usize2) + return -1; + b->orig_method = RANS_PR0 + (b->data[0]&1) + + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = usize2; + b->method = RAW; + b->uncomp_size = usize2; // Just incase it differs + //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); + break; + } + + case ARITH_PR0: { + unsigned int usize = b->uncomp_size, usize2; + uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2); + if (!uncomp || usize != usize2) + return -1; + b->orig_method = ARITH_PR0 + (b->data[0]&1) + + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); + free(b->data); + b->data = (unsigned char *)uncomp; + b->alloc = usize2; + b->method = RAW; + b->uncomp_size = usize2; // Just incase it differs + //fprintf(stderr, "Expanded %d to %d\n", b->comp_size, b->uncomp_size); + break; + } + + case NAME_TOK3: { + uint32_t out_len; + uint8_t *cp = decode_names(b->data, b->comp_size, &out_len); + b->orig_method = NAME_TOK3; + b->method = RAW; + free(b->data); + b->data = cp; + b->alloc = out_len; + b->uncomp_size = out_len; + break; + } + default: return -1; } @@ -1189,12 +1254,14 @@ int cram_uncompress_block(cram_block *b) { return 0; } -static char *cram_compress_by_method(char *in, size_t in_size, +static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, int content_id, size_t *out_size, enum cram_block_method method, int level, int strat) { switch (method) { case GZIP: + case GZIP_RLE: + case GZIP_1: // Read names bizarrely benefit from zlib over libdeflate for // mid-range compression levels. Focusing purely of ratio or // speed, libdeflate still wins. It also seems to win for @@ -1230,6 +1297,32 @@ static char *cram_compress_by_method(char *in, size_t in_size, #endif } + case FQZ: + case FQZ_b: + case FQZ_c: + case FQZ_d: { + // Extract the necessary portion of the slice into an fqz_slice struct. + // These previously were the same thing, but this permits us to detach + // the codec from the rest of this CRAM implementation. + fqz_slice *f = malloc(2*s->hdr->num_records * sizeof(uint32_t) + sizeof(fqz_slice)); + if (!f) + return NULL; + f->num_records = s->hdr->num_records; + f->len = (uint32_t *)(((char *)f) + sizeof(fqz_slice)); + f->flags = f->len + s->hdr->num_records; + int i; + for (i = 0; i < s->hdr->num_records; i++) { + f->flags[i] = s->crecs[i].flags; + f->len[i] = (i+1 < s->hdr->num_records + ? s->crecs[i+1].qual - s->crecs[i].qual + : s->block[DS_QS]->uncomp_size - s->crecs[i].qual); + } + char *comp = fqz_compress(strat & 0xff /* cram vers */, f, + in, in_size, out_size, strat >> 8, NULL); + free(f); + return comp; + } + case LZMA: #ifdef HAVE_LIBLZMA return lzma_mem_deflate(in, in_size, out_size, level); @@ -1237,23 +1330,67 @@ static char *cram_compress_by_method(char *in, size_t in_size, return NULL; #endif - case RANS0: { + case RANS0: + case RANS1: { unsigned int out_size_i; unsigned char *cp; - cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 0); + cp = rans_compress((unsigned char *)in, in_size, &out_size_i, + method == RANS0 ? 0 : 1); *out_size = out_size_i; return (char *)cp; } - case RANS1: { + case RANS_PR0: + case RANS_PR1: + case RANS_PR64: + case RANS_PR9: + case RANS_PR128: + case RANS_PR129: + case RANS_PR192: + case RANS_PR193: { unsigned int out_size_i; unsigned char *cp; - cp = rans_compress((unsigned char *)in, in_size, &out_size_i, 1); + // see enum cram_block. We map RANS_* methods to order bit-fields + static int methmap[] = { 1, 64,9, 128,129, 192,193 }; + + cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, + method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]); + *out_size = out_size_i; + return (char *)cp; + } + + case ARITH_PR0: + case ARITH_PR1: + case ARITH_PR64: + case ARITH_PR9: + case ARITH_PR128: + case ARITH_PR129: + case ARITH_PR192: + case ARITH_PR193: { + unsigned int out_size_i; + unsigned char *cp; + + // see enum cram_block. We map ARITH_* methods to order bit-fields + static int methmap[] = { 1, 64,9, 128,129, 192,193 }; + + cp = arith_compress_to((unsigned char *)in, in_size, NULL, &out_size_i, + method == ARITH_PR0 ? 0 : methmap[method - ARITH_PR1]); *out_size = out_size_i; return (char *)cp; } + case NAME_TOK3: + case NAME_TOKA: { + int out_len; + int lev = level; + if (method == NAME_TOK3 && lev > 3) + lev = 3; + uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL); + *out_size = out_len; + return (char *)cp; + } + case RAW: break; @@ -1275,13 +1412,34 @@ static char *cram_compress_by_method(char *in, size_t in_size, * * Method and level -1 implies defaults, as specified in cram_fd. */ -int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, - int method, int level) { +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level) { char *comp = NULL; size_t comp_size = 0; int strat; + // Internally we have parameterised methods that externally map + // to the same CRAM method value. + // See enum_cram_block_method. + int methmap[] = { + // Externally defined values + RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3, + + // Reserved for possible expansion + 0, 0, + + // Internally parameterised versions matching back to above + // external values + GZIP, GZIP, + FQZ, FQZ, FQZ, + RANS, + RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, RANSPR, + TOK3, + ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, ARITH, + }; + if (b->method != RAW) { // Maybe already compressed if s->block[0] was compressed and // we have e.g. s->block[DS_BA] set to s->block[0] due to only @@ -1314,14 +1472,10 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, if (metrics) { pthread_mutex_lock(&fd->metrics_lock); if (metrics->trial > 0 || --metrics->next_trial <= 0) { - size_t sz_best = INT_MAX; - size_t sz_gz_rle = 0; - size_t sz_gz_def = 0; - size_t sz_rans0 = 0; - size_t sz_rans1 = 0; - size_t sz_bzip2 = 0; - size_t sz_lzma = 0; - int method_best = 0; + int m; + size_t sz_best = b->uncomp_size; + size_t sz[CRAM_MAX_METHOD] = {0}; + int method_best = 0; // RAW char *c_best = NULL, *c = NULL; if (metrics->revised_method) @@ -1332,243 +1486,201 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, if (metrics->next_trial <= 0) { metrics->next_trial = TRIAL_SPAN; metrics->trial = NTRIALS; - metrics->sz_gz_rle /= 2; - metrics->sz_gz_def /= 2; - metrics->sz_rans0 /= 2; - metrics->sz_rans1 /= 2; - metrics->sz_bzip2 /= 2; - metrics->sz_lzma /= 2; - } - - pthread_mutex_unlock(&fd->metrics_lock); - - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_gz_rle, GZIP, 1, Z_RLE); - if (c && sz_best > sz_gz_rle) { - sz_best = sz_gz_rle; - method_best = GZIP_RLE; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_gz_rle = b->uncomp_size*2+1000; - } - - //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_rle); - } - - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_gz_def, GZIP, level, - Z_FILTERED); - if (c && sz_best > sz_gz_def) { - sz_best = sz_gz_def; - method_best = GZIP; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_gz_def = b->uncomp_size*2+1000; - } - - //fprintf(stderr, "Block %d; %d->%d\n", b->content_id, b->uncomp_size, sz_gz_def); + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] /= 2; } - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_rans0, RANS0, 0, 0); - if (c && sz_best > sz_rans0) { - sz_best = sz_rans0; - method_best = RANS0; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_rans0 = b->uncomp_size*2+1000; - } + // Compress this block using the best method + if (metrics->stats && metrics->stats->nvals > 16) { + // No point trying bit-pack if 17+ symbols. + if (method & (1<metrics_lock); - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_rans1, RANS1, 0, 0); - if (c && sz_best > sz_rans1) { - sz_best = sz_rans1; - method_best = RANS1; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); + for (m = 0; m < CRAM_MAX_METHOD; m++) { + if (method & (1<version); break; + case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; + case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; + case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; + case NAME_TOK3:strat = 0; break; + case NAME_TOKA:strat = 1; break; + default: strat = 0; + } + + c = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, + b->content_id, &sz[m], m, lvl, strat); + + if (c && sz_best > sz[m]) { + sz_best = sz[m]; + method_best = m; + if (c_best) + free(c_best); + c_best = c; + } else if (c) { + free(c); + } else { + sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw + } } else { - sz_rans1 = b->uncomp_size*2+1000; + sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw } } + //fprintf(stderr, "sz_best = %d\n", sz_best); - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_bzip2, BZIP2, level, 0); - if (c && sz_best > sz_bzip2) { - sz_best = sz_bzip2; - method_best = BZIP2; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_bzip2 = b->uncomp_size*2+1000; - } - } + if (c_best) { + free(b->data); + b->data = (unsigned char *)c_best; + //printf("method_best = %s\n", cram_block_method2str(method_best)); - if (method & (1<data, b->uncomp_size, - b->content_id, &sz_lzma, LZMA, level, 0); - if (c && sz_best > sz_lzma) { - sz_best = sz_lzma; - method_best = LZMA; - if (c_best) - free(c_best); - c_best = c; - } else if (c) { - free(c); - } else { - sz_lzma = b->uncomp_size*2+1000; - } + b->method = method_best; // adjusted to methmap[method_best] later + b->comp_size = sz_best; } - //fprintf(stderr, "sz_best = %d\n", sz_best); - - free(b->data); - b->data = (unsigned char *)c_best; - //printf("method_best = %s\n", cram_block_method2str(method_best)); - b->method = method_best == GZIP_RLE ? GZIP : method_best; - b->comp_size = sz_best; - + // Accumulate stats for all methods tried pthread_mutex_lock(&fd->metrics_lock); - metrics->sz_gz_rle += sz_gz_rle; - metrics->sz_gz_def += sz_gz_def; - metrics->sz_rans0 += sz_rans0; - metrics->sz_rans1 += sz_rans1; - metrics->sz_bzip2 += sz_bzip2; - metrics->sz_lzma += sz_lzma; + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] += sz[m]+50; // don't be overly sure on small blocks + + // When enough trials performed, find the best on average if (--metrics->trial == 0) { int best_method = RAW; int best_sz = INT_MAX; - // Scale methods by cost - if (fd->level <= 3) { - metrics->sz_rans1 *= 1.02; - metrics->sz_gz_def *= 1.04; - metrics->sz_bzip2 *= 1.08; - metrics->sz_lzma *= 1.10; + // Relative costs of methods. See enum_cram_block_method and methmap + double meth_cost[32] = { + // Externally defined methods + 1, // 0 raw + 1.04, // 1 gzip (Z_FILTERED) + 1.08, // 2 bzip2 + 1.04, // 3 lzma + 1.00, // 4 rans (O0) + 1.00, // 5 ranspr (O0) + 1.03, // 6 arithpr (O0) + 1.05, // 7 fqz + 1.05, // 8 tok3 (rans) + 9, 9, // 9,10 reserved + + // Paramterised versions of above + 1.01, // gzip rle + 1.02, // gzip -1 + + 1.05, 1.05, 1.05, // FQZ_b,c,d + + 1.01, // rans O1 + + 1.01, // rans_pr1 + 1.00, // rans_pr64; if smaller, usually fast + 1.03, // rans_pr65/9 + 1.00, // rans_pr128 + 1.01, // rans_pr129 + 1.00, // rans_pr192 + 1.01, // rans_pr193 + + 1.07, // tok3 arith + + 1.04, // arith_pr1 + 1.04, // arith_pr64 + 1.04, // arith_pr65 + 1.03, // arith_pr128 + 1.04, // arith_pr129 + 1.04, // arith_pr192 + 1.04, // arith_pr193 + }; + + // Scale methods by cost based on compression level + if (fd->level <= 1) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)*4; + } else if (fd->level <= 3) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1); } else if (fd->level <= 6) { - metrics->sz_rans1 *= 1.01; - metrics->sz_gz_def *= 1.02; - metrics->sz_bzip2 *= 1.03; - metrics->sz_lzma *= 1.05; + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)/2; + } else if (fd->level <= 7) { + for (m = 0; m < CRAM_MAX_METHOD; m++) + metrics->sz[m] *= 1+(meth_cost[m]-1)/3; + } // else cost is ignored + + for (m = 0; m < CRAM_MAX_METHOD; m++) { + if ((!metrics->sz[m]) || (!(method & (1< metrics->sz[m]) + best_sz = metrics->sz[m], best_method = m; } - if (method & (1< metrics->sz_gz_rle) - best_sz = metrics->sz_gz_rle, best_method = GZIP_RLE; - - if (method & (1< metrics->sz_gz_def) - best_sz = metrics->sz_gz_def, best_method = GZIP; - - if (method & (1< metrics->sz_rans0) - best_sz = metrics->sz_rans0, best_method = RANS0; - - if (method & (1< metrics->sz_rans1) - best_sz = metrics->sz_rans1, best_method = RANS1; - - if (method & (1< metrics->sz_bzip2) - best_sz = metrics->sz_bzip2, best_method = BZIP2; - - if (method & (1< metrics->sz_lzma) - best_sz = metrics->sz_lzma, best_method = LZMA; - - if (best_method == GZIP_RLE) { - metrics->method = GZIP; - metrics->strat = Z_RLE; + if (best_method != metrics->method) { + metrics->trial = (NTRIALS+1)/2; // be sure + //metrics->next_trial /= 1.5; + metrics->consistency = 0; } else { - metrics->method = best_method; - metrics->strat = Z_FILTERED; + metrics->next_trial *= MIN(2, 1+metrics->consistency/4.0); + metrics->consistency++; } + metrics->method = best_method; + switch (best_method) { + case GZIP: strat = Z_FILTERED; break; + case GZIP_1: strat = Z_DEFAULT_STRATEGY; break; + case GZIP_RLE: strat = Z_RLE; break; + case FQZ: strat = CRAM_MAJOR_VERS(fd->version); break; + case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; + case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; + case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; + default: strat = 0; + } + metrics->strat = strat; + // If we see at least MAXFAIL trials in a row for a specific // compression method with more than MAXDELTA aggregate // size then we drop this from the list of methods used // for this block type. #define MAXDELTA 0.20 #define MAXFAILS 4 - if (best_method == GZIP_RLE) { - metrics->gz_rle_cnt = 0; - metrics->gz_rle_extra = 0; - } else if (best_sz < metrics->sz_gz_rle) { - double r = (double)metrics->sz_gz_rle / best_sz - 1; - if (++metrics->gz_rle_cnt >= MAXFAILS && - (metrics->gz_rle_extra += r) >= MAXDELTA) - method &= ~(1<gz_def_cnt = 0; - metrics->gz_def_extra = 0; - } else if (best_sz < metrics->sz_gz_def) { - double r = (double)metrics->sz_gz_def / best_sz - 1; - if (++metrics->gz_def_cnt >= MAXFAILS && - (metrics->gz_def_extra += r) >= MAXDELTA) - method &= ~(1<cnt[m] = 0; + metrics->extra[m] = 0; + } else if (best_sz < metrics->sz[m]) { + double r = (double)metrics->sz[m] / best_sz - 1; + int mul = 1+(fd->level>=7); + if (++metrics->cnt[m] >= MAXFAILS*mul && + (metrics->extra[m] += r) >= MAXDELTA*mul) + method &= ~(1<sz[m] > best_sz) + method &= ~(1<rans0_cnt = 0; - metrics->rans0_extra = 0; - } else if (best_sz < metrics->sz_rans0) { - double r = (double)metrics->sz_rans0 / best_sz - 1; - if (++metrics->rans0_cnt >= MAXFAILS && - (metrics->rans0_extra += r) >= MAXDELTA) - method &= ~(1<rans1_cnt = 0; - metrics->rans1_extra = 0; - } else if (best_sz < metrics->sz_rans1) { - double r = (double)metrics->sz_rans1 / best_sz - 1; - if (++metrics->rans1_cnt >= MAXFAILS && - (metrics->rans1_extra += r) >= MAXDELTA) - method &= ~(1<bzip2_cnt = 0; - metrics->bzip2_extra = 0; - } else if (best_sz < metrics->sz_bzip2) { - double r = (double)metrics->sz_bzip2 / best_sz - 1; - if (++metrics->bzip2_cnt >= MAXFAILS && - (metrics->bzip2_extra += r) >= MAXDELTA) - method &= ~(1<lzma_cnt = 0; - metrics->lzma_extra = 0; - } else if (best_sz < metrics->sz_lzma) { - double r = (double)metrics->sz_lzma / best_sz - 1; - if (++metrics->lzma_cnt >= MAXFAILS && - (metrics->lzma_extra += r) >= MAXDELTA) - method &= ~(1<revised_method) - // fprintf(stderr, "%d: method from %x to %x\n", + //if (fd->verbose > 1 && method != metrics->revised_method) + // fprintf(stderr, "%d: revising method from %x to %x\n", // b->content_id, metrics->revised_method, method); metrics->revised_method = method; } @@ -1578,40 +1690,55 @@ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, method = metrics->method; pthread_mutex_unlock(&fd->metrics_lock); - comp = cram_compress_by_method((char *)b->data, b->uncomp_size, + comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, b->content_id, &comp_size, method, - level, strat); + method == GZIP_1 ? 1 : level, + strat); if (!comp) return -1; - free(b->data); - b->data = (unsigned char *)comp; - b->comp_size = comp_size; - b->method = method; + + if (comp_size < b->uncomp_size) { + free(b->data); + b->data = (unsigned char *)comp; + b->comp_size = comp_size; + b->method = method; + } else { + free(comp); + } } } else { // no cached metrics, so just do zlib? - comp = cram_compress_by_method((char *)b->data, b->uncomp_size, + comp = cram_compress_by_method(s, (char *)b->data, b->uncomp_size, b->content_id, &comp_size, GZIP, level, Z_FILTERED); if (!comp) { - hts_log_error("Compression failed"); + hts_log_error("Compression failed!"); return -1; } - free(b->data); - b->data = (unsigned char *)comp; - b->comp_size = comp_size; - b->method = GZIP; + + if (comp_size < b->uncomp_size) { + free(b->data); + b->data = (unsigned char *)comp; + b->comp_size = comp_size; + b->method = GZIP; + } else { + free(comp); + } + strat = Z_FILTERED; } hts_log_info("Compressed block ID %d from %d to %d by method %s", b->content_id, b->uncomp_size, b->comp_size, cram_block_method2str(b->method)); - if (b->method == RANS1) - b->method = RANS0; // Spec just has RANS (not 0/1) with auto-sensing + b->method = methmap[b->method]; return 0; } +int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, + int method, int level) { + return cram_compress_block2(fd, NULL, b, metrics, method, level); +} cram_metrics *cram_new_metrics(void) { cram_metrics *m = calloc(1, sizeof(*m)); @@ -1628,13 +1755,36 @@ cram_metrics *cram_new_metrics(void) { char *cram_block_method2str(enum cram_block_method m) { switch(m) { - case RAW: return "RAW"; - case GZIP: return "GZIP"; - case BZIP2: return "BZIP2"; - case LZMA: return "LZMA"; - case RANS0: return "RANS0"; - case RANS1: return "RANS1"; - case GZIP_RLE: return "GZIP_RLE"; + case RAW: return "RAW"; + case GZIP: return "GZIP"; + case BZIP2: return "BZIP2"; + case LZMA: return "LZMA"; + case RANS0: return "RANS0"; + case RANS1: return "RANS1"; + case GZIP_RLE: return "GZIP_RLE"; + case GZIP_1: return "GZIP_1"; + case FQZ: return "FQZ"; + case FQZ_b: return "FQZ_b"; + case FQZ_c: return "FQZ_c"; + case FQZ_d: return "FQZ_d"; + case RANS_PR0: return "RANS_PR0"; + case RANS_PR1: return "RANS_PR1"; + case RANS_PR64: return "RANS_PR64"; + case RANS_PR9: return "RANS_PR9"; + case RANS_PR128: return "RANS_PR128"; + case RANS_PR129: return "RANS_PR129"; + case RANS_PR192: return "RANS_PR192"; + case RANS_PR193: return "RANS_PR193"; + case NAME_TOK3: return "TOK3_R"; + case NAME_TOKA: return "TOK3_A"; + case ARITH_PR0: return "ARITH_PR0"; + case ARITH_PR1: return "ARITH_PR1"; + case ARITH_PR64: return "ARITH_PR64"; + case ARITH_PR9: return "ARITH_PR9"; + case ARITH_PR128: return "ARITH_PR128"; + case ARITH_PR129: return "ARITH_PR129"; + case ARITH_PR192: return "ARITH_PR192"; + case ARITH_PR193: return "ARITH_PR193"; case BM_ERROR: break; } return "?"; @@ -3496,12 +3646,7 @@ void reset_metrics(cram_fd *fd) { m->next_trial = TRIAL_SPAN; m->revised_method = 0; - m->sz_gz_rle = 0; - m->sz_gz_def = 0; - m->sz_rans0 = 0; - m->sz_rans1 = 0; - m->sz_bzip2 = 0; - m->sz_lzma = 0; + memset(m->sz, 0, sizeof(m->sz)); } } @@ -4435,7 +4580,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { if (!fd) return NULL; - fd->level = 5; + fd->level = CRAM_DEFAULT_LEVEL; for (i = 0; mode[i]; i++) { if (mode[i] >= '0' && mode[i] <= '9') { fd->level = mode[i] - '0'; @@ -4510,6 +4655,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->lossy_read_names = 0; fd->use_bz2 = 0; fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3); + fd->use_tok = (CRAM_MAJOR_VERS(fd->version) >= 3) && (CRAM_MINOR_VERS(fd->version) >= 1); fd->use_lzma = 0; fd->multi_seq = -1; fd->multi_seq_user = -1; @@ -4793,6 +4939,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { case CRAM_OPT_SEQS_PER_SLICE: fd->seqs_per_slice = va_arg(args, int); + if (fd->bases_per_slice == BASES_PER_SLICE) + fd->bases_per_slice = fd->seqs_per_slice * 500; break; case CRAM_OPT_BASES_PER_SLICE: @@ -4833,6 +4981,18 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->use_rans = va_arg(args, int); break; + case CRAM_OPT_USE_TOK: + fd->use_tok = va_arg(args, int); + break; + + case CRAM_OPT_USE_FQZ: + fd->use_fqz = va_arg(args, int); + break; + + case CRAM_OPT_USE_ARITH: + fd->use_arith = va_arg(args, int); + break; + case CRAM_OPT_USE_LZMA: fd->use_lzma = va_arg(args, int); break; @@ -4888,8 +5048,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { } if (!((major == 1 && minor == 0) || (major == 2 && (minor == 0 || minor == 1)) || - (major == 3 && minor == 0))) { - hts_log_error("Unknown version string; use 1.0, 2.0, 2.1 or 3.0"); + (major == 3 && (minor == 0 || minor == 1)))) { + hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0 or 3.1"); errno = EINVAL; return -1; } @@ -4897,6 +5057,8 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; + fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) >= 3 && + CRAM_MINOR_VERS(fd->version) >= 1)) ? 1 : 0; break; } @@ -4960,6 +5122,41 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->level = va_arg(args, int); break; + case HTS_OPT_PROFILE: { + enum hts_profile_option prof = va_arg(args, int); + switch (prof) { + case HTS_PROFILE_FAST: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 1; + fd->use_tok = 0; + fd->seqs_per_slice = 10000; + break; + + case HTS_PROFILE_NORMAL: + break; + + case HTS_PROFILE_SMALL: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 6; + fd->use_bz2 = 1; + fd->use_fqz = 1; + fd->seqs_per_slice = 25000; + break; + + case HTS_PROFILE_ARCHIVE: + if (fd->level == CRAM_DEFAULT_LEVEL) fd->level = 7; + fd->use_bz2 = 1; + fd->use_fqz = 1; + fd->use_arith = 1; + if (fd->level > 7) + fd->use_lzma = 1; + fd->seqs_per_slice = 100000; + break; + } + + if (fd->bases_per_slice == BASES_PER_SLICE) + fd->bases_per_slice = fd->seqs_per_slice * 500; + break; + } + default: hts_log_error("Unknown CRAM option code %d", opt); errno = EINVAL; diff --git a/cram/cram_io.h b/cram/cram_io.h index 194305398..3954a49af 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -467,6 +467,9 @@ int cram_uncompress_block(cram_block *b); */ int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, int method, int level); +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level); cram_metrics *cram_new_metrics(void); char *cram_block_method2str(enum cram_block_method m); diff --git a/cram/cram_structs.h b/cram/cram_structs.h index ce7ad665a..1c51b09f7 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -194,15 +194,7 @@ struct cram_slice; /* Now in htslib/cram.h enum cram_block_method { - BM_ERROR = -1, - RAW = 0, - GZIP = 1, - BZIP2 = 2, - LZMA = 3, - RANS = 4, // Generic; either order - RANS0 = 4, - RANS1 = 10, // Not externalised; stored as RANS (generic) - GZIP_RLE = 11, // NB: not externalised in CRAM + ... }; */ @@ -218,39 +210,29 @@ enum cram_content_type { }; */ +/* Maximum simultaneous codecs allowed, 1 per bit */ +#define CRAM_MAX_METHOD 32 + /* Compression metrics */ struct cram_metrics { // number of trials and time to next trial int trial; int next_trial; + int consistency; // aggregate sizes during trials - int sz_gz_rle; - int sz_gz_def; - int sz_rans0; - int sz_rans1; - int sz_bzip2; - int sz_lzma; + int sz[CRAM_MAX_METHOD]; // resultant method from trials - int method; + int method, revised_method; int strat; // Revisions of method, to allow culling of continually failing ones. - int gz_rle_cnt; - int gz_def_cnt; - int rans0_cnt; - int rans1_cnt; - int bzip2_cnt; - int lzma_cnt; - int revised_method; - - double gz_rle_extra; - double gz_def_extra; - double rans0_extra; - double rans1_extra; - double bzip2_extra; - double lzma_extra; + int cnt[CRAM_MAX_METHOD]; + + double extra[CRAM_MAX_METHOD]; + + cram_stats *stats; }; // Hash aux key (XX:i) to cram_metrics @@ -729,6 +711,9 @@ struct cram_fd { int use_bz2; int use_rans; int use_lzma; + int use_fqz; + int use_tok; + int use_arith; int shared_ref; unsigned int required_fields; int store_md; diff --git a/hts.c b/hts.c index 9e50290cf..8e4552f87 100644 --- a/hts.c +++ b/hts.c @@ -761,6 +761,34 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "USE_LZMA") == 0) o->opt = CRAM_OPT_USE_LZMA, o->val.i = atoi(val); + else if (strcmp(o->arg, "use_tok") == 0 || + strcmp(o->arg, "USE_TOK") == 0) + o->opt = CRAM_OPT_USE_TOK, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_fqz") == 0 || + strcmp(o->arg, "USE_FQZ") == 0) + o->opt = CRAM_OPT_USE_FQZ, o->val.i = atoi(val); + + else if (strcmp(o->arg, "use_arith") == 0 || + strcmp(o->arg, "USE_ARITH") == 0) + o->opt = CRAM_OPT_USE_ARITH, o->val.i = atoi(val); + + else if (strcmp(o->arg, "fast") == 0 || + strcmp(o->arg, "FAST") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_FAST; + + else if (strcmp(o->arg, "normal") == 0 || + strcmp(o->arg, "NORMAL") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_NORMAL; + + else if (strcmp(o->arg, "small") == 0 || + strcmp(o->arg, "SMALL") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_SMALL; + + else if (strcmp(o->arg, "archive") == 0 || + strcmp(o->arg, "ARCHIVE") == 0) + o->opt = HTS_OPT_PROFILE, o->val.i = HTS_PROFILE_ARCHIVE; + else if (strcmp(o->arg, "reference") == 0 || strcmp(o->arg, "REFERENCE") == 0) o->opt = CRAM_OPT_REFERENCE, o->val.s = val; @@ -1340,7 +1368,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { va_end(args); if (fp->is_bgzf) fp->fp.bgzf->compress_level = level; - return 0; + break; } case HTS_OPT_FILTER: { @@ -1350,6 +1378,28 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { return hts_set_filter_expression(fp, expr); } + case HTS_OPT_PROFILE: { + va_start(args, opt); + enum hts_profile_option prof = va_arg(args, int); + va_end(args); + if (fp->is_bgzf) { + switch (prof) { +#ifdef HAVE_LIBDEFLATE + case HTS_PROFILE_FAST: fp->fp.bgzf->compress_level = 2; break; + case HTS_PROFILE_NORMAL: fp->fp.bgzf->compress_level = -1; break; + case HTS_PROFILE_SMALL: fp->fp.bgzf->compress_level = 10; break; + case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 12; break; +#else + case HTS_PROFILE_FAST: fp->fp.bgzf->compress_level = 1; break; + case HTS_PROFILE_NORMAL: fp->fp.bgzf->compress_level = -1; break; + case HTS_PROFILE_SMALL: fp->fp.bgzf->compress_level = 8; break; + case HTS_PROFILE_ARCHIVE: fp->fp.bgzf->compress_level = 9; break; +#endif + } + } // else CRAM manages this in its own way + break; + } + default: break; } diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index fac3fd956..d947f985f 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -1,3 +1,8 @@ -HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c +HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ + $(HTSPREFIX)htscodecs/htscodecs/fqzcomp_qual.c \ + $(HTSPREFIX)htscodecs/htscodecs/pack.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ + $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) diff --git a/htscodecs_external.mk b/htscodecs_external.mk index 8d5640842..f8d4d7d4a 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -1,6 +1,15 @@ HTSCODECS_SOURCES = HTSCODECS_OBJS = +htscodecs_arith_dynamic_h = +htscodecs_fqzcomp_qual_h = +htscodecs_pack_h = htscodecs_rANS_static_h = +htscodecs_rANS_static4x16_h = +htscodecs_tokenise_name3_h = +htscodecs_varint_h = htscodecs_rANS_byte_h = +htscodecs_c_range_coder_h = +htscodecs_c_simple_model_h = +htscodecs_pooled_alloc_h = diff --git a/htslib/cram.h b/htslib/cram.h index bbbabe82e..890896388 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -53,10 +53,46 @@ enum cram_block_method { GZIP = 1, BZIP2 = 2, LZMA = 3, - RANS = 4, // Generic; either order - RANS0 = 4, - RANS1 = 10, // Not externalised; stored as RANS (generic) + RANS = 4, RANS0 = RANS, + RANSPR = 5, RANS_PR0 = RANSPR, + ARITH = 6, ARITH_PR0 = ARITH, + FQZ = 7, + TOK3 = 8, NAME_TOK3 = TOK3, + + // Methods not externalised, but used in metrics. + // Externally they become one of the above methods. GZIP_RLE = 11, // NB: not externalised in CRAM + GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM + + FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods + + //RANS0, // Order 0 + RANS1, + + //RANS_PR0, // Order 0 + RANS_PR1, // Order 1 + RANS_PR64, // O0 + RLE + RANS_PR9, // O1 + X4 + RANS_PR128, // O0 + Pack + RANS_PR129, // O1 + Pack + RANS_PR192, // O0 + RLE + pack + RANS_PR193, // O1 + RLE + pack + + //NAME_TOK3, // tok+rans + NAME_TOKA, // tok+arith + + //ARITH_PR0, // Order 0 + ARITH_PR1, // Order 1 + ARITH_PR64, // O0 + RLE + ARITH_PR9, // O1 + X4 + ARITH_PR128, // O0 + Pack + ARITH_PR129, // O1 + Pack + ARITH_PR192, // O0 + RLE + pack + ARITH_PR193, // O1 + RLE + pack + + // NB: must end on no more than 31 unless we change to a + // 64-bit method type. + }; enum cram_content_type { @@ -306,6 +342,9 @@ int cram_uncompress_block(cram_block *b); HTSLIB_EXPORT int cram_compress_block(cram_fd *fd, cram_block *b, cram_metrics *metrics, int method, int level); +int cram_compress_block2(cram_fd *fd, cram_slice *s, + cram_block *b, cram_metrics *metrics, + int method, int level); /**@}*/ /**@{ ---------------------------------------------------------------------- diff --git a/htslib/hts.h b/htslib/hts.h index f4e06efdd..3c6a3dcb6 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -316,6 +316,9 @@ enum hts_fmt_option { CRAM_OPT_STORE_MD, CRAM_OPT_STORE_NM, CRAM_OPT_RANGE_NOSEEK, // CRAM_OPT_RANGE minus the seek + CRAM_OPT_USE_TOK, + CRAM_OPT_USE_FQZ, + CRAM_OPT_USE_ARITH, // General purpose HTS_OPT_COMPRESSION_LEVEL = 100, @@ -324,6 +327,16 @@ enum hts_fmt_option { HTS_OPT_CACHE_SIZE, HTS_OPT_BLOCK_SIZE, HTS_OPT_FILTER, + HTS_OPT_PROFILE, +}; + +// Profile options for encoding; primarily used at present in CRAM +// but also usable in BAM as a synonym for deflate compression levels. +enum hts_profile_option { + HTS_PROFILE_FAST, + HTS_PROFILE_NORMAL, + HTS_PROFILE_SMALL, + HTS_PROFILE_ARCHIVE, }; // For backwards compatibility From a22e8ca60df122e23412dd255d3ffb4fc9f6a350 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 18 Sep 2019 16:34:34 +0100 Subject: [PATCH 042/488] Minor tweaks to codec learning params. --- cram/cram_io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index f771d2496..06affc592 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -108,7 +108,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define PATH_MAX FILENAME_MAX #endif -#define TRIAL_SPAN 50 +#define TRIAL_SPAN 70 #define NTRIALS 3 #define CRAM_DEFAULT_LEVEL 5 @@ -1573,18 +1573,18 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, // Externally defined methods 1, // 0 raw 1.04, // 1 gzip (Z_FILTERED) - 1.08, // 2 bzip2 - 1.04, // 3 lzma + 1.07, // 2 bzip2 + 1.08, // 3 lzma 1.00, // 4 rans (O0) 1.00, // 5 ranspr (O0) - 1.03, // 6 arithpr (O0) + 1.04, // 6 arithpr (O0) 1.05, // 7 fqz 1.05, // 8 tok3 (rans) 9, 9, // 9,10 reserved // Paramterised versions of above 1.01, // gzip rle - 1.02, // gzip -1 + 1.01, // gzip -1 1.05, 1.05, 1.05, // FQZ_b,c,d @@ -1745,7 +1745,7 @@ cram_metrics *cram_new_metrics(void) { if (!m) return NULL; m->trial = NTRIALS-1; - m->next_trial = TRIAL_SPAN; + m->next_trial = TRIAL_SPAN/2; // learn quicker at start m->method = RAW; m->strat = 0; m->revised_method = 0; From 9be58e5a68e01eddd679f41a95836d3a6ecae0b7 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 18 Sep 2019 11:38:47 +0100 Subject: [PATCH 043/488] Ensure threads in the pool have a big enough stack Some rANS codecs require over 2 Mbytes of temporary space for the encode/decoder state. On some platforms, notably MacOS the default thread stack size is not big enough to hold this. Previously we fixed this by using malloc instead, but that has other undesireable side effects such as repeated mmap/munmap and wasting time zeroing pages over and over. Our traditional fix here has been to use pthread_once to get a single malloc and then reuse this block, to avoid the mmap issues in glibc. However this is complicated, ensuring the thread stack is sufficiently big is much easier. --- thread_pool.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/thread_pool.c b/thread_pool.c index 2bc427d51..f56b781b3 100644 --- a/thread_pool.c +++ b/thread_pool.c @@ -42,6 +42,10 @@ DEALINGS IN THE SOFTWARE. */ #include "thread_pool_internal.h" #include "htslib/hts_log.h" +// Minimum stack size for threads. Required for some rANS codecs +// that use over 2Mbytes of stack for encoder / decoder state +#define HTS_MIN_THREAD_STACK (3 * 1024 * 1024) + static void hts_tpool_process_detach_locked(hts_tpool *p, hts_tpool_process *q); @@ -716,6 +720,9 @@ static void wake_next_worker(hts_tpool_process *q, int locked) { */ hts_tpool *hts_tpool_init(int n) { int t_idx = 0; + size_t stack_size = 0; + pthread_attr_t pattr; + int pattr_init_done = 0; hts_tpool *p = malloc(sizeof(*p)); if (!p) return NULL; @@ -748,18 +755,32 @@ hts_tpool *hts_tpool_init(int n) { pthread_mutex_lock(&p->pool_m); + // Ensure new threads have a reasonably large stack. On some platforms, + // for example MacOS which defaults to 512Kb, this is not big enough + // for some of the rANS codecs. + + if (pthread_attr_init(&pattr) < 0) + goto cleanup; + pattr_init_done = 1; + if (pthread_attr_getstacksize(&pattr, &stack_size) < 0) + goto cleanup; + if (stack_size < HTS_MIN_THREAD_STACK) { + if (pthread_attr_setstacksize(&pattr, HTS_MIN_THREAD_STACK) < 0) + goto cleanup; + } + for (t_idx = 0; t_idx < n; t_idx++) { hts_tpool_worker *w = &p->t[t_idx]; p->t_stack[t_idx] = 0; w->p = p; w->idx = t_idx; pthread_cond_init(&w->pending_c, NULL); - if (0 != pthread_create(&w->tid, NULL, tpool_worker, w)) { + if (0 != pthread_create(&w->tid, &pattr, tpool_worker, w)) goto cleanup; - } } pthread_mutex_unlock(&p->pool_m); + pthread_attr_destroy(&pattr); return p; @@ -778,6 +799,8 @@ hts_tpool *hts_tpool_init(int n) { pthread_cond_destroy(&p->t[j].pending_c); } pthread_mutex_destroy(&p->pool_m); + if (pattr_init_done) + pthread_attr_destroy(&pattr); free(p->t_stack); free(p->t); free(p); From 4182364d6e79cd640a492968c54fa6f91d8bef29 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 23 Oct 2020 18:04:24 +0100 Subject: [PATCH 044/488] Add warning about unofficial status of CRAM 3.1. This cannot be avoided! It's spam, but it will be removed once the spec becomes official. If there are no changes, that does mean this htslib will be compliant, but spammy, however we can then make a new release. --- cram/cram_io.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/cram/cram_io.c b/cram/cram_io.c index 06affc592..fa8b3e346 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5053,6 +5053,15 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { errno = EINVAL; return -1; } + + if (major > 3 || (major == 3 && minor > 0)) { + hts_log_warning( + "CRAM version %s is still in draft and is subject to\n" + "change. Please consider this a technology demonstration " + "and do not use for\n" + "long term archival of data.", s); + } + fd->version = major*256 + minor; fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; From 1e79b61467de646c63f614a599fcdba7a76bf03b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 19 Sep 2019 09:57:35 +0100 Subject: [PATCH 045/488] Add some very minimal 3.1 cram tests Not sure we have any files large enough to really put it through its paces, but this is a reasonable start. --- test/test.pl | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/test/test.pl b/test/test.pl index 5db7b2320..823a414c5 100755 --- a/test/test.pl +++ b/test/test.pl @@ -582,6 +582,34 @@ sub test_view testv $opts, "./test_view $tv_args $cram > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + ## Experimental CRAM 3.1 support. + # SAM -> CRAM31u -> SAM + foreach my $profile (qw/fast normal small archive/) { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=3.1 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } + + # BAM -> CRAM31 -> BAM -> SAM + $cram = "$bam.cram"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 $bam > $cram"; + testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; + testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; + + # CRAM31 -> CRAM30 + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram > $cram.cram"; + + # CRAM30 -> CRAM31 + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 $cram.cram > $cram"; + + # CRAM31 -> CRAM31 + multi-slice + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; + testv $opts, "./test_view $tv_args $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + # Java pre-made CRAM -> SAM my $jcram = "${base}_java.cram"; if (-e $jcram) { From 3e05b18bc8f9e74de3cfbb305358f3cc57b06d1e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 3 Feb 2020 12:42:07 +0000 Subject: [PATCH 046/488] Updated to v0.5 htscodecs --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 4e06c5d79..99ed6bcca 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 4e06c5d795b2a603bebf141cb88082901b41399c +Subproject commit 99ed6bcca2192a3c210fac04939c7e51a0b15a3c From 0d1971cbebb269d90a563464e4f56987290e20ab Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 7 Feb 2020 15:28:37 +0000 Subject: [PATCH 047/488] First tranch of CRAM V4.0 support. We can read/write CRAM 3.1. We can read CRAM 4.0, but writing 4.0 is corrupted. --- Makefile | 8 +- cram/cram_codecs.c | 1751 +++++++++++++++++++++++++++++++++++------ cram/cram_codecs.h | 60 +- cram/cram_decode.c | 516 +++++++----- cram/cram_encode.c | 735 ++++++++--------- cram/cram_external.c | 10 +- cram/cram_io.c | 847 ++++++++++++++++---- cram/cram_io.h | 305 +------ cram/cram_structs.h | 50 +- hts.c | 2 +- htscodecs_bundled.mk | 1 + htscodecs_external.mk | 1 + htslib/cram.h | 12 +- test/test.pl | 28 + 14 files changed, 3013 insertions(+), 1313 deletions(-) diff --git a/Makefile b/Makefile index d6525a516..4168c0463 100644 --- a/Makefile +++ b/Makefile @@ -217,6 +217,7 @@ htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h htscodecs_pack_h = htscodecs/htscodecs/pack.h htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h +htscodecs_rle_h = htscodecs/htscodecs/rle.h htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h htscodecs_varint_h = htscodecs/htscodecs/varint.h @@ -378,12 +379,12 @@ probaln.o probaln.pico: probaln.c config.h $(htslib_hts_h) realn.o realn.pico: realn.c config.h $(htslib_hts_h) $(htslib_sam_h) textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstring_h) $(htslib_sam_h) $(hts_internal_h) -cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(cram_h) +cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) @@ -394,8 +395,9 @@ thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_c_simple_model.h) htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model.h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) -htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_rANS_static_h) +htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.c config.h $(htscodecs_varint_h) $(htscodecs_rle_h) htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 80131b633..a598f3964 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -42,6 +42,19 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include + +#include "../htslib/hts_endian.h" + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#include +#include +#else +#include "../htscodecs/htscodecs/varint.h" +#include "../htscodecs/htscodecs/pack.h" +#include "../htscodecs/htscodecs/rle.h" +#endif #include "cram.h" @@ -149,8 +162,8 @@ static void store_bytes_MSB(cram_block *block, char *bytes, int len) { #endif /* Local optimised copy for inlining */ -static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { - unsigned int val = 0; +static inline int64_t get_bits_MSB(cram_block *block, int nbits) { + uint64_t val = 0; int i; #if 0 @@ -239,7 +252,7 @@ static inline unsigned int get_bits_MSB(cram_block *block, int nbits) { * characters with exactly the correct frequency distribution we check * for it elsewhere.) */ -static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { +static int store_bits_MSB(cram_block *block, uint64_t val, int nbits) { //fprintf(stderr, " store_bits: %02x %d\n", val, nbits); /* @@ -248,15 +261,15 @@ static int store_bits_MSB(cram_block *block, unsigned int val, int nbits) { */ unsigned int mask; - if (block->byte+4 >= block->alloc) { + if (block->byte+8 >= block->alloc) { if (block->byte) { block->alloc *= 2; - block->data = realloc(block->data, block->alloc + 4); + block->data = realloc(block->data, block->alloc + 8); if (!block->data) return -1; } else { block->alloc = 1024; - block->data = realloc(block->data, block->alloc + 4); + block->data = realloc(block->data, block->alloc + 8); if (!block->data) return -1; block->data[0] = 0; // initialise first byte of buffer @@ -314,7 +327,6 @@ static char *cram_extract_block(cram_block *b, int size) { */ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int l; char *cp; cram_block *b; @@ -325,16 +337,36 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cp = (char *)b->data + b->idx; // E_INT and E_LONG are guaranteed single item queries - l = safe_itf8_get(cp, (char *)b->data + b->uncomp_size, (int32_t *)out); - b->idx += l; + int err = 0; + *(int32_t *)out = c->vv->varint_get32(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_external_decode_sint(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int32_t *)out = c->vv->varint_get32s(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; *out_size = 1; - return l > 0 ? 0 : -1; + return err ? -1 : 0; } int cram_external_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int64_t l; char *cp; cram_block *b; @@ -345,11 +377,32 @@ int cram_external_decode_long(cram_slice *slice, cram_codec *c, cp = (char *)b->data + b->idx; // E_INT and E_LONG are guaranteed single item queries - l = safe_ltf8_get(cp, (char *)b->data + b->uncomp_size, (int64_t *)out); - b->idx += l; + int err = 0; + *(int64_t *)out = c->vv->varint_get64(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; *out_size = 1; - return l > 0 ? 0 : -1; + return err ? -1 : 0; +} + +int cram_external_decode_slong(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int64_t *)out = c->vv->varint_get64s(&cp, (char *)b->data + b->uncomp_size, &err); + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; } int cram_external_decode_char(cram_slice *slice, cram_codec *c, @@ -400,9 +453,26 @@ void cram_external_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_external_decode_init(char *data, int size, + +int cram_external_decode_size(cram_slice *slice, cram_codec *c) { + cram_block *b; + + /* Find the external block */ + b = cram_get_block_by_id(slice, c->u.external.content_id); + if (!b) + return -1; + + return b->uncomp_size; +} + +cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) { + return cram_get_block_by_id(slice, c->u.external.content_id); +} + +cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; char *cp = data; @@ -415,15 +485,21 @@ cram_codec *cram_external_decode_init(char *data, int size, c->codec = E_EXTERNAL; if (option == E_INT) c->decode = cram_external_decode_int; + else if (option == E_SINT) + c->decode = cram_external_decode_sint; else if (option == E_LONG) c->decode = cram_external_decode_long; + else if (option == E_SLONG) + c->decode = cram_external_decode_slong; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->decode = cram_external_decode_char; else c->decode = cram_external_decode_block; c->free = cram_external_decode_free; + c->size = cram_external_decode_size; + c->get_block = cram_external_get_block; - cp += safe_itf8_get(cp, data + size, &c->u.external.content_id); + c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL); if (cp - data != size) goto malformed; @@ -441,16 +517,25 @@ cram_codec *cram_external_decode_init(char *data, int size, int cram_external_encode_int(cram_slice *slice, cram_codec *c, char *in, int in_size) { uint32_t *i32 = (uint32_t *)in; + return c->vv->varint_put32_blk(c->out, *i32) >= 0 ? 0 : -1; +} - return itf8_put_blk(c->out, *i32) >= 0 ? 0 : -1; +int cram_external_encode_sint(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int32_t *i32 = (int32_t *)in; + return c->vv->varint_put32s_blk(c->out, *i32) >= 0 ? 0 : -1; } int cram_external_encode_long(cram_slice *slice, cram_codec *c, char *in, int in_size) { uint64_t *i64 = (uint64_t *)in; + return c->vv->varint_put64_blk(c->out, *i64) >= 0 ? 0 : -1; +} - ltf8_put_blk(c->out, *i64); - return 0; +int cram_external_encode_slong(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *i64 = (int64_t *)in; + return c->vv->varint_put64s_blk(c->out, *i64) >= 0 ? 0 : -1; } int cram_external_encode_char(cram_slice *slice, cram_codec *c, @@ -470,7 +555,7 @@ void cram_external_encode_free(cram_codec *c) { int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, int version) { - char tmp[99], *tp = tmp; + char tmp[99], *tp = tmp, *tpend = tmp+99; int len = 0, r = 0, n; if (prefix) { @@ -479,9 +564,9 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, len += l; } - tp += itf8_put(tp, c->u.e_external.content_id); - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, tp-tmp)); r |= n; + tp += c->vv->varint_put32(tp, tpend, c->u.e_external.content_id); + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, tp-tmp)); r |= n; BLOCK_APPEND(b, tmp, tp-tmp); len += tp-tmp; @@ -495,7 +580,7 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, cram_codec *cram_external_encode_init(cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; c = malloc(sizeof(*c)); @@ -505,13 +590,18 @@ cram_codec *cram_external_encode_init(cram_stats *st, c->free = cram_external_encode_free; if (option == E_INT) c->encode = cram_external_encode_int; + else if (option == E_SINT) + c->encode = cram_external_encode_sint; else if (option == E_LONG) c->encode = cram_external_encode_long; + else if (option == E_SLONG) + c->encode = cram_external_encode_slong; else if (option == E_BYTE_ARRAY || option == E_BYTE) c->encode = cram_external_encode_char; else abort(); c->store = cram_external_encode_store; + c->flush = NULL; c->u.e_external.content_id = (size_t)dat; @@ -526,107 +616,1194 @@ int cram_beta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char int64_t *out_i = (int64_t *)out; int i, n = *out_size; - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + +int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n = *out_size; + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + } else { + for (i = 0; i < n; i++) + out_i[i] = -c->u.beta.offset; + } + + return 0; +} + +int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int i, n = *out_size; + + + if (c->u.beta.nbits) { + if (cram_not_enough_bits(in, c->u.beta.nbits * n)) + return -1; + + if (out) + for (i = 0; i < n; i++) + out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; + else + for (i = 0; i < n; i++) + get_bits_MSB(in, c->u.beta.nbits); + } else { + if (out) + for (i = 0; i < n; i++) + out[i] = -c->u.beta.offset; + } + + return 0; +} + +void cram_beta_decode_free(cram_codec *c) { + if (c) + free(c); +} + +cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_BETA; + if (option == E_INT || option == E_SINT) + c->decode = cram_beta_decode_int; + else if (option == E_LONG || option == E_SLONG) + c->decode = cram_beta_decode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_beta_decode_char; + else { + hts_log_error("BYTE_ARRAYs not supported by this codec"); + free(c); + return NULL; + } + c->free = cram_beta_decode_free; + + c->u.beta.nbits = -1; + c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL); + if (cp < data + size) // Ensure test below works + c->u.beta.nbits = vv->varint_get32(&cp, data + size, NULL); + + if (cp - data != size + || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) { + hts_log_error("Malformed beta header stream"); + free(c); + return NULL; + } + + return c; +} + +int cram_beta_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + // codec length + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_beta.offset) + + c->vv->varint_size(c->u.e_beta.nbits))); + r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.offset)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_beta.nbits)); r |= n; + + if (r > 0) return len; + + block_err: + return -1; +} + +int cram_beta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +int cram_beta_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int *syms = (int *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +int cram_beta_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + unsigned char *syms = (unsigned char *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, + c->u.e_beta.nbits); + + return r; +} + +void cram_beta_encode_free(cram_codec *c) { + if (c) free(c); +} + +cram_codec *cram_beta_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + int min_val, max_val, len = 0; + int64_t range; + + c = malloc(sizeof(*c)); + if (!c) + return NULL; + c->codec = E_BETA; + c->free = cram_beta_encode_free; + if (option == E_INT || option == E_SINT) + c->encode = cram_beta_encode_int; + else if (option == E_LONG || option == E_SLONG) + c->encode = cram_beta_encode_long; + else + c->encode = cram_beta_encode_char; + c->store = cram_beta_encode_store; + c->flush = NULL; + + if (dat) { + min_val = ((int *)dat)[0]; + max_val = ((int *)dat)[1]; + } else { + min_val = INT_MAX; + max_val = INT_MIN; + int i; + for (i = 0; i < MAX_STAT_VAL; i++) { + if (!st->freqs[i]) + continue; + if (min_val > i) + min_val = i; + max_val = i; + } + if (st->h) { + khint_t k; + + for (k = kh_begin(st->h); k != kh_end(st->h); k++) { + if (!kh_exist(st->h, k)) + continue; + + i = kh_key(st->h, k); + if (min_val > i) + min_val = i; + if (max_val < i) + max_val = i; + } + } + } + + assert(max_val >= min_val); + c->u.e_beta.offset = -min_val; + range = (int64_t) max_val - min_val; + while (range) { + len++; + range >>= 1; + } + c->u.e_beta.nbits = len; + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XPACK: Packing multiple values into a single byte. A fast transform that + * reduces time taken by entropy encoder and may also improve compression. + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ +int cram_xpack_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n = *out_size; + + if (c->u.xpack.nbits) { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)]; + } else { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[0]; + } + + return 0; +} + +int cram_xpack_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n = *out_size; + + if (c->u.xpack.nbits) { + if (cram_not_enough_bits(in, c->u.xpack.nbits * n)) + return -1; + + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[get_bits_MSB(in, c->u.xpack.nbits)]; + } else { + for (i = 0; i < n; i++) + out_i[i] = c->u.xpack.rmap[0]; + } + + return 0; +} + +static int cram_xpack_decode_expand_char(cram_slice *slice, cram_codec *c) { + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (b) + return 0; + + // get sub-codec data. + cram_block *sub_b = c->u.xpack.sub_codec->get_block(slice, c->u.xpack.sub_codec); + if (!sub_b) + return -1; + + // Allocate local block to expand into + b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0); + if (!b) + return -1; + int n = sub_b->uncomp_size * 8/c->u.xpack.nbits; + BLOCK_GROW(b, n); + b->uncomp_size = n; + + uint8_t p[256]; + int z; + for (z = 0; z < 256; z++) + p[z] = c->u.xpack.rmap[z]; + hts_unpack(sub_b->data, sub_b->uncomp_size, b->data, b->uncomp_size, + 8 / c->u.xpack.nbits, p); + + return 0; + + block_err: + return -1; +} + +int cram_xpack_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // FIXME: we need to ban data-series interleaving in the spec for this to work. + + // Remember this may be called when threaded and multi-slice per container. + // Hence one cram_codec instance, multiple slices, multiple blocks. + // We therefore have to cache appropriate block info in slice and not codec. + // b = cram_get_block_by_id(slice, c->external.content_id); + if (c->u.xpack.nval > 1) { + cram_xpack_decode_expand_char(slice, c); + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (!b) + return -1; + + if (out) + memcpy(out, b->data + b->byte, *out_size); + b->byte += *out_size; + } else { + memset(out, c->u.xpack.rmap[0], *out_size); + } + + return 0; +} + +void cram_xpack_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xpack.sub_codec) + c->u.xpack.sub_codec->free(c->u.xpack.sub_codec); + + //free(slice->block_by_id[512 + c->codec_id]); + //slice->block_by_id[512 + c->codec_id] = 0; + + free(c); +} + +int cram_xpack_decode_size(cram_slice *slice, cram_codec *c) { + cram_xpack_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} + +cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) { + cram_xpack_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} + +cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + char *endp = data+size; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XPACK; + if (option == E_LONG) + c->decode = cram_xpack_decode_long; + else if (option == E_INT) + c->decode = cram_xpack_decode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_xpack_decode_char; + else { + fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); + return NULL; + } + c->free = cram_xpack_decode_free; + c->size = cram_xpack_decode_size; + c->get_block = cram_xpack_get_block; + + c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL); + c->u.xpack.nval = vv->varint_get32(&cp, endp, NULL); + int i; + for (i = 0; i < c->u.xpack.nval; i++) { + uint32_t v = vv->varint_get32(&cp, endp, NULL); + if (v >= 256) return NULL; + c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K + } + + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xpack.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); + if (c->u.xpack.sub_codec == NULL) + goto malformed; + cp += sub_size; + + if (cp - data != size + || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) { + malformed: + fprintf(stderr, "Malformed xpack header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_xpack_encode_flush(cram_codec *c) { + // Pack the buffered up data + int meta_len; + uint64_t out_len; + uint8_t out_meta[1024]; + uint8_t *out = hts_pack(BLOCK_DATA(c->out), BLOCK_SIZE(c->out), + out_meta, &meta_len, &out_len); + + // We now need to pass this through the next layer of transform + if (c->u.e_xpack.sub_codec->encode(NULL, // also indicates flush incoming + c->u.e_xpack.sub_codec, + (char *)out, out_len)) + return -1; + + int r = 0; + if (c->u.e_xpack.sub_codec->flush) + r = c->u.e_xpack.sub_codec->flush(c->u.e_xpack.sub_codec); + + free(out); + return r; +} + +int cram_xpack_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + // Store sub-codec + cram_codec *tc = c->u.e_xpack.sub_codec; + cram_block *tb = cram_new_block(0, 0); + if (!tb) + return -1; + int len2 = tc->store(tc, tb, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + + // codec length + int len1 = 0, i; + for (i = 0; i < c->u.e_xpack.nval; i++) + len1 += (n = c->vv->varint_size(c->u.e_xpack.rmap[i])), r |= n; + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xpack.nbits) + + c->vv->varint_size(c->u.e_xpack.nval) + + len1 + len2)); r |= n; + + // The map and sub-codec + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nbits)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.nval)); r |= n; + for (i = 0; i < c->u.e_xpack.nval; i++) + len += (n = c->vv->varint_put32_blk(b, c->u.e_xpack.rmap[i])), r |= n; + + BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb)); + + cram_free_block(tb); + + return r > 0 ? len + len2 : -1; + + block_err: + return -1; +} + +// Same as cram_beta_encode_long +int cram_xpack_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *syms = (int64_t *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits); + + return r; +} + +int cram_xpack_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int *syms = (int *)in; + int i, r = 0; + + for (i = 0; i < in_size; i++) + r |= store_bits_MSB(c->out, c->u.e_xpack.map[syms[i]], c->u.e_xpack.nbits); + + return r; +} + +int cram_xpack_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + BLOCK_APPEND(c->out, in, in_size); + return 0; + + block_err: + return -1; +} + +void cram_xpack_encode_free(cram_codec *c) { + if (!c) return; + + if (c->u.e_xpack.sub_codec) + c->u.e_xpack.sub_codec->free(c->u.e_xpack.sub_codec); + + cram_free_block(c->out); + + free(c); +} + +cram_codec *cram_xpack_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XPACK; + c->free = cram_xpack_encode_free; + if (option == E_LONG) + c->encode = cram_xpack_encode_long; + else if (option == E_INT) + c->encode = cram_xpack_encode_int; + else + c->encode = cram_xpack_encode_char; + c->store = cram_xpack_encode_store; + c->flush = cram_xpack_encode_flush; + + cram_xpack_encoder *e = (cram_xpack_encoder *)dat; + c->u.e_xpack.nbits = e->nbits; + c->u.e_xpack.nval = e->nval; + c->u.e_xpack.sub_codec = cram_encoder_init(e->sub_encoding, NULL, + E_BYTE_ARRAY, e->sub_codec_dat, + version, vv); + + // Initialise fwd and rev maps + memcpy(c->u.e_xpack.map, e->map, sizeof(e->map)); // P,A,C,K to 0,1,2,3 + int i, n; + for (i = n = 0; i < 256; i++) + if (e->map[i] != -1) + c->u.e_xpack.rmap[n++] = i; // 0,1,2,3 to P,A,C,K + if (n != e->nval) { + fprintf(stderr, "Incorrectly specified number of map items in PACK\n"); + return NULL; + } + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XDELTA: subtract successive values, zig-zag to turn +/- to + only, + * and then var-int encode the result. + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ + +static uint8_t zigzag8 (int8_t x) { return (x << 1) ^ (x >> 7); } +static uint16_t zigzag16(int16_t x) { return (x << 1) ^ (x >> 15); } +static uint32_t zigzag32(int32_t x) { return (x << 1) ^ (x >> 31); } + +//static int8_t unzigzag8 (uint8_t x) { return (x >> 1) ^ -(x & 1); } +static int16_t unzigzag16(uint16_t x) { return (x >> 1) ^ -(x & 1); } +static int32_t unzigzag32(uint32_t x) { return (x >> 1) ^ -(x & 1); } + +int cram_xdelta_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + return -1; +} + +int cram_xdelta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // Slow value-by-value method for now + uint32_t *out32 = (uint32_t *)out; + int i; + for (i = 0; i < *out_size; i++) { + uint32_t v; + int one = 1; + if (c->u.e_xdelta.sub_codec->decode(slice, c->u.e_xdelta.sub_codec, in, + (char *)&v, &one) < 0) + return -1; + uint32_t d = unzigzag32(v); + c->u.xdelta.last = out32[i] = d + c->u.xdelta.last; + } + + return 0; +} + +static int cram_xdelta_decode_expand_char(cram_slice *slice, cram_codec *c) { + return -1; +} + +int cram_xdelta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + return -1; +} + +static inline int16_t le_int2(int16_t i) { + int16_t s; + i16_to_le(i, (uint8_t *)&s); + return s; +} + +int cram_xdelta_decode_block(cram_slice *slice, cram_codec *c, cram_block *in, + char *out_, int *out_size) { + cram_block *out = (cram_block *)out_; + cram_block *b = c->u.e_xdelta.sub_codec->get_block(slice, c->u.e_xdelta.sub_codec); + int i = 0; + + const int w = c->u.xdelta.word_size; + uint32_t npad = (w - *out_size%w)%w; + uint32_t out_sz = *out_size + npad; + c->u.xdelta.last = 0; // reset for each new array + + for (i = 0; i < out_sz; i += w) { + uint16_t v; + // Need better interface + char *cp = (char *)b->data + b->byte; + char *cp_end = (char *)b->data + b->uncomp_size; + int err = 0; + v = c->vv->varint_get32(&cp, cp_end, &err); + if (err) + return -1; + b->byte = cp - (char *)b->data; + + switch(w) { + case 2: { + int16_t d = unzigzag16(v), z; + c->u.xdelta.last = d + c->u.xdelta.last; + z = le_int2(c->u.xdelta.last); + BLOCK_APPEND(out, &z, 2-npad); + npad = 0; + break; + } + default: + fprintf(stderr, "Unsupported word size by XDELTA\n"); + return -1; + } + } + + return 0; + + block_err: + return -1; +} + +void cram_xdelta_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xdelta.sub_codec) + c->u.xdelta.sub_codec->free(c->u.xdelta.sub_codec); + + free(c); +} + +int cram_xdelta_decode_size(cram_slice *slice, cram_codec *c) { + cram_xdelta_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} + +cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) { + cram_xdelta_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} + +cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + char *endp = data+size; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XDELTA; + if (option == E_LONG) + c->decode = cram_xdelta_decode_long; + else if (option == E_INT) + c->decode = cram_xdelta_decode_int; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_xdelta_decode_char; + else if (option == E_BYTE_ARRAY_BLOCK) { + option = E_BYTE_ARRAY; + c->decode = cram_xdelta_decode_block; + } else + return NULL; + c->free = cram_xdelta_decode_free; + c->size = cram_xdelta_decode_size; + c->get_block = cram_xdelta_get_block; + + c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL); + c->u.xdelta.last = 0; + + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xdelta.sub_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); + if (c->u.xdelta.sub_codec == NULL) + goto malformed; + cp += sub_size; + + if (cp - data != size) { + malformed: + fprintf(stderr, "Malformed xdelta header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_xdelta_encode_flush(cram_codec *c) { + int r = -1; + cram_block *b = cram_new_block(0, 0); + if (!b) + return -1; + + switch (c->u.e_xdelta.word_size) { + case 2: { + // Delta + zigzag transform. + // Subtracting two 8-bit values has a 9-bit result (-255 to 255). + // However think of it as turning a wheel clockwise or anti-clockwise. + // If it has 256 gradations then a -ve rotation followed by a +ve + // rotation of the same amount reverses it regardless. + // + // Similarly the zig-zag transformation doesn't invent any extra bits, + // so the entire thing can be done in-situ. This may permit faster + // SIMD loops if we break apart the steps. + + // uint16_t last = 0, d; + // for (i = 0; i < n; i++) { + // d = io[i] - last; + // last = io[i]; + // io[i] = zigzag16(vd); + // } + + // --- vs --- + + // for (i = n-1; i >= 1; i--) + // io[i] -= io[i-1]; + // for (i = 0; i < n; i++) + // io[i] = zigzag16(io[i]); + + // varint: need array variant for speed here. + // With zig-zag + int i, n = BLOCK_SIZE(c->out)/2;; + uint16_t *dat = (uint16_t *)BLOCK_DATA(c->out), last = 0; + + if (n*2 < BLOCK_SIZE(c->out)) { + // half word + last = *(uint8_t *)dat; + c->vv->varint_put32_blk(b, zigzag16(last)); + dat = (uint16_t *)(((uint8_t *)dat)+1); + } + + for (i = 0; i < n; i++) { + uint16_t d = dat[i] - last; // possibly unaligned + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag16(d)); + } + + break; + } + + case 4: { + int i, n = BLOCK_SIZE(c->out)/4;; + uint32_t *dat = (uint32_t *)BLOCK_DATA(c->out), last = 0; + + for (i = 0; i < n; i++) { + uint32_t d = dat[i] - last; + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag32(d)); + } + + break; + } + + case 1: { + int i, n = BLOCK_SIZE(c->out);; + uint8_t *dat = (uint8_t *)BLOCK_DATA(c->out), last = 0; + + for (i = 0; i < n; i++) { + uint32_t d = dat[i] - last; + last = dat[i]; + c->vv->varint_put32_blk(b, zigzag8(d)); + } + + break; + } + + default: + goto err; + } + + if (c->u.e_xdelta.sub_codec->encode(NULL, c->u.e_xdelta.sub_codec, + (char *)b->data, b->byte)) + goto err; + + r = 0; + + err: + cram_free_block(b); + return r; + +} + +int cram_xdelta_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { + int len = 0, r = 0, n; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + // Store sub-codec + cram_codec *tc = c->u.e_xdelta.sub_codec; + cram_block *tb = cram_new_block(0, 0); + if (!tb) + return -1; + int len2 = tc->store(tc, tb, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + + // codec length + len += (n = c->vv->varint_put32_blk(b, c->vv->varint_size(c->u.e_xdelta.word_size) + + len2)); r |= n; + + // This and sub-codec + len += (n = c->vv->varint_put32_blk(b, c->u.e_xdelta.word_size)); r |= n; + BLOCK_APPEND(b, BLOCK_DATA(tb), BLOCK_SIZE(tb)); + + cram_free_block(tb); + + return r > 0 ? len + len2 : -1; + + block_err: + return -1; +} + +// Same as cram_beta_encode_long +int cram_xdelta_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return -1; +} + +int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return -1; +} + +int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + char *dat = malloc(in_size*5), *cp = dat, *cp_end = dat + in_size*5; + if (!dat) + return -1; + + c->u.e_xdelta.last = 0; // reset for each new array + switch(c->u.e_xdelta.word_size) { + case 2: { + int i, part; + + part = in_size%2; + if (part) { + uint16_t z = in[0]; + c->u.e_xdelta.last = le_int2(z); + cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last)); + } + + uint16_t *in16 = (uint16_t *)(in+part); + for (i = 0; i < in_size/2; i++) { + uint16_t d = le_int2(in16[i]) - c->u.e_xdelta.last; + c->u.e_xdelta.last = le_int2(in16[i]); + cp += c->vv->varint_put32(cp, cp_end, zigzag16(d)); + } + + break; + } + } + if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec, + (char *)dat, cp-dat)) { + free(dat); + return -1; + } + + free(dat); + return 0; +} + +void cram_xdelta_encode_free(cram_codec *c) { + if (!c) return; + + if (c->u.e_xdelta.sub_codec) + c->u.e_xdelta.sub_codec->free(c->u.e_xdelta.sub_codec); + + cram_free_block(c->out); + + free(c); +} + +cram_codec *cram_xdelta_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = E_XDELTA; + c->free = cram_xdelta_encode_free; + if (option == E_LONG) + c->encode = cram_xdelta_encode_long; + else if (option == E_INT) + c->encode = cram_xdelta_encode_int; + else + c->encode = cram_xdelta_encode_char; + c->store = cram_xdelta_encode_store; + c->flush = cram_xdelta_encode_flush; + + cram_xdelta_encoder *e = (cram_xdelta_encoder *)dat; + c->u.e_xdelta.word_size = e->word_size; + c->u.e_xdelta.last = 0; + c->u.e_xdelta.sub_codec = cram_encoder_init(e->sub_encoding, NULL, + E_BYTE_ARRAY, + e->sub_codec_dat, + version, vv); + + return c; +} + +/* + * --------------------------------------------------------------------------- + * XRLE + * + * This also has the additional requirement that the data series is not + * interleaved with another, permitting efficient encoding and decoding + * of all elements enmasse instead of needing to only extract the bits + * necessary per item. + */ +int cram_xrle_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // TODO if and when needed + return -1; +} + +int cram_xrle_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + // TODO if and when needed + return -1; +} + +// Expands an XRLE transform and caches result in slice->block_by_id[] +static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) { + cram_block *b = slice->block_by_id[512 + c->codec_id]; + if (b) + return 0; - for (i = 0; i < n; i++) - out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - } else { - for (i = 0; i < n; i++) - out_i[i] = -c->u.beta.offset; + b = slice->block_by_id[512 + c->codec_id] = cram_new_block(0, 0); + if (!b) + return -1; + cram_block *lit_b = c->u.xrle.lit_codec->get_block(slice, c->u.xrle.lit_codec); + if (!lit_b) + return -1; + unsigned char *lit_dat = lit_b->data; + unsigned int lit_sz = lit_b->uncomp_size; + unsigned int len_sz = c->u.xrle.len_codec->size(slice, c->u.xrle.len_codec); + + cram_block *len_b = c->u.xrle.len_codec->get_block(slice, c->u.xrle.len_codec); + if (!len_b) + return -1; + unsigned char *len_dat = len_b->data; + + uint8_t rle_syms[256]; + int rle_nsyms = 0; + int i; + for (i = 0; i < 256; i++) { + if (c->u.xrle.rep_score[i] > 0) + rle_syms[rle_nsyms++] = i; } + uint64_t out_sz; + int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz); + if (!(b->data = malloc(out_sz))) + return -1; + rle_decode(lit_dat, lit_sz, + len_dat+nb, len_sz-nb, + rle_syms, rle_nsyms, + b->data, &out_sz); + b->uncomp_size = out_sz; + return 0; } -int cram_beta_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int32_t *out_i = (int32_t *)out; - int i, n = *out_size; +int cram_xrle_decode_size(cram_slice *slice, cram_codec *c) { + cram_xrle_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]->uncomp_size; +} - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; +cram_block *cram_xrle_get_block(cram_slice *slice, cram_codec *c) { + cram_xrle_decode_expand_char(slice, c); + return slice->block_by_id[512 + c->codec_id]; +} - for (i = 0; i < n; i++) - out_i[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - } else { - for (i = 0; i < n; i++) - out_i[i] = -c->u.beta.offset; - } +int cram_xrle_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { + int n = *out_size; + + cram_xrle_decode_expand_char(slice, c); + cram_block *b = slice->block_by_id[512 + c->codec_id]; + memcpy(out, b->data + b->idx, n); + b->idx += n; return 0; -} -int cram_beta_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { - int i, n = *out_size; + // Old code when not cached + while (n > 0) { + if (c->u.xrle.cur_len == 0) { + unsigned char lit; + int one = 1; + if (c->u.xrle.lit_codec->decode(slice, c->u.xrle.lit_codec, in, + (char *)&lit, &one) < 0) + return -1; + c->u.xrle.cur_lit = lit; + if (c->u.xrle.rep_score[lit] > 0) { + if (c->u.xrle.len_codec->decode(slice, c->u.xrle.len_codec, in, + (char *)&c->u.xrle.cur_len, &one) < 0) + return -1; + } // else cur_len still zero + //else fprintf(stderr, "%d\n", lit); - if (c->u.beta.nbits) { - if (cram_not_enough_bits(in, c->u.beta.nbits * n)) - return -1; + c->u.xrle.cur_len++; + } - if (out) - for (i = 0; i < n; i++) - out[i] = get_bits_MSB(in, c->u.beta.nbits) - c->u.beta.offset; - else - for (i = 0; i < n; i++) - get_bits_MSB(in, c->u.beta.nbits); - } else { - if (out) - for (i = 0; i < n; i++) - out[i] = -c->u.beta.offset; + if (n >= c->u.xrle.cur_len) { + memset(out, c->u.xrle.cur_lit, c->u.xrle.cur_len); + out += c->u.xrle.cur_len; + n -= c->u.xrle.cur_len; + c->u.xrle.cur_len = 0; + } else { + memset(out, c->u.xrle.cur_lit, n); + out += n; + c->u.xrle.cur_len -= n; + n = 0; + } } return 0; } -void cram_beta_decode_free(cram_codec *c) { - if (c) - free(c); +void cram_xrle_decode_free(cram_codec *c) { + if (!c) return; + + if (c->u.xrle.len_codec) + c->u.xrle.len_codec->free(c->u.xrle.len_codec); + + if (c->u.xrle.lit_codec) + c->u.xrle.lit_codec->free(c->u.xrle.lit_codec); + + free(c); } -cram_codec *cram_beta_decode_init(char *data, int size, +cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; + char *endp = data+size; + int err = 0; if (!(c = malloc(sizeof(*c)))) return NULL; - c->codec = E_BETA; - if (option == E_INT) - c->decode = cram_beta_decode_int; - else if (option == E_LONG) - c->decode = cram_beta_decode_long; + c->codec = E_XRLE; + if (option == E_LONG) + c->decode = cram_xrle_decode_long; + else if (option == E_INT) + c->decode = cram_xrle_decode_int; else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->decode = cram_beta_decode_char; + c->decode = cram_xrle_decode_char; else { - hts_log_error("BYTE_ARRAYs not supported by this codec"); - free(c); + fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); return NULL; } - c->free = cram_beta_decode_free; - - c->u.beta.nbits = -1; - cp += safe_itf8_get(cp, data + size, &c->u.beta.offset); - if (cp < data + size) // Ensure test below works - cp += safe_itf8_get(cp, data + size, &c->u.beta.nbits); + c->free = cram_xrle_decode_free; + c->size = cram_xrle_decode_size; + c->get_block = cram_xrle_get_block; + c->u.xrle.cur_len = 0; + c->u.xrle.cur_lit = -1; - if (cp - data != size - || c->u.beta.nbits < 0 || c->u.beta.nbits > 8 * sizeof(int)) { - hts_log_error("Malformed beta header stream"); - free(c); - return NULL; + // RLE map + int i, j, nrle = vv->varint_get32(&cp, endp, &err); + memset(c->u.xrle.rep_score, 0, 256*sizeof(*c->u.xrle.rep_score)); + for (i = 0; i < nrle && i < 256; i++) { + j = vv->varint_get32(&cp, endp, &err); + if (j >= 0 && j < 256) + c->u.xrle.rep_score[j] = 1; } + // Length and literal sub encodings + c->u.xrle.len_encoding = vv->varint_get32(&cp, endp, &err); + int sub_size = vv->varint_get32(&cp, endp, &err); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xrle.len_codec = cram_decoder_init(hdr, c->u.xrle.len_encoding, + cp, sub_size, E_INT, version, vv); + if (c->u.xrle.len_codec == NULL) + goto malformed; + cp += sub_size; + + c->u.xrle.lit_encoding = vv->varint_get32(&cp, endp, &err); + sub_size = vv->varint_get32(&cp, endp, &err); + if (sub_size < 0 || endp - cp < sub_size) + goto malformed; + c->u.xrle.lit_codec = cram_decoder_init(hdr, c->u.xrle.lit_encoding, + cp, sub_size, option, version, vv); + if (c->u.xrle.lit_codec == NULL) + goto malformed; + cp += sub_size; + + if (err) + goto malformed; + return c; + + malformed: + fprintf(stderr, "Malformed xrle header stream\n"); + free(c); + return NULL; } -int cram_beta_encode_store(cram_codec *c, cram_block *b, - char *prefix, int version) { +int cram_xrle_encode_flush(cram_codec *c) { + uint8_t *out_lit, *out_len; + uint64_t out_lit_size, out_len_size; + uint8_t rle_syms[256]; + int rle_nsyms = 0, i; + + for (i = 0; i < 256; i++) + if (c->u.e_xrle.rep_score[i] > 0) + rle_syms[rle_nsyms++] = i; + + if (!c->u.e_xrle.to_flush) { + c->u.e_xrle.to_flush = (char *)BLOCK_DATA(c->out); + c->u.e_xrle.to_flush_size = BLOCK_SIZE(c->out); + } + + out_len = malloc(c->u.e_xrle.to_flush_size+8); + if (!out_len) + return -1; + + int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size); + + out_lit = rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size, + out_len+nb, &out_len_size, + rle_syms, &rle_nsyms, + NULL, &out_lit_size); + out_len_size += nb; + + + // TODO: can maybe "gift" the sub codec the data block, to remove + // one level of memcpy. + if (c->u.e_xrle.len_codec->encode(NULL, + c->u.e_xrle.len_codec, + (char *)out_len, out_len_size)) + return -1; + + if (c->u.e_xrle.lit_codec->encode(NULL, + c->u.e_xrle.lit_codec, + (char *)out_lit, out_lit_size)) + return -1; + + free(out_len); + free(out_lit); + + return 0; +} + +int cram_xrle_encode_store(cram_codec *c, cram_block *b, + char *prefix, int version) { int len = 0, r = 0, n; + cram_codec *tc; + cram_block *b_rle, *b_len, *b_lit; if (prefix) { size_t l = strlen(prefix); @@ -634,118 +1811,134 @@ int cram_beta_encode_store(cram_codec *c, cram_block *b, len += l; } - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, itf8_size(c->u.e_beta.offset) - + itf8_size(c->u.e_beta.nbits))); // codec length - r |= n; - len += (n = itf8_put_blk(b, c->u.e_beta.offset)); r |= n; - len += (n = itf8_put_blk(b, c->u.e_beta.nbits)); r |= n; + // List of symbols to RLE + b_rle = cram_new_block(0, 0); + if (!b_rle) + return -1; + int i, nrle = 0, len1 = 0; + for (i = 0; i < 256; i++) { + if (c->u.e_xrle.rep_score[i] > 0) { + nrle++; + len1 += (n = c->vv->varint_put32_blk(b_rle,i)); r |= n; + } + } - if (r > 0) return len; + // Store length and literal sub-codecs to get encoded length + tc = c->u.e_xrle.len_codec; + b_len = cram_new_block(0, 0); + if (!b_len) + return -1; + int len2 = tc->store(tc, b_len, NULL, version); + + tc = c->u.e_xrle.lit_codec; + b_lit = cram_new_block(0, 0); + if (!b_lit) + return -1; + int len3 = tc->store(tc, b_lit, NULL, version); + + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, len1 + len2 + len3 + + c->vv->varint_size(nrle))); r |= n; + len += (n = c->vv->varint_put32_blk(b, nrle)); r |= n; + BLOCK_APPEND(b, BLOCK_DATA(b_rle), BLOCK_SIZE(b_rle)); + BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); + BLOCK_APPEND(b, BLOCK_DATA(b_lit), BLOCK_SIZE(b_lit)); + + cram_free_block(b_rle); + cram_free_block(b_len); + cram_free_block(b_lit); + + if (r > 0) + return len + len1 + len2 + len3; block_err: return -1; } -int cram_beta_encode_long(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - int64_t *syms = (int64_t *)in; - int i, r = 0; - - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); +int cram_xrle_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + // TODO if and when needed + return -1; +} - return r; +int cram_xrle_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + // TODO if and when needed + return -1; } -int cram_beta_encode_int(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - int *syms = (int *)in; - int i, r = 0; +int cram_xrle_encode_char(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + if (c->u.e_xrle.to_flush) { + if (!c->out && !(c->out = cram_new_block(0, 0))) + return -1; + BLOCK_APPEND(c->out, c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size); + c->u.e_xrle.to_flush = NULL; + c->u.e_xrle.to_flush_size = 0; + } - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); + if (c->out && BLOCK_SIZE(c->out) > 0) { + // Gathering data + BLOCK_APPEND(c->out, in, in_size); + return 0; + } - return r; + // else cache copy of the data we're about to send to flush instead. + c->u.e_xrle.to_flush = in; + c->u.e_xrle.to_flush_size = in_size; + return 0; + + block_err: + return -1; } -int cram_beta_encode_char(cram_slice *slice, cram_codec *c, - char *in, int in_size) { - unsigned char *syms = (unsigned char *)in; - int i, r = 0; +void cram_xrle_encode_free(cram_codec *c) { + if (!c) return; - for (i = 0; i < in_size; i++) - r |= store_bits_MSB(c->out, syms[i] + c->u.e_beta.offset, - c->u.e_beta.nbits); + if (c->u.e_xrle.len_codec) + c->u.e_xrle.len_codec->free(c->u.e_xrle.len_codec); + if (c->u.e_xrle.lit_codec) + c->u.e_xrle.lit_codec->free(c->u.e_xrle.lit_codec); - return r; -} + cram_free_block(c->out); -void cram_beta_encode_free(cram_codec *c) { - if (c) free(c); + free(c); } -cram_codec *cram_beta_encode_init(cram_stats *st, - enum cram_external_type option, - void *dat, - int version) { +cram_codec *cram_xrle_encode_init(cram_stats *st, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { cram_codec *c; - int min_val, max_val, len = 0; - int64_t range; - c = malloc(sizeof(*c)); - if (!c) + if (!(c = malloc(sizeof(*c)))) return NULL; - c->codec = E_BETA; - c->free = cram_beta_encode_free; - if (option == E_INT) - c->encode = cram_beta_encode_int; - else if (option == E_LONG) - c->encode = cram_beta_encode_long; - else - c->encode = cram_beta_encode_char; - c->store = cram_beta_encode_store; - if (dat) { - min_val = ((int *)dat)[0]; - max_val = ((int *)dat)[1]; - } else { - min_val = INT_MAX; - max_val = INT_MIN; - int i; - for (i = 0; i < MAX_STAT_VAL; i++) { - if (!st->freqs[i]) - continue; - if (min_val > i) - min_val = i; - max_val = i; - } - if (st->h) { - khint_t k; + c->codec = E_XRLE; + c->free = cram_xrle_encode_free; + if (option == E_LONG) + c->encode = cram_xrle_encode_long; + else if (option == E_INT) + c->encode = cram_xrle_encode_int; + else + c->encode = cram_xrle_encode_char; + c->store = cram_xrle_encode_store; + c->flush = cram_xrle_encode_flush; - for (k = kh_begin(st->h); k != kh_end(st->h); k++) { - if (!kh_exist(st->h, k)) - continue; + cram_xrle_encoder *e = (cram_xrle_encoder *)dat; - i = kh_key(st->h, k); - if (min_val > i) - min_val = i; - if (max_val < i) - max_val = i; - } - } - } + c->u.e_xrle.len_codec = cram_encoder_init(e->len_encoding, NULL, + E_BYTE, e->len_dat, + version, vv); + c->u.e_xrle.lit_codec = cram_encoder_init(e->lit_encoding, NULL, + E_BYTE, e->lit_dat, + version, vv); + c->u.e_xrle.cur_lit = -1; + c->u.e_xrle.cur_len = -1; + c->u.e_xrle.to_flush = NULL; + c->u.e_xrle.to_flush_size = 0; - assert(max_val >= min_val); - c->u.e_beta.offset = -min_val; - range = (int64_t) max_val - min_val; - while (range) { - len++; - range >>= 1; - } - c->u.e_beta.nbits = len; + memcpy(c->u.e_xrle.rep_score, e->rep_score, 256*sizeof(*c->u.e_xrle.rep_score)); return c; } @@ -803,9 +1996,10 @@ void cram_subexp_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_subexp_decode_init(char *data, int size, +cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; @@ -822,8 +2016,8 @@ cram_codec *cram_subexp_decode_init(char *data, int size, c->free = cram_subexp_decode_free; c->u.subexp.k = -1; - cp += safe_itf8_get(cp, data + size, &c->u.subexp.offset); - cp += safe_itf8_get(cp, data + size, &c->u.subexp.k); + c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL); + c->u.subexp.k = vv->varint_get32(&cp, data + size, NULL); if (cp - data != size || c->u.subexp.k < 0) { hts_log_error("Malformed subexp header stream"); @@ -867,9 +2061,10 @@ void cram_gamma_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_gamma_decode_init(char *data, int size, +cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; char *cp = data; @@ -888,7 +2083,7 @@ cram_codec *cram_gamma_decode_init(char *data, int size, c->decode = cram_gamma_decode; c->free = cram_gamma_decode_free; - cp += safe_itf8_get(cp, data + size, &c->u.gamma.offset); + c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL); if (cp - data != size) goto malformed; @@ -1082,9 +2277,10 @@ int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, /* * Initialises a huffman decoder from an encoding data stream. */ -cram_codec *cram_huffman_decode_init(char *data, int size, +cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { int32_t ncodes = 0, i, j; char *cp = data, *data_end = &data[size]; cram_codec *h; @@ -1092,14 +2288,14 @@ cram_codec *cram_huffman_decode_init(char *data, int size, int32_t val, last_len, max_len = 0; uint32_t max_val; // needs one more bit than val const int max_code_bits = sizeof(val) * 8 - 1; - int l; + int err = 0; if (option == E_BYTE_ARRAY_BLOCK) { hts_log_error("BYTE_ARRAYs not supported by this codec"); return NULL; } - cp += safe_itf8_get(cp, data_end, &ncodes); + ncodes = vv->varint_get32(&cp, data_end, &err); if (ncodes < 0) { hts_log_error("Invalid number of symbols in huffman stream"); return NULL; @@ -1129,21 +2325,26 @@ cram_codec *cram_huffman_decode_init(char *data, int size, /* Read symbols and bit-lengths */ if (option == E_LONG) { - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - l = safe_ltf8_get(cp, data_end, &codes[i].symbol); - } + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get64(&cp, data_end, &err); + } else if (option == E_SLONG) { + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get64s(&cp, data_end, &err); + } else if (option == E_INT || option == E_BYTE) { + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get32(&cp, data_end, &err); + } else if (option == E_SINT) { + for (i = 0; i < ncodes; i++) + codes[i].symbol = vv->varint_get32s(&cp, data_end, &err); } else { - for (i = 0, l = 1; i < ncodes && l > 0; i++, cp += l) { - int32_t i32; - l = safe_itf8_get(cp, data_end, &i32); - codes[i].symbol = i32; - } + free(h); + return NULL; } - if (l < 1) + if (err) goto malformed; - cp += safe_itf8_get(cp, data_end, &i); + i = vv->varint_get32(&cp, data_end, &err); if (i != ncodes) goto malformed; @@ -1154,9 +2355,9 @@ cram_codec *cram_huffman_decode_init(char *data, int size, return h; } - for (i = 0, l = 1; i < ncodes; i++, cp += l) { - l = safe_itf8_get(cp, data_end, &codes[i].len); - if (l < 1) + for (i = 0; i < ncodes; i++) { + codes[i].len = vv->varint_get32(&cp, data_end, &err); + if (err) break; if (codes[i].len < 0) { hts_log_error("Huffman code length (%d) is negative", codes[i].len); @@ -1165,7 +2366,7 @@ cram_codec *cram_huffman_decode_init(char *data, int size, if (max_len < codes[i].len) max_len = codes[i].len; } - if (l < 1 || cp - data != size || max_len >= ncodes) + if (err || cp - data != size || max_len >= ncodes) goto malformed; /* 31 is max. bits available in val */ @@ -1229,12 +2430,12 @@ cram_codec *cram_huffman_decode_init(char *data, int size, h->decode = cram_huffman_decode_char0; else h->decode = cram_huffman_decode_char; - } else if (option == E_LONG) { + } else if (option == E_LONG || option == E_SLONG) { if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_long0; else h->decode = cram_huffman_decode_long; - } else if (option == E_INT) { + } else if (option == E_INT || option == E_SINT || option == E_BYTE) { if (h->u.huffman.codes[0].len == 0) h->decode = cram_huffman_decode_int0; else @@ -1389,7 +2590,7 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, * Therefore 6*ncodes + 5 + 5 + 1 + 5 is max memory */ char *tmp = malloc(6*c->u.e_huffman.nvals+16); - char *tp = tmp; + char *tp = tmp, *tpend = tmp+6*c->u.e_huffman.nvals+16; if (!tmp) return -1; @@ -1400,24 +2601,33 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, len += l; } - tp += itf8_put(tp, c->u.e_huffman.nvals); + tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals); if (c->u.e_huffman.option == E_LONG) { for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += ltf8_put(tp, codes[i].symbol); + tp += c->vv->varint_put64(tp, tpend, codes[i].symbol); } - } else { + } else if (c->u.e_huffman.option == E_SLONG) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += c->vv->varint_put64s(tp, tpend, codes[i].symbol); + } + } else if (c->u.e_huffman.option == E_INT || c->u.e_huffman.option == E_BYTE) { + for (i = 0; i < c->u.e_huffman.nvals; i++) { + tp += c->vv->varint_put32(tp, tpend, codes[i].symbol); + } + } else if (c->u.e_huffman.option == E_SINT) { for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].symbol); + tp += c->vv->varint_put32s(tp, tpend, codes[i].symbol); } + } else { + return -1; } - tp += itf8_put(tp, c->u.e_huffman.nvals); - for (i = 0; i < c->u.e_huffman.nvals; i++) { - tp += itf8_put(tp, codes[i].len); - } + tp += c->vv->varint_put32(tp, tpend, c->u.e_huffman.nvals); + for (i = 0; i < c->u.e_huffman.nvals; i++) + tp += c->vv->varint_put32(tp, tpend, codes[i].len); - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, tp-tmp)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, tp-tmp)); r |= n; BLOCK_APPEND(b, tmp, tp-tmp); len += tp-tmp; @@ -1433,7 +2643,7 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, cram_codec *cram_huffman_encode_init(cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens = NULL, code, len; int *new_vals, *new_freqs; int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; @@ -1594,18 +2804,21 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, c->encode = cram_huffman_encode_char0; else c->encode = cram_huffman_encode_char; - } else if (option == E_INT) { + } else if (option == E_INT || option == E_SINT) { if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_int0; else c->encode = cram_huffman_encode_int; - } else if (option == E_LONG) { + } else if (option == E_LONG || option == E_SLONG) { if (c->u.e_huffman.codes[0].len == 0) c->encode = cram_huffman_encode_long0; else c->encode = cram_huffman_encode_long; + } else { + return NULL; } c->store = cram_huffman_encode_store; + c->flush = NULL; return c; @@ -1658,14 +2871,13 @@ void cram_byte_array_len_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_byte_array_len_decode_init(char *data, int size, +cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c; char *cp = data; char *endp = data + size; - int32_t encoding = 0; - int32_t sub_size = -1; if (!(c = malloc(sizeof(*c)))) return NULL; @@ -1676,23 +2888,22 @@ cram_codec *cram_byte_array_len_decode_init(char *data, int size, c->u.byte_array_len.len_codec = NULL; c->u.byte_array_len.val_codec = NULL; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &sub_size); + int encoding = vv->varint_get32(&cp, endp, NULL); + int sub_size = vv->varint_get32(&cp, endp, NULL); if (sub_size < 0 || endp - cp < sub_size) goto malformed; - c->u.byte_array_len.len_codec = cram_decoder_init(encoding, cp, sub_size, - E_INT, version); + c->u.byte_array_len.len_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + E_INT, version, vv); if (c->u.byte_array_len.len_codec == NULL) goto no_codec; cp += sub_size; - sub_size = -1; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &sub_size); + encoding = vv->varint_get32(&cp, endp, NULL); + sub_size = vv->varint_get32(&cp, endp, NULL); if (sub_size < 0 || endp - cp < sub_size) goto malformed; - c->u.byte_array_len.val_codec = cram_decoder_init(encoding, cp, sub_size, - option, version); + c->u.byte_array_len.val_codec = cram_decoder_init(hdr, encoding, cp, sub_size, + option, version, vv); if (c->u.byte_array_len.val_codec == NULL) goto no_codec; cp += sub_size; @@ -1760,8 +2971,8 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, len3 = tc->store(tc, b_val, NULL, version); if (len3 < 0) goto block_err; - len += (n = itf8_put_blk(b, c->codec)); r |= n; - len += (n = itf8_put_blk(b, len2+len3)); r |= n; + len += (n = c->vv->varint_put32_blk(b, c->codec)); r |= n; + len += (n = c->vv->varint_put32_blk(b, len2+len3)); r |= n; BLOCK_APPEND(b, BLOCK_DATA(b_len), BLOCK_SIZE(b_len)); BLOCK_APPEND(b, BLOCK_DATA(b_val), BLOCK_SIZE(b_val)); @@ -1780,7 +2991,7 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, cram_codec *cram_byte_array_len_encode_init(cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; cram_byte_array_len_encoder *e = (cram_byte_array_len_encoder *)dat; @@ -1791,15 +3002,16 @@ cram_codec *cram_byte_array_len_encode_init(cram_stats *st, c->free = cram_byte_array_len_encode_free; c->encode = cram_byte_array_len_encode; c->store = cram_byte_array_len_encode_store; + c->flush = NULL; c->u.e_byte_array_len.len_codec = cram_encoder_init(e->len_encoding, st, E_INT, e->len_dat, - version); + version, vv); c->u.e_byte_array_len.val_codec = cram_encoder_init(e->val_encoding, NULL, E_BYTE_ARRAY, e->val_dat, - version); + version, vv); if (!c->u.e_byte_array_len.len_codec || !c->u.e_byte_array_len.val_codec) { @@ -1896,11 +3108,13 @@ void cram_byte_array_stop_decode_free(cram_codec *c) { free(c); } -cram_codec *cram_byte_array_stop_decode_init(char *data, int size, +cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { cram_codec *c = NULL; unsigned char *cp = (unsigned char *)data; + int err = 0; if (size < (CRAM_MAJOR_VERS(version) == 1 ? 5 : 2)) goto malformed; @@ -1929,11 +3143,10 @@ cram_codec *cram_byte_array_stop_decode_init(char *data, int size, + ((unsigned int) cp[3]<<24); cp += 4; } else { - cp += safe_itf8_get((char *) cp, data + size, - &c->u.byte_array_stop.content_id); + c->u.byte_array_stop.content_id = vv->varint_get32((char **)&cp, data+size, &err); } - if ((char *)cp - data != size) + if ((char *)cp - data != size || err) goto malformed; return c; @@ -1971,19 +3184,20 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, len += l; } - cp += itf8_put(cp, c->codec); + cp += c->vv->varint_put32(cp, buf+20, c->codec); if (CRAM_MAJOR_VERS(version) == 1) { - cp += itf8_put(cp, 5); + cp += c->vv->varint_put32(cp, buf+20, 5); *cp++ = c->u.e_byte_array_stop.stop; *cp++ = (c->u.e_byte_array_stop.content_id >> 0) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 8) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 16) & 0xff; *cp++ = (c->u.e_byte_array_stop.content_id >> 24) & 0xff; } else { - cp += itf8_put(cp, 1 + itf8_size(c->u.e_byte_array_stop.content_id)); + cp += c->vv->varint_put32(cp, buf+20, 1 + + c->vv->varint_size(c->u.e_byte_array_stop.content_id)); *cp++ = c->u.e_byte_array_stop.stop; - cp += itf8_put(cp, c->u.e_byte_array_stop.content_id); + cp += c->vv->varint_put32(cp, buf+20, c->u.e_byte_array_stop.content_id); } BLOCK_APPEND(b, buf, cp-buf); @@ -1998,7 +3212,7 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { cram_codec *c; c = malloc(sizeof(*c)); @@ -2008,6 +3222,7 @@ cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, c->free = cram_byte_array_stop_encode_free; c->encode = cram_byte_array_stop_encode; c->store = cram_byte_array_stop_encode_store; + c->flush = NULL; c->u.e_byte_array_stop.stop = ((int *)dat)[0]; c->u.e_byte_array_stop.content_id = ((int *)dat)[1]; @@ -2036,10 +3251,11 @@ const char *cram_encoding2str(enum cram_encoding t) { } } -static cram_codec *(*decode_init[])(char *data, +static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr, + char *data, int size, enum cram_external_type option, - int version) = { + int version, varint_vec *vv) = { NULL, cram_external_decode_init, NULL, @@ -2052,12 +3268,18 @@ static cram_codec *(*decode_init[])(char *data, cram_gamma_decode_init, }; -cram_codec *cram_decoder_init(enum cram_encoding codec, +cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, + enum cram_encoding codec, char *data, int size, enum cram_external_type option, - int version) { + int version, varint_vec *vv) { if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) { - return decode_init[codec](data, size, option, version); + cram_codec *r = decode_init[codec](hdr, data, size, option, version, vv); + if (r) { + r->vv = vv; + r->codec_id = hdr->ncodecs++; + } + return r; } else { hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec)); return NULL; @@ -2067,7 +3289,7 @@ cram_codec *cram_decoder_init(enum cram_encoding codec, static cram_codec *(*encode_init[])(cram_stats *stx, enum cram_external_type option, void *opt, - int version) = { + int version, varint_vec *vv) = { NULL, cram_external_encode_init, NULL, @@ -2084,14 +3306,19 @@ cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st, enum cram_external_type option, void *dat, - int version) { + int version, varint_vec *vv) { if (st && !st->nvals) return NULL; if (encode_init[codec]) { cram_codec *r; - if ((r = encode_init[codec](st, option, dat, version))) + if ((r = encode_init[codec](st, option, dat, version, vv))) r->out = NULL; + if (!r) { + hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec)); + return NULL; + } + r->vv = vv; return r; } else { hts_log_error("Unimplemented codec of type %s", cram_encoding2str(codec)); @@ -2167,6 +3394,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { c->encode = cram_external_encode_long; else if (c->decode == cram_external_decode_char) c->encode = cram_external_encode_char; + else if (c->decode == cram_external_decode_block) + c->encode = cram_external_encode_char; else return -1; break; @@ -2223,6 +3452,26 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { return -1; break; + case E_XPACK: { + // shares struct with decode + cram_codec t = *c; + t.free = cram_xpack_encode_free; + t.store = cram_xpack_encode_store; + if (t.decode == cram_xpack_decode_long) + t.encode = cram_xpack_encode_long; + else if (t.decode == cram_xpack_decode_int) + t.encode = cram_xpack_encode_int; + else if (t.decode == cram_xpack_decode_char) + t.encode = cram_xpack_encode_char; + else + return -1; + t.u.e_xpack.sub_codec = t.u.xpack.sub_codec; + if (cram_codec_decoder2encoder(fd, t.u.e_xpack.sub_codec) == -1) + return -1; + *c = t; + break; + } + case E_BYTE_ARRAY_LEN: { cram_codec *t = malloc(sizeof(*t)); if (!t) return -1; diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 31a170031..850a2a92e 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2015, 2018 Genome Research Ltd. +Copyright (c) 2012-2015, 2018, 2020 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -73,6 +73,46 @@ typedef struct { int32_t nbits; } cram_beta_decoder; +// A PACK transform, packing multiple values into a single byte +typedef struct { + int32_t nbits; + enum cram_encoding sub_encoding; + void *sub_codec_dat; + struct cram_codec *sub_codec; + int nval; // number of items in maps + uint32_t rmap[256]; // 0,1,2,3 -> P,A,C,K + int map[256]; // P,A,C,K -> 0,1,2,3 // NB: max input is uint8_tb? Or use hash? +} cram_xpack_decoder; +typedef cram_xpack_decoder cram_xpack_encoder; + +// Transforms symbols X,Y,Z to bytes 0,1,2. +typedef struct { + enum cram_encoding len_encoding; + enum cram_encoding lit_encoding; + void *len_dat; + void *lit_dat; + struct cram_codec *len_codec; + struct cram_codec *lit_codec; + int cur_len; + int cur_lit; + int rep_score[256]; + char *to_flush; + size_t to_flush_size; +} cram_xrle_decoder; +typedef cram_xrle_decoder cram_xrle_encoder; + +// DELTA + zigzag + varint encoding +typedef struct { + // FIXME: define endian here too. Require little endian? + int64_t last; + uint8_t word_size; // 1, 2, 4, 8 + //uint8_t sign; // true if input data is already signed + enum cram_encoding sub_encoding; + void *sub_codec_dat; + struct cram_codec *sub_codec; +} cram_xdelta_decoder; +typedef cram_xdelta_decoder cram_xdelta_encoder; + typedef struct { int32_t offset; } cram_gamma_decoder; @@ -112,6 +152,8 @@ typedef struct { typedef struct cram_codec { enum cram_encoding codec; cram_block *out; + varint_vec *vv; + int codec_id; void (*free)(struct cram_codec *codec); int (*decode)(cram_slice *slice, struct cram_codec *codec, cram_block *in, char *out, int *out_size); @@ -119,6 +161,9 @@ typedef struct cram_codec { char *in, int in_size); int (*store)(struct cram_codec *codec, cram_block *b, char *prefix, int version); + int (*size)(cram_slice *slice, struct cram_codec *codec); + int (*flush)(struct cram_codec *codec); + cram_block *(*get_block)(cram_slice *slice, struct cram_codec *codec); union { cram_huffman_decoder huffman; @@ -128,23 +173,30 @@ typedef struct cram_codec { cram_subexp_decoder subexp; cram_byte_array_len_decoder byte_array_len; cram_byte_array_stop_decoder byte_array_stop; + cram_xpack_decoder xpack; + cram_xrle_decoder xrle; + cram_xdelta_decoder xdelta; cram_huffman_encoder e_huffman; cram_external_decoder e_external; cram_byte_array_stop_decoder e_byte_array_stop; cram_byte_array_len_encoder e_byte_array_len; cram_beta_decoder e_beta; + cram_xpack_decoder e_xpack; + cram_xrle_decoder e_xrle; + cram_xdelta_decoder e_xdelta; } u; } cram_codec; const char *cram_encoding2str(enum cram_encoding t); -cram_codec *cram_decoder_init(enum cram_encoding codec, char *data, int size, +cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, + enum cram_encoding codec, char *data, int size, enum cram_external_type option, - int version); + int version, varint_vec *vv); cram_codec *cram_encoder_init(enum cram_encoding codec, cram_stats *st, enum cram_external_type option, void *dat, - int version); + int version, varint_vec *vv); //int cram_decode(void *codes, char *in, int in_size, char *out, int *out_size); //void cram_decoder_free(void *codes); diff --git a/cram/cram_decode.c b/cram/cram_decode.c index ad09fb757..06e297331 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -66,12 +66,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * Returns number of bytes decoded on success * -1 on failure */ -int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { +int cram_decode_TD(cram_fd *fd, char *cp, const char *endp, + cram_block_compression_hdr *h) { char *op = cp; unsigned char *dat; cram_block *b; int32_t blk_size = 0; - int nTL, i, sz; + int nTL, i, sz, err = 0; if (!(b = cram_new_block(0, 0))) return -1; @@ -85,14 +86,14 @@ int cram_decode_TD(char *cp, const char *endp, cram_block_compression_hdr *h) { } /* Decode */ - cp += safe_itf8_get(cp, endp, &blk_size); + blk_size = fd->vv.varint_get32(&cp, endp, &err); if (!blk_size) { h->nTL = 0; cram_free_block(b); return cp - op; } - if (blk_size < 0 || endp - cp < blk_size) { + if (err || blk_size < 0 || endp - cp < blk_size) { cram_free_block(b); return -1; } @@ -143,7 +144,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, cram_block *b) { char *cp, *endp, *cp_copy; cram_block_compression_hdr *hdr = calloc(1, sizeof(*hdr)); - int i; + int i, err = 0; int32_t map_size = 0, map_count = 0; if (!hdr) @@ -160,27 +161,16 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, endp = cp + b->uncomp_size; if (CRAM_MAJOR_VERS(fd->version) == 1) { - int32_t i32; - cp += safe_itf8_get(cp, endp, &hdr->ref_seq_id); -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ -#ifdef LARGE_POS - cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_start); - cp += safe_ltf8_get(cp, endp, &hdr->ref_seq_span); -#else - cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_start=i32; - cp += safe_itf8_get(cp, endp, &i32); hdr->ref_seq_span=i32; -#endif - cp += safe_itf8_get(cp, endp, &hdr->num_records); - cp += safe_itf8_get(cp, endp, &hdr->num_landmarks); + hdr->ref_seq_id = fd->vv.varint_get32(&cp, endp, &err); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + hdr->ref_seq_start = fd->vv.varint_get64(&cp, endp, &err); + hdr->ref_seq_span = fd->vv.varint_get64(&cp, endp, &err); + } else { + hdr->ref_seq_start = fd->vv.varint_get32(&cp, endp, &err); + hdr->ref_seq_span = fd->vv.varint_get32(&cp, endp, &err); + } + hdr->num_records = fd->vv.varint_get32(&cp, endp, &err); + hdr->num_landmarks = fd->vv.varint_get32(&cp, endp, &err); if (hdr->num_landmarks < 0 || hdr->num_landmarks >= SIZE_MAX / sizeof(int32_t) || endp - cp < hdr->num_landmarks) { @@ -191,9 +181,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, free(hdr); return NULL; } - for (i = 0; i < hdr->num_landmarks; i++) { - cp += safe_itf8_get(cp, endp, &hdr->landmark[i]); - } + for (i = 0; i < hdr->num_landmarks; i++) + hdr->landmark[i] = fd->vv.varint_get32(&cp, endp, &err);; } hdr->preservation_map = kh_init(map); @@ -211,11 +200,12 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, /* Initialise defaults for preservation map */ hdr->read_names_included = 0; hdr->AP_delta = 1; + hdr->qs_seq_orient = 1; memcpy(hdr->substitution_matrix, "CGTNAGTNACTNACGNACGT", 20); /* Preservation map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); for (i = 0; i < map_count; i++) { pmap_t hd; khint_t k; @@ -269,6 +259,18 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, hdr->no_ref = !hd.i; break; + case CRAM_KEY('Q','O'): + hd.i = *cp++; + k = kh_put(map, hdr->preservation_map, "QO", &r); + if (-1 == r) { + cram_free_compression_header(hdr); + return NULL; + } + + kh_val(hdr->preservation_map, k) = hd; + hdr->qs_seq_orient = hd.i; + break; + case CRAM_KEY('S','M'): if (endp - cp < 5) { cram_free_compression_header(hdr); @@ -311,7 +313,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, break; case CRAM_KEY('T','D'): { - int sz = cram_decode_TD(cp, endp, hdr); // tag dictionary + int sz = cram_decode_TD(fd, cp, endp, hdr); // tag dictionary if (sz < 0) { cram_free_compression_header(hdr); return NULL; @@ -342,8 +344,9 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } /* Record encoding map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); + int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; for (i = 0; i < map_count; i++) { char *key = cp; int32_t encoding = E_NULL; @@ -359,8 +362,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } cp += 2; - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &size); + encoding = fd->vv.varint_get32(&cp, endp, &err); + size = fd->vv.varint_get32(&cp, endp, &err); offset = cp - (char *)b->data; @@ -394,31 +397,20 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { ds_id = DS_AP; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_SLONG : E_INT; } else if (key[0] == 'R' && key[1] == 'G') { - ds_id = DS_RG; type = E_INT; + ds_id = DS_RG; + type = is_v4 ? E_SINT : E_INT; } else if (key[0] == 'M' && key[1] == 'F') { ds_id = DS_MF; type = E_INT; } else if (key[0] == 'N' && key[1] == 'S') { ds_id = DS_NS; type = E_INT; } else if (key[0] == 'N' && key[1] == 'P') { ds_id = DS_NP; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_LONG : E_INT; } else if (key[0] == 'T' && key[1] == 'S') { ds_id = DS_TS; -#ifdef LARGE_POS - type = E_LONG, -#else - type = E_INT; -#endif + type = is_v4 ? E_SLONG : E_INT; } else if (key[0] == 'N' && key[1] == 'F') { ds_id = DS_NF; type = E_INT; } else if (key[0] == 'T' && key[1] == 'C') { @@ -471,8 +463,8 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, key); hdr->codecs[ds_id]->free(hdr->codecs[ds_id]); } - hdr->codecs[ds_id] = cram_decoder_init(encoding, cp, size, - type, fd->version); + hdr->codecs[ds_id] = cram_decoder_init(hdr, encoding, cp, size, + type, fd->version, &fd->vv); if (!hdr->codecs[ds_id]) { cram_free_compression_header(hdr); return NULL; @@ -502,13 +494,13 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } /* Tag encoding map */ - cp += safe_itf8_get(cp, endp, &map_size); cp_copy = cp; - cp += safe_itf8_get(cp, endp, &map_count); + map_size = fd->vv.varint_get32(&cp, endp, &err); cp_copy = cp; + map_count = fd->vv.varint_get32(&cp, endp, &err); for (i = 0; i < map_count; i++) { int32_t encoding = E_NULL; int32_t size = 0; cram_map *m = malloc(sizeof(*m)); // FIXME: use pooled_alloc - uint8_t *key; + uint8_t key[3]; if (!m || endp - cp < 6) { free(m); @@ -516,19 +508,19 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, return NULL; } - key = (uint8_t *) cp + 1; - m->key = (key[0]<<16)|(key[1]<<8)|key[2]; - - cp += 4; // Strictly ITF8, but this suffices - cp += safe_itf8_get(cp, endp, &encoding); - cp += safe_itf8_get(cp, endp, &size); + m->key = fd->vv.varint_get32(&cp, endp, &err); + key[0] = m->key>>16; + key[1] = m->key>>8; + key[2] = m->key; + encoding = fd->vv.varint_get32(&cp, endp, &err); + size = fd->vv.varint_get32(&cp, endp, &err); m->encoding = encoding; m->size = size; m->offset = cp - (char *)b->data; if (size < 0 || endp - cp < size || - !(m->codec = cram_decoder_init(encoding, cp, size, - E_BYTE_ARRAY_BLOCK, fd->version))) { + !(m->codec = cram_decoder_init(hdr, encoding, cp, size, + E_BYTE_ARRAY_BLOCK, fd->version, &fd->vv))) { cram_free_compression_header(hdr); free(m); return NULL; @@ -539,7 +531,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, m->next = hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])]; hdr->tag_encoding_map[CRAM_MAP(key[0],key[1])] = m; } - if (cp - cp_copy != map_size) { + if (err || cp - cp_copy != map_size) { cram_free_compression_header(hdr); return NULL; } @@ -955,7 +947,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { cram_block_slice_hdr *hdr; unsigned char *cp; unsigned char *cp_end; - int i; + int i, err = 0; if (b->method != RAW) { /* Spec. says slice header should be RAW, but we can future-proof @@ -976,17 +968,14 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { hdr->content_type = b->content_type; if (b->content_type == MAPPED_SLICE) { - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_id); -#ifdef LARGE_POS - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_start); - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->ref_seq_span); -#else - int32_t i32; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->ref_seq_start = i32; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->ref_seq_span = i32; -#endif + hdr->ref_seq_id = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); + hdr->ref_seq_span = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); + } else { + hdr->ref_seq_start = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + hdr->ref_seq_span = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + } if (hdr->ref_seq_start < 0 || hdr->ref_seq_span < 0) { free(hdr); hts_log_error("Negative values not permitted for header " @@ -994,19 +983,15 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { return NULL; } } - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_records); + hdr->num_records = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err); hdr->record_counter = 0; if (CRAM_MAJOR_VERS(fd->version) == 2) { - int32_t i32 = 0; - cp += safe_itf8_get((char *)cp, (char *)cp_end, &i32); - hdr->record_counter = i32; + hdr->record_counter = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += safe_ltf8_get((char *)cp, (char *)cp_end, &hdr->record_counter); + hdr->record_counter = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); } - - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_blocks); - - cp += safe_itf8_get((char *)cp, (char *)cp_end, &hdr->num_content_ids); + hdr->num_blocks = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + hdr->num_content_ids = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); if (hdr->num_content_ids < 1 || hdr->num_content_ids >= SIZE_MAX / sizeof(int32_t)) { /* Slice must have at least one data block, @@ -1020,20 +1005,18 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { return NULL; } - for (i = 0; i < hdr->num_content_ids; i++) { - int l = safe_itf8_get((char *)cp, (char *)cp_end, - &hdr->block_content_ids[i]); - if (l <= 0) { - free(hdr->block_content_ids); - free(hdr); - return NULL; - } - cp += l; + for (i = 0; i < hdr->num_content_ids; i++) + hdr->block_content_ids[i] = fd->vv.varint_get32((char **)&cp, + (char *)cp_end, + &err); + if (err) { + free(hdr->block_content_ids); + free(hdr); + return NULL; } - if (b->content_type == MAPPED_SLICE) { - cp += safe_itf8_get((char *)cp, (char *) cp_end, &hdr->ref_base_id); - } + if (b->content_type == MAPPED_SLICE) + hdr->ref_base_id = fd->vv.varint_get32((char **)&cp, (char *) cp_end, &err); if (CRAM_MAJOR_VERS(fd->version) != 1) { if (cp_end - cp < 16) { @@ -1046,7 +1029,11 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { memset(hdr->md5, 0, 16); } - return hdr; + if (!err) + return hdr; + + free(hdr); + return NULL; } @@ -1113,8 +1100,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, uint32_t nm = 0; int32_t md_dist = 0; int orig_aux = 0; - int decode_md = s->decode_md && s->ref && !has_MD && cr->ref_id >= 0; - int decode_nm = s->decode_md && s->ref && !has_NM && cr->ref_id >= 0; + // CRAM < 4.0 decode_md is off/on + // CRAM >= 4.0 decode_md is auto/on (auto=on if MD* present, off otherwise) + int do_md = CRAM_MAJOR_VERS(fd->version) >= 4 + ? (s->decode_md > 0) + : (s->decode_md != 0); + int decode_md = s->ref && cr->ref_id >= 0 && ((do_md && !has_MD) || has_MD < 0); + int decode_nm = s->ref && cr->ref_id >= 0 && ((do_md && !has_NM) || has_NM < 0); uint32_t ds = s->data_series; sam_hrecs_t *bfd = sh->hrecs; @@ -1127,7 +1119,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (decode_md) { orig_aux = BLOCK_SIZE(s->aux_blk); - BLOCK_APPEND(s->aux_blk, "MDZ", 3); + if (has_MD == 0) + BLOCK_APPEND(s->aux_blk, "MDZ", 3); } if (ds & CRAM_FN) { @@ -1807,32 +1800,67 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (decode_md) { BLOCK_APPEND_CHAR(s->aux_blk, '\0'); // null terminate MD:Z: - cr->aux_size += BLOCK_SIZE(s->aux_blk) - orig_aux; + size_t sz = BLOCK_SIZE(s->aux_blk) - orig_aux; + if (has_MD < 0) { + // has_MD < 0; already have MDZ allocated in aux at -has_MD, + // but wrote MD to end of aux (at orig_aux). + // We need some memmoves to shuffle it around. + char tmp_MD_[1024], *tmp_MD = tmp_MD_; + unsigned char *orig_aux_p = BLOCK_DATA(s->aux_blk) + orig_aux; + if (sz > 1024) { + tmp_MD = malloc(sz); + if (!tmp_MD) + return -1; + } + memcpy(tmp_MD, orig_aux_p, sz); + memmove(&BLOCK_DATA(s->aux_blk)[-has_MD] + sz, + &BLOCK_DATA(s->aux_blk)[-has_MD], + orig_aux_p - &BLOCK_DATA(s->aux_blk)[-has_MD]); + memcpy(&BLOCK_DATA(s->aux_blk)[-has_MD], tmp_MD, sz); + if (tmp_MD != tmp_MD_) + free(tmp_MD); + + if (-has_NM > -has_MD) + // we inserted before NM, so move it up a bit + has_NM -= sz; + } + // else has_MD == 0 and we've already appended MD to the end. + + cr->aux_size += sz; } if (decode_nm) { - char buf[7]; - size_t buf_size; - buf[0] = 'N'; buf[1] = 'M'; - if (nm <= UINT8_MAX) { - buf_size = 4; - buf[2] = 'C'; - buf[3] = (nm>> 0) & 0xff; - } else if (nm <= UINT16_MAX) { - buf_size = 5; - buf[2] = 'S'; - buf[3] = (nm>> 0) & 0xff; - buf[4] = (nm>> 8) & 0xff; + if (has_NM == 0) { + char buf[7]; + size_t buf_size; + buf[0] = 'N'; buf[1] = 'M'; + if (nm <= UINT8_MAX) { + buf_size = 4; + buf[2] = 'C'; + buf[3] = (nm>> 0) & 0xff; + } else if (nm <= UINT16_MAX) { + buf_size = 5; + buf[2] = 'S'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + } else { + buf_size = 7; + buf[2] = 'I'; + buf[3] = (nm>> 0) & 0xff; + buf[4] = (nm>> 8) & 0xff; + buf[5] = (nm>>16) & 0xff; + buf[6] = (nm>>24) & 0xff; + } + BLOCK_APPEND(s->aux_blk, buf, buf_size); + cr->aux_size += buf_size; } else { - buf_size = 7; - buf[2] = 'I'; - buf[3] = (nm>> 0) & 0xff; - buf[4] = (nm>> 8) & 0xff; - buf[5] = (nm>>16) & 0xff; - buf[6] = (nm>>24) & 0xff; + // Preallocated space for NM at -has_NM into aux block + unsigned char *buf = BLOCK_DATA(s->aux_blk) + -has_NM; + buf[0] = (nm>> 0) & 0xff; + buf[1] = (nm>> 8) & 0xff; + buf[2] = (nm>>16) & 0xff; + buf[3] = (nm>>24) & 0xff; } - BLOCK_APPEND(s->aux_blk, buf, buf_size); - cr->aux_size += buf_size; } return r; @@ -1914,7 +1942,11 @@ static int cram_decode_aux_1_0(cram_container *c, cram_slice *s, return -1; } -static int cram_decode_aux(cram_container *c, cram_slice *s, +// has_MD and has_NM are filled out with 0 for none present, +// 1 for present and verbatim, and -pos for present as placeholder +// (MD*, NM*) to be generated and filled out at offset +pos. +static int cram_decode_aux(cram_fd *fd, + cram_container *c, cram_slice *s, cram_block *blk, cram_record *cr, int *has_MD, int *has_NM) { int i, r = 0, out_sz = 1; @@ -1946,29 +1978,64 @@ static int cram_decode_aux(cram_container *c, cram_slice *s, for (i = 0; i < cr->ntags; i++) { int32_t id, out_sz = 1; - unsigned char tag_data[3]; + unsigned char tag_data[7]; cram_map *m; if (TN[0] == 'M' && TN[1] == 'D' && has_MD) - *has_MD = 1; + *has_MD = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1); if (TN[0] == 'N' && TN[1] == 'M' && has_NM) - *has_NM = 1; + *has_NM = (BLOCK_SIZE(s->aux_blk)+3) * (TN[2] == '*' ? -1 : 1);; //printf("Tag %d/%d\n", i+1, cr->ntags); - tag_data[0] = *TN++; - tag_data[1] = *TN++; - tag_data[2] = *TN++; + tag_data[0] = TN[0]; + tag_data[1] = TN[1]; + tag_data[2] = TN[2]; id = (tag_data[0]<<16) | (tag_data[1]<<8) | tag_data[2]; - m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); - if (!m) - return -1; - BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + if (CRAM_MAJOR_VERS(fd->version) >= 4 && TN[2] == '*') { + // Place holder, fill out contents later. + int tag_data_size; + if (TN[0] == 'N' && TN[1] == 'M') { + // Use a fixed size, so we can allocate room for it now. + memcpy(&tag_data[2], "I\0\0\0\0", 5); + tag_data_size = 7; + } else if (TN[0] == 'R' && TN[1] == 'G') { + // RG is variable size, but known already. Insert now + TN += 3; + // Equiv to fd->header->hrecs->rg[cr->rg], but this is the + // new header API equivalent. + const char *rg = sam_hdr_line_name(fd->header, "RG", cr->rg); + if (!rg) + continue; - if (!m->codec) return -1; - r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); - if (r) break; - cr->aux_size += out_sz + 3; + size_t rg_len = strlen(rg); + tag_data[2] = 'Z'; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + BLOCK_APPEND(s->aux_blk, rg, rg_len); + BLOCK_APPEND_CHAR(s->aux_blk, '\0'); + cr->aux_size += 3 + rg_len + 1; + cr->rg = -1; // prevents auto-add later + continue; + } else { + // Unknown size. We'll insert MD into stream later. + tag_data[2] = 'Z'; + tag_data_size = 3; + } + BLOCK_APPEND(s->aux_blk, (char *)tag_data, tag_data_size); + cr->aux_size += tag_data_size; + TN += 3; + } else { + TN += 3; + m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); + if (!m) + return -1; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); + + if (!m->codec) return -1; + r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); + if (r) break; + cr->aux_size += out_sz + 3; + } } return r; @@ -2011,7 +2078,7 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { * Or do we just admit defeat and output 0 for tlen? It's the * safe option... */ - if (cr->tlen == INT_MIN) { + if (cr->tlen == INT64_MIN) { int id1 = rec, id2 = rec; int64_t aleft = cr->apos, aright = cr->aend; int64_t tlen; @@ -2120,9 +2187,16 @@ static int cram_decode_slice_xref(cram_slice *s, int required_fields) { cr->mate_ref_id = -1; } - if (cr->tlen == INT_MIN) - cr->tlen = 0; // Just in case + if (cr->tlen == INT64_MIN) + cr->tlen = 0; // Just incase } + + for (rec = 0; rec < s->hdr->num_records; rec++) { + cram_record *cr = &s->crecs[rec]; + if (cr->explicit_tlen != INT64_MIN) + cr->tlen = cr->explicit_tlen; + } + return 0; } @@ -2137,6 +2211,31 @@ static char *md5_print(unsigned char *md5, char *out) { return out; } +/* + * Utility function to decode tlen (ISIZE), as it's called + * in multiple places. + * + * Returns codec return value (0 on success). + */ +static int cram_decode_tlen(cram_fd *fd, cram_container *c, cram_slice *s, + cram_block *blk, int64_t *tlen) { + int out_sz = 1, r = 0; + + if (!c->comp_hdr->codecs[DS_TS]) return -1; + if (CRAM_MAJOR_VERS(fd->version) < 4) { + int32_t i32; + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)&i32, &out_sz); + *tlen = i32; + } else { + r |= c->comp_hdr->codecs[DS_TS] + ->decode(s, c->comp_hdr->codecs[DS_TS], blk, + (char *)tlen, &out_sz); + } + return r; +} + /* * Decode an entire slice from container blocks. Fills out s->crecs[] array. * Returns 0 on success @@ -2454,18 +2553,18 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (ds & CRAM_AP) { if (!c->comp_hdr->codecs[DS_AP]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_AP] - ->decode(s, c->comp_hdr->codecs[DS_AP], blk, - (char *)&cr->apos, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_AP] - ->decode(s, c->comp_hdr->codecs[DS_AP], blk, - (char *)&i32, &out_sz); - cr->apos = i32; -#endif - if (r) goto block_err; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&cr->apos, &out_sz); + } else { + int32_t i32; + r |= c->comp_hdr->codecs[DS_AP] + ->decode(s, c->comp_hdr->codecs[DS_AP], blk, + (char *)&i32, &out_sz); + cr->apos = i32; + } + if (r) goto block_err;; if (c->comp_hdr->AP_delta) cr->apos += s->last_apos; s->last_apos= cr->apos; @@ -2505,6 +2604,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->mate_pos = 0; cr->mate_line = -1; cr->mate_ref_id = -1; + cr->explicit_tlen = INT64_MIN; if ((ds & CRAM_CF) && (cf & CRAM_FLAG_DETACHED)) { if (ds & CRAM_MF) { if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -2560,39 +2660,30 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, // } if (ds & CRAM_NP) { - if (!c->comp_hdr->codecs[DS_NP]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_NP] - ->decode(s, c->comp_hdr->codecs[DS_NP], blk, - (char *)&cr->mate_pos, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_NP] - ->decode(s, c->comp_hdr->codecs[DS_NP], blk, - (char *)&i32, &out_sz); - cr->mate_pos = i32; -#endif + if (!c->comp_hdr->codecs[DS_NP]) goto block_err;; + if (CRAM_MAJOR_VERS(fd->version) < 4) { + int32_t i32; + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&i32, &out_sz); + cr->mate_pos = i32; + } else { + r |= c->comp_hdr->codecs[DS_NP] + ->decode(s, c->comp_hdr->codecs[DS_NP], blk, + (char *)&cr->mate_pos, &out_sz); + } if (r) goto block_err; } if (ds & CRAM_TS) { if (!c->comp_hdr->codecs[DS_TS]) goto block_err; -#ifdef LARGE_POS - r |= c->comp_hdr->codecs[DS_TS] - ->decode(s, c->comp_hdr->codecs[DS_TS], blk, - (char *)&cr->tlen, &out_sz); -#else - int32_t i32; - r |= c->comp_hdr->codecs[DS_TS] - ->decode(s, c->comp_hdr->codecs[DS_TS], blk, - (char *)&i32, &out_sz); - cr->tlen = i32; -#endif + r = cram_decode_tlen(fd, c, s, blk, &cr->tlen); if (r) goto block_err; } else { - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; } } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_MATE_DOWNSTREAM)) { + // else not detached if (ds & CRAM_NF) { if (!c->comp_hdr->codecs[DS_NF]) goto block_err; r |= c->comp_hdr->codecs[DS_NF] @@ -2606,15 +2697,32 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, //dstring_nappend(name_ds, name, cr->name_len); cr->mate_ref_id = -1; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; cr->mate_pos = 0; } else { cr->mate_flags = 0; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; + } + if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) { + if (ds & CRAM_TS) { + r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen); + if (r) return r; + } else { + cr->mate_flags = 0; + cr->tlen = INT64_MIN; + } + } + } else if ((ds & CRAM_CF) && (cf & CRAM_FLAG_EXPLICIT_TLEN)) { + if (ds & CRAM_TS) { + r = cram_decode_tlen(fd, c, s, blk, &cr->explicit_tlen); + if (r) return r; + } else { + cr->mate_flags = 0; + cr->tlen = INT64_MIN; } } else { cr->mate_flags = 0; - cr->tlen = INT_MIN; + cr->tlen = INT64_MIN; } /* else if (!name[0]) { @@ -2634,7 +2742,7 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if (CRAM_MAJOR_VERS(fd->version) == 1) r |= cram_decode_aux_1_0(c, s, blk, cr); else - r |= cram_decode_aux(c, s, blk, cr, &has_MD, &has_NM); + r |= cram_decode_aux(fd, c, s, blk, cr, &has_MD, &has_NM); if (r) goto block_err; /* Fake up dynamic string growth and appending */ @@ -2705,6 +2813,16 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, memset(qual, 255, cr->len); } } + + if (!c->comp_hdr->qs_seq_orient && (ds & CRAM_QS) && (cr->flags & BAM_FREVERSE)) { + int i, j; + for (i = 0, j = cr->len-1; i < j; i++, j--) { + unsigned char c; + c = qual[i]; + qual[i] = qual[j]; + qual[j] = c; + } + } } pthread_mutex_lock(&fd->ref_lock); @@ -2853,18 +2971,28 @@ static int cram_to_bam(sam_hdr_t *sh, cram_fd *fd, cram_slice *s, name_len = cr->name_len; } else { name = name_a; - name_len = strlen(fd->prefix); - memcpy(name, fd->prefix, name_len); - name += name_len; - *name++ = ':'; - if (cr->mate_line >= 0 && cr->mate_line < rec) - name = (char *)append_uint64((unsigned char *)name, - s->hdr->record_counter + - cr->mate_line + 1); - else - name = (char *)append_uint64((unsigned char *)name, - s->hdr->record_counter + - rec + 1); + if (cr->mate_line >= 0 && cr->mate_line < s->max_rec && + s->crecs[cr->mate_line].name_len > 0) { + // Copy our mate if non-zero. + memcpy(name_a, BLOCK_DATA(s->name_blk)+s->crecs[cr->mate_line].name, + s->crecs[cr->mate_line].name_len); + name = name_a + s->crecs[cr->mate_line].name_len; + } else { + // Otherwise generate a name based on prefix + name_len = strlen(fd->prefix); + memcpy(name, fd->prefix, name_len); + name += name_len; + *name++ = ':'; + if (cr->mate_line >= 0 && cr->mate_line < rec) { + name = (char *)append_uint64((unsigned char *)name, + s->hdr->record_counter + + cr->mate_line + 1); + } else { + name = (char *)append_uint64((unsigned char *)name, + s->hdr->record_counter + + rec + 1); + } + } name_len = name - name_a; name = name_a; } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 7d2f5a9a5..cdd73b8a7 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -95,27 +95,11 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, * the total size (stored as a variable length string). */ -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ - // Duplicated from container itself, and removed in 1.1 if (CRAM_MAJOR_VERS(fd->version) == 1) { r |= itf8_put_blk(cb, h->ref_seq_id); -#ifdef LARGE_POS - r |= ltf8_put_blk(cb, h->ref_seq_start); - r |= ltf8_put_blk(cb, h->ref_seq_span); -#else r |= itf8_put_blk(cb, h->ref_seq_start); r |= itf8_put_blk(cb, h->ref_seq_span); -#endif r |= itf8_put_blk(cb, h->num_records); r |= itf8_put_blk(cb, h->num_landmarks); for (i = 0; i < h->num_landmarks; i++) { @@ -128,7 +112,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, /* Create in-memory preservation map */ /* FIXME: should create this when we create the container */ - { + if (h->num_records > 0) { khint_t k; int r; @@ -166,6 +150,12 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, if (-1 == r) return NULL; kh_val(h->preservation_map, k).i = h->AP_delta; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + k = kh_put(map, h->preservation_map, "QO", &r); + if (-1 == r) return NULL; + kh_val(h->preservation_map, k).i = h->qs_seq_orient; + } + if (fd->no_ref || fd->embed_ref) { // Reference Required == No k = kh_put(map, h->preservation_map, "RR", &r); @@ -196,26 +186,12 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, switch(CRAM_KEY(key[0], key[1])) { case CRAM_KEY('M','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('U','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('P','I'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('A','P'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('R','N'): - BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); - break; - case CRAM_KEY('R','R'): + case CRAM_KEY('Q','O'): BLOCK_APPEND_CHAR(map, kh_val(pmap, k).i); break; @@ -251,7 +227,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, } case CRAM_KEY('T','D'): { - r |= itf8_put_blk(map, BLOCK_SIZE(h->TD_blk)); + r |= (fd->vv.varint_put32_blk(map, BLOCK_SIZE(h->TD_blk)) <= 0); BLOCK_APPEND(map, BLOCK_DATA(h->TD_blk), BLOCK_SIZE(h->TD_blk)); @@ -266,8 +242,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, mc++; } } - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); /* rec encoding map */ @@ -467,33 +443,11 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, return NULL; mc++; } - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); /* tag encoding map */ -#if 0 - mp = map; mc = 0; - if (h->tag_encoding_map) { - HashItem *hi; - HashIter *iter = HashTableIterCreate(); - if (!iter) - return NULL; - - while ((hi = HashTableIterNext(h->tag_encoding_map, iter))) { - cram_map *m = hi->data.p; - int sz; - - mp += itf8_put(mp, (hi->key[0]<<16)|(hi->key[1]<<8)|hi->key[2]); - if (-1 == (sz = m->codec->store(m->codec, mp, NULL, fd->version))) - return NULL; - mp += sz; - mc++; - } - - HashTableIterDestroy(iter); - } -#else mc = 0; BLOCK_SIZE(map) = 0; if (c->tags_used) { @@ -507,16 +461,16 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, key = kh_key(c->tags_used, k); cram_codec *cd = kh_val(c->tags_used, k)->codec; - r |= itf8_put_blk(map, key); + r |= (fd->vv.varint_put32_blk(map, key) <= 0); if (-1 == cd->store(cd, map, NULL, fd->version)) return NULL; mc++; } } -#endif - r |= itf8_put_blk(cb, BLOCK_SIZE(map) + itf8_size(mc)); - r |= itf8_put_blk(cb, mc); + + r |= (fd->vv.varint_put32_blk(cb, BLOCK_SIZE(map) + fd->vv.varint_size(mc)) <= 0); + r |= (fd->vv.varint_put32_blk(cb, mc) <= 0); BLOCK_APPEND(cb, BLOCK_DATA(map), BLOCK_SIZE(map)); hts_log_info("Wrote compression block header in %d bytes", (int)BLOCK_SIZE(cb)); @@ -548,38 +502,38 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { if (!b) return NULL; - cp = buf = malloc(16+5*(8+s->hdr->num_blocks)); + cp = buf = malloc(22+16+5*(8+s->hdr->num_blocks)); if (NULL == buf) { cram_free_block(b); return NULL; } - cp += itf8_put(cp, s->hdr->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put(cp, s->hdr->ref_seq_start); - cp += ltf8_put(cp, s->hdr->ref_seq_span); -#else - cp += itf8_put(cp, s->hdr->ref_seq_start); - cp += itf8_put(cp, s->hdr->ref_seq_span); -#endif - cp += itf8_put(cp, s->hdr->num_records); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_span); + } + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) - cp += itf8_put(cp, s->hdr->record_counter); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->record_counter); else if (CRAM_MAJOR_VERS(fd->version) >= 3) - cp += ltf8_put(cp, s->hdr->record_counter); - cp += itf8_put(cp, s->hdr->num_blocks); - cp += itf8_put(cp, s->hdr->num_content_ids); + cp += fd->vv.varint_put64(cp, NULL, s->hdr->record_counter); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->num_content_ids); for (j = 0; j < s->hdr->num_content_ids; j++) { - cp += itf8_put(cp, s->hdr->block_content_ids[j]); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->block_content_ids[j]); } if (s->hdr->content_type == MAPPED_SLICE) - cp += itf8_put(cp, s->hdr->ref_base_id); + cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_base_id); if (CRAM_MAJOR_VERS(fd->version) != 1) { memcpy(cp, s->hdr->md5, 16); cp += 16; } - assert(cp-buf <= 16+5*(8+s->hdr->num_blocks)); + assert(cp-buf <= 22+16+5*(8+s->hdr->num_blocks)); b->data = (unsigned char *)buf; b->comp_size = b->uncomp_size = cp-buf; @@ -602,6 +556,7 @@ static int cram_encode_slice_read(cram_fd *fd, int64_t *last_pos) { int r = 0; int32_t i32; + int64_t i64; unsigned char uc; //fprintf(stderr, "Encode seq %d, %d/%d FN=%d, %s\n", rec, core->byte, core->bit, cr->nfeature, s->name_ds->str + cr->name); @@ -620,24 +575,22 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_RL]->encode(s, h->codecs[DS_RL], (char *)&cr->len, 1); if (c->pos_sorted) { -#ifdef LARGE_POS - int64_t i64; - i64 = cr->apos - *last_pos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); -#else - i32 = cr->apos - *last_pos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); -#endif + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + i64 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); + } else { + i32 = cr->apos - *last_pos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); + } *last_pos = cr->apos; } else { -#ifdef LARGE_POS - int64_t i64; - i64 = cr->apos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); -#else - i32 = cr->apos; - r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); -#endif + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + i64 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i64, 1); + } else { + i32 = cr->apos; + r |= h->codecs[DS_AP]->encode(s, h->codecs[DS_AP], (char *)&i32, 1); + } } r |= h->codecs[DS_RG]->encode(s, h->codecs[DS_RG], (char *)&cr->rg, 1); @@ -649,23 +602,30 @@ static int cram_encode_slice_read(cram_fd *fd, r |= h->codecs[DS_NS]->encode(s, h->codecs[DS_NS], (char *)&cr->mate_ref_id, 1); -#ifdef LARGE_POS - r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], - (char *)&cr->mate_pos, 1); - - r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], - (char *)&cr->tlen, 1); -#else - i32 = cr->mate_pos; - r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], - (char *)&i32, 1); - i32 = cr->tlen; - r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], - (char *)&i32, 1); -#endif - } else if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { - r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], - (char *)&cr->mate_line, 1); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&cr->mate_pos, 1); + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&cr->tlen, 1); + } else { + i32 = cr->mate_pos; + r |= h->codecs[DS_NP]->encode(s, h->codecs[DS_NP], + (char *)&i32, 1); + i32 = cr->tlen; + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&i32, 1); + } + } else { + if (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) { + r |= h->codecs[DS_NF]->encode(s, h->codecs[DS_NF], + (char *)&cr->mate_line, 1); + } + if (cr->cram_flags & CRAM_FLAG_EXPLICIT_TLEN) { + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + r |= h->codecs[DS_TS]->encode(s, h->codecs[DS_TS], + (char *)&cr->tlen, 1); + } + } } /* Aux tags */ @@ -716,7 +676,7 @@ static int cram_encode_slice_read(cram_fd *fd, // BLOCK_DATA(s->soft_blk) + f->S.seq_idx, // f->S.len); - //if (IS_CRAM_3_VERS(fd)) { + //if (CRAM_MAJOR_VERS(fd->version) >= 3) { // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], // BLOCK_DATA(s->seqs_blk) + f->S.seq_idx, // f->S.len); @@ -726,7 +686,7 @@ static int cram_encode_slice_read(cram_fd *fd, //seq = DSTRING_STR(s->seqs_ds) + f->S.seq_idx; //r |= h->codecs[DS_IN]->encode(s, h->codecs[DS_IN], // seq, f->S.len); - //if (IS_CRAM_3_VERS(fd)) { + //if (CRAM_MAJOR_VERS(fd->version) >= 3) { // r |= h->codecs[DS_BB]->encode(s, h->codecs[DS_BB], // BLOCK_DATA(s->seqs_blk) + f->I.seq_idx, // f->I.len); @@ -822,6 +782,7 @@ static int cram_encode_slice_read(cram_fd *fd, static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { int level = fd->level, i; int method = 1<version >= (3<<8)+1); /* Compress the CORE Block too, with minimal zlib level */ if (level > 5 && s->block[0]->uncomp_size > 500) @@ -843,7 +804,6 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { method_ranspr |= (1<version >= (3<<8)+1); if (fd->use_rans) { methodF |= v31_or_above ? method_ranspr : method_rans; method |= v31_or_above ? method_ranspr : method_rans; @@ -856,7 +816,7 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { method_arith |= (1<use_arith && v31_or_above) { methodF |= method_arith; @@ -893,6 +853,11 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { } } + pthread_mutex_lock(&fd->metrics_lock); + for (i = 0; i < DS_END; i++) + fd->m[i]->stats = c->stats[i]; + pthread_mutex_unlock(&fd->metrics_lock); + /* Specific compression methods for certain block types */ if (cram_compress_block2(fd, s, s->block[DS_IN], fd->m[DS_IN], //IN (seq) method, level)) @@ -949,13 +914,13 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { // NAME: best is generally xz, bzip2, zlib then rans1 int method_rn = method & ~(method_rans | method_ranspr | 1<version >= (3<<8)+1 && fd->use_tok) - method_rn |= fd->use_arith ? (1<use_arith ? (1<block[DS_RN], fd->m[DS_RN], method_rn, level)) return -1; // NS shows strong local correlation as rearrangements are localised - if (s->block[DS_NS] != s->block[0]) + if (s->block[DS_NS] && s->block[DS_NS] != s->block[0]) if (cram_compress_block2(fd, s, s->block[DS_NS], fd->m[DS_NS], method, level)) return -1; @@ -1000,6 +965,98 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { return 0; } +/* + * Allocates a block associated with the cram codec associated with + * data series ds_id or the internal codec_id (depending on codec + * type). + * + * The ds_ids are what end up written to disk as an external block. + * The c_ids are internal and used when daisy-chaining transforms + * such as MAP and RLE. These blocks are also allocated, but + * are ephemeral in nature. (The codecs themselves cannot allocate + * these as the same codec pointer may be operating on multiple slices + * if we're using a multi-slice container.) + * + * Returns 0 on success + * -1 on failure + */ +static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) { + if (!codec) + return 0; + + switch(codec->codec) { + // Codecs which are hard-coded to use the CORE block + case E_GOLOMB: + case E_HUFFMAN: + case E_BETA: + case E_SUBEXP: + case E_GOLOMB_RICE: + case E_GAMMA: + codec->out = s->block[0]; + break; + + // Codecs that emit directly to external blocks + case E_EXTERNAL: + if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id))) + return -1; + codec->u.external.content_id = ds_id; + codec->out = s->block[ds_id]; + break; + + case E_BYTE_ARRAY_STOP: // Why no sub-codec? + if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id))) + return -1; + codec->u.byte_array_stop.content_id = ds_id; + codec->out = s->block[ds_id]; + break; + + + // Codecs that contain sub-codecs which may in turn emit to external blocks + case E_BYTE_ARRAY_LEN: { + cram_codec *bal = codec->u.e_byte_array_len.len_codec; + if (cram_allocate_block(bal, s, bal->u.external.content_id)) + return -1; + bal = codec->u.e_byte_array_len.val_codec; + if (cram_allocate_block(bal, s, bal->u.external.content_id)) + return -1; + + break; + } + + case E_XRLE: + if (cram_allocate_block(codec->u.e_xrle.len_codec, s, ds_id)) + //ds_id == DS_QS ? DS_QS_len : ds_id)) + return -1; + if (cram_allocate_block(codec->u.e_xrle.lit_codec, s, ds_id)) + return -1; + + break; + + case E_XPACK: + if (cram_allocate_block(codec->u.e_xpack.sub_codec, s, ds_id)) + return -1; + codec->out = cram_new_block(0, 0); // ephemeral + if (!codec->out) + return -1; + + break; + + case E_XDELTA: + if (cram_allocate_block(codec->u.e_xdelta.sub_codec, s, ds_id)) + return -1; + codec->out = cram_new_block(0, 0); // ephemeral + if (!codec->out) + return -1; + + break; + + default: + break; + } + + return 0; +} + /* * Encodes a single slice from a container * @@ -1032,7 +1089,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, c->num_records += s->hdr->num_records; int ntags = c->tags_used ? c->tags_used->n_occupied : 0; - s->block = calloc(DS_END + ntags, sizeof(s->block[0])); + s->block = calloc(DS_END + ntags*2, sizeof(s->block[0])); s->hdr->block_content_ids = malloc(DS_END * sizeof(int32_t)); if (!s->block || !s->hdr->block_content_ids) return -1; @@ -1065,55 +1122,9 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, /* * All the data-series blocks if appropriate. */ - for (id = DS_BF; id < DS_TN; id++) { - if (h->codecs[id] && (h->codecs[id]->codec == E_EXTERNAL || - h->codecs[id]->codec == E_BYTE_ARRAY_STOP || - h->codecs[id]->codec == E_BYTE_ARRAY_LEN)) { - switch (h->codecs[id]->codec) { - case E_EXTERNAL: - if (!(s->block[id] = cram_new_block(EXTERNAL, id))) - return -1; - h->codecs[id]->u.external.content_id = id; - break; - - case E_BYTE_ARRAY_STOP: - if (!(s->block[id] = cram_new_block(EXTERNAL, id))) - return -1; - h->codecs[id]->u.byte_array_stop.content_id = id; - break; - - case E_BYTE_ARRAY_LEN: { - cram_codec *cc; - - cc = h->codecs[id]->u.e_byte_array_len.len_codec; - if (cc->codec == E_EXTERNAL) { - int eid = cc->u.external.content_id; - if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) - return -1; - cc->u.external.content_id = eid; - cc->out = s->block[eid]; - } - - cc = h->codecs[id]->u.e_byte_array_len.val_codec; - if (cc->codec == E_EXTERNAL) { - int eid = cc->u.external.content_id; - if (!s->block[eid]) - if (!(s->block[eid] = cram_new_block(EXTERNAL, eid))) - return -1; - cc->u.external.content_id = eid; - cc->out = s->block[eid]; - } - break; - } - default: - break; - } - } else { - if (!(id == DS_BB && !h->codecs[DS_BB])) - s->block[id] = s->block[0]; - } - if (h->codecs[id]) - h->codecs[id]->out = s->block[id]; + for (id = DS_QS; id < DS_TN; id++) { + if (cram_allocate_block(h->codecs[id], s, id) < 0) + return -1; } /* @@ -1138,11 +1149,21 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, s->block[0]->comp_size = s->block[0]->uncomp_size; // Make sure the fixed blocks point to the correct sources + if (s->block[DS_IN]) cram_free_block(s->block[DS_IN]); s->block[DS_IN] = s->base_blk; s->base_blk = NULL; + if (s->block[DS_QS]) cram_free_block(s->block[DS_QS]); s->block[DS_QS] = s->qual_blk; s->qual_blk = NULL; + if (s->block[DS_RN]) cram_free_block(s->block[DS_RN]); s->block[DS_RN] = s->name_blk; s->name_blk = NULL; + if (s->block[DS_SC]) cram_free_block(s->block[DS_SC]); s->block[DS_SC] = s->soft_blk; s->soft_blk = NULL; + // Finalise any data transforms. + for (id = DS_QS; id < DS_TN; id++) { + if (h->codecs[id] && h->codecs[id]->flush) + h->codecs[id]->flush(h->codecs[id]); + } + // Ensure block sizes are up to date. for (id = 1; id < s->hdr->num_blocks; id++) { if (!s->block[id] || s->block[id] == s->block[0]) @@ -1338,12 +1359,21 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, cr->name = BLOCK_SIZE(s->name_blk); if ((cr->cram_flags & CRAM_FLAG_DETACHED) || keep_names) { - BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); - cr->name_len = bam_name_len(b); + if (CRAM_MAJOR_VERS(fd->version) >= 4 + && (cr->cram_flags & CRAM_FLAG_MATE_DOWNSTREAM) + && cr->mate_line) { + // Dedup read names in V4 + BLOCK_APPEND(s->name_blk, "\0", 1); + cr->name_len = 1; + } else { + BLOCK_APPEND(s->name_blk, bam_name(b), bam_name_len(b)); + cr->name_len = bam_name_len(b); + } } else { // Can only discard duplicate names if not detached cr->name_len = 0; } + if (cram_stats_add(c->stats[DS_RN], cr->name_len) < 0) goto block_err; } @@ -1367,6 +1397,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { int r1, r2, sn, nref; spare_bams *spares; + if (CRAM_MAJOR_VERS(fd->version) == 1) + goto err; + //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;} #define goto_err goto err @@ -1491,7 +1524,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // slice can start aggregating them from the start again. if (c->tags_used->n_occupied) { int ntags = c->tags_used->n_occupied; - s->aux_block = calloc(ntags, sizeof(*s->aux_block)); + s->aux_block = calloc(ntags*2, sizeof(*s->aux_block)); if (!s->aux_block) return -1; @@ -1507,8 +1540,11 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (!tm->blk) continue; s->aux_block[s->naux_block++] = tm->blk; tm->blk = NULL; + if (!tm->blk2) continue; + s->aux_block[s->naux_block++] = tm->blk2; + tm->blk2 = NULL; } - assert(s->naux_block <= c->tags_used->n_occupied); + assert(s->naux_block <= 2*c->tags_used->n_occupied); } } @@ -1544,6 +1580,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Compute MD5s */ + int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; @@ -1570,13 +1607,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== BF ===\n"); h->codecs[DS_BF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BF]), c->stats[DS_BF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BF]->nvals && !h->codecs[DS_BF]) goto_err; //fprintf(stderr, "=== CF ===\n"); h->codecs[DS_CF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_CF]), c->stats[DS_CF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_CF]->nvals && !h->codecs[DS_CF]) goto_err; //fprintf(stderr, "=== RN ===\n"); @@ -1588,105 +1625,104 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (c->pos_sorted) { h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), c->stats[DS_AP], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + is_v4 ? E_SLONG : E_INT, + NULL, fd->version, &fd->vv); } else { int p[2] = {0, c->max_apos}; - h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, E_INT, p, - fd->version); + h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, + is_v4 ? E_SLONG : E_INT, + p, fd->version, &fd->vv); +// cram_xdelta_encoder e; +// e.word_size = is_v4 ? 8 : 4; +// e.sub_encoding = E_EXTERNAL; +// e.sub_codec_dat = (void *)DS_AP; +// +// h->codecs[DS_AP] = cram_encoder_init(E_XDELTA, NULL, +// is_v4 ? E_LONG : E_INT, +// &e, fd->version, &fd->vv); } if (!h->codecs[DS_AP]) goto_err; //fprintf(stderr, "=== RG ===\n"); h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), - c->stats[DS_RG], E_INT, NULL, - fd->version); + c->stats[DS_RG], + is_v4 ? E_SINT : E_INT, + NULL, + fd->version, &fd->vv); if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err; //fprintf(stderr, "=== MQ ===\n"); h->codecs[DS_MQ] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MQ]), c->stats[DS_MQ], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_MQ]->nvals && !h->codecs[DS_MQ]) goto_err; //fprintf(stderr, "=== NS ===\n"); h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), c->stats[DS_NS], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err; //fprintf(stderr, "=== MF ===\n"); h->codecs[DS_MF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_MF]), c->stats[DS_MF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_MF]->nvals && !h->codecs[DS_MF]) goto_err; //fprintf(stderr, "=== TS ===\n"); h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), c->stats[DS_TS], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + is_v4 ? E_SLONG : E_INT, + NULL, fd->version, &fd->vv); if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err; //fprintf(stderr, "=== NP ===\n"); h->codecs[DS_NP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NP]), c->stats[DS_NP], -#ifdef LARGE_POS - E_LONG, -#else - E_INT, -#endif - NULL, fd->version); + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); if (c->stats[DS_NP]->nvals && !h->codecs[DS_NP]) goto_err; //fprintf(stderr, "=== NF ===\n"); h->codecs[DS_NF] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NF]), c->stats[DS_NF], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_NF]->nvals && !h->codecs[DS_NF]) goto_err; //fprintf(stderr, "=== RL ===\n"); h->codecs[DS_RL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RL]), c->stats[DS_RL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RL]->nvals && !h->codecs[DS_RL]) goto_err; //fprintf(stderr, "=== FN ===\n"); h->codecs[DS_FN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FN]), c->stats[DS_FN], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FN]->nvals && !h->codecs[DS_FN]) goto_err; //fprintf(stderr, "=== FC ===\n"); h->codecs[DS_FC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FC]), c->stats[DS_FC], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FC]->nvals && !h->codecs[DS_FC]) goto_err; //fprintf(stderr, "=== FP ===\n"); h->codecs[DS_FP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_FP]), c->stats[DS_FP], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_FP]->nvals && !h->codecs[DS_FP]) goto_err; //fprintf(stderr, "=== DL ===\n"); h->codecs[DS_DL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_DL]), c->stats[DS_DL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_DL]->nvals && !h->codecs[DS_DL]) goto_err; //fprintf(stderr, "=== BA ===\n"); h->codecs[DS_BA] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BA]), c->stats[DS_BA], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BA]->nvals && !h->codecs[DS_BA]) goto_err; if (CRAM_MAJOR_VERS(fd->version) >= 3) { @@ -1701,7 +1737,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_BB] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_BB]) goto_err; } else { h->codecs[DS_BB] = NULL; @@ -1710,7 +1746,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== BS ===\n"); h->codecs[DS_BS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_BS]), c->stats[DS_BS], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_BS]->nvals && !h->codecs[DS_BS]) goto_err; if (CRAM_MAJOR_VERS(fd->version) == 1) { @@ -1724,13 +1760,13 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TC ===\n"); h->codecs[DS_TC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TC]), c->stats[DS_TC], E_BYTE, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TC]->nvals && !h->codecs[DS_TC]) goto_err; //fprintf(stderr, "=== TN ===\n"); h->codecs[DS_TN] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TN]), c->stats[DS_TN], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TN]->nvals && !h->codecs[DS_TN]) goto_err; } else { h->codecs[DS_TC] = NULL; @@ -1739,32 +1775,32 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TL ===\n"); h->codecs[DS_TL] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TL]), c->stats[DS_TL], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_TL]->nvals && !h->codecs[DS_TL]) goto_err; //fprintf(stderr, "=== RI ===\n"); h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), c->stats[DS_RI], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err; //fprintf(stderr, "=== RS ===\n"); h->codecs[DS_RS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RS]), c->stats[DS_RS], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_RS]->nvals && !h->codecs[DS_RS]) goto_err; //fprintf(stderr, "=== PD ===\n"); h->codecs[DS_PD] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_PD]), c->stats[DS_PD], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_PD]->nvals && !h->codecs[DS_PD]) goto_err; //fprintf(stderr, "=== HC ===\n"); h->codecs[DS_HC] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_HC]), c->stats[DS_HC], E_INT, NULL, - fd->version); + fd->version, &fd->vv); if (c->stats[DS_HC]->nvals && !h->codecs[DS_HC]) goto_err; //fprintf(stderr, "=== SC ===\n"); @@ -1773,7 +1809,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); } else { // Appears to be no practical benefit to using this method, // but it may work better if we start mixing SC, IN and BB @@ -1788,7 +1824,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->codecs[DS_SC] = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); } if (!h->codecs[DS_SC]) goto_err; } @@ -1798,19 +1834,19 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { int i2[2] = {0, DS_IN}; h->codecs[DS_IN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_IN]) goto_err; } h->codecs[DS_QS] = cram_encoder_init(E_EXTERNAL, NULL, E_BYTE, (void *)DS_QS, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_QS]) goto_err; { int i2[2] = {0, DS_RN}; h->codecs[DS_RN] = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); if (!h->codecs[DS_RN]) goto_err; } @@ -1829,7 +1865,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->ref_seq_start = c->ref_seq_start; h->ref_seq_span = c->ref_seq_span; h->num_records = c->num_records; - h->AP_delta = c->pos_sorted; + h->qs_seq_orient = c->qs_seq_orient; + h->AP_delta = c->pos_sorted; memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); if (!(c_hdr = cram_encode_compression_header(fd, c, h))) @@ -1852,9 +1889,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { ? c_hdr->uncomp_size : c_hdr->comp_size; slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(c_hdr->content_id) + - itf8_size(c_hdr->comp_size) + - itf8_size(c_hdr->uncomp_size); + fd->vv.varint_size(c_hdr->content_id) + + fd->vv.varint_size(c_hdr->comp_size) + + fd->vv.varint_size(c_hdr->uncomp_size); } c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; @@ -1877,15 +1914,15 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { : s->hdr_block->comp_size; slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(s->hdr_block->content_id) + - itf8_size(s->hdr_block->comp_size) + - itf8_size(s->hdr_block->uncomp_size); + fd->vv.varint_size(s->hdr_block->content_id) + + fd->vv.varint_size(s->hdr_block->comp_size) + + fd->vv.varint_size(s->hdr_block->uncomp_size); for (j = 0; j < s->hdr->num_blocks; j++) { slice_offset += 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(s->block[j]->content_id) + - itf8_size(s->block[j]->comp_size) + - itf8_size(s->block[j]->uncomp_size); + fd->vv.varint_size(s->block[j]->content_id) + + fd->vv.varint_size(s->block[j]->comp_size) + + fd->vv.varint_size(s->block[j]->uncomp_size); slice_offset += s->block[j]->method == RAW ? s->block[j]->uncomp_size @@ -2127,147 +2164,6 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, return -1; } -/* - * Encodes auxiliary data, CRAM 1.0 format. - * Returns the read-group parsed out of the BAM aux fields on success - * NULL on failure or no rg present (FIXME) - */ -static char *cram_encode_aux_1_0(cram_fd *fd, bam_seq_t *b, cram_container *c, - cram_slice *s, cram_record *cr) { - char *aux, *tmp, *rg = NULL; - int aux_size = bam_blk_size(b) - - ((char *)bam_aux(b) - (char *)&bam_ref(b)); - - /* Worst case is 1 nul char on every ??:Z: string, so +33% */ - BLOCK_GROW(s->aux_blk, aux_size*1.34+1); - tmp = (char *)BLOCK_END(s->aux_blk); - - aux = (char *)bam_aux(b); - cr->TN_idx = s->nTN; - - while (aux[0] != 0) { - int32_t i32; - int r; - - if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { - rg = &aux[3]; - while (*aux++); - continue; - } - if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { - while (*aux++); - continue; - } - if (aux[0] == 'N' && aux[1] == 'M') { - switch(aux[2]) { - case 'A': case 'C': case 'c': aux+=4; break; - case 'I': case 'i': case 'f': aux+=7; break; - default: - hts_log_error("Unhandled type code for NM tag"); - return NULL; - } - continue; - } - - cr->ntags++; - - i32 = (aux[0]<<16) | (aux[1]<<8) | aux[2]; - kh_put(m_tagmap, c->tags_used, i32, &r); - if (-1 == r) - return NULL; - - if (s->nTN >= s->aTN) { - s->aTN = s->aTN ? s->aTN*2 : 1024; - if (!(s->TN = realloc(s->TN, s->aTN * sizeof(*s->TN)))) - return NULL; - } - s->TN[s->nTN++] = i32; - if (cram_stats_add(c->stats[DS_TN], i32) < 0) - goto block_err; - - switch(aux[2]) { - case 'A': case 'C': case 'c': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; - break; - - case 'S': case 's': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'I': case 'i': case 'f': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'd': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - break; - - case 'Z': case 'H': - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - while ((*tmp++=*aux++)); - *tmp++ = '\t'; // stop byte - break; - - case 'B': { - int type = aux[3], blen; - uint32_t count = (uint32_t)((((unsigned char *)aux)[4]<< 0) + - (((unsigned char *)aux)[5]<< 8) + - (((unsigned char *)aux)[6]<<16) + - (((unsigned char *)aux)[7]<<24)); - // skip TN field - aux+=3; //*tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - - // We use BYTE_ARRAY_LEN with external length, so store that first - switch (type) { - case 'c': case 'C': - blen = count; - break; - case 's': case 'S': - blen = 2*count; - break; - case 'i': case 'I': case 'f': - blen = 4*count; - break; - default: - hts_log_error("Unknown sub-type '%c' for aux type 'B'", type); - return NULL; - } - - tmp += itf8_put(tmp, blen+5); - - *tmp++=*aux++; // sub-type & length - *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; *tmp++=*aux++; - - // The tag data itself - memcpy(tmp, aux, blen); tmp += blen; aux += blen; - - //cram_stats_add(c->aux_B_stats, blen); - break; - } - default: - hts_log_error("Unknown aux type '%c'", aux[2]); - return NULL; - } - } - if (cram_stats_add(c->stats[DS_TC], cr->ntags) < 0) - goto block_err; - - cr->aux = BLOCK_SIZE(s->aux_blk); - cr->aux_size = (uc *)tmp - (BLOCK_DATA(s->aux_blk) + cr->aux); - BLOCK_SIZE(s->aux_blk) = (uc *)tmp - BLOCK_DATA(s->aux_blk); - assert(s->aux_blk->byte <= s->aux_blk->alloc); - - return rg; - - block_err: - return NULL; -} - /* * Encodes auxiliary data. Largely duplicated from above, but done so to * keep it simple and avoid a myriad of version ifs. @@ -2299,6 +2195,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { rg = &aux[3]; while (*aux++); + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "RG*", 3); continue; } @@ -2307,6 +2205,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) { while (*aux++); + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "MD*", 3); continue; } } @@ -2325,6 +2225,8 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, hts_log_error("Unhandled type code for NM tag"); return NULL; } + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "NM*", 3); continue; } } @@ -2378,7 +2280,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // string as byte_array_stop c = cram_encoder_init(E_BYTE_ARRAY_STOP, NULL, E_BYTE_ARRAY, (void *)i2, - fd->version); + fd->version, &fd->vv); break; case 'A': case 'c': case 'C': { @@ -2397,7 +2299,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2417,7 +2319,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } case 'i': case 'I': case 'f': { @@ -2436,7 +2338,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, &st, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2456,7 +2358,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, c = cram_encoder_init(E_BYTE_ARRAY_LEN, NULL, E_BYTE_ARRAY, (void *)&e, - fd->version); + fd->version, &fd->vv); break; } @@ -2561,8 +2463,15 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (!tm->blk) { if (!(tm->blk = cram_new_block(EXTERNAL, key))) return NULL; - codec->u.e_byte_array_len.len_codec->out = tm->blk; - codec->u.e_byte_array_len.val_codec->out = tm->blk; + if (codec->u.e_byte_array_len.val_codec->codec == E_XDELTA) { + if (!(tm->blk2 = cram_new_block(EXTERNAL, key+128))) + return NULL; + codec->u.e_byte_array_len.len_codec->out = tm->blk2; + codec->u.e_byte_array_len.val_codec->u.e_xdelta.sub_codec->out = tm->blk; + } else { + codec->u.e_byte_array_len.len_codec->out = tm->blk; + codec->u.e_byte_array_len.val_codec->out = tm->blk; + } } // skip TN field @@ -2737,6 +2646,12 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { c->s_num_bases = 0; c->n_mapped = 0; + // QO field: 0 implies original orientation, 1 implies sequence orientation + // 1 is often preferable for NovaSeq, but impact is slight. ~0.5% diff. + // Conversely other data sets it's often better than 1% saving for 0. + // Short of trying both and learning, for now we use use 0 for V4, 1 for V3. + c->qs_seq_orient = CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : 1; + return c; } @@ -2822,6 +2737,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, qual = cp = (char *)bam_qual(b); + /* Copy and parse */ if (!(cr->flags & BAM_FUNMAP)) { uint32_t *cig_to, *cig_from; @@ -3048,10 +2964,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); int err = 0; - if (CRAM_MAJOR_VERS(fd->version) == 1) - rg = cram_encode_aux_1_0(fd, b, c, s, cr); - else - rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err); + rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err); if (err) goto block_err; @@ -3086,7 +2999,19 @@ static int process_one_read(cram_fd *fd, cram_container *c, char *from = (char *)&bam_qual(b)[0]; char *to = &cp[0]; memcpy(to, from, cr->len); - //for (i = 0; i < cr->len; i++) cp[i] = from[i]; + + // Store quality in original orientation for better compression. + if (!c->qs_seq_orient) { + if (cr->flags & BAM_FREVERSE) { + int i, j; + for (i = 0, j = cr->len-1; i < j; i++, j--) { + unsigned char c; + c = to[i]; + to[i] = to[j]; + to[j] = c; + } + } + } } BLOCK_SIZE(s->qual_blk) += cr->len; } else { @@ -3139,11 +3064,6 @@ static int process_one_read(cram_fd *fd, cram_container *c, // This vs p: tlen, matepos, flags. Permit TLEN 0 and/or TLEN +/- // a small amount, if appropriate options set. - if ((bam_ins_size(b) && - llabs(bam_ins_size(b) - sign*(aright-aleft+1)) > fd->tlen_approx) || - (!bam_ins_size(b) && !fd->tlen_zero)) - goto detached; - if ((!fd->tlen_zero && MAX(bam_mate_pos(b)+1, 0) != p->apos) && !(fd->tlen_zero && bam_mate_pos(b) == 0)) goto detached; @@ -3162,10 +3082,6 @@ static int process_one_read(cram_fd *fd, cram_container *c, !(fd->tlen_zero && p->ref_id == -1)) goto detached; - if ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) > fd->tlen_approx) || - (!p->tlen && !fd->tlen_zero)) - goto detached; - if (p->mate_pos != cr->apos && !(fd->tlen_zero && p->mate_pos == 0)) goto detached; @@ -3191,6 +3107,29 @@ static int process_one_read(cram_fd *fd, cram_container *c, !((p->cram_flags & CRAM_FLAG_DISCARD_NAME)))) goto detached; + // Now check TLEN. We do this last as sometimes it's the + // only thing that differs. In CRAM4 we have a better way + // of handling this that doesn't break detached status + int explicit_tlen = 0; + int tflag1 = ((bam_ins_size(b) && + llabs(bam_ins_size(b) - sign*(aright-aleft+1)) + > fd->tlen_approx) + || (!bam_ins_size(b) && !fd->tlen_zero)); + + int tflag2 = ((p->tlen && llabs(p->tlen - -sign*(aright-aleft+1)) + > fd->tlen_approx) + || (!p->tlen && !fd->tlen_zero)); + + if (tflag1 || tflag2) { + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + explicit_tlen = CRAM_FLAG_EXPLICIT_TLEN; + } else { + // Stil do detached for unmapped data in CRAM4 as this + // also impacts RNEXT calculation. + goto detached; + } + } + /* * The fields below are unused when encoding this read as it is * no longer detached. In theory they may get referred to when @@ -3201,7 +3140,9 @@ static int process_one_read(cram_fd *fd, cram_container *c, * not emitted. */ cr->mate_pos = p->apos; - cr->tlen = sign*(aright-aleft+1); + cram_stats_add(c->stats[DS_NP], cr->mate_pos); + cr->tlen = explicit_tlen ? bam_ins_size(b) : sign*(aright-aleft+1); + cram_stats_add(c->stats[DS_TS], cr->tlen); cr->mate_flags = ((p->flags & BAM_FMUNMAP) == BAM_FMUNMAP) * CRAM_M_UNMAP + ((p->flags & BAM_FMREVERSE) == BAM_FMREVERSE) * CRAM_M_REVERSE; @@ -3210,7 +3151,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (p->cram_flags & CRAM_FLAG_STATS_ADDED) { cram_stats_del(c->stats[DS_NP], p->mate_pos); cram_stats_del(c->stats[DS_MF], p->mate_flags); - cram_stats_del(c->stats[DS_TS], p->tlen); + if (!(p->cram_flags & CRAM_FLAG_EXPLICIT_TLEN)) + cram_stats_del(c->stats[DS_TS], p->tlen); cram_stats_del(c->stats[DS_NS], p->mate_ref_id); } @@ -3226,6 +3168,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // Clear detached from cr flags cr->cram_flags &= ~CRAM_FLAG_DETACHED; + cr->cram_flags |= explicit_tlen; if (cram_stats_add(c->stats[DS_CF], cr->cram_flags & CRAM_FLAG_MASK) < 0) goto block_err; @@ -3236,7 +3179,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, } p->cram_flags &= ~CRAM_FLAG_DETACHED; - p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM; + p->cram_flags |= CRAM_FLAG_MATE_DOWNSTREAM | explicit_tlen;; if (cram_stats_add(c->stats[DS_CF], p->cram_flags & CRAM_FLAG_MASK) < 0) goto block_err; diff --git a/cram/cram_external.c b/cram/cram_external.c index d0fd48cc8..88175103d 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -333,15 +333,17 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, char *op = cp; char *endp = cp + cram_block_get_uncomp_size(o_blk); //fprintf(stderr, "sz = %d\n", (int)(endp-cp)); - int32_t i32; + int32_t i32, err = 0; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); cp += i32; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); cp += i32; op = cp; - cp += safe_itf8_get(cp, endp, &i32); + i32 = in->vv.varint_get32(&cp, endp, &err); i32 += (cp-op); + if (err) + return -2; //fprintf(stderr, "remaining %d bytes\n", i32); cram_block_set_size(n_blk, cram_block_get_size(n_blk)-2); diff --git a/cram/cram_io.c b/cram/cram_io.c index fa8b3e346..3583021f7 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -80,12 +80,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include // CRAM v4.0 variable-size integers #else #include "../htscodecs/htscodecs/rANS_static.h" #include "../htscodecs/htscodecs/rANS_static4x16.h" #include "../htscodecs/htscodecs/arith_dynamic.h" #include "../htscodecs/htscodecs/tokenise_name3.h" #include "../htscodecs/htscodecs/fqzcomp_qual.h" +#include "../htscodecs/htscodecs/varint.h" #endif //#define REF_DEBUG @@ -261,6 +263,112 @@ int itf8_decode_crc(cram_fd *fd, int32_t *val_p, uint32_t *crc) { return 5; } +/* + * Stores a value to memory in ITF-8 format. + * + * Returns the number of bytes required to store the number. + * This is a maximum of 5 bytes. + */ +static inline int itf8_put(char *cp, int32_t val) { + unsigned char *up = (unsigned char *)cp; + if (!(val & ~0x00000007f)) { // 1 byte + *up = val; + return 1; + } else if (!(val & ~0x00003fff)) { // 2 byte + *up++ = (val >> 8 ) | 0x80; + *up = val & 0xff; + return 2; + } else if (!(val & ~0x01fffff)) { // 3 byte + *up++ = (val >> 16) | 0xc0; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 3; + } else if (!(val & ~0x0fffffff)) { // 4 byte + *up++ = (val >> 24) | 0xe0; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 4; + } else { // 5 byte + *up++ = 0xf0 | ((val>>28) & 0xff); + *up++ = (val >> 20) & 0xff; + *up++ = (val >> 12) & 0xff; + *up++ = (val >> 4 ) & 0xff; + *up = val & 0x0f; + return 5; + } +} + + +/* 64-bit itf8 variant */ +static inline int ltf8_put(char *cp, int64_t val) { + unsigned char *up = (unsigned char *)cp; + if (!(val & ~((1LL<<7)-1))) { + *up = val; + return 1; + } else if (!(val & ~((1LL<<(6+8))-1))) { + *up++ = (val >> 8 ) | 0x80; + *up = val & 0xff; + return 2; + } else if (!(val & ~((1LL<<(5+2*8))-1))) { + *up++ = (val >> 16) | 0xc0; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 3; + } else if (!(val & ~((1LL<<(4+3*8))-1))) { + *up++ = (val >> 24) | 0xe0; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 4; + } else if (!(val & ~((1LL<<(3+4*8))-1))) { + *up++ = (val >> 32) | 0xf0; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 5; + } else if (!(val & ~((1LL<<(2+5*8))-1))) { + *up++ = (val >> 40) | 0xf8; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 6; + } else if (!(val & ~((1LL<<(1+6*8))-1))) { + *up++ = (val >> 48) | 0xfc; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 7; + } else if (!(val & ~((1LL<<(7*8))-1))) { + *up++ = (val >> 56) | 0xfe; + *up++ = (val >> 48) & 0xff; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 8; + } else { + *up++ = 0xff; + *up++ = (val >> 56) & 0xff; + *up++ = (val >> 48) & 0xff; + *up++ = (val >> 40) & 0xff; + *up++ = (val >> 32) & 0xff; + *up++ = (val >> 24) & 0xff; + *up++ = (val >> 16) & 0xff; + *up++ = (val >> 8 ) & 0xff; + *up = val & 0xff; + return 9; + } +} + /* * Encodes and writes a single integer in ITF-8 format. * Returns 0 on success @@ -528,6 +636,315 @@ int ltf8_put_blk(cram_block *blk, int64_t val) { return -1; } +static int64_t safe_itf8_get(char **cp, const char *endp, int *err) { + const unsigned char *up = (unsigned char *)*cp; + + if (endp && endp - *cp < 5 && + (*cp >= endp || endp - *cp < itf8_bytes[up[0]>>4])) { + if (err) *err = 1; + return 0; + } + + if (up[0] < 0x80) { + (*cp)++; + return up[0]; + } else if (up[0] < 0xc0) { + (*cp)+=2; + return ((up[0] <<8) | up[1]) & 0x3fff; + } else if (up[0] < 0xe0) { + (*cp)+=3; + return ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; + } else if (up[0] < 0xf0) { + (*cp)+=4; + uint32_t uv = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; + return (int32_t)uv; + } else { + (*cp)+=5; + uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); + return (int32_t)uv; + } +} + +static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) { + unsigned char *up = (unsigned char *)*cp; + + if (endp && endp - *cp < 9 && + (*cp >= endp || endp - *cp < ltf8_bytes[up[0]])) { + if (err) *err = 1; + return 0; + } + + if (up[0] < 0x80) { + (*cp)++; + return up[0]; + } else if (up[0] < 0xc0) { + (*cp)+=2; + return (((uint64_t)up[0]<< 8) | + (uint64_t)up[1]) & (((1LL<<(6+8)))-1); + } else if (up[0] < 0xe0) { + (*cp)+=3; + return (((uint64_t)up[0]<<16) | + ((uint64_t)up[1]<< 8) | + (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); + } else if (up[0] < 0xf0) { + (*cp)+=4; + return (((uint64_t)up[0]<<24) | + ((uint64_t)up[1]<<16) | + ((uint64_t)up[2]<< 8) | + (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); + } else if (up[0] < 0xf8) { + (*cp)+=5; + return (((uint64_t)up[0]<<32) | + ((uint64_t)up[1]<<24) | + ((uint64_t)up[2]<<16) | + ((uint64_t)up[3]<< 8) | + (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); + } else if (up[0] < 0xfc) { + (*cp)+=6; + return (((uint64_t)up[0]<<40) | + ((uint64_t)up[1]<<32) | + ((uint64_t)up[2]<<24) | + ((uint64_t)up[3]<<16) | + ((uint64_t)up[4]<< 8) | + (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); + } else if (up[0] < 0xfe) { + (*cp)+=7; + return (((uint64_t)up[0]<<48) | + ((uint64_t)up[1]<<40) | + ((uint64_t)up[2]<<32) | + ((uint64_t)up[3]<<24) | + ((uint64_t)up[4]<<16) | + ((uint64_t)up[5]<< 8) | + (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); + } else if (up[0] < 0xff) { + (*cp)+=8; + return (((uint64_t)up[1]<<48) | + ((uint64_t)up[2]<<40) | + ((uint64_t)up[3]<<32) | + ((uint64_t)up[4]<<24) | + ((uint64_t)up[5]<<16) | + ((uint64_t)up[6]<< 8) | + (uint64_t)up[7]) & ((1LL<<(7*8))-1); + } else { + (*cp)+=9; + return (((uint64_t)up[1]<<56) | + ((uint64_t)up[2]<<48) | + ((uint64_t)up[3]<<40) | + ((uint64_t)up[4]<<32) | + ((uint64_t)up[5]<<24) | + ((uint64_t)up[6]<<16) | + ((uint64_t)up[7]<< 8) | + (uint64_t)up[8]); + } +} + +// Wrapper for now +int safe_itf8_put(char *cp, const char *cp_end, int32_t val) { + return itf8_put(cp, val); +} + +int safe_ltf8_put(char *cp, const char *cp_end, int64_t val) { + return ltf8_put(cp, val); +} + +int itf8_size(int64_t v) { + return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5); +} + +//----------------------------------------------------------------------------- + +// CRAM v4.0 onwards uses a different variable sized integer encoding +// that is size agnostic. + +// Local interface to varint.h inline version, so we can use in func ptr. +// Note a lot of these use the unsigned interface but take signed int64_t. +// This is because the old CRAM ITF8 inteface had signed -1 as unsigned +// 0xffffffff. +static int uint7_size(int64_t v) { + return var_size_u64(v); +} + +static int64_t uint7_get_32(char **cp, const char *endp, int *err) { + uint32_t val; + int nb = var_get_u32((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t sint7_get_32(char **cp, const char *endp, int *err) { + int32_t val; + int nb = var_get_s32((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t uint7_get_64(char **cp, const char *endp, int *err) { + uint64_t val; + int nb = var_get_u64((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int64_t sint7_get_64(char **cp, const char *endp, int *err) { + int64_t val; + int nb = var_get_s64((uint8_t *)(*cp), (const uint8_t *)endp, &val); + (*cp) += nb; + if (!nb && err) *err = 1; + return val; +} + +static int uint7_put_32(char *cp, const char *endp, int32_t val) { + return var_put_u32((uint8_t *)cp, (const uint8_t *)endp, val); +} + +static int sint7_put_32(char *cp, const char *endp, int32_t val) { + return var_put_s32((uint8_t *)cp, (const uint8_t *)endp, val); +} + +static int uint7_put_64(char *cp, const char *endp, int64_t val) { + return var_put_u64((uint8_t *)cp, (const uint8_t *)endp, val); +} + +static int sint7_put_64(char *cp, const char *endp, int64_t val) { + return var_put_s64((uint8_t *)cp, (const uint8_t *)endp, val); +} + +// Put direct to to cram_block +static int uint7_put_blk_32(cram_block *blk, int32_t v) { + uint8_t buf[10]; + int sz = var_put_u32(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int sint7_put_blk_32(cram_block *blk, int32_t v) { + uint8_t buf[10]; + int sz = var_put_s32(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int uint7_put_blk_64(cram_block *blk, int64_t v) { + uint8_t buf[10]; + int sz = var_put_u64(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +static int sint7_put_blk_64(cram_block *blk, int64_t v) { + uint8_t buf[10]; + int sz = var_put_s64(buf, buf+10, v); + BLOCK_APPEND(blk, buf, sz); + return sz; + + block_err: + return -1; +} + +// Decode 32-bits with CRC update from cram_fd +static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { + uint8_t b[5], i = 0; + int c; + uint32_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u32(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 5 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = v; + return i; +} + +// Decode 64-bits with CRC update from cram_fd +static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) { + uint8_t b[10], i = 0; + int c; + uint64_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u64(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 10 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = v; + return i; +} + +//----------------------------------------------------------------------------- + /* * Decodes a 32-bit little endian value from fd and stores in val. * @@ -926,6 +1343,9 @@ cram_block *cram_new_block(enum cram_content_type content_type, b->alloc = 0; b->byte = 0; b->bit = 7; // MSB + b->crc32 = 0; + b->idx = 0; + b->m = NULL; return b; } @@ -948,9 +1368,9 @@ cram_block *cram_read_block(cram_fd *fd) { c = b->method; crc = crc32(crc, &c, 1); if (-1 == (b->content_type= hgetc(fd->fp))) { free(b); return NULL; } c = b->content_type; crc = crc32(crc, &c, 1); - if (-1 == itf8_decode_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } - if (-1 == itf8_decode_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } - if (-1 == itf8_decode_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->content_id, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->comp_size, &crc)) { free(b); return NULL; } + if (-1 == fd->vv.varint_decode32_crc(fd, &b->uncomp_size, &crc)) { free(b); return NULL; } //fprintf(stderr, " method %d, ctype %d, cid %d, csize %d, ucsize %d\n", // b->method, b->content_type, b->content_id, b->comp_size, b->uncomp_size); @@ -1029,13 +1449,18 @@ uint32_t cram_block_size(cram_block *b) { * -1 on failure */ int cram_write_block(cram_fd *fd, cram_block *b) { + char vardata[100]; + int vardata_o = 0; + assert(b->method != RAW || (b->comp_size == b->uncomp_size)); if (hputc(b->method, fd->fp) == EOF) return -1; if (hputc(b->content_type, fd->fp) == EOF) return -1; - if (itf8_encode(fd, b->content_id) == -1) return -1; - if (itf8_encode(fd, b->comp_size) == -1) return -1; - if (itf8_encode(fd, b->uncomp_size) == -1) return -1; + vardata_o += fd->vv.varint_put32(vardata , vardata+100, b->content_id); + vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->comp_size); + vardata_o += fd->vv.varint_put32(vardata+vardata_o, vardata+100, b->uncomp_size); + if (vardata_o != hwrite(fd->fp, vardata, vardata_o)) + return -1; if (b->data) { if (b->method == RAW) { @@ -1051,15 +1476,15 @@ int cram_write_block(cram_fd *fd, cram_block *b) { } if (CRAM_MAJOR_VERS(fd->version) >= 3) { - unsigned char dat[100], *cp = dat;; + char dat[100], *cp = (char *)dat; uint32_t crc; *cp++ = b->method; *cp++ = b->content_type; - cp += itf8_put((char*)cp, b->content_id); - cp += itf8_put((char*)cp, b->comp_size); - cp += itf8_put((char*)cp, b->uncomp_size); - crc = crc32(0L, dat, cp-dat); + cp += fd->vv.varint_put32(cp, dat+100, b->content_id); + cp += fd->vv.varint_put32(cp, dat+100, b->comp_size); + cp += fd->vv.varint_put32(cp, dat+100, b->uncomp_size); + crc = crc32(0L, (uc *)dat, cp-dat); if (b->method == RAW) { b->crc32 = crc32(crc, b->data ? b->data : (uc*)"", b->uncomp_size); @@ -1235,10 +1660,10 @@ int cram_uncompress_block(cram_block *b) { break; } - case NAME_TOK3: { + case TOK3: { uint32_t out_len; uint8_t *cp = decode_names(b->data, b->comp_size, &out_len); - b->orig_method = NAME_TOK3; + b->orig_method = TOK3; b->method = RAW; free(b->data); b->data = cp; @@ -1380,11 +1805,11 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, return (char *)cp; } - case NAME_TOK3: - case NAME_TOKA: { + case TOK3: + case TOKA: { int out_len; int lev = level; - if (method == NAME_TOK3 && lev > 3) + if (method == TOK3 && lev > 3) lev = 3; uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL); *out_size = out_len; @@ -1416,6 +1841,9 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, cram_block *b, cram_metrics *metrics, int method, int level) { + if (!b) + return 0; + char *comp = NULL; size_t comp_size = 0; int strat; @@ -1508,13 +1936,13 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, method = (method|(1<metrics_lock); for (m = 0; m < CRAM_MAX_METHOD; m++) { - if (method & (1<version)+256; break; case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; - case NAME_TOK3:strat = 0; break; - case NAME_TOKA:strat = 1; break; + case TOK3: strat = 0; break; + case TOKA: strat = 1; break; default: strat = 0; } @@ -1547,13 +1975,10 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, sz[m] = b->uncomp_size*2+1000; // arbitrarily worse than raw } } - //fprintf(stderr, "sz_best = %d\n", sz_best); if (c_best) { free(b->data); b->data = (unsigned char *)c_best; - //printf("method_best = %s\n", cram_block_method2str(method_best)); - b->method = method_best; // adjusted to methmap[method_best] later b->comp_size = sz_best; } @@ -1625,7 +2050,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, } // else cost is ignored for (m = 0; m < CRAM_MAX_METHOD; m++) { - if ((!metrics->sz[m]) || (!(method & (1<sz[m]) || (!(method & (1u< metrics->sz[m]) @@ -1669,12 +2094,12 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, int mul = 1+(fd->level>=7); if (++metrics->cnt[m] >= MAXFAILS*mul && (metrics->extra[m] += r) >= MAXDELTA*mul) - method &= ~(1<sz[m] > best_sz) - method &= ~(1<pos_sorted = 1; c->max_apos = 0; c->multi_seq = 0; + c->qs_seq_orient = 1; c->bams = NULL; @@ -3191,7 +3617,7 @@ cram_container *cram_read_container(cram_fd *fd) { memset(&c2, 0, sizeof(c2)); if (CRAM_MAJOR_VERS(fd->version) == 1) { - if ((s = itf8_decode_crc(fd, &c2.length, &crc)) == -1) { + if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { fd->eof = fd->empty_container ? 1 : 2; return NULL; } else { @@ -3212,54 +3638,53 @@ cram_container *cram_read_container(cram_fd *fd) { len = le_int4(c2.length); crc = crc32(0L, (unsigned char *)&len, 4); } - if ((s = itf8_decode_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; -/* - * LARGE_POS used in this code is purely a debugging mechanism for testing - * whether the htslib API can cope with 64-bit quantities. These are - * possible in SAM, but not *yet* in BAM or CRAM. - * - * DO NOT ENABLE LARGE_POS for anything other than debugging / testing. - * - * At some point it is expected these ifdefs will become a version check - * instead. - */ -#ifdef LARGE_POS - if ((s = ltf8_decode_crc(fd, &c2.ref_seq_start, &crc))== -1) return NULL; else rd+=s; - if ((s = ltf8_decode_crc(fd, &c2.ref_seq_span, &crc)) == -1) return NULL; else rd+=s; -#else - int32_t i32; - if ((s = itf8_decode_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; - c2.ref_seq_start = i32; - if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; - c2.ref_seq_span = i32; -#endif - if ((s = itf8_decode_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; + if ((s = fd->vv.varint_decode32_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + int64_t i64; + if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i64; + if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i64; + } else { + int32_t i32; + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc))== -1) return NULL; else rd+=s; + c2.ref_seq_start = i32; + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd+=s; + c2.ref_seq_span = i32; + } + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_records, &crc)) == -1) return NULL; else rd+=s; if (CRAM_MAJOR_VERS(fd->version) == 1) { c2.record_counter = 0; c2.num_bases = 0; } else { if (CRAM_MAJOR_VERS(fd->version) >= 3) { - if ((s = ltf8_decode_crc(fd, &c2.record_counter, &crc)) == -1) + if ((s = fd->vv.varint_decode64_crc(fd, &c2.record_counter, &crc)) == -1) return NULL; else rd += s; } else { int32_t i32; - if ((s = itf8_decode_crc(fd, &i32, &crc)) == -1) + if ((s = fd->vv.varint_decode32_crc(fd, &i32, &crc)) == -1) return NULL; else rd += s; c2.record_counter = i32; } - if ((s = ltf8_decode_crc(fd, &c2.num_bases, &crc))== -1) + if ((s = fd->vv.varint_decode64_crc(fd, &c2.num_bases, &crc))== -1) return NULL; else rd += s; } - if ((s = itf8_decode_crc(fd, &c2.num_blocks, &crc)) == -1) return NULL; else rd+=s; - if ((s = itf8_decode_crc(fd, &c2.num_landmarks, &crc))== -1) return NULL; else rd+=s; + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_blocks, &crc)) == -1) + return NULL; + else + rd+=s; + if ((s = fd->vv.varint_decode32_crc(fd, &c2.num_landmarks, &crc))== -1) + return NULL; + else + rd+=s; if (c2.num_landmarks < 0 || c2.num_landmarks >= SIZE_MAX / sizeof(int32_t)) return NULL; @@ -3275,7 +3700,7 @@ cram_container *cram_read_container(cram_fd *fd) { return NULL; } for (i = 0; i < c->num_landmarks; i++) { - if ((s = itf8_decode_crc(fd, &c->landmark[i], &crc)) == -1) { + if ((s = fd->vv.varint_decode32_crc(fd, &c->landmark[i], &crc)) == -1) { cram_free_container(c); return NULL; } else { @@ -3337,7 +3762,7 @@ int cram_container_size(cram_container *c) { */ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) { - unsigned char *cp = (unsigned char *)dat; + char *cp = (char *)dat; int i; // Check the input buffer is large enough according to our stated @@ -3346,41 +3771,39 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) return -1; if (CRAM_MAJOR_VERS(fd->version) == 1) { - cp += itf8_put((char*)cp, c->length); + cp += itf8_put(cp, c->length); } else { *(int32_t *)cp = le_int4(c->length); cp += 4; } if (c->multi_seq) { - cp += itf8_put((char*)cp, -2); - cp += itf8_put((char*)cp, 0); - cp += itf8_put((char*)cp, 0); + cp += fd->vv.varint_put32(cp, NULL, -2); + cp += fd->vv.varint_put32(cp, NULL, 0); + cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += itf8_put((char*)cp, c->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put((char*)cp, c->ref_seq_start); - cp += ltf8_put((char*)cp, c->ref_seq_span); -#else - cp += itf8_put((char*)cp, c->ref_seq_start); - cp += itf8_put((char*)cp, c->ref_seq_span); -#endif + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); + } } - cp += itf8_put((char*)cp, c->num_records); + cp += fd->vv.varint_put32(cp, NULL, c->num_records); if (CRAM_MAJOR_VERS(fd->version) == 2) { - cp += itf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put64(cp, NULL, c->record_counter); } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += ltf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->record_counter); } - - cp += itf8_put((char*)cp, c->num_blocks); - cp += itf8_put((char*)cp, c->num_landmarks); + cp += fd->vv.varint_put64(cp, NULL, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); for (i = 0; i < c->num_landmarks; i++) - cp += itf8_put((char*)cp, c->landmark[i]); + cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); if (CRAM_MAJOR_VERS(fd->version) >= 3) { - c->crc32 = crc32(0L, (uc *)dat, (char*)cp-dat); + c->crc32 = crc32(0L, (uc *)dat, cp-dat); cp[0] = c->crc32 & 0xff; cp[1] = (c->crc32 >> 8) & 0xff; cp[2] = (c->crc32 >> 16) & 0xff; @@ -3388,7 +3811,7 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) cp += 4; } - *size = (char *)cp-dat; // actual used size + *size = cp-dat; // actual used size return 0; } @@ -3401,50 +3824,49 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) * -1 on failure */ int cram_write_container(cram_fd *fd, cram_container *c) { - char buf_a[1024], *buf = buf_a; - unsigned char *cp; + char buf_a[1024], *buf = buf_a, *cp; int i; - if (55 + c->num_landmarks * 5 >= 1024) - buf = malloc(55 + c->num_landmarks * 5); - cp = (unsigned char *)buf; + if (61 + c->num_landmarks * 10 >= 1024) { + buf = malloc(61 + c->num_landmarks * 10); + if (!buf) + return -1; + } + cp = buf; if (CRAM_MAJOR_VERS(fd->version) == 1) { - cp += itf8_put((char*)cp, c->length); + cp += itf8_put(cp, c->length); } else { *(int32_t *)cp = le_int4(c->length); cp += 4; } if (c->multi_seq) { - cp += itf8_put((char*)cp, -2); - cp += itf8_put((char*)cp, 0); - cp += itf8_put((char*)cp, 0); + cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2); + cp += fd->vv.varint_put32(cp, NULL, 0); + cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += itf8_put((char*)cp, c->ref_seq_id); -#ifdef LARGE_POS - cp += ltf8_put((char*)cp, c->ref_seq_start); - cp += ltf8_put((char*)cp, c->ref_seq_span); -#else - cp += itf8_put((char*)cp, c->ref_seq_start); - cp += itf8_put((char*)cp, c->ref_seq_span); -#endif - } - cp += itf8_put((char*)cp, c->num_records); - if (CRAM_MAJOR_VERS(fd->version) == 2) { - cp += itf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); - } else if (CRAM_MAJOR_VERS(fd->version) >= 3) { - cp += ltf8_put((char*)cp, c->record_counter); - cp += ltf8_put((char*)cp, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_id); + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); + } else { + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_start); + cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_span); + } } - - cp += itf8_put((char*)cp, c->num_blocks); - cp += itf8_put((char*)cp, c->num_landmarks); + cp += fd->vv.varint_put32(cp, NULL, c->num_records); + if (CRAM_MAJOR_VERS(fd->version) >= 3) + cp += fd->vv.varint_put64(cp, NULL, c->record_counter); + else + cp += fd->vv.varint_put32(cp, NULL, c->record_counter); + cp += fd->vv.varint_put64(cp, NULL, c->num_bases); + cp += fd->vv.varint_put32(cp, NULL, c->num_blocks); + cp += fd->vv.varint_put32(cp, NULL, c->num_landmarks); for (i = 0; i < c->num_landmarks; i++) - cp += itf8_put((char*)cp, c->landmark[i]); + cp += fd->vv.varint_put32(cp, NULL, c->landmark[i]); if (CRAM_MAJOR_VERS(fd->version) >= 3) { - c->crc32 = crc32(0L, (uc *)buf, (char*)cp-buf); + c->crc32 = crc32(0L, (uc *)buf, cp-buf); cp[0] = c->crc32 & 0xff; cp[1] = (c->crc32 >> 8) & 0xff; cp[2] = (c->crc32 >> 16) & 0xff; @@ -3452,7 +3874,7 @@ int cram_write_container(cram_fd *fd, cram_container *c) { cp += 4; } - if ((char*)cp-buf != hwrite(fd->fp, buf, (char*)cp-buf)) { + if (cp-buf != hwrite(fd->fp, buf, cp-buf)) { if (buf != buf_a) free(buf); return -1; @@ -4044,8 +4466,8 @@ cram_file_def *cram_read_file_def(cram_fd *fd) { return NULL; } - if (def->major_version > 3) { - hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x or 3.x, got %d.%d", + if (def->major_version > 4) { + hts_log_error("CRAM version number mismatch. Expected 1.x, 2.x, 3.x or 4.x, got %d.%d", def->major_version, def->minor_version); free(def); return NULL; @@ -4134,9 +4556,9 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { } len = b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); /* Extract header from 1st block */ if (-1 == int32_get_blk(b, &header_len) || @@ -4163,9 +4585,9 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { return NULL; } len += b->comp_size + 2 + 4*(CRAM_MAJOR_VERS(fd->version) >= 3) + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); cram_free_block(b); } @@ -4359,9 +4781,9 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { if (blank_block) { c->length = b->comp_size + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); c->num_blocks = 2; c->num_landmarks = 2; @@ -4376,8 +4798,8 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { // Plus extra storage for uncompressed secondary blank block padded_length = MIN(c->length*.5, 10000); c->length += padded_length + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(padded_length)*2; + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(padded_length)*2; } else { // Pad the block instead. c->num_blocks = 1; @@ -4390,9 +4812,9 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { c->length = b->comp_size + padded_length + 2 + 4*is_cram_3 + - itf8_size(b->content_id) + - itf8_size(b->uncomp_size) + - itf8_size(b->comp_size); + fd->vv.varint_size(b->content_id) + + fd->vv.varint_size(b->uncomp_size) + + fd->vv.varint_size(b->comp_size); if (NULL == (pads = calloc(1, padded_length))) { cram_free_block(b); @@ -4453,6 +4875,51 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { * The top-level cram opening, closing and option handling */ +/* + * Sets CRAM variable sized integer decode function tables. + * CRAM 1, 2, and 3.x all used ITF8 for uint32 and UTF8 for uint64. + * CRAM 4.x uses the same encoding mechanism for 32-bit and 64-bit + * (or anything inbetween), but also now supports signed values. + * + * Version is the CRAM major version number. + * vv is the vector table (probably &cram_fd->vv) + */ +static void cram_init_varint(varint_vec *vv, int version) { + if (version >= 4) { + vv->varint_get32 = uint7_get_32; // FIXME: varint.h API should be size agnostic + vv->varint_get32s = sint7_get_32; + vv->varint_get64 = uint7_get_64; + vv->varint_get64s = sint7_get_64; + vv->varint_put32 = uint7_put_32; + vv->varint_put32s = sint7_put_32; + vv->varint_put64 = uint7_put_64; + vv->varint_put64s = sint7_put_64; + vv->varint_put32_blk = uint7_put_blk_32; + vv->varint_put32s_blk = sint7_put_blk_32; + vv->varint_put64_blk = uint7_put_blk_64; + vv->varint_put64s_blk = sint7_put_blk_64; + vv->varint_size = uint7_size; + vv->varint_decode32_crc = uint7_decode_crc32; + vv->varint_decode64_crc = uint7_decode_crc64; + } else { + vv->varint_get32 = safe_itf8_get; + vv->varint_get32s = safe_itf8_get; + vv->varint_get64 = safe_ltf8_get; + vv->varint_get64s = safe_ltf8_get; + vv->varint_put32 = safe_itf8_put; + vv->varint_put32s = safe_itf8_put; + vv->varint_put64 = safe_ltf8_put; + vv->varint_put64s = safe_ltf8_put; + vv->varint_put32_blk = itf8_put_blk; + vv->varint_put32s_blk = itf8_put_blk; + vv->varint_put64_blk = ltf8_put_blk; + vv->varint_put64s_blk = ltf8_put_blk; + vv->varint_size = itf8_size; + vv->varint_decode32_crc = itf8_decode_crc; + vv->varint_decode64_crc = ltf8_decode_crc; + } +} + /* * Initialises the lookup tables. These could be global statics, but they're * clumsy to setup in a multi-threaded environment unless we generate @@ -4535,6 +5002,8 @@ static void cram_init_tables(cram_fd *fd) { fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+2]&0x1f]=2; fd->cram_sub_matrix["ACGTN"[i>>2]&0x1f][CRAM_SUBST_MATRIX[i+3]&0x1f]=3; } + + cram_init_varint(&fd->vv, CRAM_MAJOR_VERS(fd->version)); } // Default version numbers for CRAM @@ -4602,6 +5071,8 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->version = fd->file_def->major_version * 256 + fd->file_def->minor_version; + cram_init_tables(fd); + if (!(fd->header = cram_read_SAM_hdr(fd))) { cram_free_file_def(fd->file_def); goto err; @@ -4625,12 +5096,11 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { strncpy(def->file_id, filename, 20); fd->version = major_version * 256 + minor_version; + cram_init_tables(fd); /* SAM header written later along with this file_def */ } - cram_init_tables(fd); - fd->prefix = strdup((cp = strrchr(filename, '/')) ? cp+1 : filename); if (!fd->prefix) goto err; @@ -4754,6 +5224,88 @@ int cram_flush(cram_fd *fd) { return 0; } +/* + * Writes an EOF block to a CRAM file. + * + * Returns 0 on success + * -1 on failure + */ +int cram_write_eof_block(cram_fd *fd) { + // EOF block is a container with special values to aid detection + if (CRAM_MAJOR_VERS(fd->version) >= 2) { + // Empty container with + // ref_seq_id -1 + // start pos 0x454f46 ("EOF") + // span 0 + // nrec 0 + // counter 0 + // nbases 0 + // 1 block (landmark 0) + // (CRC32) + cram_container c; + memset(&c, 0, sizeof(c)); + c.ref_seq_id = -1; + c.ref_seq_start = 0x454f46; // "EOF" + c.ref_seq_span = 0; + c.record_counter = 0; + c.num_bases = 0; + c.num_blocks = 1; + int32_t land[1] = {0}; + c.landmark = land; + + // An empty compression header block with + // method raw (0) + // type comp header (1) + // content id 0 + // block contents size 6 + // raw size 6 + // empty preservation map (01 00) + // empty data series map (01 00) + // empty tag map (01 00) + // block CRC + cram_block_compression_hdr ch; + memset(&ch, 0, sizeof(ch)); + c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch); + + c.length = c.comp_hdr_block->byte // Landmark[0] + + 5 // block struct + + 4*(CRAM_MAJOR_VERS(fd->version) >= 3); // CRC + if (cram_write_container(fd, &c) < 0 || + cram_write_block(fd, c.comp_hdr_block) < 0) { + cram_close(fd); + cram_free_block(c.comp_hdr_block); + return -1; + } + cram_free_block(c.comp_hdr_block); + + // V2.1 bytes + // 0b 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id + // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + + // V3.0 bytes: + // 0f 00 00 00 ff ff ff ff 0f // Cont HDR: size, ref seq id + // e0 45 4f 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // 05 bd d9 4f // CRC32 + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + // ee 63 01 4b // CRC32 + + // V4.0 bytes: + // 0f 00 00 00 8f ff ff ff // Cont HDR: size, ref seq id + // 82 95 9e 46 00 00 00 // Cont HDR: pos, span, nrec, counter + // 00 01 00 // Cont HDR: nbase, nblk, landmark + // ac d6 05 bc // CRC32 + // 00 01 00 06 06 // Comp.HDR blk + // 01 00 01 00 01 00 // Comp.HDR blk + // ee 63 01 4b // CRC32 + } + + return 0; +} /* * Closes a CRAM file. * Returns 0 on success @@ -4797,25 +5349,8 @@ int cram_close(cram_fd *fd) { if (fd->mode == 'w') { /* Write EOF block */ - if (CRAM_MAJOR_VERS(fd->version) == 3) { - if (38 != hwrite(fd->fp, - "\x0f\x00\x00\x00\xff\xff\xff\xff" // Cont HDR - "\x0f\xe0\x45\x4f\x46\x00\x00\x00" // Cont HDR - "\x00\x01\x00" // Cont HDR - "\x05\xbd\xd9\x4f" // CRC32 - "\x00\x01\x00\x06\x06" // Comp.HDR blk - "\x01\x00\x01\x00\x01\x00" // Comp.HDR blk - "\xee\x63\x01\x4b", // CRC32 - 38)) - return -1; - } else { - if (30 != hwrite(fd->fp, - "\x0b\x00\x00\x00\xff\xff\xff\xff" - "\x0f\xe0\x45\x4f\x46\x00\x00\x00" - "\x00\x01\x00\x00\x01\x00\x06\x06" - "\x01\x00\x01\x00\x01\x00", 30)) - return -1; - } + if (0 != cram_write_eof_block(fd)) + return -1; } for (bl = fd->bl; bl; bl = next) { @@ -5048,8 +5583,9 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { } if (!((major == 1 && minor == 0) || (major == 2 && (minor == 0 || minor == 1)) || - (major == 3 && (minor == 0 || minor == 1)))) { - hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0 or 3.1"); + (major == 3 && (minor == 0 || minor == 1)) || + (major == 4 && minor == 0))) { + hts_log_error("Unknown version string; use 1.0, 2.0, 2.1, 3.0, 3.1 or 4.0"); errno = EINVAL; return -1; } @@ -5066,8 +5602,11 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->use_rans = (CRAM_MAJOR_VERS(fd->version) >= 3) ? 1 : 0; - fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) >= 3 && - CRAM_MINOR_VERS(fd->version) >= 1)) ? 1 : 0; + fd->use_tok = ((CRAM_MAJOR_VERS(fd->version) == 3 && + CRAM_MINOR_VERS(fd->version) >= 1) || + CRAM_MAJOR_VERS(fd->version) >= 4) ? 1 : 0; + cram_init_tables(fd); + break; } diff --git a/cram/cram_io.h b/cram/cram_io.h index 3954a49af..7d787e44c 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -68,308 +68,9 @@ extern "C" { */ int itf8_decode(cram_fd *fd, int32_t *val); -static inline int itf8_get(char *cp, int32_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = ((up[0] <<8) | up[1]) & 0x3fff; - return 2; - } else if (up[0] < 0xe0) { - *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; - return 3; - } else if (up[0] < 0xf0) { - *val_p = ((up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; - return 4; - } else { - *val_p = ((up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); - return 5; - } -} - -/* - * Stores a value to memory in ITF-8 format. - * - * Returns the number of bytes required to store the number. - * This is a maximum of 5 bytes. - */ -static inline int itf8_put(char *cp, int32_t val) { - unsigned char *up = (unsigned char *)cp; - if (!(val & ~0x00000007f)) { // 1 byte - *up = val; - return 1; - } else if (!(val & ~0x00003fff)) { // 2 byte - *up++ = (val >> 8 ) | 0x80; - *up = val & 0xff; - return 2; - } else if (!(val & ~0x01fffff)) { // 3 byte - *up++ = (val >> 16) | 0xc0; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 3; - } else if (!(val & ~0x0fffffff)) { // 4 byte - *up++ = (val >> 24) | 0xe0; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 4; - } else { // 5 byte - *up++ = 0xf0 | ((val>>28) & 0xff); - *up++ = (val >> 20) & 0xff; - *up++ = (val >> 12) & 0xff; - *up++ = (val >> 4 ) & 0xff; - *up = val & 0x0f; - return 5; - } -} - - -/* 64-bit itf8 variant */ -static inline int ltf8_put(char *cp, int64_t val) { - unsigned char *up = (unsigned char *)cp; - if (!(val & ~((1LL<<7)-1))) { - *up = val; - return 1; - } else if (!(val & ~((1LL<<(6+8))-1))) { - *up++ = (val >> 8 ) | 0x80; - *up = val & 0xff; - return 2; - } else if (!(val & ~((1LL<<(5+2*8))-1))) { - *up++ = (val >> 16) | 0xc0; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 3; - } else if (!(val & ~((1LL<<(4+3*8))-1))) { - *up++ = (val >> 24) | 0xe0; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 4; - } else if (!(val & ~((1LL<<(3+4*8))-1))) { - *up++ = (val >> 32) | 0xf0; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 5; - } else if (!(val & ~((1LL<<(2+5*8))-1))) { - *up++ = (val >> 40) | 0xf8; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 6; - } else if (!(val & ~((1LL<<(1+6*8))-1))) { - *up++ = (val >> 48) | 0xfc; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 7; - } else if (!(val & ~((1LL<<(7*8))-1))) { - *up++ = (val >> 56) | 0xfe; - *up++ = (val >> 48) & 0xff; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 8; - } else { - *up++ = 0xff; - *up++ = (val >> 56) & 0xff; - *up++ = (val >> 48) & 0xff; - *up++ = (val >> 40) & 0xff; - *up++ = (val >> 32) & 0xff; - *up++ = (val >> 24) & 0xff; - *up++ = (val >> 16) & 0xff; - *up++ = (val >> 8 ) & 0xff; - *up = val & 0xff; - return 9; - } -} - -static inline int ltf8_get(char *cp, int64_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = (((uint64_t)up[0]<< 8) | - (uint64_t)up[1]) & (((1LL<<(6+8)))-1); - return 2; - } else if (up[0] < 0xe0) { - *val_p = (((uint64_t)up[0]<<16) | - ((uint64_t)up[1]<< 8) | - (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint64_t)up[0]<<24) | - ((uint64_t)up[1]<<16) | - ((uint64_t)up[2]<< 8) | - (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); - return 4; - } else if (up[0] < 0xf8) { - *val_p = (((uint64_t)up[0]<<32) | - ((uint64_t)up[1]<<24) | - ((uint64_t)up[2]<<16) | - ((uint64_t)up[3]<< 8) | - (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); - return 5; - } else if (up[0] < 0xfc) { - *val_p = (((uint64_t)up[0]<<40) | - ((uint64_t)up[1]<<32) | - ((uint64_t)up[2]<<24) | - ((uint64_t)up[3]<<16) | - ((uint64_t)up[4]<< 8) | - (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); - return 6; - } else if (up[0] < 0xfe) { - *val_p = (((uint64_t)up[0]<<48) | - ((uint64_t)up[1]<<40) | - ((uint64_t)up[2]<<32) | - ((uint64_t)up[3]<<24) | - ((uint64_t)up[4]<<16) | - ((uint64_t)up[5]<< 8) | - (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); - return 7; - } else if (up[0] < 0xff) { - *val_p = (((uint64_t)up[1]<<48) | - ((uint64_t)up[2]<<40) | - ((uint64_t)up[3]<<32) | - ((uint64_t)up[4]<<24) | - ((uint64_t)up[5]<<16) | - ((uint64_t)up[6]<< 8) | - (uint64_t)up[7]) & ((1LL<<(7*8))-1); - return 8; - } else { - *val_p = (((uint64_t)up[1]<<56) | - ((uint64_t)up[2]<<48) | - ((uint64_t)up[3]<<40) | - ((uint64_t)up[4]<<32) | - ((uint64_t)up[5]<<24) | - ((uint64_t)up[6]<<16) | - ((uint64_t)up[7]<< 8) | - (uint64_t)up[8]); - return 9; - } -} - -#define itf8_size(v) ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5) - - -/* Version of itf8_get that checks it hasn't run out of input */ - extern const int itf8_bytes[16]; extern const int ltf8_bytes[256]; -static inline int safe_itf8_get(const char *cp, const char *endp, - int32_t *val_p) { - const unsigned char *up = (unsigned char *)cp; - - if (endp - cp < 5 && - (cp >= endp || endp - cp < itf8_bytes[up[0]>>4])) { - *val_p = 0; - return 0; - } - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = ((up[0] <<8) | up[1]) & 0x3fff; - return 2; - } else if (up[0] < 0xe0) { - *val_p = ((up[0]<<16) | (up[1]<< 8) | up[2]) & 0x1fffff; - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint32_t)up[0]<<24) | (up[1]<<16) | (up[2]<<8) | up[3]) & 0x0fffffff; - return 4; - } else { - uint32_t uv = (((uint32_t)up[0] & 0x0f)<<28) | (up[1]<<20) | (up[2]<<12) | (up[3]<<4) | (up[4] & 0x0f); - *val_p = uv < 0x80000000UL ? (int32_t) uv : -((int32_t) (0xffffffffUL - uv)) - 1; - return 5; - } -} - -static inline int safe_ltf8_get(const char *cp, const char *endp, - int64_t *val_p) { - unsigned char *up = (unsigned char *)cp; - - if (endp - cp < 9 && - (cp >= endp || endp - cp < ltf8_bytes[up[0]])) return 0; - - if (up[0] < 0x80) { - *val_p = up[0]; - return 1; - } else if (up[0] < 0xc0) { - *val_p = (((uint64_t)up[0]<< 8) | - (uint64_t)up[1]) & (((1LL<<(6+8)))-1); - return 2; - } else if (up[0] < 0xe0) { - *val_p = (((uint64_t)up[0]<<16) | - ((uint64_t)up[1]<< 8) | - (uint64_t)up[2]) & ((1LL<<(5+2*8))-1); - return 3; - } else if (up[0] < 0xf0) { - *val_p = (((uint64_t)up[0]<<24) | - ((uint64_t)up[1]<<16) | - ((uint64_t)up[2]<< 8) | - (uint64_t)up[3]) & ((1LL<<(4+3*8))-1); - return 4; - } else if (up[0] < 0xf8) { - *val_p = (((uint64_t)up[0]<<32) | - ((uint64_t)up[1]<<24) | - ((uint64_t)up[2]<<16) | - ((uint64_t)up[3]<< 8) | - (uint64_t)up[4]) & ((1LL<<(3+4*8))-1); - return 5; - } else if (up[0] < 0xfc) { - *val_p = (((uint64_t)up[0]<<40) | - ((uint64_t)up[1]<<32) | - ((uint64_t)up[2]<<24) | - ((uint64_t)up[3]<<16) | - ((uint64_t)up[4]<< 8) | - (uint64_t)up[5]) & ((1LL<<(2+5*8))-1); - return 6; - } else if (up[0] < 0xfe) { - *val_p = (((uint64_t)up[0]<<48) | - ((uint64_t)up[1]<<40) | - ((uint64_t)up[2]<<32) | - ((uint64_t)up[3]<<24) | - ((uint64_t)up[4]<<16) | - ((uint64_t)up[5]<< 8) | - (uint64_t)up[6]) & ((1LL<<(1+6*8))-1); - return 7; - } else if (up[0] < 0xff) { - *val_p = (((uint64_t)up[1]<<48) | - ((uint64_t)up[2]<<40) | - ((uint64_t)up[3]<<32) | - ((uint64_t)up[4]<<24) | - ((uint64_t)up[5]<<16) | - ((uint64_t)up[6]<< 8) | - (uint64_t)up[7]) & ((1LL<<(7*8))-1); - return 8; - } else { - *val_p = (((uint64_t)up[1]<<56) | - ((uint64_t)up[2]<<48) | - ((uint64_t)up[3]<<40) | - ((uint64_t)up[4]<<32) | - ((uint64_t)up[5]<<24) | - ((uint64_t)up[6]<<16) | - ((uint64_t)up[7]<< 8) | - (uint64_t)up[8]); - return 9; - } -} - /*! Pushes a value in ITF8 format onto the end of a block. * * This shouldn't be used for high-volume data as it is not the fastest @@ -544,8 +245,10 @@ static inline int block_append(cram_block *b, const void *s, size_t len) { if (block_grow(b, len) < 0) return -1; - memcpy(BLOCK_END(b), s, len); - BLOCK_SIZE(b) += len; + if (len) { + memcpy(BLOCK_END(b), s, len); + BLOCK_SIZE(b) += len; + } return 0; } diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 1c51b09f7..6a0dc3fba 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -111,7 +111,10 @@ enum cram_encoding { E_SUBEXP = 7, E_GOLOMB_RICE = 8, E_GAMMA = 9, - E_NUM_CODECS = 10, /* Number of codecs, not a real one. */ + E_XPACK = 11, // Transform to sub-codec + E_XRLE = 12, // Transform to sub-codec + E_XDELTA = 13, // Transform to sub-codec + E_NUM_CODECS, /* Total number of codecs, not a real one. */ }; enum cram_external_type { @@ -120,6 +123,8 @@ enum cram_external_type { E_BYTE = 3, E_BYTE_ARRAY = 4, E_BYTE_ARRAY_BLOCK = 5, + E_SINT = 6, // signed INT + E_SLONG = 7, // signed LONG }; /* External IDs used by this implementation (only assumed during writing) */ @@ -283,6 +288,7 @@ struct cram_block_compression_hdr { // indexed by ref-base and subst. code char substitution_matrix[5][4]; int no_ref; + int qs_seq_orient; // 1 => same as seq. 0 => original orientation // TD Dictionary as a concatenated block cram_block *TD_blk; // Tag Dictionary @@ -299,6 +305,9 @@ struct cram_block_compression_hdr { char *uncomp; // A single block of uncompressed data size_t uncomp_size, uncomp_alloc; + + // Total codec count, used for index to block_by_id for transforms + int ncodecs; }; typedef struct cram_map { @@ -313,6 +322,7 @@ typedef struct cram_map { typedef struct cram_tag_map { struct cram_codec *codec; cram_block *blk; + cram_block *blk2; cram_metrics *m; } cram_tag_map; @@ -378,6 +388,7 @@ struct cram_container { int last_slice; // number of reads in last slice (0 for 1st) int multi_seq; // true if packing multi seqs per cont/slice int unsorted; // true is AP_delta is 0. + int qs_seq_orient; // 1 => same as seq. 0 => original orientation /* Copied from fd before encoding, to allow multi-threading */ int ref_start, first_base, last_base, ref_id, ref_end; @@ -418,6 +429,7 @@ typedef struct cram_record { int32_t mate_ref_id; int64_t mate_pos; // NP int64_t tlen; // TS + int64_t explicit_tlen;// TS, but PNEXT/RNEXT still need auto-computing // Auxiliary data int32_t ntags; // TC @@ -663,6 +675,35 @@ typedef struct spare_bams { struct spare_bams *next; } spare_bams; +struct cram_fd; +typedef struct varint_vec { + // Returns number of bytes decoded from fd, 0 on error + int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); + int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc); + + // Returns the value and increments *cp. Sets err to 1 iff an error occurs. + // NOTE: Does not set err to 0 on success. + int64_t (*varint_get32) (char **cp, const char *endp, int *err); + int64_t (*varint_get32s)(char **cp, const char *endp, int *err); + int64_t (*varint_get64) (char **cp, const char *endp, int *err); + int64_t (*varint_get64s)(char **cp, const char *endp, int *err); + + // Returns the number of bytes written, <= 0 on error. + int (*varint_put32) (char *cp, const char *endp, int32_t val_p); + int (*varint_put32s)(char *cp, const char *endp, int32_t val_p); + int (*varint_put64) (char *cp, const char *endp, int64_t val_p); + int (*varint_put64s)(char *cp, const char *endp, int64_t val_p); + + // Returns the number of bytes written, <= 0 on error. + int (*varint_put32_blk) (cram_block *blk, int32_t val_p); + int (*varint_put32s_blk)(cram_block *blk, int32_t val_p); + int (*varint_put64_blk) (cram_block *blk, int64_t val_p); + int (*varint_put64s_blk)(cram_block *blk, int64_t val_p); + + // Returns number of bytes needed to encode 'val' + int (*varint_size)(int64_t val); +} varint_vec; + struct cram_fd { struct hFILE *fp; int mode; // 'r' or 'w' @@ -757,6 +798,10 @@ struct cram_fd { int tlen_zero; // If true, permit tlen 0 (=> tlen calculated) BGZF *idxfp; // File pointer for on-the-fly index creation + + // variable integer decoding callbacks. + // This changed in CRAM4.0 to a data-size agnostic encoding. + varint_vec vv; }; // Translation of required fields to cram data series @@ -838,7 +883,8 @@ enum cram_fields { #define CRAM_FLAG_DETACHED (1<<1) #define CRAM_FLAG_MATE_DOWNSTREAM (1<<2) #define CRAM_FLAG_NO_SEQ (1<<3) -#define CRAM_FLAG_MASK ((1<<4)-1) +#define CRAM_FLAG_EXPLICIT_TLEN (1<<4) +#define CRAM_FLAG_MASK ((1<<5)-1) /* Internal only */ #define CRAM_FLAG_STATS_ADDED (1<<30) diff --git a/hts.c b/hts.c index 8e4552f87..69b736121 100644 --- a/hts.c +++ b/hts.c @@ -1169,7 +1169,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) fp->fp.cram = cram_dopen(hfile, fn, simple_mode); if (fp->fp.cram == NULL) goto error; if (!fp->is_write) - cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, 1); + cram_set_option(fp->fp.cram, CRAM_OPT_DECODE_MD, -1); // auto fp->is_cram = 1; break; diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index d947f985f..d40bbd095 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -3,6 +3,7 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ + $(HTSPREFIX)htscodecs/htscodecs/rle.c \ $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) diff --git a/htscodecs_external.mk b/htscodecs_external.mk index f8d4d7d4a..f1d82faf8 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -6,6 +6,7 @@ htscodecs_fqzcomp_qual_h = htscodecs_pack_h = htscodecs_rANS_static_h = htscodecs_rANS_static4x16_h = +htscodecs_rle_h = htscodecs_tokenise_name3_h = htscodecs_varint_h = diff --git a/htslib/cram.h b/htslib/cram.h index 890896388..5446945e4 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -48,16 +48,22 @@ extern "C" { #endif enum cram_block_method { + // Public methods as defined in the CRAM spec. BM_ERROR = -1, + + // CRAM 2.x and 3.0 RAW = 0, GZIP = 1, BZIP2 = 2, LZMA = 3, RANS = 4, RANS0 = RANS, + + // CRAM 3.1 onwards RANSPR = 5, RANS_PR0 = RANSPR, ARITH = 6, ARITH_PR0 = ARITH, FQZ = 7, - TOK3 = 8, NAME_TOK3 = TOK3, + TOK3 = 8, + // BSC = 9, ZSTD = 10 // Methods not externalised, but used in metrics. // Externally they become one of the above methods. @@ -78,8 +84,8 @@ enum cram_block_method { RANS_PR192, // O0 + RLE + pack RANS_PR193, // O1 + RLE + pack - //NAME_TOK3, // tok+rans - NAME_TOKA, // tok+arith + //TOK3, // tok+rans + TOKA, // tok+arith //ARITH_PR0, // Order 0 ARITH_PR1, // Order 1 diff --git a/test/test.pl b/test/test.pl index 823a414c5..8def186d9 100755 --- a/test/test.pl +++ b/test/test.pl @@ -610,6 +610,34 @@ sub test_view testv $opts, "./test_view $tv_args $cram > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + ## Experimental CRAM 4.0 support. + # SAM -> CRAM40u -> SAM + foreach my $profile (qw/fast normal small archive/) { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=4.0 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } + + # BAM -> CRAM40 -> BAM -> SAM + $cram = "$bam.cram"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 $bam > $cram"; + testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; + testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; + + # CRAM40 -> CRAM30 + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram > $cram.cram"; + + # CRAM30 -> CRAM40 + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 $cram.cram > $cram"; + + # CRAM40 -> CRAM40 + multi-slice + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; + testv $opts, "./test_view $tv_args $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + # Java pre-made CRAM -> SAM my $jcram = "${base}_java.cram"; if (-e $jcram) { From 192c8c0f78c657f68122ab3ed64edff64fda024d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 29 Sep 2020 15:28:34 +0100 Subject: [PATCH 048/488] Added a "pos_delta" option for forcing AP data series delta. Normally sorted data deltas the positions and unsorted does not. This allows us to explicitly reenable the AP=1 container header field to do deltaing, as it's sometimes useful on unsorted data. Eg pairs are normally neighbouring position, so it's still a bit better to compress than leaving them as-is. --- cram/cram_decode.c | 10 +++++++++- cram/cram_encode.c | 5 +++-- cram/cram_io.c | 5 +++++ cram/cram_structs.h | 6 ++++++ hts.c | 4 ++++ htslib/hts.h | 1 + 6 files changed, 28 insertions(+), 3 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 06e297331..8735aafa5 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2565,8 +2565,16 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, cr->apos = i32; } if (r) goto block_err;; - if (c->comp_hdr->AP_delta) + if (c->comp_hdr->AP_delta) { + if (cr->apos < 0 && c->unsorted == 0) { + // cache locally in c->unsorted so we don't have an + // excessive number of locks + pthread_mutex_lock(&fd->ref_lock); + c->unsorted = fd->unsorted = 1; + pthread_mutex_unlock(&fd->ref_lock); + } cr->apos += s->last_apos; + } s->last_apos= cr->apos; } else { cr->apos = c->ref_seq_start; diff --git a/cram/cram_encode.c b/cram/cram_encode.c index cdd73b8a7..bb5aaea76 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1866,6 +1866,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->ref_seq_span = c->ref_seq_span; h->num_records = c->num_records; h->qs_seq_orient = c->qs_seq_orient; + // slight misnomer - sorted or treat as-if sorted (ap_delta force to 1) h->AP_delta = c->pos_sorted; memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); @@ -2707,7 +2708,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, c->num_bases += cr->len; cr->apos = bam_pos(b)+1; if (c->pos_sorted) { - if (cr->apos < s->last_apos) { + if (cr->apos < s->last_apos && !fd->ap_delta) { c->pos_sorted = 0; } else { if (cram_stats_add(c->stats[DS_AP], cr->apos - s->last_apos) < 0) @@ -3320,7 +3321,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { // We detected we need multi-seq fd->multi_seq = 1; c->multi_seq = 1; - c->pos_sorted = 0; // required atm for multi_seq slices + c->pos_sorted = 0; if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); diff --git a/cram/cram_io.c b/cram/cram_io.c index 3583021f7..1f118aae4 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5121,6 +5121,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->slices_per_container = SLICE_PER_CNT; fd->embed_ref = 0; fd->no_ref = 0; + fd->ap_delta = 0; fd->ignore_md5 = 0; fd->lossy_read_names = 0; fd->use_bz2 = 0; @@ -5494,6 +5495,10 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { fd->no_ref = va_arg(args, int); break; + case CRAM_OPT_POS_DELTA: + fd->ap_delta = va_arg(args, int); + break; + case CRAM_OPT_IGNORE_MD5: fd->ignore_md5 = va_arg(args, int); break; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 6a0dc3fba..a54e2ec6f 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -802,6 +802,12 @@ struct cram_fd { // variable integer decoding callbacks. // This changed in CRAM4.0 to a data-size agnostic encoding. varint_vec vv; + + // Force AP delta even on non positional sorted data. + // This can be beneficial for pairs where pairs are nearby each other. + // We suffer with delta to unrelated things (previous pair), but gain + // in delta between them. (Ideal would be a per read setting.) + int ap_delta; }; // Translation of required fields to cram data series diff --git a/hts.c b/hts.c index 69b736121..72041d0d5 100644 --- a/hts.c +++ b/hts.c @@ -745,6 +745,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "NO_REF") == 0) o->opt = CRAM_OPT_NO_REF, o->val.i = atoi(val); + else if (strcmp(o->arg, "pos_delta") == 0 || + strcmp(o->arg, "POS_DELTA") == 0) + o->opt = CRAM_OPT_POS_DELTA, o->val.i = atoi(val); + else if (strcmp(o->arg, "ignore_md5") == 0 || strcmp(o->arg, "IGNORE_MD5") == 0) o->opt = CRAM_OPT_IGNORE_MD5, o->val.i = atoi(val); diff --git a/htslib/hts.h b/htslib/hts.h index 3c6a3dcb6..0f2fdd021 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -319,6 +319,7 @@ enum hts_fmt_option { CRAM_OPT_USE_TOK, CRAM_OPT_USE_FQZ, CRAM_OPT_USE_ARITH, + CRAM_OPT_POS_DELTA, // force delta for AP, even on non-pos sorted data // General purpose HTS_OPT_COMPRESSION_LEVEL = 100, From b8c3bafffea118eda042383c16cdb4c01fec9f01 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 30 Sep 2020 14:16:51 +0100 Subject: [PATCH 049/488] An alternative (but still commented out) strategy for multi-base vars. This does some basic analysis to spot grouped variants and if and only if long enough and scoring +ve will it switch to 'b'(?) feature. --- cram/cram_encode.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index bb5aaea76..c46460144 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -2804,6 +2804,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (!sp[l]) break; if (0 && CRAM_MAJOR_VERS(fd->version) >= 3) { +#if 0 // Disabled for the time being as it doesn't // seem to gain us much. int ol=l; @@ -2821,6 +2822,43 @@ static int process_one_read(cram_fd *fd, cram_container *c, qp[l], rp[l])) return -1; } +#else + // With urmap pushed to the limit and lots + // of unaligned data (should be soft-clipped) + // this saves ~2-7%. Worth it? + int nl = l; + int max_end = nl, max_score = 0, score = 0; + while (nl < end) { + if (rp[nl] != sp[nl]) { + score += 3; + if (max_score < score) { + max_score = score; + max_end = nl; + } + } else { + score--; + if (score < -2 || + max_score - score > 7) + break; + } + nl++; + } + if (max_score > 20) { + cram_add_bases(fd, c, s, cr, spos+l, + max_end-l, &seq[spos+l]); + l = max_end-1; + } else { + while (l < nl) { + if (rp[l] != sp[l]) + cram_add_substitution(fd, c, s, + cr, spos+l, + sp[l], qp[l], + rp[l]); + l++; + } + l--; + } +#endif } else { if (cram_add_substitution(fd, c, s, cr, spos+l, sp[l], qp[l], rp[l])) From 1a69f852929b667a65b09f0226ca588c6080c118 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 23 Oct 2020 17:53:06 +0100 Subject: [PATCH 050/488] Small cram encoding speed improvements. - We no longer keep GZIP_RLE used when libdeflate is compiled in because it doesn't support strategy in the same manner as zlib. Instead we use GZIP level 1. - Adjusted the fixed small-block compensation code. It's now equiv to a 60 byte difference required before changing up to the next tier in complexity. This had no impact on my novaseq test normally, but with bzip2+lzma added in it saved 6% CPU (at a cost of 0.14% file growth). --- cram/cram_io.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 1f118aae4..3cea2ba4f 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1066,6 +1066,9 @@ static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size, if (level >= 8) level += level/8; // 8->10, 9->12 if (level > 12) level = 12; + if (strat == Z_RLE) // not supported by libdeflate + level = 1; + struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); if (!z) { hts_log_error("Call to libdeflate_alloc_compressor failed"); @@ -1939,6 +1942,15 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, if (method & (1u<metrics_lock); for (m = 0; m < CRAM_MAX_METHOD; m++) { @@ -1986,7 +1998,11 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, // Accumulate stats for all methods tried pthread_mutex_lock(&fd->metrics_lock); for (m = 0; m < CRAM_MAX_METHOD; m++) - metrics->sz[m] += sz[m]+50; // don't be overly sure on small blocks + // don't be overly sure on small blocks. + // +2000 means eg bzip2 vs gzip (1.07 to 1.04) or gz vs rans1 + // needs to be at least 60 bytes smaller to overcome the + // fixed size addition. + metrics->sz[m] += sz[m]+2000; // When enough trials performed, find the best on average if (--metrics->trial == 0) { @@ -2058,7 +2074,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, } if (best_method != metrics->method) { - metrics->trial = (NTRIALS+1)/2; // be sure + //metrics->trial = (NTRIALS+1)/2; // be sure //metrics->next_trial /= 1.5; metrics->consistency = 0; } else { From 46ec44cf2184740aaac91aaaa358fda2ee792e67 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Oct 2020 10:33:14 +0000 Subject: [PATCH 051/488] CRAM 4 updates. Same as io_lib's eda7cfe and 16c19ee. This replaces the length field in containers with a variable size quantity instead of 32-bit int, for consistency with other fields, and marks a few fields as using a signed encoding strategy (necessary for the next commit). [The CRAM 4 specification moved on slightly since the original date of this PR.] --- .cirrus.yml | 2 +- cram/cram_decode.c | 11 +++++--- cram/cram_encode.c | 10 ++++--- cram/cram_io.c | 67 +++++++++++++++++++++++++++++++++++++++++---- cram/cram_structs.h | 1 + htscodecs | 2 +- 6 files changed, 78 insertions(+), 15 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index d4256b11a..6b9bcd8cf 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -4,7 +4,7 @@ # Sadly though there is still a finite limit to macOS of one instance. # Can we cull our Mac test to just one instance? -timeout_in: 10m +timeout_in: 20m #-------------------------------------------------- # Template: build libdeflate dependency diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 8735aafa5..e4ee8986b 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -392,7 +392,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'C' && key[1] == 'F') { ds_id = DS_CF; type = E_INT; } else if (key[0] == 'R' && key[1] == 'I') { - ds_id = DS_RI; type = E_INT; + ds_id = DS_RI; type = E_SINT; } else if (key[0] == 'R' && key[1] == 'L') { ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { @@ -404,7 +404,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'M' && key[1] == 'F') { ds_id = DS_MF; type = E_INT; } else if (key[0] == 'N' && key[1] == 'S') { - ds_id = DS_NS; type = E_INT; + ds_id = DS_NS; type = E_SINT; } else if (key[0] == 'N' && key[1] == 'P') { ds_id = DS_NP; type = is_v4 ? E_LONG : E_INT; @@ -968,7 +968,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { hdr->content_type = b->content_type; if (b->content_type == MAPPED_SLICE) { - hdr->ref_seq_id = fd->vv.varint_get32((char **)&cp, (char *)cp_end, &err); + hdr->ref_seq_id = fd->vv.varint_get32s((char **)&cp, (char *)cp_end, &err); if (CRAM_MAJOR_VERS(fd->version) >= 4) { hdr->ref_seq_start = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); hdr->ref_seq_span = fd->vv.varint_get64((char **)&cp, (char *)cp_end, &err); @@ -2298,7 +2298,10 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, return -1; ref_id = s->hdr->ref_seq_id; - embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; + if (CRAM_MAJOR_VERS(fd->version) < 4) + embed_ref = s->hdr->ref_base_id >= 0 ? 1 : 0; + else + embed_ref = s->hdr->ref_base_id > 0 ? 1 : 0; if (ref_id >= 0) { if (embed_ref) { diff --git a/cram/cram_encode.c b/cram/cram_encode.c index c46460144..89c79fa8a 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -508,7 +508,7 @@ cram_block *cram_encode_slice_header(cram_fd *fd, cram_slice *s) { return NULL; } - cp += fd->vv.varint_put32(cp, NULL, s->hdr->ref_seq_id); + cp += fd->vv.varint_put32s(cp, NULL, s->hdr->ref_seq_id); if (CRAM_MAJOR_VERS(fd->version) >= 4) { cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_start); cp += fd->vv.varint_put64(cp, NULL, s->hdr->ref_seq_span); @@ -1084,7 +1084,9 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, */ /* Create cram slice header */ - s->hdr->ref_base_id = embed_ref ? DS_ref : -1; + s->hdr->ref_base_id = embed_ref && s->hdr->ref_seq_span > 0 + ? DS_ref + : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1); s->hdr->record_counter = c->num_records + c->record_counter; c->num_records += s->hdr->num_records; @@ -1659,7 +1661,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== NS ===\n"); h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), - c->stats[DS_NS], E_INT, NULL, + c->stats[DS_NS], E_SINT, NULL, fd->version, &fd->vv); if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err; @@ -1781,7 +1783,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== RI ===\n"); h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), - c->stats[DS_RI], E_INT, NULL, + c->stats[DS_RI], E_SINT, NULL, fd->version, &fd->vv); if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err; diff --git a/cram/cram_io.c b/cram/cram_io.c index 3cea2ba4f..7cbd41c05 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -898,6 +898,52 @@ static int uint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { return i; } +// Decode 32-bits with CRC update from cram_fd +static int sint7_decode_crc32(cram_fd *fd, int32_t *val_p, uint32_t *crc) { + uint8_t b[5], i = 0; + int c; + uint32_t v = 0; + +#ifdef VARINT2 + b[0] = hgetc(fd->fp); + if (b[0] < 177) { + } else if (b[0] < 241) { + b[1] = hgetc(fd->fp); + } else if (b[0] < 249) { + b[1] = hgetc(fd->fp); + b[2] = hgetc(fd->fp); + } else { + int n = b[0]+2, z = 1; + while (n-- >= 249) + b[z++] = hgetc(fd->fp); + } + i = var_get_u32(b, NULL, &v); +#else +// // Little endian +// int s = 0; +// do { +// b[i++] = c = hgetc(fd->fp); +// if (c < 0) +// return -1; +// v |= (c & 0x7f) << s; +// s += 7; +// } while (i < 5 && (c & 0x80)); + + // Big endian, see also htscodecs/varint.h + do { + b[i++] = c = hgetc(fd->fp); + if (c < 0) + return -1; + v = (v<<7) | (c & 0x7f); + } while (i < 5 && (c & 0x80)); +#endif + *crc = crc32(*crc, b, i); + + *val_p = (v>>1) ^ -(v&1); + return i; +} + + // Decode 64-bits with CRC update from cram_fd static int uint7_decode_crc64(cram_fd *fd, int64_t *val_p, uint32_t *crc) { uint8_t b[10], i = 0; @@ -3639,7 +3685,7 @@ cram_container *cram_read_container(cram_fd *fd) { } else { rd+=s; } - } else { + } else if (CRAM_MAJOR_VERS(fd->version) < 4) { uint32_t len; if ((s = int32_decode(fd, &c2.length)) == -1) { if (CRAM_MAJOR_VERS(fd->version) == 2 && @@ -3653,8 +3699,15 @@ cram_container *cram_read_container(cram_fd *fd) { } len = le_int4(c2.length); crc = crc32(0L, (unsigned char *)&len, 4); + } else { + if ((s = fd->vv.varint_decode32_crc(fd, &c2.length, &crc)) == -1) { + fd->eof = fd->empty_container ? 1 : 2; + return NULL; + } else { + rd+=s; + } } - if ((s = fd->vv.varint_decode32_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; + if ((s = fd->vv.varint_decode32s_crc(fd, &c2.ref_seq_id, &crc)) == -1) return NULL; else rd+=s; if (CRAM_MAJOR_VERS(fd->version) >= 4) { int64_t i64; if ((s = fd->vv.varint_decode64_crc(fd, &i64, &crc))== -1) return NULL; else rd+=s; @@ -3797,7 +3850,7 @@ int cram_store_container(cram_fd *fd, cram_container *c, char *dat, int *size) cp += fd->vv.varint_put32(cp, NULL, 0); cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_id); + cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); if (CRAM_MAJOR_VERS(fd->version) >= 4) { cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); @@ -3852,16 +3905,18 @@ int cram_write_container(cram_fd *fd, cram_container *c) { if (CRAM_MAJOR_VERS(fd->version) == 1) { cp += itf8_put(cp, c->length); - } else { + } else if (CRAM_MAJOR_VERS(fd->version) <= 3) { *(int32_t *)cp = le_int4(c->length); cp += 4; + } else { + cp += fd->vv.varint_put32(cp, NULL, c->length); } if (c->multi_seq) { cp += fd->vv.varint_put32(cp, NULL, (uint32_t)-2); cp += fd->vv.varint_put32(cp, NULL, 0); cp += fd->vv.varint_put32(cp, NULL, 0); } else { - cp += fd->vv.varint_put32(cp, NULL, c->ref_seq_id); + cp += fd->vv.varint_put32s(cp, NULL, c->ref_seq_id); if (CRAM_MAJOR_VERS(fd->version) >= 4) { cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_start); cp += fd->vv.varint_put64(cp, NULL, c->ref_seq_span); @@ -4916,6 +4971,7 @@ static void cram_init_varint(varint_vec *vv, int version) { vv->varint_put64s_blk = sint7_put_blk_64; vv->varint_size = uint7_size; vv->varint_decode32_crc = uint7_decode_crc32; + vv->varint_decode32s_crc = sint7_decode_crc32; vv->varint_decode64_crc = uint7_decode_crc64; } else { vv->varint_get32 = safe_itf8_get; @@ -4932,6 +4988,7 @@ static void cram_init_varint(varint_vec *vv, int version) { vv->varint_put64s_blk = ltf8_put_blk; vv->varint_size = itf8_size; vv->varint_decode32_crc = itf8_decode_crc; + vv->varint_decode32s_crc = itf8_decode_crc; vv->varint_decode64_crc = ltf8_decode_crc; } } diff --git a/cram/cram_structs.h b/cram/cram_structs.h index a54e2ec6f..5c39eca56 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -679,6 +679,7 @@ struct cram_fd; typedef struct varint_vec { // Returns number of bytes decoded from fd, 0 on error int (*varint_decode32_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); + int (*varint_decode32s_crc)(struct cram_fd *fd, int32_t *val_p, uint32_t *crc); int (*varint_decode64_crc)(struct cram_fd *fd, int64_t *val_p, uint32_t *crc); // Returns the value and increments *cp. Sets err to 1 iff an error occurs. diff --git a/htscodecs b/htscodecs index 99ed6bcca..dca826bbc 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 99ed6bcca2192a3c210fac04939c7e51a0b15a3c +Subproject commit dca826bbc598d6dcae450e92e60dcf789454999d From 96c64aa1f56fdb724565d7a56798515d27cbb1f9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Oct 2020 15:26:40 +0000 Subject: [PATCH 052/488] Next update for CRAM 4.0; now matching io_lib. This is the htslib equivalent to io_lib's 090dfd3f and d86b008. It is now in adherance to the draft CRAM 4 spec. It removes the E_SINT and E_SLONG types and replaces E_EXTERNAL with E_BYTE, E_VARINT_SIGNED and E_VARINT_UNSIGNED. Integer data types aren't explicitly signed or unsigned but simply use the codec that works for them. So if all ref-ids are >= 0 it'll be VARINT_UNSIGNED, otherwise VARINT_SIGNED. This future proofs things by adding introspection to the format. Also removed HUFFMAN from CRAM 4. It was only being used for storing constant values, so we now have CONST_BYTE and CONST_INT instead (which always uses a signed encoding). This should be the final removal of ever encoding negatives as if they were positives (eg -1 as 0xFFFFFFFF) and so in turn removes all size limitations on values. --- cram/cram_codecs.c | 700 ++++++++++++++++++++++++++++++++++++++------ cram/cram_codecs.h | 15 + cram/cram_decode.c | 6 +- cram/cram_encode.c | 77 +++-- cram/cram_stats.c | 16 +- cram/cram_structs.h | 34 ++- 6 files changed, 726 insertions(+), 122 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index a598f3964..783fc393d 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -324,6 +324,24 @@ static char *cram_extract_block(cram_block *b, int size) { /* * --------------------------------------------------------------------------- * EXTERNAL + * + * In CRAM 3.0 and earlier, E_EXTERNAL use the data type to determine the + * size of the object being returned. This type is hard coded in the + * spec document (changing from uint32 to uint64 requires a spec change) + * and there is no data format introspection so implementations have + * to determine which size to use based on version numbers. It also + * doesn't support signed data. + * + * With CRAM 4.0 onwards the size and sign of the data is no longer stated + * explicitly in the specification. Instead EXTERNAL is replaced by three + * new encodings, for bytes and signed / unsigned integers which used a + * variable sized encoding. + * + * For simplicity we use the same encode and decode functions for + * bytes (CRAM4) and external (CRAM3). Given we already had code to + * replace codec + type into a function pointer it makes little + * difference how we ended up at that function. However we disallow + * this codec to operate on integer data for CRAM4 onwards. */ int cram_external_decode_int(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { @@ -345,26 +363,6 @@ int cram_external_decode_int(cram_slice *slice, cram_codec *c, return err ? -1 : 0; } -int cram_external_decode_sint(cram_slice *slice, cram_codec *c, - cram_block *in, char *out, int *out_size) { - char *cp; - cram_block *b; - - /* Find the external block */ - b = cram_get_block_by_id(slice, c->u.external.content_id); - if (!b) - return *out_size?-1:0; - - cp = (char *)b->data + b->idx; - // E_INT and E_LONG are guaranteed single item queries - int err = 0; - *(int32_t *)out = c->vv->varint_get32s(&cp, (char *)b->data + b->uncomp_size, &err); - b->idx = cp - (char *)b->data; - *out_size = 1; - - return err ? -1 : 0; -} - int cram_external_decode_long(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { char *cp; @@ -385,26 +383,6 @@ int cram_external_decode_long(cram_slice *slice, cram_codec *c, return err ? -1 : 0; } -int cram_external_decode_slong(cram_slice *slice, cram_codec *c, - cram_block *in, char *out, int *out_size) { - char *cp; - cram_block *b; - - /* Find the external block */ - b = cram_get_block_by_id(slice, c->u.external.content_id); - if (!b) - return *out_size?-1:0; - - cp = (char *)b->data + b->idx; - // E_INT and E_LONG are guaranteed single item queries - int err = 0; - *(int64_t *)out = c->vv->varint_get64s(&cp, (char *)b->data + b->uncomp_size, &err); - b->idx = cp - (char *)b->data; - *out_size = 1; - - return err ? -1 : 0; -} - int cram_external_decode_char(cram_slice *slice, cram_codec *c, cram_block *in, char *out, int *out_size) { @@ -471,6 +449,7 @@ cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) { cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c = NULL; @@ -483,18 +462,35 @@ cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, return NULL; c->codec = E_EXTERNAL; - if (option == E_INT) - c->decode = cram_external_decode_int; - else if (option == E_SINT) - c->decode = cram_external_decode_sint; - else if (option == E_LONG) - c->decode = cram_external_decode_long; - else if (option == E_SLONG) - c->decode = cram_external_decode_slong; - else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->decode = cram_external_decode_char; - else - c->decode = cram_external_decode_block; + if (CRAM_MAJOR_VERS(version) >= 4) { + // Version 4 does not permit integer data to be encoded as a + // series of bytes. This is used purely for bytes, either + // singular or declared as arrays + switch (codec) { + case E_EXTERNAL: + if (option == E_BYTE_ARRAY_BLOCK) + c->decode = cram_external_decode_block; + else if (option == E_BYTE || option == E_BYTE_ARRAY) + c->decode = cram_external_decode_char; + else + return NULL; + break; + default: + return NULL; + } + } else { + // CRAM 3 and earlier encodes integers as EXTERNAL. We need + // use the option field to indicate the input data format so + // we know which serialisation format to use. + if (option == E_INT) + c->decode = cram_external_decode_int; + else if (option == E_LONG) + c->decode = cram_external_decode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->decode = cram_external_decode_char; + else + c->decode = cram_external_decode_block; + } c->free = cram_external_decode_free; c->size = cram_external_decode_size; c->get_block = cram_external_get_block; @@ -578,6 +574,7 @@ int cram_external_encode_store(cram_codec *c, cram_block *b, char *prefix, } cram_codec *cram_external_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -588,18 +585,32 @@ cram_codec *cram_external_encode_init(cram_stats *st, return NULL; c->codec = E_EXTERNAL; c->free = cram_external_encode_free; - if (option == E_INT) - c->encode = cram_external_encode_int; - else if (option == E_SINT) - c->encode = cram_external_encode_sint; - else if (option == E_LONG) - c->encode = cram_external_encode_long; - else if (option == E_SLONG) - c->encode = cram_external_encode_slong; - else if (option == E_BYTE_ARRAY || option == E_BYTE) - c->encode = cram_external_encode_char; - else - abort(); + if (CRAM_MAJOR_VERS(version) >= 4) { + // Version 4 does not permit integer data to be encoded as a + // series of bytes. This is used purely for bytes, either + // singular or declared as arrays + switch (codec) { + case E_EXTERNAL: + if (option != E_BYTE && option != E_BYTE_ARRAY) + return NULL; + c->encode = cram_external_encode_char; + break; + default: + return NULL; + } + } else { + // CRAM 3 and earlier encodes integers as EXTERNAL. We need + // use the option field to indicate the input data format so + // we know which serialisation format to use. + if (option == E_INT) + c->encode = cram_external_encode_int; + else if (option == E_LONG) + c->encode = cram_external_encode_long; + else if (option == E_BYTE_ARRAY || option == E_BYTE) + c->encode = cram_external_encode_char; + else + abort(); + } c->store = cram_external_encode_store; c->flush = NULL; @@ -608,6 +619,410 @@ cram_codec *cram_external_encode_init(cram_stats *st, return c; } +/* + * --------------------------------------------------------------------------- + * VARINT + * + * In CRAM 3.0 and earlier, E_EXTERNAL stored both integers in ITF8 + * format as well as bytes. In CRAM 4 EXTERNAL is only for bytes and + * byte arrays, with two dedicated encodings for integers: + * VARINT_SIGNED and VARINT_UNSIGNED. These also differ a little to + * EXTERNAL with the addition of an offset field, meaning we can store + * values in, say, the range -2 to 1 million without needing to use + * a signed zig-zag transformation. + */ +int cram_varint_decode_int(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int32_t *)out = c->vv->varint_get32(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_sint(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int32_t *)out = c->vv->varint_get32s(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int64_t *)out = c->vv->varint_get64(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +int cram_varint_decode_slong(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + char *cp; + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return *out_size?-1:0; + + cp = (char *)b->data + b->idx; + // E_INT and E_LONG are guaranteed single item queries + int err = 0; + *(int64_t *)out = c->vv->varint_get64s(&cp, + (char *)b->data + b->uncomp_size, + &err) + c->u.varint.offset; + b->idx = cp - (char *)b->data; + *out_size = 1; + + return err ? -1 : 0; +} + +void cram_varint_decode_free(cram_codec *c) { + if (c) + free(c); +} + +int cram_varint_decode_size(cram_slice *slice, cram_codec *c) { + cram_block *b; + + /* Find the data block */ + b = cram_get_block_by_id(slice, c->u.varint.content_id); + if (!b) + return -1; + + return b->uncomp_size; +} + +cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) { + return cram_get_block_by_id(slice, c->u.varint.content_id); +} + +cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + + // Function pointer choice is theoretically by codec type. + // Given we have some vars as int32 and some as int64 we + // use option too for sizing, although on disk format + // does not change. + switch(codec) { + case E_VARINT_UNSIGNED: + c->decode = (option == E_INT) + ? cram_varint_decode_int + : cram_varint_decode_long; + break; + case E_VARINT_SIGNED: + c->decode = (option == E_INT) + ? cram_varint_decode_sint + : cram_varint_decode_slong; + break; + default: + return NULL; + } + + c->free = cram_varint_decode_free; + c->size = cram_varint_decode_size; + c->get_block = cram_varint_get_block; + + c->u.varint.content_id = vv->varint_get32 (&cp, NULL, NULL); + c->u.varint.offset = vv->varint_get64s(&cp, NULL, NULL); + + if (cp - data != size) { + fprintf(stderr, "Malformed varint header stream\n"); + free(c); + return NULL; + } + + c->u.varint.type = option; + + return c; +} + +int cram_varint_encode_int(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint32_t *i32 = (uint32_t *)in; + return c->vv->varint_put32_blk(c->out, *i32 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_sint(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int32_t *i32 = (int32_t *)in; + return c->vv->varint_put32s_blk(c->out, *i32 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_long(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + uint64_t *i64 = (uint64_t *)in; + return c->vv->varint_put64_blk(c->out, *i64 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +int cram_varint_encode_slong(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + int64_t *i64 = (int64_t *)in; + return c->vv->varint_put64s_blk(c->out, *i64 - c->u.varint.offset) >= 0 + ? 0 : -1; +} + +void cram_varint_encode_free(cram_codec *c) { + if (!c) + return; + free(c); +} + +int cram_varint_encode_store(cram_codec *c, cram_block *b, char *prefix, + int version) { + char tmp[99], *tp = tmp; + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + tp += c->vv->varint_put32 (tp, NULL, c->u.e_varint.content_id); + tp += c->vv->varint_put64s(tp, NULL, c->u.e_varint.offset); + len += c->vv->varint_put32_blk(b, c->codec); + len += c->vv->varint_put32_blk(b, tp-tmp); + BLOCK_APPEND(b, tmp, tp-tmp); + len += tp-tmp; + + return len; + + block_err: + return -1; +} + +cram_codec *cram_varint_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->u.e_varint.offset = 0; + if (st) { + // Marginal difference so far! Not worth the hassle? + if (st->min_val < 0 && st->min_val >= -127 + && st->max_val / -st->min_val > 100) { + c->u.e_varint.offset = -st->min_val; + codec = E_VARINT_UNSIGNED; + } else if (st->min_val > 0) { + c->u.e_varint.offset = -st->min_val; + } + } + + c->codec = codec; + c->free = cram_varint_encode_free; + + // Function pointer choice is theoretically by codec type. + // Given we have some vars as int32 and some as int64 we + // use option too for sizing, although on disk format + // does not change. + switch (codec) { + case E_VARINT_UNSIGNED: + c->encode = (option == E_INT) + ? cram_varint_encode_int + : cram_varint_encode_long; + break; + case E_VARINT_SIGNED: + c->encode = (option == E_INT) + ? cram_varint_encode_sint + : cram_varint_encode_slong; + break; + default: + return NULL; + } + c->store = cram_varint_encode_store; + c->flush = NULL; + + c->u.e_varint.content_id = (size_t)dat; + + return c; +} +/* + * --------------------------------------------------------------------------- + * CONST_BYTE and CONST_INT + */ +int cram_const_decode_byte(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int i, n; + + for (i = 0, n = *out_size; i < n; i++) + out[i] = c->u.xconst.val; + + return 0; +} + +int cram_const_decode_int(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int32_t *out_i = (int32_t *)out; + int i, n; + + for (i = 0, n = *out_size; i < n; i++) + out_i[i] = c->u.xconst.val; + + return 0; +} + +int cram_const_decode_long(cram_slice *slice, cram_codec *c, + cram_block *in, char *out, int *out_size) { + int64_t *out_i = (int64_t *)out; + int i, n; + + for (i = 0, n = *out_size; i < n; i++) + out_i[i] = c->u.xconst.val; + + return 0; +} + +void cram_const_decode_free(cram_codec *c) { + if (c) + free(c); +} + +int cram_const_decode_size(cram_slice *slice, cram_codec *c) { + return 0; +} + +cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, + char *data, int size, + enum cram_encoding codec, + enum cram_external_type option, + int version, varint_vec *vv) { + cram_codec *c; + char *cp = data; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + if (codec == E_CONST_BYTE) + c->decode = cram_const_decode_byte; + else if (option == E_INT) + c->decode = cram_const_decode_int; + else + c->decode = cram_const_decode_long; + c->free = cram_const_decode_free; + c->size = cram_const_decode_size; + c->get_block = NULL; + + c->u.xconst.val = vv->varint_get64s(&cp, NULL, NULL); + + if (cp - data != size) { + fprintf(stderr, "Malformed const header stream\n"); + free(c); + return NULL; + } + + return c; +} + +int cram_const_encode(cram_slice *slice, cram_codec *c, + char *in, int in_size) { + return 0; +} + +int cram_const_encode_store(cram_codec *c, cram_block *b, char *prefix, + int version) { + char tmp[99], *tp = tmp; + int len = 0; + + if (prefix) { + size_t l = strlen(prefix); + BLOCK_APPEND(b, prefix, l); + len += l; + } + + tp += c->vv->varint_put64s(tp, NULL, c->u.xconst.val); + len += c->vv->varint_put32_blk(b, c->codec); + len += c->vv->varint_put32_blk(b, tp-tmp); + BLOCK_APPEND(b, tmp, tp-tmp); + len += tp-tmp; + + return len; + + block_err: + return -1; +} + +cram_codec *cram_const_encode_init(cram_stats *st, + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { + cram_codec *c; + + if (!(c = malloc(sizeof(*c)))) + return NULL; + + c->codec = codec; + c->free = cram_const_decode_free; // as as decode + c->encode = cram_const_encode; // a nop + c->store = cram_const_encode_store; + c->flush = NULL; + c->u.e_xconst.val = st->min_val; + + return c; +} + /* * --------------------------------------------------------------------------- * BETA @@ -678,6 +1093,7 @@ void cram_beta_decode_free(cram_codec *c) { cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -780,6 +1196,7 @@ void cram_beta_encode_free(cram_codec *c) { } cram_codec *cram_beta_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -964,6 +1381,7 @@ cram_block *cram_xpack_get_block(cram_slice *slice, cram_codec *c) { cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -1127,6 +1545,7 @@ void cram_xpack_encode_free(cram_codec *c) { } cram_codec *cram_xpack_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -1285,6 +1704,7 @@ cram_block *cram_xdelta_get_block(cram_slice *slice, cram_codec *c) { cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -1524,9 +1944,10 @@ void cram_xdelta_encode_free(cram_codec *c) { } cram_codec *cram_xdelta_encode_init(cram_stats *st, - enum cram_external_type option, - void *dat, - int version, varint_vec *vv) { + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { cram_codec *c; if (!(c = malloc(sizeof(*c)))) @@ -1685,6 +2106,7 @@ void cram_xrle_decode_free(cram_codec *c) { cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -1906,9 +2328,10 @@ void cram_xrle_encode_free(cram_codec *c) { } cram_codec *cram_xrle_encode_init(cram_stats *st, - enum cram_external_type option, - void *dat, - int version, varint_vec *vv) { + enum cram_encoding codec, + enum cram_external_type option, + void *dat, + int version, varint_vec *vv) { cram_codec *c; if (!(c = malloc(sizeof(*c)))) @@ -1998,6 +2421,7 @@ void cram_subexp_decode_free(cram_codec *c) { cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -2063,6 +2487,7 @@ void cram_gamma_decode_free(cram_codec *c) { cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c = NULL; @@ -2279,6 +2704,7 @@ int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, */ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { int32_t ncodes = 0, i, j; @@ -2313,6 +2739,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, h->free = cram_huffman_decode_free; h->u.huffman.ncodes = ncodes; + h->u.huffman.option = option; if (ncodes) { codes = h->u.huffman.codes = malloc(ncodes * sizeof(*codes)); if (!codes) { @@ -2327,15 +2754,9 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, if (option == E_LONG) { for (i = 0; i < ncodes; i++) codes[i].symbol = vv->varint_get64(&cp, data_end, &err); - } else if (option == E_SLONG) { - for (i = 0; i < ncodes; i++) - codes[i].symbol = vv->varint_get64s(&cp, data_end, &err); } else if (option == E_INT || option == E_BYTE) { for (i = 0; i < ncodes; i++) codes[i].symbol = vv->varint_get32(&cp, data_end, &err); - } else if (option == E_SINT) { - for (i = 0; i < ncodes; i++) - codes[i].symbol = vv->varint_get32s(&cp, data_end, &err); } else { free(h); return NULL; @@ -2641,6 +3062,7 @@ int cram_huffman_encode_store(cram_codec *c, cram_block *b, char *prefix, } cram_codec *cram_huffman_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -2873,6 +3295,7 @@ void cram_byte_array_len_decode_free(cram_codec *c) { cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; @@ -2989,6 +3412,7 @@ int cram_byte_array_len_encode_store(cram_codec *c, cram_block *b, } cram_codec *cram_byte_array_len_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -3110,6 +3534,7 @@ void cram_byte_array_stop_decode_free(cram_codec *c) { cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c = NULL; @@ -3210,6 +3635,7 @@ int cram_byte_array_stop_encode_store(cram_codec *c, cram_block *b, } cram_codec *cram_byte_array_stop_encode_init(cram_stats *st, + enum cram_encoding codec, enum cram_external_type option, void *dat, int version, varint_vec *vv) { @@ -3246,6 +3672,12 @@ const char *cram_encoding2str(enum cram_encoding t) { case E_SUBEXP: return "SUBEXP"; case E_GOLOMB_RICE: return "GOLOMB_RICE"; case E_GAMMA: return "GAMMA"; + + case E_VARINT_UNSIGNED: return "VARINT_UNSIGNED"; + case E_VARINT_SIGNED: return "VARINT_SIGNED"; + case E_CONST_BYTE: return "CONST_BYTE"; + case E_CONST_INT: return "CONST_INT"; + case E_NUM_CODECS: default: return "?"; } @@ -3254,18 +3686,39 @@ const char *cram_encoding2str(enum cram_encoding t) { static cram_codec *(*decode_init[])(cram_block_compression_hdr *hdr, char *data, int size, + enum cram_encoding codec, enum cram_external_type option, int version, varint_vec *vv) = { - NULL, + // CRAM 3.0 valid codecs + NULL, // null codec cram_external_decode_init, - NULL, + NULL, // golomb cram_huffman_decode_init, cram_byte_array_len_decode_init, cram_byte_array_stop_decode_init, cram_beta_decode_init, cram_subexp_decode_init, - NULL, + NULL, // golomb rice cram_gamma_decode_init, + + // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, // was xbyte + cram_varint_decode_init, // varint unsigned + cram_varint_decode_init, // varint signed + cram_const_decode_init, // const byte + cram_const_decode_init, // const int + + // Gap to CRAM 4 transfomrations; 45 to 49 inclusive + NULL, NULL, NULL, NULL, NULL, + + NULL, // xhuffman + cram_xpack_decode_init, + cram_xrle_decode_init, + cram_xdelta_decode_init, }; cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, @@ -3274,7 +3727,8 @@ cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, enum cram_external_type option, int version, varint_vec *vv) { if (codec >= E_NULL && codec < E_NUM_CODECS && decode_init[codec]) { - cram_codec *r = decode_init[codec](hdr, data, size, option, version, vv); + cram_codec *r = decode_init[codec](hdr, data, size, codec, + option, version, vv); if (r) { r->vv = vv; r->codec_id = hdr->ncodecs++; @@ -3287,19 +3741,40 @@ cram_codec *cram_decoder_init(cram_block_compression_hdr *hdr, } static cram_codec *(*encode_init[])(cram_stats *stx, + enum cram_encoding codec, enum cram_external_type option, void *opt, int version, varint_vec *vv) = { - NULL, - cram_external_encode_init, - NULL, + // CRAM 3.0 valid codecs + NULL, // null codec + cram_external_encode_init, // int/bytes in cram 3, byte only in cram 4 + NULL, // golomb cram_huffman_encode_init, cram_byte_array_len_encode_init, cram_byte_array_stop_encode_init, cram_beta_encode_init, - NULL, //cram_subexp_encode_init, - NULL, - NULL, //cram_gamma_encode_init, + NULL, // subexponential (we support decode only) + NULL, // golomb rice + NULL, // gamma (we support decode only) + + // Gap between CRAM 3 and CRAM 4; 9 to 39 inclusive + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + NULL, // was xbyte + cram_varint_encode_init, // varint unsigned + cram_varint_encode_init, // varint signed + cram_const_encode_init, // const byte + cram_const_encode_init, // const int + + // Gap to CRAM 4 transfomrations; 45 to 49 inclusive + NULL, NULL, NULL, NULL, NULL, + + NULL, // xhuffman + cram_xpack_encode_init, + cram_xrle_encode_init, + cram_xdelta_encode_init, }; cram_codec *cram_encoder_init(enum cram_encoding codec, @@ -3310,9 +3785,20 @@ cram_codec *cram_encoder_init(enum cram_encoding codec, if (st && !st->nvals) return NULL; + // cram_stats_encoding assumes integer data, but if option + // is E_BYTE then tweak the requested encoding. This ought + // to be fixed in cram_stats_encoding instead. + if (option == E_BYTE || option == E_BYTE_ARRAY || + option == E_BYTE_ARRAY_BLOCK) { + if (codec == E_VARINT_SIGNED || codec == E_VARINT_UNSIGNED) + codec = E_EXTERNAL; + else if (codec == E_CONST_INT) + codec = E_CONST_BYTE; + } + if (encode_init[codec]) { cram_codec *r; - if ((r = encode_init[codec](st, option, dat, version, vv))) + if ((r = encode_init[codec](st, codec, option, dat, version, vv))) r->out = NULL; if (!r) { hts_log_error("Unable to initialise codec of type %s", cram_encoding2str(codec)); @@ -3335,29 +3821,42 @@ int cram_codec_to_id(cram_codec *c, int *id2) { int bnum1, bnum2 = -2; switch (c->codec) { + case E_CONST_INT: + case E_CONST_BYTE: + bnum1 = -2; // no blocks used + case E_HUFFMAN: bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1; break; + case E_GOLOMB: case E_BETA: case E_SUBEXP: case E_GOLOMB_RICE: case E_GAMMA: + // CORE block bnum1 = -1; break; + case E_EXTERNAL: + case E_VARINT_UNSIGNED: + case E_VARINT_SIGNED: bnum1 = c->u.external.content_id; break; + case E_BYTE_ARRAY_LEN: bnum1 = cram_codec_to_id(c->u.byte_array_len.len_codec, NULL); bnum2 = cram_codec_to_id(c->u.byte_array_len.val_codec, NULL); break; + case E_BYTE_ARRAY_STOP: bnum1 = c->u.byte_array_stop.content_id; break; + case E_NULL: bnum1 = -2; break; + default: hts_log_error("Unknown codec type %d", c->codec); bnum1 = -1; @@ -3384,6 +3883,12 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { int j; switch (c->codec) { + case E_CONST_INT: + case E_CONST_BYTE: + // shares struct with decode + c->store = cram_const_encode_store; + break; + case E_EXTERNAL: // shares struct with decode c->free = cram_external_encode_free; @@ -3400,6 +3905,23 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { return -1; break; + case E_VARINT_SIGNED: + case E_VARINT_UNSIGNED: + // shares struct with decode + c->free = cram_varint_encode_free; + c->store = cram_varint_encode_store; + if (c->decode == cram_varint_decode_int) + c->encode = cram_varint_encode_int; + else if (c->decode == cram_varint_decode_sint) + c->encode = cram_varint_encode_sint; + else if (c->decode == cram_varint_decode_long) + c->encode = cram_varint_encode_long; + else if (c->decode == cram_varint_decode_slong) + c->encode = cram_varint_encode_slong; + else + return -1; + break; + case E_HUFFMAN: { // New structure, so switch. // FIXME: we huffman and e_huffman structs amended, we could @@ -3411,6 +3933,7 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { t->store = cram_huffman_encode_store; t->u.e_huffman.codes = c->u.huffman.codes; t->u.e_huffman.nvals = c->u.huffman.ncodes; + t->u.e_huffman.option = c->u.huffman.option; for (j = 0; j < t->u.e_huffman.nvals; j++) { int32_t sym = t->u.e_huffman.codes[j].symbol; if (sym >= -1 && sym < MAX_HUFF) @@ -3475,7 +3998,8 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { case E_BYTE_ARRAY_LEN: { cram_codec *t = malloc(sizeof(*t)); if (!t) return -1; - t->codec = E_BYTE_ARRAY_LEN; + t->vv = c->vv; + t->codec = E_BYTE_ARRAY_LEN; t->free = cram_byte_array_len_encode_free; t->store = cram_byte_array_len_encode_store; t->encode = cram_byte_array_len_encode; diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 850a2a92e..56b065255 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -58,6 +58,7 @@ typedef struct { typedef struct { int ncodes; cram_huffman_code *codes; + int option; } cram_huffman_decoder; #define MAX_HUFF 128 @@ -127,6 +128,12 @@ typedef struct { enum cram_external_type type; } cram_external_decoder; +typedef struct { + int32_t content_id; + int64_t offset; + enum cram_external_type type; +} cram_varint_decoder; + typedef struct { struct cram_codec *len_codec; struct cram_codec *val_codec; @@ -146,6 +153,10 @@ typedef struct { struct cram_codec *val_codec; } cram_byte_array_len_encoder; +typedef struct { + int64_t val; +} cram_const_codec; + /* * A generic codec structure. */ @@ -176,6 +187,8 @@ typedef struct cram_codec { cram_xpack_decoder xpack; cram_xrle_decoder xrle; cram_xdelta_decoder xdelta; + cram_const_codec xconst; + cram_varint_decoder varint; cram_huffman_encoder e_huffman; cram_external_decoder e_external; @@ -185,6 +198,8 @@ typedef struct cram_codec { cram_xpack_decoder e_xpack; cram_xrle_decoder e_xrle; cram_xdelta_decoder e_xdelta; + cram_const_codec e_xconst; + cram_varint_decoder e_varint; } u; } cram_codec; diff --git a/cram/cram_decode.c b/cram/cram_decode.c index e4ee8986b..41203ec7f 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -392,7 +392,7 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, } else if (key[0] == 'C' && key[1] == 'F') { ds_id = DS_CF; type = E_INT; } else if (key[0] == 'R' && key[1] == 'I') { - ds_id = DS_RI; type = E_SINT; + ds_id = DS_RI; type = E_INT; } else if (key[0] == 'R' && key[1] == 'L') { ds_id = DS_RL; type = E_INT; } else if (key[0] == 'A' && key[1] == 'P') { @@ -400,11 +400,11 @@ cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, type = is_v4 ? E_SLONG : E_INT; } else if (key[0] == 'R' && key[1] == 'G') { ds_id = DS_RG; - type = is_v4 ? E_SINT : E_INT; + type = E_INT; } else if (key[0] == 'M' && key[1] == 'F') { ds_id = DS_MF; type = E_INT; } else if (key[0] == 'N' && key[1] == 'S') { - ds_id = DS_NS; type = E_SINT; + ds_id = DS_NS; type = E_INT; } else if (key[0] == 'N' && key[1] == 'P') { ds_id = DS_NP; type = is_v4 ? E_LONG : E_INT; diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 89c79fa8a..3d9235ac8 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -995,8 +995,16 @@ static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) { codec->out = s->block[0]; break; + // Codecs which don't use external blocks + case E_CONST_BYTE: + case E_CONST_INT: + codec->out = NULL; + break; + // Codecs that emit directly to external blocks case E_EXTERNAL: + case E_VARINT_UNSIGNED: + case E_VARINT_SIGNED: if (!(s->block[ds_id] = cram_new_block(EXTERNAL, ds_id))) return -1; codec->u.external.content_id = ds_id; @@ -1624,15 +1632,25 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // fd->version); //fprintf(stderr, "=== AP ===\n"); - if (c->pos_sorted) { - h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), - c->stats[DS_AP], - is_v4 ? E_SLONG : E_INT, - NULL, fd->version, &fd->vv); + if (c->pos_sorted || CRAM_MAJOR_VERS(fd->version) >= 4) { + if (c->pos_sorted) + h->codecs[DS_AP] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_AP]), + c->stats[DS_AP], + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); + else + // Unsorted data has no stats, but hard-code VARINT_SIGNED / EXT. + h->codecs[DS_AP] = cram_encoder_init(is_v4 ? E_VARINT_SIGNED + : E_EXTERNAL, + NULL, + is_v4 ? E_LONG : E_INT, + NULL, fd->version, &fd->vv); } else { + // Removed BETA in v4.0. + // Should we consider dropping use of it for 3.0 too? int p[2] = {0, c->max_apos}; h->codecs[DS_AP] = cram_encoder_init(E_BETA, NULL, - is_v4 ? E_SLONG : E_INT, + is_v4 ? E_LONG : E_INT, p, fd->version, &fd->vv); // cram_xdelta_encoder e; // e.word_size = is_v4 ? 8 : 4; @@ -1648,7 +1666,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== RG ===\n"); h->codecs[DS_RG] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RG]), c->stats[DS_RG], - is_v4 ? E_SINT : E_INT, + E_INT, NULL, fd->version, &fd->vv); if (c->stats[DS_RG]->nvals && !h->codecs[DS_RG]) goto_err; @@ -1661,7 +1679,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== NS ===\n"); h->codecs[DS_NS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_NS]), - c->stats[DS_NS], E_SINT, NULL, + c->stats[DS_NS], E_INT, NULL, fd->version, &fd->vv); if (c->stats[DS_NS]->nvals && !h->codecs[DS_NS]) goto_err; @@ -1674,7 +1692,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== TS ===\n"); h->codecs[DS_TS] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_TS]), c->stats[DS_TS], - is_v4 ? E_SLONG : E_INT, + is_v4 ? E_LONG : E_INT, NULL, fd->version, &fd->vv); if (c->stats[DS_TS]->nvals && !h->codecs[DS_TS]) goto_err; @@ -1730,7 +1748,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (CRAM_MAJOR_VERS(fd->version) >= 3) { cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)DS_BB_len; //e.len_dat = (void *)DS_BB; @@ -1783,7 +1803,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //fprintf(stderr, "=== RI ===\n"); h->codecs[DS_RI] = cram_encoder_init(cram_stats_encoding(fd, c->stats[DS_RI]), - c->stats[DS_RI], E_SINT, NULL, + c->stats[DS_RI], E_INT, NULL, fd->version, &fd->vv); if (c->stats[DS_RI]->nvals && !h->codecs[DS_RI]) goto_err; @@ -1818,7 +1838,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // elements into the same external block. cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)DS_SC_len; e.val_encoding = E_EXTERNAL; @@ -2291,8 +2313,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 1) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2311,8 +2338,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 2) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2330,8 +2362,13 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_byte_array_len_encoder e; cram_stats st; - e.len_encoding = E_HUFFMAN; - e.len_dat = NULL; + if (CRAM_MAJOR_VERS(fd->version) <= 3) { + e.len_encoding = E_HUFFMAN; + e.len_dat = NULL; // will get codes from st + } else { + e.len_encoding = E_CONST_INT; + e.len_dat = NULL; // will get codes from st + } memset(&st, 0, sizeof(st)); if (cram_stats_add(&st, 4) < 0) goto block_err; cram_stats_encoding(fd, &st); @@ -2353,7 +2390,9 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // too. cram_byte_array_len_encoder e; - e.len_encoding = E_EXTERNAL; + e.len_encoding = CRAM_MAJOR_VERS(fd->version) >= 4 + ? E_VARINT_UNSIGNED + : E_EXTERNAL; e.len_dat = (void *)sk; // or key+128 for len? e.val_encoding = E_EXTERNAL; diff --git a/cram/cram_stats.c b/cram/cram_stats.c index 20ef51dc7..3ceda0db1 100644 --- a/cram/cram_stats.c +++ b/cram/cram_stats.c @@ -193,6 +193,8 @@ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { } st->nvals = nvals; + st->min_val = min_val; + st->max_val = max_val; assert(ntot == st->nsamp); free(vals); @@ -202,7 +204,19 @@ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { * Simple policy that everything is external unless it can be * encoded using zero bits as a unary item huffman table. */ - return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL; + if (CRAM_MAJOR_VERS(fd->version) >= 4) { + // Note, we're assuming integer data here as we don't have the + // type passed in. Cram_encoder_init does know the type and + // will convert to E_CONST_BYTE or E_EXTERNAL as appropriate. + if (nvals == 1) + return E_CONST_INT; + else if (nvals == 0 || min_val < 0) + return E_VARINT_SIGNED; + else + return E_VARINT_UNSIGNED; + } else { + return nvals <= 1 ? E_HUFFMAN : E_EXTERNAL; + } } void cram_stats_free(cram_stats *st) { diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 5c39eca56..b327a06cf 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -97,24 +97,36 @@ typedef struct cram_stats { khash_t(m_i2i) *h; int nsamp; // total number of values added int nvals; // total number of unique values added + int64_t min_val, max_val; } cram_stats; /* NB: matches java impl, not the spec */ enum cram_encoding { E_NULL = 0, - E_EXTERNAL = 1, - E_GOLOMB = 2, - E_HUFFMAN = 3, + E_EXTERNAL = 1, // Only for BYTE type in CRAM 4 + E_GOLOMB = 2, // Not in CRAM 4 + E_HUFFMAN = 3, // Not in CRAM 4 E_BYTE_ARRAY_LEN = 4, E_BYTE_ARRAY_STOP = 5, - E_BETA = 6, - E_SUBEXP = 7, - E_GOLOMB_RICE = 8, - E_GAMMA = 9, - E_XPACK = 11, // Transform to sub-codec - E_XRLE = 12, // Transform to sub-codec - E_XDELTA = 13, // Transform to sub-codec - E_NUM_CODECS, /* Total number of codecs, not a real one. */ + E_BETA = 6, // Not in CRAM 4 + E_SUBEXP = 7, // Not in CRAM 4 + E_GOLOMB_RICE = 8, // Not in CRAM 4 + E_GAMMA = 9, // Not in CRAM 4 + + // CRAM 4 specific codecs + E_VARINT_UNSIGNED = 41, // Specialisation of EXTERNAL + E_VARINT_SIGNED = 42, // Specialisation of EXTERNAL + E_CONST_BYTE = 43, // Alternative to HUFFMAN with 1 symbol + E_CONST_INT = 44, // Alternative to HUFFMAN with 1 symbol + + // More experimental ideas, not documented in spec yet + E_XHUFFMAN = 50, // To external block + E_XPACK = 51, // Transform to sub-codec + E_XRLE = 52, // Transform to sub-codec + E_XDELTA = 53, // Transform to sub-codec + + // Total number of codecs, not a real one. + E_NUM_CODECS, }; enum cram_external_type { From 7068ac84636da735e919173d65d021aeb1afd0e0 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 11 Nov 2020 14:27:06 +0000 Subject: [PATCH 053/488] Remove metrics->stats pointer. This has been replaced by an "unpackable" flag. This avoids cram_compress_block2 from needing to access metrics->stats, which is a pointer to the container->stats struct. In rare cases this could be freed prior to the compression completing. --- cram/cram_encode.c | 3 ++- cram/cram_io.c | 7 +++++-- cram/cram_structs.h | 3 ++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 3d9235ac8..315279dae 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -855,7 +855,8 @@ static int cram_compress_slice(cram_fd *fd, cram_container *c, cram_slice *s) { pthread_mutex_lock(&fd->metrics_lock); for (i = 0; i < DS_END; i++) - fd->m[i]->stats = c->stats[i]; + if (c->stats[i] && c->stats[i]->nvals > 16) + fd->m[i]->unpackable = 1; pthread_mutex_unlock(&fd->metrics_lock); /* Specific compression methods for certain block types */ diff --git a/cram/cram_io.c b/cram/cram_io.c index 7cbd41c05..65ff7980e 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1949,7 +1949,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, if (metrics) { pthread_mutex_lock(&fd->metrics_lock); if (metrics->trial > 0 || --metrics->next_trial <= 0) { - int m; + int m, unpackable = metrics->unpackable; size_t sz_best = b->uncomp_size; size_t sz[CRAM_MAX_METHOD] = {0}; int method_best = 0; // RAW @@ -1965,10 +1965,11 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, metrics->trial = NTRIALS; for (m = 0; m < CRAM_MAX_METHOD; m++) metrics->sz[m] /= 2; + metrics->unpackable = 0; } // Compress this block using the best method - if (metrics->stats && metrics->stats->nvals > 16) { + if (unpackable && CRAM_MAJOR_VERS(fd->version) > 3) { // No point trying bit-pack if 17+ symbols. if (method & (1<method = RAW; m->strat = 0; m->revised_method = 0; + m->unpackable = 0; return m; } @@ -4138,6 +4140,7 @@ void reset_metrics(cram_fd *fd) { m->trial = NTRIALS; m->next_trial = TRIAL_SPAN; m->revised_method = 0; + m->unpackable = 0; memset(m->sz, 0, sizeof(m->sz)); } diff --git a/cram/cram_structs.h b/cram/cram_structs.h index b327a06cf..0ede383f5 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -249,7 +249,8 @@ struct cram_metrics { double extra[CRAM_MAX_METHOD]; - cram_stats *stats; + // Not amenable to rANS bit-packing techniques; cardinality > 16 + int unpackable; }; // Hash aux key (XX:i) to cram_metrics From 85240bae84f49c05656b5ffd5b23485c2559f25b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 1 Dec 2020 13:50:54 +0000 Subject: [PATCH 054/488] Further optimisations to probaln_glocal. The changes are: - Replace a lot of the repeated set_u macro usages that go from k to {u, v01, v10, v11} with pointer increments instead. This rather obfuscates things as the difference between v01/u and v11/v10 (+/-3) is now hidden (but see PROBALN_ORIG comment below). - An attempt at manual ordering of code to avoid as many instruction latency issues as possible. Mostly compilers do this already, but some combinations benefit here. - Cache and reuse of variables computed during the previous loop iterations. This can have a big impact, but again whether or not it was already done for us is compiler specific. - Removal of some loop invariants (ideally already done for us, but safer to do it ourselves). - Perform the bi (backwards) rescaling during the backwards calculation loop instead of in a second pass. - Reduce conditionals in the map stage. - Plus some code formatting tidyups, in particular removal of many;statements;on;same;long;line. The original code wasn't exactly easy to understand, but the new code is possibly even worse if that's possible. Hence I've kept the old code in place guarded by a #ifdef PROBALN_ORIG (technically not original as bdf85e4 already did some optimisations) to act as a bit of a translation guide. If we prefer to simply keep this in git history then feel free to nuke the ifdefed bits. Benchmarks vs PROBALN_ORIG (time in this function only) 35% quicker with gcc 7 -O2 (NEW: 1168742 usec, ORIG: 1575843) 14% quicker with gcc 7 -O3 (NEW: 1165683 ORIG: 1333712) 33% quicker with gcc 9 -O2 (NEW: 1169097 ORIG: 1556956) 29% quicker with gcc 9 -O3 (NEW: 1157547 ORIG: 1491839) 24% quicker with clang 7.0 -O2 (NEW: 1207968 ORIG: 1501549) 24% quicker with clang 7.0 -O3 (NEW: 1211660 ORIG: 1504905) On samtools mpileup of 10 million reads this reduced the time from 6m58 to 5m27 (28% faster throughput). This is because 86% of all CPU time was spent in this one function! It's still 81.5% even after optimisation. Note I initially attempted a SIMD implementation, which I think would still be possible, but it cannot be faster without majorly changing the data layout and order of evaluation, plus realistically it'd need AVX or above as SSE4 can only do SIMD on 2 doubles so the overhead is significant. --- probaln.c | 168 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 152 insertions(+), 16 deletions(-) diff --git a/probaln.c b/probaln.c index 9b9442c5b..73df1a8f1 100644 --- a/probaln.c +++ b/probaln.c @@ -74,8 +74,9 @@ static float g_qual2prob[256]; Returns phred-scaled likelihood score, or INT_MIN on failure. */ -int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_query, - const uint8_t *iqual, const probaln_par_t *c, int *state, uint8_t *q) +int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, + int l_query, const uint8_t *iqual, const probaln_par_t *c, + int *state, uint8_t *q) { double *f = NULL, *b = NULL, *s = NULL, m[9], sI, sM, bI, bM; float *qual = NULL; @@ -96,8 +97,10 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu bw2 = bw * 2 + 1; size_t i_dim = bw2 < l_ref ? (size_t) bw2*3+6 : (size_t) l_ref*3+6; - // allocate the forward and backward matrices f[][] and b[][] and the scaling array s[] - // Ideally these callocs would be mallocs + initialisation of the few bits needed. + // allocate the forward and backward matrices f[][] and b[][] + // and the scaling array s[] + // Ideally these callocs would be mallocs + initialisation of + // the few bits needed. if (SIZE_MAX / (l_query+1) / i_dim < sizeof(double)) { errno = ENOMEM; // Allocation would fail return INT_MIN; @@ -108,7 +111,9 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu b = calloc((l_query+1)*i_dim, sizeof(double)); if (!b) goto fail; } - s = malloc((l_query+2) * sizeof(double)); // s[] is the scaling factor to avoid underflow + + // s[] is the scaling factor to avoid underflow + s = malloc((l_query+2) * sizeof(double)); if (!s) goto fail; // initialize qual @@ -122,11 +127,19 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu qual[i] = g_qual2prob[iqual? iqual[i] : 30]; // initialize transition probability - sM = sI = 1. / (2 * l_query + 2); // the value here seems not to affect results; FIXME: need proof - m[0*3+0] = (1 - c->d - c->d) * (1 - sM); m[0*3+1] = m[0*3+2] = c->d * (1 - sM); - m[1*3+0] = (1 - c->e) * (1 - sI); m[1*3+1] = c->e * (1 - sI); m[1*3+2] = 0.; - m[2*3+0] = 1 - c->e; m[2*3+1] = 0.; m[2*3+2] = c->e; - bM = (1 - c->d) / l_ref; bI = c->d / l_ref; // (bM+bI)*l_ref==1 + // the value here seems not to affect results; FIXME: need proof + sM = sI = 1. / (2 * l_query + 2); + m[0*3+0] = (1 - c->d - c->d) * (1 - sM); + m[0*3+1] = m[0*3+2] = c->d * (1 - sM); + m[1*3+0] = (1 - c->e) * (1 - sI); + m[1*3+1] = c->e * (1 - sI); + m[1*3+2] = 0.; + m[2*3+0] = 1 - c->e; + m[2*3+1] = 0.; + m[2*3+2] = c->e; + bM = (1 - c->d) / l_ref; // (bM+bI)*l_ref==1 + bI = c->d / l_ref; + /*** forward ***/ // f[0] set_u(k, bw, 0, 0); @@ -150,6 +163,15 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu uint8_t qyi = query[i - 1]; x = i - bw; beg = beg > x? beg : x; // band start x = i + bw; end = end < x? end : x; // band end + + // NB end-beg is almost always 14 (99.9% of the time) + // Hence not a large volume to parallelise. + // + // Maybe stripe in diagonal doing 14 lines together? + // + // Consider rotation? 150x14 vs 14x150 so inner loop + // takes longer. + double E[] = { qli * EM, // 00 1. - qli, // 01 @@ -157,19 +179,69 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu 1., // 11 }; double M = 1./s[i-1]; + + // Note this code has the original version listed here (albeit + // with improved formatting), but we do not compile using + // -DPROBALN_ORIG. The purpose of this code is to act as an + // easier(?) to understand version of the heavily optimised + // version following it and as an easy validation path in case + // of any differences in results. +#ifdef PROBALN_ORIG for (k = beg, sum = 0.; k <= end; ++k) { int u, v11, v01, v10; double e; e = E[(ref[k - 1] > 3 || qyi > 3)*2 + (ref[k - 1] == qyi)]; - set_u(u, bw, i, k); set_u(v11, bw, i-1, k-1); set_u(v10, bw, i-1, k); set_u(v01, bw, i, k-1); + set_u(u, bw, i, k); + set_u(v11, bw, i-1, k-1); + set_u(v10, bw, i-1, k); + set_u(v01, bw, i, k-1); fi[u+0] = e * (m[0] * M*fi1[v11+0] + m[3] * M*fi1[v11+1] + m[6] * M*fi1[v11+2]); fi[u+1] = EI * (m[1] * M*fi1[v10+0] + m[4] * M*fi1[v10+1]); fi[u+2] = m[2] * fi[v01+0] + m[8] * fi[v01+2]; sum += fi[u] + fi[u+1] + fi[u+2]; -// fprintf(stderr, "F (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, fi[u], fi[u+1], fi[u+2]); // DEBUG } +#else + // We use EI*(M*m[1]*? + M*m[4]*?) a lot. So factor it out here. + double xm[5]; + xm[0] = M*m[0]; + xm[1] = M*m[3]; + xm[2] = M*m[6]; + xm[3] = EI*M*m[1]; + xm[4] = EI*M*m[4]; + + { + int u, v11; + set_u(u, bw, i, beg); + set_u(v11, bw, i-1, beg-1); + // Rather than recompute k->{u,v01,v10,v11} each loop + // we just increment the pointers. + double *xi = &fi[u]; + double *yi = &fi1[v11]; + // Derived from xi[0,2] in previous loop iter. + double l_x0 = m[2]*xi[0]; + double l_x2 = m[8]*xi[2]; + for (k = beg, sum = 0.; k <= end; ++k, xi+=3, yi+=3) { + int cond = (ref[k-1] > 3 || qyi > 3)*2 + (ref[k-1] == qyi); + + double z0 = xm[0]*yi[0]; + double z1 = xm[1]*yi[1]; + double z2 = xm[2]*yi[2]; + double z3 = xm[3]*yi[3]; + double z4 = xm[4]*yi[4]; + + xi[0] = E[cond] * (z0+z1+z2); + xi[1] = z3 + z4; + xi[2] = l_x0 + l_x2; + sum += xi[0] + xi[1] + xi[2]; + + l_x0 = m[2]*xi[0]; + l_x2 = m[8]*xi[2]; + } + } +#endif s[i] = sum; } + { // f[l_query+1] double sum; double M = 1./s[l_query]; @@ -205,7 +277,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu } // b[l_query-1..1] for (i = l_query - 1; i >= 1; --i) { - int beg = 1, end = l_ref, x, _beg, _end; + int beg = 1, end = l_ref, x; double *bi = &b[i*i_dim], *bi1 = &b[(i+1)*i_dim], y = (i > 1), qli1 = qual[i]; uint8_t qyi1 = query[i]; x = i - bw; beg = beg > x? beg : x; @@ -217,10 +289,15 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu 1., //011 //0,0,0,0 //1xx }; + +#ifdef PROBALN_ORIG for (k = end; k >= beg; --k) { int u, v11, v01, v10; double e; - set_u(u, bw, i, k); set_u(v11, bw, i+1, k+1); set_u(v10, bw, i+1, k); set_u(v01, bw, i, k+1); + set_u(u, bw, i, k); + set_u(v11, bw, i+1, k+1); + set_u(v10, bw, i+1, k); + set_u(v01, bw, i, k+1); e = (k>=l_ref)?0 :E[(ref[k] > 3 || qyi1 > 3)*2 + (ref[k] == qyi1)] * bi1[v11]; bi[u+0] = e * m[0] + EI * m[1] * bi1[v10+1] + m[2] * bi[v01+2]; // bi1[v11] has been foled into e. bi[u+1] = e * m[3] + EI * m[4] * bi1[v10+1]; @@ -228,8 +305,43 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu // fprintf(stderr, "B (%d,%d;%d): %lg,%lg,%lg\n", i, k, u, bi[u], bi[u+1], bi[u+2]); // DEBUG } // rescale + int _beg, _end; set_u(_beg, bw, i, beg); set_u(_end, bw, i, end); _end += 2; for (k = _beg, y = 1./s[i]; k <= _end; ++k) bi[k] *= y; +#else + { + int u, v10; + set_u(u, bw, i, end); + set_u(v10, bw, i+1, end); + // Rather than recompute k->{u,v01,v10,v11} each loop + // we just increment the pointers. + double *xi = &bi[u]; + double *yi = &bi1[v10]; + // NB xi[5] is equiv to v01+2. + double xi_5 = xi[5]; + // Manual loop invariant removal + double e1 = EI*m[1]; + double e4 = EI*m[4]; + // Do renorm too in the same pass. + double n = 1./s[i]; + for (k = end; k >= beg; --k, xi -= 3, yi -= 3) { + double e = (k>=l_ref) + ? 0 + : E[(ref[k]>3 || qyi1>3)*2 + (ref[k] == qyi1)] * yi[3]; + + xi[1] = e * m[3] + e4 * yi[1]; + xi[0] = e * m[0] + e1 * yi[1] + m[2] * xi_5; + xi[2] = (e * m[6] + m[8] * xi_5) * y; + // bi[u+2] from this iter becomes bi[v01+2] in next iter + xi_5 = xi[2]; + + // rescale + xi[1] *= n; + xi[0] *= n; + xi[2] *= n; + } + } +#endif } { // b[0] int beg = 1, end = l_ref < bw + 1? l_ref : bw + 1; @@ -251,13 +363,36 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu x = i - bw; beg = beg > x? beg : x; x = i + bw; end = end < x? end : x; double M = 1./s[i]; +#ifdef PROBALN_ORIG for (k = beg; k <= end; ++k) { int u; double z; set_u(u, bw, i, k); - z = M*fi[u+0] * bi[u+0]; if (z > max) max = z, max_k = (k-1)<<2 | 0; sum += z; - z = M*fi[u+1] * bi[u+1]; if (z > max) max = z, max_k = (k-1)<<2 | 1; sum += z; + z = M*fi[u+0] * bi[u+0]; + if (z > max) max = z, max_k = (k-1)<<2 | 0; + sum += z; + z = M*fi[u+1] * bi[u+1]; + if (z > max) max = z, max_k = (k-1)<<2 | 1; + sum += z; + } +#else + { + int u; + set_u(u, bw, i, beg); + for (k = beg; k <= end; ++k, u+=3) { + double z1, z2; + z1 = M*fi[u+0] * bi[u+0]; + z2 = M*fi[u+1] * bi[u+1]; + int which = z2 > z1; // strictly z2 >= z1 matches old code + double zm = which ? z2 : z1; + if (zm > max) { + max = zm; + max_k = (k-1)<<2 | which; + } + sum += z1 + z2; + } } +#endif max /= sum; sum *= s[i]; // if everything works as is expected, sum == 1.0 if (state) state[i-1] = max_k; if (q) k = (int)(-4.343 * log(1. - max) + .499), q[i-1] = k > 100? 99 : k; @@ -268,6 +403,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int l_qu "ACGT"[query[i - 1]], "ACGT"[ref[(max_k>>2)]], max_k&3, max); // DEBUG #endif } + /*** free ***/ free(f); free(b); free(s); free(qual); return Pr; From c1245117caa14a7fb05b385e9d981c2ea950edfc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 15 Dec 2020 17:15:24 +0000 Subject: [PATCH 055/488] Split cram_block_method enum into public and private. Also the public version has had the enum members renamed to include an HTS_ prefix. At present they appear to be unused, although it's hard to know for sure with some of them due to the common hits you get all over github. Leaving them as they are though is just waiting for a hard problem to arrive. This commit introduces the notion of HTS_COMPAT. The ABI changes changed, but the API has (albeit in a way we believe should affect people). It's a 3+ digit number with the last two being minor version and earlier being major. So htslib version 1.12 would be 112. The intention is simply to permit packages with dependencies on specific earlier API names to still compile with the minimal of changes (even if it's just by overriding CFLAGS). --- cram/cram_io.c | 11 ++++---- cram/cram_io.h | 2 +- cram/cram_structs.h | 60 ++++++++++++++++++++++++++++++++++++++---- htslib/cram.h | 63 ++++++++++++++------------------------------- 4 files changed, 82 insertions(+), 54 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 65ff7980e..4f95f940a 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1730,7 +1730,7 @@ int cram_uncompress_block(cram_block *b) { static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, int content_id, size_t *out_size, - enum cram_block_method method, + enum cram_block_method_int method, int level, int strat) { switch (method) { case GZIP: @@ -1899,7 +1899,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, // Internally we have parameterised methods that externally map // to the same CRAM method value. - // See enum_cram_block_method. + // See enum_cram_block_method_int in cram_structs.h. int methmap[] = { // Externally defined values RAW, GZIP, BZIP2, LZMA, RANS, RANSPR, ARITH, FQZ, TOK3, @@ -2056,7 +2056,8 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, int best_method = RAW; int best_sz = INT_MAX; - // Relative costs of methods. See enum_cram_block_method and methmap + // Relative costs of methods. See enum_cram_block_method_int + // and methmap double meth_cost[32] = { // Externally defined methods 1, // 0 raw @@ -2090,7 +2091,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, 1.04, // arith_pr1 1.04, // arith_pr64 - 1.04, // arith_pr65 + 1.04, // arith_pr9 1.03, // arith_pr128 1.04, // arith_pr129 1.04, // arith_pr192 @@ -2242,7 +2243,7 @@ cram_metrics *cram_new_metrics(void) { return m; } -char *cram_block_method2str(enum cram_block_method m) { +char *cram_block_method2str(enum cram_block_method_int m) { switch(m) { case RAW: return "RAW"; case GZIP: return "GZIP"; diff --git a/cram/cram_io.h b/cram/cram_io.h index 7d787e44c..8cc59be51 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -173,7 +173,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, int method, int level); cram_metrics *cram_new_metrics(void); -char *cram_block_method2str(enum cram_block_method m); +char *cram_block_method2str(enum cram_block_method_int m); char *cram_content_type2str(enum cram_content_type t); /* diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 0ede383f5..0c3ae8542 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -209,11 +209,61 @@ struct cram_file_def { struct cram_slice; -/* Now in htslib/cram.h -enum cram_block_method { - ... +// Internal version of htslib/cram.h enum. +// Note these have to match the laout of methmap and methcost in +// cram_io.c:cram_compress_block2 +enum cram_block_method_int { + // Public methods as defined in the CRAM spec. + BM_ERROR = -1, + + // CRAM 2.x and 3.0 + RAW = 0, + GZIP = 1, + BZIP2 = 2, + LZMA = 3, + RANS = 4, RANS0 = RANS, + + // CRAM 3.1 onwards + RANSPR = 5, RANS_PR0 = RANSPR, + ARITH = 6, ARITH_PR0 = ARITH, + FQZ = 7, + TOK3 = 8, + // BSC = 9, ZSTD = 10 + + // Methods not externalised, but used in metrics. + // Externally they become one of the above methods. + GZIP_RLE = 11, + GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM + + FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods + + //RANS0, // Order 0 + RANS1, + + //RANS_PR0, // Order 0 + RANS_PR1, // Order 1 + RANS_PR64, // O0 + RLE + RANS_PR9, // O1 + X4 + RANS_PR128, // O0 + Pack + RANS_PR129, // O1 + Pack + RANS_PR192, // O0 + RLE + pack + RANS_PR193, // O1 + RLE + pack + + //TOK3, // tok+rans + TOKA, // tok+arith + + //ARITH_PR0, // Order 0 + ARITH_PR1, // Order 1 + ARITH_PR64, // O0 + RLE + ARITH_PR9, // O1 + X4 + ARITH_PR128, // O0 + Pack + ARITH_PR129, // O1 + Pack + ARITH_PR192, // O0 + RLE + pack + ARITH_PR193, // O1 + RLE + pack + + // NB: must end on no more than 31 unless we change to a + // 64-bit method type. }; -*/ /* Now in htslib/cram.h enum cram_content_type { @@ -259,7 +309,7 @@ KHASH_MAP_INIT_INT(m_metrics, cram_metrics*) /* Block */ struct cram_block { - enum cram_block_method method, orig_method; + enum cram_block_method_int method, orig_method; enum cram_content_type content_type; int32_t content_id; int32_t comp_size; diff --git a/htslib/cram.h b/htslib/cram.h index 5446945e4..607cf8bcf 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -47,6 +47,17 @@ DEALINGS IN THE SOFTWARE. */ extern "C" { #endif +// see cram/cram_structs.h for an internal more complete copy of this enum + +// Htslib 1.11 had these listed without any hts prefix, and included +// some internal values such as RANS1 and GZIP_RLE (which shouldn't have ever +// been public). +// +// We can't find evidence of these being used and the data type occurs +// nowhere in functions or structures meaning using it would be pointless. +// However for safety, if you absolute need the API to not change then +// define HTS_COMPAT to 101100 (XYYYZZ for X.Y[.Z], meaning 1.11). +#if defined(HTS_COMPAT) && HTS_COMPAT <= 101100 enum cram_block_method { // Public methods as defined in the CRAM spec. BM_ERROR = -1, @@ -56,50 +67,16 @@ enum cram_block_method { GZIP = 1, BZIP2 = 2, LZMA = 3, - RANS = 4, RANS0 = RANS, - - // CRAM 3.1 onwards - RANSPR = 5, RANS_PR0 = RANSPR, - ARITH = 6, ARITH_PR0 = ARITH, - FQZ = 7, - TOK3 = 8, - // BSC = 9, ZSTD = 10 - - // Methods not externalised, but used in metrics. - // Externally they become one of the above methods. - GZIP_RLE = 11, // NB: not externalised in CRAM - GZIP_1, // Z_DEFAULT_STRATEGY level 1, NB: not externalised in CRAM - - FQZ_b, FQZ_c, FQZ_d, // Various preset FQZ methods - - //RANS0, // Order 0 - RANS1, - - //RANS_PR0, // Order 0 - RANS_PR1, // Order 1 - RANS_PR64, // O0 + RLE - RANS_PR9, // O1 + X4 - RANS_PR128, // O0 + Pack - RANS_PR129, // O1 + Pack - RANS_PR192, // O0 + RLE + pack - RANS_PR193, // O1 + RLE + pack - - //TOK3, // tok+rans - TOKA, // tok+arith - - //ARITH_PR0, // Order 0 - ARITH_PR1, // Order 1 - ARITH_PR64, // O0 + RLE - ARITH_PR9, // O1 + X4 - ARITH_PR128, // O0 + Pack - ARITH_PR129, // O1 + Pack - ARITH_PR192, // O0 + RLE + pack - ARITH_PR193, // O1 + RLE + pack - - // NB: must end on no more than 31 unless we change to a - // 64-bit method type. - + RANS = 4, + + // NB: the subsequent numbers may change. They're simply here for + // compatibility with the old API, but may have no bearing on the + // internal way htslib works. DO NOT USE + RANS0 = 4, + RANS1 = 10, + GZIP_RLE = 11, }; +#endif enum cram_content_type { CT_ERROR = -1, From 878c71bf0ec52ed66666a37e84241a62c474fecd Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 16 Dec 2020 20:40:20 +0000 Subject: [PATCH 056/488] Get htslib's "make test" to run the htscodecs tests They are included by adding extra dependencies to the "test" and "check" targets when htscodecs is bundled via the submodule. --- Makefile | 57 ++++++++++++++++++++++++++++++++++++++++++- htscodecs_bundled.mk | 6 +++++ htscodecs_external.mk | 1 + 3 files changed, 63 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 4168c0463..5b447cbce 100644 --- a/Makefile +++ b/Makefile @@ -426,7 +426,7 @@ maintainer-check: # # If using MSYS, avoid poor shell expansion via: # MSYS2_ARG_CONV_EXCL="*" make check -check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) +check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODECS_TEST_TARGETS) test/hts_endian test/test_expr test/test_kfunc @@ -508,6 +508,56 @@ test/test-bcf-sr: test/test-bcf-sr.o libhts.a test/test-bcf-translate: test/test-bcf-translate.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread +# Extra tests for bundled htscodecs +test_htscodecs_rans4x8: htscodecs/tests/rans4x8 + cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x8.test + +test_htscodecs_rans4x16: htscodecs/tests/rans4x16pr + cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x16.test + +test_htscodecs_arith: htscodecs/tests/arith_dynamic + cd htscodecs/tests && srcdir=. && export srcdir && ./arith.test + +test_htscodecs_tok3: htscodecs/tests/tokenise_name3 + cd htscodecs/tests && srcdir=. && export srcdir && ./tok3.test + +test_htscodecs_fqzcomp: htscodecs/tests/fqzcomp_qual + cd htscodecs/tests && srcdir=. && export srcdir && ./fqzcomp.test + +test_htscodecs_varint: htscodecs/tests/varint + cd htscodecs/tests && ./varint + +htscodecs/tests/arith_dynamic: htscodecs/tests/arith_dynamic_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/fqzcomp_qual: htscodecs/tests/fqzcomp_qual_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/rans4x16pr: htscodecs/tests/rANS_static4x16pr_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/rans4x8: htscodecs/tests/rANS_static_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/tokenise_name3: htscodecs/tests/tokenise_name3_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/varint: htscodecs/tests/varint_test.o $(HTSCODECS_OBJS) + $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread + +htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c $(htscodecs_arith_dynamic_h) +htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) +htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c $(htscodecs_rANS_static4x16_h) +htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c $(htscodecs_rANS_static_h) +htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c $(htscodecs_tokenise_name3_h) +htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c $(htscodecs_varint_h) + test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h) @@ -629,14 +679,17 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt + -rm -rf htscodecs/tests/test.out mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico -rm -f hts-object-files + -rm -f htscodecs/tests/*.o clean: mostlyclean clean-$(SHLIB_FLAVOUR) -rm -f libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) $(BUILT_TEST_PROGRAMS) $(BUILT_THRASH_PROGRAMS) + -rm -f htscodecs/tests/rans4x8 htscodecs/tests/rans4x16pr htscodecs/tests/arith_dynamic htscodecs/tests/tokenise_name3 htscodecs/tests/fqzcomp_qual htscodecs/tests/varint distclean maintainer-clean: clean -rm -f config.cache config.h config.log config.mk config.status @@ -683,3 +736,5 @@ force: .PHONY: clean-cygdll install-cygdll .PHONY: clean-dll install-dll .PHONY: clean-dylib install-dylib +.PHONY: test_htscodecs_rans4x8 test_htscodecs_rans4x16 test_htscodecs_arith +.PHONY: test_htscodecs_tok3 test_htscodecs_fqzcomp test_htscodecs_varint diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index d40bbd095..5a8328e0e 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -7,3 +7,9 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) + +# Add htscodecs tests into the HTSlib test framework + +HTSCODECS_TEST_TARGETS = test_htscodecs_rans4x8 \ + test_htscodecs_rans4x16 test_htscodecs_arith test_htscodecs_tok3 \ + test_htscodecs_fqzcomp test_htscodecs_varint diff --git a/htscodecs_external.mk b/htscodecs_external.mk index f1d82faf8..7ee47402c 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -1,5 +1,6 @@ HTSCODECS_SOURCES = HTSCODECS_OBJS = +HTSCODECS_TEST_TARGETS = htscodecs_arith_dynamic_h = htscodecs_fqzcomp_qual_h = From 1fc55428dfffbc36609798bb823be51f733bb55a Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 5 Jan 2021 15:52:12 +0000 Subject: [PATCH 057/488] Add configure check for the htscodecs submodule So that anyone who tries to build the library without setting up the submodule first will get a hint on how to make the build work. The check isn't really needed for anyone building from a release tarball, but it will be harmless as it should always succeed (as long as the tarball was built correctly and includes the htscodecs files). --- configure.ac | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 890d7aa31..f990c370b 100644 --- a/configure.ac +++ b/configure.ac @@ -306,7 +306,26 @@ if necessary set CPPFLAGS and LDFLAGS so the compiler can find them; or configure using --without-external-htscodecs to build the required functions from the htscodecs submodule. ])])], - [selected_htscodecs_mk="htscodecs_bundled.mk"]) + [AC_MSG_CHECKING([whether htscodecs files are present]) + AS_IF([test -e "$srcdir/htscodecs/htscodecs/rANS_static4x16.h"], + [AC_MSG_RESULT([yes]) + selected_htscodecs_mk="htscodecs_bundled.mk"], + [AC_MSG_RESULT([no]) + AS_IF([test -e "$srcdir/.git"], + [MSG_ERROR([htscodecs submodule files not present. + +HTSlib uses some functions from the htscodecs project, which is normally +included as a submodule. Try running: + + git submodule update --init --recursive + +to update it, and then re-run configure. +])], + [MSG_ERROR([htscodecs submodule files not present. + +You have an incomplete distribution. Please try downloading one of the +official releases from https://www.htslib.org +])])])]) AC_SUBST([selected_htscodecs_mk]) AS_IF([test "x$with_libdeflate" != "xno"], From f895c674f5f946f291d6c9f8d3fe956bab485800 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 5 Jan 2021 18:54:25 +0000 Subject: [PATCH 058/488] Add Makefile check for the htscodecs submodule Currently if the submodule files are missing it asks the user to run "git submodule update" to install them. It's possible that the Makefile run this itself and continue, although I'm not sure if that would be completely safe in parallel builds. The variables lising dependencies for htscodecs header files are moved to htscodecs_bundled.mk to ensure they are only set for submodule builds. --- Makefile | 34 ++++++++++++++++++++-------------- htscodecs_bundled.mk | 14 ++++++++++++++ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/Makefile b/Makefile index 5b447cbce..5558799cb 100644 --- a/Makefile +++ b/Makefile @@ -212,20 +212,6 @@ sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) -htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h -htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h -htscodecs_pack_h = htscodecs/htscodecs/pack.h -htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h -htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h -htscodecs_rle_h = htscodecs/htscodecs/rle.h -htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h -htscodecs_varint_h = htscodecs/htscodecs/varint.h - -htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h -htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h -htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) -htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h - # To be effective, config.mk needs to appear after most Makefile variables are # set but before most rules appear, so that it can both use previously-set # variables in its own rules' prerequisites and also update variables for use @@ -414,6 +400,26 @@ bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) +# Runes to check that the htscodecs submodule is present +ifdef HTSCODECS_SOURCES +htscodecs/htscodecs/%.c: | htscodecs/htscodecs + ; +htscodecs/htscodecs/%.h: | htscodecs/htscodecs + ; +htscodecs/htscodecs: + @if test -e .git ; then \ + printf "\\n\\nError: htscodecs submodule files not present.\\n\ + Try running: \\n\ + git submodule update --init --recursive\\n\ + and then re-run make.\\n\\n\\n" ; \ + else \ + printf "\\n\\nError: htscodecs submodule files not present and this is not a git checkout.\\n\ + You have an incomplete distribution. Please try downloading one of the\\n\ + official releases from https://www.htslib.org/\\n" ; \ + fi + @false +endif + # Maintainer source code checks # - copyright boilerplate presence # - tab and trailing space detection diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 5a8328e0e..d270773f7 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -8,6 +8,20 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) +htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h +htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h +htscodecs_pack_h = htscodecs/htscodecs/pack.h +htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h +htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h +htscodecs_rle_h = htscodecs/htscodecs/rle.h +htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h +htscodecs_varint_h = htscodecs/htscodecs/varint.h + +htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h +htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) +htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h + # Add htscodecs tests into the HTSlib test framework HTSCODECS_TEST_TARGETS = test_htscodecs_rans4x8 \ From 67805c7811a758da6a1e3ceef06004378de23460 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 2 Nov 2020 12:36:56 +0000 Subject: [PATCH 059/488] Added some simple introspection functions. These permit testing of explicit features, e.g. if (htslib_test_feature(HTS_FEATURE_PLUGINS)) { ... } or capturing the entire feature bit-field via htslib_features(). (This function may be redundant so perhaps is a candidate for culling?). This also permits querying compilation details: CC, CFLAGS, LDFLAGS and CPPFLAGS. Finally there is a htslib_feature_string() function which is mainly for verbose feature printing rather than programmatic parsing, but it conveniently also permits us to get features from a library simply by running "strings" on the binary. Note this doesn't actually do any testing so it's not in the Makefile. It's hard to know quite what testing it could do given by design the output would differ based on how the user built it. We could maybe call it to make sure it doesn't crash, but that's not likely to be something we need regression testing on. --- Makefile | 9 ++- hts_internal.h | 1 + hts_os.c | 147 ++++++++++++++++++++++++++++++++++++++ htslib/hts_os.h | 39 ++++++++++ plugin.c | 27 +++++++ test/test_introspection.c | 62 ++++++++++++++++ 6 files changed, 284 insertions(+), 1 deletion(-) create mode 100644 test/test_introspection.c diff --git a/Makefile b/Makefile index eac4b809d..f598f66cc 100644 --- a/Makefile +++ b/Makefile @@ -90,7 +90,8 @@ BUILT_TEST_PROGRAMS = \ test/test-bcf-sr \ test/fuzz/hts_open_fuzzer.o \ test/test-bcf-translate \ - test/test-parse-reg + test/test-parse-reg \ + test/test_introspection BUILT_THRASH_PROGRAMS = \ test/thrash_threads1 \ @@ -341,6 +342,8 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c +hts_os.o hts_os.pico: TMP_CPPFLAGS := $(CPPFLAGS) +hts_os.o hts_os.pico: CPPFLAGS += -DHTS_CPPFLAGS=\"$(TMP_CPPFLAGS)\" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) @@ -481,6 +484,9 @@ test/test-bcf-sr: test/test-bcf-sr.o libhts.a test/test-bcf-translate: test/test-bcf-translate.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test-bcf-translate.o libhts.a -lz $(LIBS) -lpthread +test/test_introspection: test/test_introspection.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread + test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h) @@ -502,6 +508,7 @@ test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) +test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hts_os_h) test/thrash_threads1: test/thrash_threads1.o libhts.a diff --git a/hts_internal.h b/hts_internal.h index 4e5e20151..5ee88b7d0 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -104,6 +104,7 @@ plugin_void_func *load_plugin(void **pluginp, const char *filename, const char * void *plugin_sym(void *plugin, const char *name, const char **errmsg); plugin_void_func *plugin_func(void *plugin, const char *name, const char **errmsg); void close_plugin(void *plugin); +const char *htslib_plugin_path(void); /* * Buffers up arguments to hts_idx_push for later use, once we've written all bar diff --git a/hts_os.c b/hts_os.c index c26700975..c533c8cd9 100644 --- a/hts_os.c +++ b/hts_os.c @@ -57,3 +57,150 @@ long hts_lrand48(void) { return lrand48(); } // #define USE_FILEEXTD // #include "os/iscygpty.c" // #endif + + +#include +#include +#include "hts_internal.h" +#include "htslib/hts.h" +#include "htslib/hts_os.h" +#include "htslib/kstring.h" + +unsigned int htslib_features(void) { + unsigned int feat = 0; + +#ifdef PACKAGE_URL + feat |= HTS_FEATURE_CONFIGURE; +#endif + +#ifdef ENABLE_PLUGINS + feat |= HTS_FEATURE_PLUGINS; +#endif + +#ifdef HAVE_LIBCURL + feat |= HTS_FEATURE_LIBCURL; +#endif + +#ifdef ENABLE_S3 + feat |= HTS_FEATURE_S3; +#endif + +#ifdef ENABLE_GCS + feat |= HTS_FEATURE_GCS; +#endif + +#ifdef HAVE_LIBDEFLATE + feat |= HTS_FEATURE_LIBDEFLATE; +#endif + +#ifdef HAVE_LIBLZMA + feat |= HTS_FEATURE_LZMA; +#endif + +#ifdef HAVE_LIBBZ2 + feat |= HTS_FEATURE_BZIP2; +#endif + + return feat; +} + +const char *htslib_test_feature(int id) { + int feat = htslib_features(); + + switch (id) { + case HTS_FEATURE_CONFIGURE: + return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; + case HTS_FEATURE_PLUGINS: + return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; + case HTS_FEATURE_LIBCURL: + return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; + case HTS_FEATURE_S3: + return feat & HTS_FEATURE_S3 ? "yes" : NULL; + case HTS_FEATURE_GCS: + return feat & HTS_FEATURE_GCS ? "yes" : NULL; + case HTS_FEATURE_LIBDEFLATE: + return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; + case HTS_FEATURE_BZIP2: + return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; + case HTS_FEATURE_LZMA: + return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + + case HTS_FEATURE_CC: + return HTS_CC; + case HTS_FEATURE_CFLAGS: + return HTS_CFLAGS; + case HTS_FEATURE_LDFLAGS: + return HTS_LDFLAGS; + case HTS_FEATURE_CPPFLAGS: + return HTS_CPPFLAGS; + + default: + fprintf(stderr, "Unknown feature code: %d\n", id); + } + + return NULL; +} + +// Note this implementation also means we can just "strings" the library +// to find the configuration parameters. +const char *htslib_feature_string(void) { + const char *fmt= + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef ENABLE_PLUGINS + "plugins=yes, plugin-path=%.1000s " +#else + "plugins=no " +#endif + +#ifdef HAVE_LIBCURL + "libcurl=yes " +#else + "libcurl=no " +#endif + +#ifdef ENABLE_S3 + "S3=yes " +#else + "S3=no " +#endif + +#ifdef ENABLE_GCS + "GCS=yes " +#else + "GCS=no " +#endif + +#ifdef HAVE_LIBDEFLATE + "libdeflate=yes " +#else + "libdeflate=no " +#endif + +#ifdef HAVE_LIBLZMA + "lzma=yes " +#else + "lzma=no " +#endif + +#ifdef HAVE_LIBBZ2 + "bzip2=yes "; +#else + "bzip2=no "; +#endif + +#ifdef ENABLE_PLUGINS + static char config[1200]; + sprintf(config, fmt, htslib_plugin_path()); + return config; +#else + return fmt; +#endif +} + +// Plus hts_version here? diff --git a/htslib/hts_os.h b/htslib/hts_os.h index 2f988c3c9..eab438b54 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -77,4 +77,43 @@ extern int is_cygpty(int fd); #define random rand #endif +/*! @abstract Introspection on the features enabled in htslib + * + * @return a bitfield of HTS_FEATURE_* macros. + */ +HTSLIB_EXPORT +unsigned int htslib_features(void); + +HTSLIB_EXPORT +const char *htslib_test_feature(int id); + +/*! @abstract Introspection on the features enabled in htslib, string form + * + * @return a string describing htslib build features + */ +HTSLIB_EXPORT +const char *htslib_feature_string(void); + +// Whether ./configure was used or vanilla Makefile +#define HTS_FEATURE_CONFIGURE 1 + +// Also see htslib_plugin_path function +#define HTS_FEATURE_PLUGINS 2 + +// Transport specific +#define HTS_FEATURE_LIBCURL 4 +#define HTS_FEATURE_S3 8 +#define HTS_FEATURE_GCS 16 + +// Compression options +#define HTS_FEATURE_LIBDEFLATE 32 +#define HTS_FEATURE_LZMA 64 +#define HTS_FEATURE_BZIP2 128 + +// Build params +#define HTS_FEATURE_CC (1<<28) +#define HTS_FEATURE_CFLAGS (1<<29) +#define HTS_FEATURE_LDFLAGS (1<<30) +#define HTS_FEATURE_CPPFLAGS (1<<31) + #endif diff --git a/plugin.c b/plugin.c index d5c1981ca..fccc83793 100644 --- a/plugin.c +++ b/plugin.c @@ -191,3 +191,30 @@ void close_plugin(void *plugin) __func__, dlerror()); } } + +const char *htslib_plugin_path(void) { +#ifdef ENABLE_PLUGINS + char *path = getenv("HTS_PATH"); + if (!path) path = ""; + + kstring_t ks = {0}; + while(1) { + size_t len = strcspn(path, HTS_PATH_SEPARATOR_STR); + if (len == 0) kputs(PLUGINPATH, &ks); + else kputsn(path, len, &ks); + kputc(HTS_PATH_SEPARATOR_CHAR, &ks); + + path += len; + if (*path == HTS_PATH_SEPARATOR_CHAR) path++; + else break; + } + + static char s_path[1024]; + sprintf(s_path, "%.1023s", ks.s ? ks.s : ""); + free(ks.s); + + return s_path; +#else + return NULL; +#endif +} diff --git a/test/test_introspection.c b/test/test_introspection.c new file mode 100644 index 000000000..6bce199dc --- /dev/null +++ b/test/test_introspection.c @@ -0,0 +1,62 @@ +/* test/test_introspection.c -- demonstration of introspection function usage + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include + +#include "../htslib/hts.h" +#include "../htslib/hts_os.h" + +int main(void) { + printf("Version string: %s\n", hts_version()); + printf("Version number: %d\n", HTS_VERSION); + + printf("\nCC: %s\n", htslib_test_feature(HTS_FEATURE_CC)); + printf("CPPFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CPPFLAGS)); + printf("CFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CFLAGS)); + printf("LDFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_LDFLAGS)); + + unsigned int feat = htslib_features(); + printf("\nFeature number: 0x%x\n", feat); + if (feat & HTS_FEATURE_CONFIGURE) + printf(" HTS_FEATURE_CONFIGURE\n"); + if (feat & HTS_FEATURE_PLUGINS) + printf(" HTS_FEATURE_PLUGINS\n"); + if (feat & HTS_FEATURE_LIBCURL) + printf(" HTS_FEATURE_LIBCURL\n"); + if (feat & HTS_FEATURE_S3) + printf(" HTS_FEATURE_S3\n"); + if (feat & HTS_FEATURE_GCS) + printf(" HTS_FEATURE_GCS\n"); + if (feat & HTS_FEATURE_LIBDEFLATE) + printf(" HTS_FEATURE_LIBDEFLATE\n"); + if (feat & HTS_FEATURE_LZMA) + printf(" HTS_FEATURE_LZMA\n"); + if (feat & HTS_FEATURE_BZIP2) + printf(" HTS_FEATURE_BZIP2\n"); + + printf("\nFeature string: %s\n", htslib_feature_string()); + + return 0; +} From cd48384237cd63d5b1e1a42e1946e6e59143f790 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Nov 2020 15:24:06 +0000 Subject: [PATCH 060/488] Added a plugin and scheme query API --- hfile.c | 105 ++++++++++++++++++++++++++++++++++++++ hts_os.c | 2 +- htslib/hfile.h | 48 +++++++++++++++++ test/test_introspection.c | 24 ++++++++- 4 files changed, 177 insertions(+), 2 deletions(-) diff --git a/hfile.c b/hfile.c index 218520a7c..35b4adac7 100644 --- a/hfile.c +++ b/hfile.c @@ -1142,6 +1142,111 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) return (k != kh_end(schemes))? kh_value(schemes, k) : &unknown_scheme; } + +/* + * Fills out sc_list[] with the list of known schemes. + * This can be restricted to just ones from a specific plugin, + * or all (plugin == NULL). + * + * Returns number of schemes found on success; + * -1 on failure. + */ +HTSLIB_EXPORT +int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + khiter_t k; + int ns = 0; + + for (k = kh_begin(schemes); k != kh_end(schemes); k++) { + if (!kh_exist(schemes, k)) + continue; + + const struct hFILE_scheme_handler *s = kh_value(schemes, k); + if (plugin && strcmp(s->provider, plugin) != 0) + continue; + + if (ns < *nschemes) + sc_list[ns] = kh_key(schemes, k); + ns++; + } + + if (*nschemes > ns) + *nschemes = ns; + + return ns; +} + + +/* + * Fills out plist[] with the list of known plugins. + * + * Returns number of schemes found on success; + * -1 on failure + */ +HTSLIB_EXPORT +int hts_list_plugins(const char *plist[], int *nplugins) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + int np = 0; + if (*nplugins) + plist[np++] = "built-in"; + + struct hFILE_plugin_list *p = plugins; + while (p) { + if (np < *nplugins) + plist[np] = p->plugin.name; + + p = p->next; + np++; + } + + if (*nplugins > np) + *nplugins = np; + + return np; +} + + +/* + * Tests for the presence of a specific plugin. + * + * Returns 1 if true + * 0 otherwise + */ +HTSLIB_EXPORT +int htslib_has_plugin(const char *name) +{ + pthread_mutex_lock(&plugins_lock); + if (!schemes && load_hfile_plugins() < 0) { + pthread_mutex_unlock(&plugins_lock); + return -1; + } + pthread_mutex_unlock(&plugins_lock); + + struct hFILE_plugin_list *p = plugins; + while (p) { + if (strcmp(p->plugin.name, name) == 0) + return 1; + p = p->next; + } + + return 0; +} + + hFILE *hopen(const char *fname, const char *mode, ...) { const struct hFILE_scheme_handler *handler = find_scheme_handler(fname); diff --git a/hts_os.c b/hts_os.c index c533c8cd9..adcf222c9 100644 --- a/hts_os.c +++ b/hts_os.c @@ -1,7 +1,7 @@ /// @file hts_os.c /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019 Genome Research Ltd. + Copyright (C) 2017, 2019, 2020 Genome Research Ltd. Author: James Bonfield diff --git a/htslib/hfile.h b/htslib/hfile.h index 08d3edf7e..9ca489d2d 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -316,6 +316,54 @@ purpose other than closing. HTSLIB_EXPORT char *hfile_mem_steal_buffer(hFILE *file, size_t *length); +/// Fills out sc_list[] with the list of known schemes. +/** + * @param plugin [in] Restricts schemes to only those from 'plugin. + * @param sc_list [out] Filled out with the scheme names + * @param nschemes [in/out] Size of sc_list (in) and number returned (out) + * + * Plugin may be passed in as NULL in which case all schemes are returned. + * Use plugin "built-in" to list the built in schemes. + * The size of sc_list is determined by the input value of *nschemes. + * This is updated to return the output size. It is up to the caller to + * determine whether to call again with a larger number if this is too small. + * + * The return value represents the total number found matching plugin, which + * may be larger than *nschemes if too small a value was specified. + * + * @return the number of schemes found on success. + * -1 on failure + */ +HTSLIB_EXPORT +int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); + +/// Fills out plist[] with the list of known plugins. +/* + * @param plist [out] Filled out with the plugin names + * @param nplugins [in/out] Size of plist (in) and number returned (out) + * + * The size of plist is determined by the input value of *nplugins. + * This is updated to return the output size. It is up to the caller to + * determine whether to call again with a larger number if this is too small. + * + * The return value represents the total number found, which may be + * larger than *nplugins if too small a value was specified. + * + * @return the number of plugins found on success. + * -1 on failure + */ +HTSLIB_EXPORT +int hts_list_plugins(const char *plist[], int *nplugins); + +/// Tests for the presence of a specific plugin. +/* + * @param name The name of the plugin to query. + * + * @return 1 if found, 0 otherwise. + */ +HTSLIB_EXPORT +int htslib_has_plugin(const char *name); + #ifdef __cplusplus } #endif diff --git a/test/test_introspection.c b/test/test_introspection.c index 6bce199dc..ea90430c4 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -26,6 +26,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "../htslib/hts.h" +#include "../htslib/hfile.h" #include "../htslib/hts_os.h" int main(void) { @@ -36,7 +37,7 @@ int main(void) { printf("CPPFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CPPFLAGS)); printf("CFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CFLAGS)); printf("LDFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_LDFLAGS)); - + unsigned int feat = htslib_features(); printf("\nFeature number: 0x%x\n", feat); if (feat & HTS_FEATURE_CONFIGURE) @@ -58,5 +59,26 @@ int main(void) { printf("\nFeature string: %s\n", htslib_feature_string()); + + // Plugins and schemes + printf("\nPlugins present:\n"); + const char *plugins[100]; + int np = 100, i, j; + + if (hts_list_plugins(plugins, &np) < 0) + return 1; + + for (i = 0; i < np; i++) { + const char *sc_list[100]; + int nschemes = 100; + if (hts_list_schemes(plugins[i], sc_list, &nschemes) < 0) + return 1; + + printf(" %s:\n", plugins[i]); + for (j = 0; j < nschemes; j++) + printf("\t%s\n", sc_list[j]); + puts(""); + } + return 0; } From b5ed0101b6d4d8345cc0211ea880537c003c461b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Nov 2020 16:18:44 +0000 Subject: [PATCH 061/488] Moved the introspection code from hts_os.[ch] to hfile.[ch]. Some of it needs to be there anyway for the scheme and plugin APIs. --- Makefile | 6 +- hfile.c | 147 +++++++++++++++++++++++++++++++++++++ hts_os.c | 149 +------------------------------------- htslib/hfile.h | 39 ++++++++++ htslib/hts_os.h | 41 +---------- test/test_introspection.c | 1 - 6 files changed, 191 insertions(+), 192 deletions(-) diff --git a/Makefile b/Makefile index f598f66cc..b02cbf0a9 100644 --- a/Makefile +++ b/Makefile @@ -335,6 +335,8 @@ errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htsl kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) +hfile.o hfile.pico: TMP_CPPFLAGS := $(CPPFLAGS) +hfile.o hfile.pico: CPPFLAGS += -DHTS_CPPFLAGS=\"$(TMP_CPPFLAGS)\" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) @@ -342,8 +344,6 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c -hts_os.o hts_os.pico: TMP_CPPFLAGS := $(CPPFLAGS) -hts_os.o hts_os.pico: CPPFLAGS += -DHTS_CPPFLAGS=\"$(TMP_CPPFLAGS)\" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) @@ -508,7 +508,7 @@ test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) -test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hts_os_h) +test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h) test/thrash_threads1: test/thrash_threads1.o libhts.a diff --git a/hfile.c b/hfile.c index 35b4adac7..e5861c25c 100644 --- a/hfile.c +++ b/hfile.c @@ -1143,6 +1143,10 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) } +/*************************** + * Library introspection functions + ***************************/ + /* * Fills out sc_list[] with the list of known schemes. * This can be restricted to just ones from a specific plugin, @@ -1246,6 +1250,149 @@ int htslib_has_plugin(const char *name) return 0; } +HTSLIB_EXPORT +unsigned int htslib_features(void) { + unsigned int feat = 0; + +#ifdef PACKAGE_URL + feat |= HTS_FEATURE_CONFIGURE; +#endif + +#ifdef ENABLE_PLUGINS + feat |= HTS_FEATURE_PLUGINS; +#endif + +#ifdef HAVE_LIBCURL + feat |= HTS_FEATURE_LIBCURL; +#endif + +#ifdef ENABLE_S3 + feat |= HTS_FEATURE_S3; +#endif + +#ifdef ENABLE_GCS + feat |= HTS_FEATURE_GCS; +#endif + +#ifdef HAVE_LIBDEFLATE + feat |= HTS_FEATURE_LIBDEFLATE; +#endif + +#ifdef HAVE_LIBLZMA + feat |= HTS_FEATURE_LZMA; +#endif + +#ifdef HAVE_LIBBZ2 + feat |= HTS_FEATURE_BZIP2; +#endif + + return feat; +} + +HTSLIB_EXPORT +const char *htslib_test_feature(int id) { + int feat = htslib_features(); + + switch (id) { + case HTS_FEATURE_CONFIGURE: + return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; + case HTS_FEATURE_PLUGINS: + return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; + case HTS_FEATURE_LIBCURL: + return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; + case HTS_FEATURE_S3: + return feat & HTS_FEATURE_S3 ? "yes" : NULL; + case HTS_FEATURE_GCS: + return feat & HTS_FEATURE_GCS ? "yes" : NULL; + case HTS_FEATURE_LIBDEFLATE: + return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; + case HTS_FEATURE_BZIP2: + return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; + case HTS_FEATURE_LZMA: + return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + + case HTS_FEATURE_CC: + return HTS_CC; + case HTS_FEATURE_CFLAGS: + return HTS_CFLAGS; + case HTS_FEATURE_LDFLAGS: + return HTS_LDFLAGS; + case HTS_FEATURE_CPPFLAGS: + return HTS_CPPFLAGS; + + default: + fprintf(stderr, "Unknown feature code: %d\n", id); + } + + return NULL; +} + +// Note this implementation also means we can just "strings" the library +// to find the configuration parameters. +HTSLIB_EXPORT +const char *htslib_feature_string(void) { + const char *fmt= + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef ENABLE_PLUGINS + "plugins=yes, plugin-path=%.1000s " +#else + "plugins=no " +#endif + +#ifdef HAVE_LIBCURL + "libcurl=yes " +#else + "libcurl=no " +#endif + +#ifdef ENABLE_S3 + "S3=yes " +#else + "S3=no " +#endif + +#ifdef ENABLE_GCS + "GCS=yes " +#else + "GCS=no " +#endif + +#ifdef HAVE_LIBDEFLATE + "libdeflate=yes " +#else + "libdeflate=no " +#endif + +#ifdef HAVE_LIBLZMA + "lzma=yes " +#else + "lzma=no " +#endif + +#ifdef HAVE_LIBBZ2 + "bzip2=yes "; +#else + "bzip2=no "; +#endif + +#ifdef ENABLE_PLUGINS + static char config[1200]; + sprintf(config, fmt, htslib_plugin_path()); + return config; +#else + return fmt; +#endif +} + +/*************************** + * hFILE interface proper + ***************************/ hFILE *hopen(const char *fname, const char *mode, ...) { diff --git a/hts_os.c b/hts_os.c index adcf222c9..c26700975 100644 --- a/hts_os.c +++ b/hts_os.c @@ -1,7 +1,7 @@ /// @file hts_os.c /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019, 2020 Genome Research Ltd. + Copyright (C) 2017, 2019 Genome Research Ltd. Author: James Bonfield @@ -57,150 +57,3 @@ long hts_lrand48(void) { return lrand48(); } // #define USE_FILEEXTD // #include "os/iscygpty.c" // #endif - - -#include -#include -#include "hts_internal.h" -#include "htslib/hts.h" -#include "htslib/hts_os.h" -#include "htslib/kstring.h" - -unsigned int htslib_features(void) { - unsigned int feat = 0; - -#ifdef PACKAGE_URL - feat |= HTS_FEATURE_CONFIGURE; -#endif - -#ifdef ENABLE_PLUGINS - feat |= HTS_FEATURE_PLUGINS; -#endif - -#ifdef HAVE_LIBCURL - feat |= HTS_FEATURE_LIBCURL; -#endif - -#ifdef ENABLE_S3 - feat |= HTS_FEATURE_S3; -#endif - -#ifdef ENABLE_GCS - feat |= HTS_FEATURE_GCS; -#endif - -#ifdef HAVE_LIBDEFLATE - feat |= HTS_FEATURE_LIBDEFLATE; -#endif - -#ifdef HAVE_LIBLZMA - feat |= HTS_FEATURE_LZMA; -#endif - -#ifdef HAVE_LIBBZ2 - feat |= HTS_FEATURE_BZIP2; -#endif - - return feat; -} - -const char *htslib_test_feature(int id) { - int feat = htslib_features(); - - switch (id) { - case HTS_FEATURE_CONFIGURE: - return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; - case HTS_FEATURE_PLUGINS: - return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; - case HTS_FEATURE_LIBCURL: - return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; - case HTS_FEATURE_S3: - return feat & HTS_FEATURE_S3 ? "yes" : NULL; - case HTS_FEATURE_GCS: - return feat & HTS_FEATURE_GCS ? "yes" : NULL; - case HTS_FEATURE_LIBDEFLATE: - return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; - case HTS_FEATURE_BZIP2: - return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; - case HTS_FEATURE_LZMA: - return feat & HTS_FEATURE_LZMA ? "yes" : NULL; - - case HTS_FEATURE_CC: - return HTS_CC; - case HTS_FEATURE_CFLAGS: - return HTS_CFLAGS; - case HTS_FEATURE_LDFLAGS: - return HTS_LDFLAGS; - case HTS_FEATURE_CPPFLAGS: - return HTS_CPPFLAGS; - - default: - fprintf(stderr, "Unknown feature code: %d\n", id); - } - - return NULL; -} - -// Note this implementation also means we can just "strings" the library -// to find the configuration parameters. -const char *htslib_feature_string(void) { - const char *fmt= - -#ifdef PACKAGE_URL - "build=configure " -#else - "build=Makefile " -#endif - -#ifdef ENABLE_PLUGINS - "plugins=yes, plugin-path=%.1000s " -#else - "plugins=no " -#endif - -#ifdef HAVE_LIBCURL - "libcurl=yes " -#else - "libcurl=no " -#endif - -#ifdef ENABLE_S3 - "S3=yes " -#else - "S3=no " -#endif - -#ifdef ENABLE_GCS - "GCS=yes " -#else - "GCS=no " -#endif - -#ifdef HAVE_LIBDEFLATE - "libdeflate=yes " -#else - "libdeflate=no " -#endif - -#ifdef HAVE_LIBLZMA - "lzma=yes " -#else - "lzma=no " -#endif - -#ifdef HAVE_LIBBZ2 - "bzip2=yes "; -#else - "bzip2=no "; -#endif - -#ifdef ENABLE_PLUGINS - static char config[1200]; - sprintf(config, fmt, htslib_plugin_path()); - return config; -#else - return fmt; -#endif -} - -// Plus hts_version here? diff --git a/htslib/hfile.h b/htslib/hfile.h index 9ca489d2d..d960ab3db 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -364,6 +364,45 @@ int hts_list_plugins(const char *plist[], int *nplugins); HTSLIB_EXPORT int htslib_has_plugin(const char *name); +/*! @abstract Introspection on the features enabled in htslib + * + * @return a bitfield of HTS_FEATURE_* macros. + */ +HTSLIB_EXPORT +unsigned int htslib_features(void); + +HTSLIB_EXPORT +const char *htslib_test_feature(int id); + +/*! @abstract Introspection on the features enabled in htslib, string form + * + * @return a string describing htslib build features + */ +HTSLIB_EXPORT +const char *htslib_feature_string(void); + +// Whether ./configure was used or vanilla Makefile +#define HTS_FEATURE_CONFIGURE 1 + +// Also see htslib_plugin_path function +#define HTS_FEATURE_PLUGINS 2 + +// Transport specific +#define HTS_FEATURE_LIBCURL 4 +#define HTS_FEATURE_S3 8 +#define HTS_FEATURE_GCS 16 + +// Compression options +#define HTS_FEATURE_LIBDEFLATE 32 +#define HTS_FEATURE_LZMA 64 +#define HTS_FEATURE_BZIP2 128 + +// Build params +#define HTS_FEATURE_CC (1<<28) +#define HTS_FEATURE_CFLAGS (1<<29) +#define HTS_FEATURE_LDFLAGS (1<<30) +#define HTS_FEATURE_CPPFLAGS (1<<31) + #ifdef __cplusplus } #endif diff --git a/htslib/hts_os.h b/htslib/hts_os.h index eab438b54..3461df512 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -77,43 +77,4 @@ extern int is_cygpty(int fd); #define random rand #endif -/*! @abstract Introspection on the features enabled in htslib - * - * @return a bitfield of HTS_FEATURE_* macros. - */ -HTSLIB_EXPORT -unsigned int htslib_features(void); - -HTSLIB_EXPORT -const char *htslib_test_feature(int id); - -/*! @abstract Introspection on the features enabled in htslib, string form - * - * @return a string describing htslib build features - */ -HTSLIB_EXPORT -const char *htslib_feature_string(void); - -// Whether ./configure was used or vanilla Makefile -#define HTS_FEATURE_CONFIGURE 1 - -// Also see htslib_plugin_path function -#define HTS_FEATURE_PLUGINS 2 - -// Transport specific -#define HTS_FEATURE_LIBCURL 4 -#define HTS_FEATURE_S3 8 -#define HTS_FEATURE_GCS 16 - -// Compression options -#define HTS_FEATURE_LIBDEFLATE 32 -#define HTS_FEATURE_LZMA 64 -#define HTS_FEATURE_BZIP2 128 - -// Build params -#define HTS_FEATURE_CC (1<<28) -#define HTS_FEATURE_CFLAGS (1<<29) -#define HTS_FEATURE_LDFLAGS (1<<30) -#define HTS_FEATURE_CPPFLAGS (1<<31) - -#endif +#endif // HTSLIB_HTS_OS_H diff --git a/test/test_introspection.c b/test/test_introspection.c index ea90430c4..527a22aa7 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -27,7 +27,6 @@ DEALINGS IN THE SOFTWARE. */ #include "../htslib/hts.h" #include "../htslib/hfile.h" -#include "../htslib/hts_os.h" int main(void) { printf("Version string: %s\n", hts_version()); From 410f2b2e4174772d504f3d288c085359878a38e9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Nov 2020 16:26:11 +0000 Subject: [PATCH 062/488] Untabify and remove gcc -fpedantic pedantry. --- hfile.c | 2 +- htslib/hfile.h | 4 ++-- test/test_introspection.c | 16 ++++++++-------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/hfile.c b/hfile.c index e5861c25c..01bace09d 100644 --- a/hfile.c +++ b/hfile.c @@ -1219,7 +1219,7 @@ int hts_list_plugins(const char *plist[], int *nplugins) if (*nplugins > np) *nplugins = np; - + return np; } diff --git a/htslib/hfile.h b/htslib/hfile.h index d960ab3db..e0aea9008 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -316,7 +316,7 @@ purpose other than closing. HTSLIB_EXPORT char *hfile_mem_steal_buffer(hFILE *file, size_t *length); -/// Fills out sc_list[] with the list of known schemes. +/// Fills out sc_list[] with the list of known schemes. /** * @param plugin [in] Restricts schemes to only those from 'plugin. * @param sc_list [out] Filled out with the scheme names @@ -401,7 +401,7 @@ const char *htslib_feature_string(void); #define HTS_FEATURE_CC (1<<28) #define HTS_FEATURE_CFLAGS (1<<29) #define HTS_FEATURE_LDFLAGS (1<<30) -#define HTS_FEATURE_CPPFLAGS (1<<31) +#define HTS_FEATURE_CPPFLAGS (1u<<31) #ifdef __cplusplus } diff --git a/test/test_introspection.c b/test/test_introspection.c index 527a22aa7..9ee7d1aa3 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -40,21 +40,21 @@ int main(void) { unsigned int feat = htslib_features(); printf("\nFeature number: 0x%x\n", feat); if (feat & HTS_FEATURE_CONFIGURE) - printf(" HTS_FEATURE_CONFIGURE\n"); + printf(" HTS_FEATURE_CONFIGURE\n"); if (feat & HTS_FEATURE_PLUGINS) - printf(" HTS_FEATURE_PLUGINS\n"); + printf(" HTS_FEATURE_PLUGINS\n"); if (feat & HTS_FEATURE_LIBCURL) - printf(" HTS_FEATURE_LIBCURL\n"); + printf(" HTS_FEATURE_LIBCURL\n"); if (feat & HTS_FEATURE_S3) - printf(" HTS_FEATURE_S3\n"); + printf(" HTS_FEATURE_S3\n"); if (feat & HTS_FEATURE_GCS) - printf(" HTS_FEATURE_GCS\n"); + printf(" HTS_FEATURE_GCS\n"); if (feat & HTS_FEATURE_LIBDEFLATE) - printf(" HTS_FEATURE_LIBDEFLATE\n"); + printf(" HTS_FEATURE_LIBDEFLATE\n"); if (feat & HTS_FEATURE_LZMA) - printf(" HTS_FEATURE_LZMA\n"); + printf(" HTS_FEATURE_LZMA\n"); if (feat & HTS_FEATURE_BZIP2) - printf(" HTS_FEATURE_BZIP2\n"); + printf(" HTS_FEATURE_BZIP2\n"); printf("\nFeature string: %s\n", htslib_feature_string()); From 481bd2226195d3d6f9bc79f9b80d9750b2b43b76 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Fri, 15 Jan 2021 21:15:42 +0200 Subject: [PATCH 063/488] Fix iteration over CIGARs with indels in tweak_overlap_quality (PR #1202) Reset icig to -1 at the end of a CIGAR matching segment or in case of indels, clips and skips, so that the next MATCH starts from 0 index. Co-authored-by: Valeriu Ohan Co-authored-by: wulj2 --- sam.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sam.c b/sam.c index dbfc42e56..ea5b968f1 100644 --- a/sam.c +++ b/sam.c @@ -4816,14 +4816,14 @@ static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, ht if ( cig==BAM_CMATCH || cig==BAM_CEQUAL || cig==BAM_CDIFF ) { - if ( *icig >= ncig - 1 ) { *icig = 0; (*cigar)++; continue; } + if ( *icig >= ncig - 1 ) { *icig = -1; (*cigar)++; continue; } (*iseq)++; (*icig)++; (*iref)++; return BAM_CMATCH; } - if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = 0; continue; } - if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } - if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = 0; continue; } - if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = 0; continue; } + if ( cig==BAM_CDEL || cig==BAM_CREF_SKIP ) { (*cigar)++; (*iref) += ncig; *icig = -1; continue; } + if ( cig==BAM_CINS ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } + if ( cig==BAM_CSOFT_CLIP ) { (*cigar)++; *iseq += ncig; *icig = -1; continue; } + if ( cig==BAM_CHARD_CLIP || cig==BAM_CPAD ) { (*cigar)++; *icig = -1; continue; } hts_log_error("Unexpected cigar %d", cig); return -2; } From 13fe974f75458edd490ed841f60bbcd0cef58ee1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 14 Jan 2021 11:47:37 +0000 Subject: [PATCH 064/488] Renable changing CRAM compression level via hts_set_opt. This bug was introduced during #1181 where the additional HTS_OPT_FILTER meant the automatic fall-through from HTS_OPT_COMPRESSION_LEVEL no longer applied. I decided against continuing the previous obscurity of moving HTS_OPT_COMPRESSION_LEVEL to the end of the switch and having no return (thus falling into the cram_set_voption call), instead favouring an explicit call. An example command that demonstrated the problem was: samtools view -O cram,level=1 in.bam -o out.cram --- hts.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hts.c b/hts.c index 9e50290cf..07c9d604a 100644 --- a/hts.c +++ b/hts.c @@ -1340,6 +1340,8 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { va_end(args); if (fp->is_bgzf) fp->fp.bgzf->compress_level = level; + else if (fp->format.format == cram) + return cram_set_option(fp->fp.cram, opt, level); return 0; } From f39dae751dae8c1abf8477074c5fbaea66d229df Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 13 Jan 2021 17:34:40 +0000 Subject: [PATCH 065/488] Fix incorrect setting of end position in overlap_push() Commit a2fdf3b changed bam_endpos() so that it would consider alignments which consume no bases (e.g. fully soft clipped) as one base long. This had the side effect of making overlap_push() change the position stored in a->end, causing reads that would previously have been ignored to be added to the pileup. As resolve_cigar2() was not designed to handle such reads it resulted in nonsensical values being set in the pileup structures for these reads. The update to a->end in overlap_push() is not necessary anyway as it will have already have been set correctly in bam_plp_push(), so the simplest solution is to remove the call to bam_endpos() in overlap_push(). Pointers to the cigar data in tweak_overlap_quality() have been changed to const to ensure it makes no changes that would change the alignment end position. Fixes samtools/bcftools#1362 (bcftools mpileup seg fault for reads with only 1 match). --- sam.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/sam.c b/sam.c index ea5b968f1..884eecd6a 100644 --- a/sam.c +++ b/sam.c @@ -4772,7 +4772,11 @@ void bam_plp_destructor(bam_plp_t plp, * Returns BAM_CMATCH, -1 when there is no more cigar to process or the requested position is not covered, * or -2 on error. */ -static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) +static inline int cigar_iref2iseq_set(const uint32_t **cigar, + const uint32_t *cigar_max, + hts_pos_t *icig, + hts_pos_t *iseq, + hts_pos_t *iref) { hts_pos_t pos = *iref; if ( pos < 0 ) return -1; @@ -4807,7 +4811,11 @@ static inline int cigar_iref2iseq_set(uint32_t **cigar, uint32_t *cigar_max, hts *iseq = -1; return -1; } -static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, hts_pos_t *icig, hts_pos_t *iseq, hts_pos_t *iref) +static inline int cigar_iref2iseq_next(const uint32_t **cigar, + const uint32_t *cigar_max, + hts_pos_t *icig, + hts_pos_t *iseq, + hts_pos_t *iref) { while ( *cigar < cigar_max ) { @@ -4834,8 +4842,8 @@ static inline int cigar_iref2iseq_next(uint32_t **cigar, uint32_t *cigar_max, ht static int tweak_overlap_quality(bam1_t *a, bam1_t *b) { - uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; - uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; + const uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; + const uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; hts_pos_t a_icig = 0, a_iseq = 0; hts_pos_t b_icig = 0, b_iseq = 0; uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); @@ -4945,8 +4953,6 @@ static int overlap_push(bam_plp_t iter, lbnode_t *node) int err = tweak_overlap_quality(&a->b, &node->b); kh_del(olap_hash, iter->overlaps, kitr); assert(a->end-1 == a->s.end); - a->end = bam_endpos(&a->b); - a->s.end = a->end - 1; return err; } return 0; From 1c930612f2049639cf832526752f80415125930a Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 23 Jan 2021 05:05:50 +0000 Subject: [PATCH 066/488] Update links to curl project website See https://daniel.haxx.se/blog/2020/11/04/the-journey-to-a-curl-domain/ --- INSTALL | 2 +- configure.ac | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/INSTALL b/INSTALL index 92a358a73..582f9b69e 100644 --- a/INSTALL +++ b/INSTALL @@ -145,7 +145,7 @@ various features and specify further optional external requirements: on to cause additional directories to be searched. --enable-libcurl - Use libcurl () to implement network access to + Use libcurl () to implement network access to remote files via FTP, HTTP, HTTPS, etc. --enable-gcs diff --git a/configure.ac b/configure.ac index f990c370b..6b76c00a1 100644 --- a/configure.ac +++ b/configure.ac @@ -362,7 +362,7 @@ if test "$enable_libcurl" != no; then *) MSG_ERROR([libcurl $message Support for HTTPS and other SSL-based URLs requires routines from the libcurl -library . Building HTSlib with libcurl enabled +library . Building HTSlib with libcurl enabled requires libcurl development files to be installed on the build machine; you may need to ensure a package such as libcurl4-{gnutls,nss,openssl}-dev (on Debian or Ubuntu Linux) or libcurl-devel (on RPM-based Linux distributions From 79d8c948e6aaec343e6f07d9bab9f7edcf1fa368 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 21 Jan 2021 16:42:34 +0000 Subject: [PATCH 067/488] Fix some fuzz issues in CRAM 3.1. Credit to OSS-Fuzz - Check for return value from RANS4x16 and ARITH codecs so a valid stream that is the wrong size still frees up the memory. Fixes oss-fuzz 29786 Fixes oss-fuzz 29787 Fixes oss-fuzz 29796 - Check for return value from TOK3 codec so error is bubbled up Fixes oss-fuzz 29789 --- cram/cram_io.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 4f95f940a..30f109d6e 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1680,8 +1680,12 @@ int cram_uncompress_block(cram_block *b) { case RANS_PR0: { unsigned int usize = b->uncomp_size, usize2; uncomp = (char *)rans_uncompress_4x16(b->data, b->comp_size, &usize2); - if (!uncomp || usize != usize2) + if (!uncomp) + return -1; + if (usize != usize2) { + free(uncomp); return -1; + } b->orig_method = RANS_PR0 + (b->data[0]&1) + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); free(b->data); @@ -1696,8 +1700,12 @@ int cram_uncompress_block(cram_block *b) { case ARITH_PR0: { unsigned int usize = b->uncomp_size, usize2; uncomp = (char *)arith_uncompress_to(b->data, b->comp_size, NULL, &usize2); - if (!uncomp || usize != usize2) + if (!uncomp) return -1; + if (usize != usize2) { + free(uncomp); + return -1; + } b->orig_method = ARITH_PR0 + (b->data[0]&1) + 2*((b->data[0]&0x40)>0) + 4*((b->data[0]&0x80)>0); free(b->data); @@ -1712,6 +1720,8 @@ int cram_uncompress_block(cram_block *b) { case TOK3: { uint32_t out_len; uint8_t *cp = decode_names(b->data, b->comp_size, &out_len); + if (!cp) + return -1; b->orig_method = TOK3; b->method = RAW; free(b->data); From f4235784d7feb75ed0a15afa3f624029172c39bb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 21 Jan 2021 17:00:03 +0000 Subject: [PATCH 068/488] Fix memory leak with malformed huffman data. Credit to OSS-Fuzz Fixes oss-fuzz 29780 --- cram/cram_codecs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 783fc393d..2fcce4ed1 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -2758,8 +2758,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, for (i = 0; i < ncodes; i++) codes[i].symbol = vv->varint_get32(&cp, data_end, &err); } else { - free(h); - return NULL; + goto malformed; } if (err) @@ -2794,9 +2793,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, if (max_len > max_code_bits) { hts_log_error("Huffman code length (%d) is greater " "than maximum supported (%d)", max_len, max_code_bits); - free(h); - free(codes); - return NULL; + goto malformed; } /* Sort by bit length and then by symbol value */ From 23a67495c5372bacca90a6e1f5ab6b79afc314fc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 21 Jan 2021 17:10:37 +0000 Subject: [PATCH 069/488] Fix missing rANS_word.h dependency. --- htscodecs_bundled.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index d270773f7..328a86b0c 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -18,6 +18,7 @@ htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h htscodecs_varint_h = htscodecs/htscodecs/varint.h htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h From 40afaaa5bf7e3f9cd6d6d9d9d8ae26496df82aea Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 22 Jan 2021 14:37:37 +0000 Subject: [PATCH 070/488] Fixed some more malformed data bugs, this time XPACK. This is an experimental encoding for CRAM 4.0 draft. We protect against nbits and nval being out of range. Fixes a buffer overrun. Credit to OSS-Fuzz Fixes oss-fuzz 29827 --- cram/cram_codecs.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 2fcce4ed1..067319fc6 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1400,7 +1400,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, c->decode = cram_xpack_decode_char; else { fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); - return NULL; + goto malformed; } c->free = cram_xpack_decode_free; c->size = cram_xpack_decode_size; @@ -1408,6 +1408,9 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL); c->u.xpack.nval = vv->varint_get32(&cp, endp, NULL); + if (c->u.xpack.nbits >= 8 || c->u.xpack.nbits < 0 || + c->u.xpack.nval > 256 || c->u.xpack.nval < 0) + goto malformed; int i; for (i = 0; i < c->u.xpack.nval; i++) { uint32_t v = vv->varint_get32(&cp, endp, NULL); From a01a139ac1fa365a688ec8734d62ef859a8bc416 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 22 Jan 2021 18:49:40 +0000 Subject: [PATCH 071/488] Pull in htscodecs bug fixes * Fix 1-byte buffer overruns in arith_uncompress_to() and rans_uncompress_to_4x16() * Fixes various cases of undefined behaviour. * Adds a limit on max_names in the name tokeniser to prevent excess memory use. Credit to OSS-Fuzz Fixes oss-fuzz 29785 Fixes oss-fuzz 29789 Fixes oss-fuzz 29792 Fixes oss-fuzz 29795 Fixes oss-fuzz 29802 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index dca826bbc..e8d8597e7 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit dca826bbc598d6dcae450e92e60dcf789454999d +Subproject commit e8d8597e7fa7fd27a4cd5c9ff00ff97c37a8b456 From 90fef7300c75ca6b8834da0dcb710e27d599324e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 25 Jan 2021 09:32:50 +0000 Subject: [PATCH 072/488] More OSS-Fuzz detected fixes Credit to OSS-Fuzz - Clean up properly when XPACK output values are out of the accepted range. Fixes oss-fuzz 29827 - Add "cp_end" arg to varint_get* funcs in cram_varint_decode_init so reading beyond the buffer can be spotted. Fixes oss-fuzz 29837 - Set b->uncomp_size for FQZ decoder. Already done in other codecs, just accidentally missed here. Without it, it invalidates the boundary check in int32_get_blk. Fixes oss-fuzz 29855 Fixes oss-fuzz 29908 - Fixed memory leak in XDELTA when recovering from malformed data stream. Fixes oss-fuzz 29858 --- cram/cram_codecs.c | 13 +++++++------ cram/cram_io.c | 1 + 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 067319fc6..63c6ea730 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -745,7 +745,7 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, enum cram_external_type option, int version, varint_vec *vv) { cram_codec *c; - char *cp = data; + char *cp = data, *cp_end = data+size; if (!(c = malloc(sizeof(*c)))) return NULL; @@ -775,8 +775,8 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, c->size = cram_varint_decode_size; c->get_block = cram_varint_get_block; - c->u.varint.content_id = vv->varint_get32 (&cp, NULL, NULL); - c->u.varint.offset = vv->varint_get64s(&cp, NULL, NULL); + c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL); + c->u.varint.offset = vv->varint_get64s(&cp, cp_end, NULL); if (cp - data != size) { fprintf(stderr, "Malformed varint header stream\n"); @@ -1414,7 +1414,8 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, int i; for (i = 0; i < c->u.xpack.nval; i++) { uint32_t v = vv->varint_get32(&cp, endp, NULL); - if (v >= 256) return NULL; + if (v >= 256) + goto malformed; c->u.xpack.rmap[i] = v; // reverse map: e.g 0-3 to P,A,C,K } @@ -1714,7 +1715,7 @@ cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, char *cp = data; char *endp = data+size; - if (!(c = malloc(sizeof(*c)))) + if (!(c = calloc(1, sizeof(*c)))) return NULL; c->codec = E_XDELTA; @@ -1749,7 +1750,7 @@ cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, if (cp - data != size) { malformed: fprintf(stderr, "Malformed xdelta header stream\n"); - free(c); + cram_xdelta_decode_free(c); return NULL; } diff --git a/cram/cram_io.c b/cram/cram_io.c index 30f109d6e..8d257accc 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1674,6 +1674,7 @@ int cram_uncompress_block(cram_block *b) { b->data = (unsigned char *)uncomp; b->alloc = uncomp_size; b->method = RAW; + b->uncomp_size = uncomp_size; break; } From 10815ba355e0fe386cbe63d72405a40d3675893d Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 25 Jan 2021 17:36:10 +0000 Subject: [PATCH 073/488] Spell out `git clone --recurse-submodules` option [minor] Use the current canonical option rather than the --recursive alias; there doesn't appear to have ever been a -r short form for this. --- INSTALL | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL b/INSTALL index 582f9b69e..42bffb85b 100644 --- a/INSTALL +++ b/INSTALL @@ -66,7 +66,7 @@ from a release tar file, you can skip this section. Some parts of HTSlib are provided by the external "htscodecs" project. This is included as a submodule. When building from the git repository, -either clone the project using "git clone -r", or run: +either clone the project using "git clone --recurse-submodules", or run: git submodule update --init --recursive From a64c6c8e4f4056c1f7dca3335d239fa392e2d1b3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 19 Jan 2021 17:17:52 +0000 Subject: [PATCH 074/488] Reject VCF/BCF records with no REF allele The specification says REF must be present, so reject reads that do not have one. Fixes a crash reported in bcf_sr_sort_set() which expects REF to be present. Fixes samtools/bcftools#1361 (bcftools merge segfault when REF and ALT are unset) --- vcf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vcf.c b/vcf.c index a1aa55525..e9868cadf 100644 --- a/vcf.c +++ b/vcf.c @@ -1368,6 +1368,12 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { ptr += bytes; // Check REF and ALT + if (rec->n_allele < 1) { + hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": No REF allele", + bcf_seqname_safe(hdr,rec), rec->pos+1); + err |= BCF_ERR_TAG_UNDEF; + } + reports = 0; for (i = 0; i < rec->n_allele; i++) { if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; From 42b365fb5ed155b963c07eb16b4538b7bd8daf55 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 26 Jan 2021 17:26:47 +0000 Subject: [PATCH 075/488] remove a strange gcc10 warning. Gcc 10 when using -O and -fsanitize=address produces a warning about mallocing some insanely large amount of memory. This is an impossibility given the limits on the sizes of some data values, but gcc doesn't know this and is assuimng variables may wrap around and go negative. Using an unsigned type stops it from exploring the inaccessible data ranges. --- cram/cram_codecs.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 63c6ea730..e581ba3cf 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3067,9 +3067,10 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, enum cram_external_type option, void *dat, int version, varint_vec *vv) { - int *vals = NULL, *freqs = NULL, vals_alloc = 0, *lens = NULL, code, len; + int *vals = NULL, *freqs = NULL, *lens = NULL, code, len; int *new_vals, *new_freqs; - int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; + int i, ntot = 0, max_val = 0, min_val = INT_MAX, k; + size_t nvals, vals_alloc = 0; cram_codec *c; cram_huffman_code *codes; From 9d9e60c83487a8ac4644079e9636e34a9dd4bb08 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 27 Jan 2021 11:32:04 +0000 Subject: [PATCH 076/488] Update htscodecs to pull in more bug fixes Fixes various issues found by fuzzing. Credit to OSS-Fuzz - Removed undefined shift (malformed input data) Fixes oss-fuzz 29817 - Fix overflow in filling out sfb[] array Fixes oss-fuzz 29931 - Fix RLE with zero-byte meta-data block Fixes oss-fuzz 29939 - Fix rle_decode memory tidyup Fixes oss-fuzz 29935 - Fix undefined shift in tokenise_name3.c Fixes oss-fuzz 29956 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index e8d8597e7..144bda6ac 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit e8d8597e7fa7fd27a4cd5c9ff00ff97c37a8b456 +Subproject commit 144bda6ac96ea2d0535e7e305077b3fcaea7b5f0 From b8dcbd12b2c535fb792a5dfb275e875b626e929d Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 27 Jan 2021 13:50:04 +0000 Subject: [PATCH 077/488] Added missing quotes. Also converted to using ALL_CPPFLAGS from #1187. --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 21f510579..ad1353f10 100644 --- a/Makefile +++ b/Makefile @@ -341,8 +341,7 @@ errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htsl kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) -hfile.o hfile.pico: TMP_CPPFLAGS := $(CPPFLAGS) -hfile.o hfile.pico: CPPFLAGS += -DHTS_CPPFLAGS=\"$(TMP_CPPFLAGS)\" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" +hfile.o hfile.pico: ALL_CPPFLAGS += -DHTS_CPPFLAGS="\"$(CPPFLAGS)\"" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) From 64808c2f898782cc50b690969265f01adeeb978d Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 27 Jan 2021 17:21:10 +0000 Subject: [PATCH 078/488] Disable travis tests travis.org is closing down, and cirrus-ci works as a replacement. Unfortunately this removes the only big-endian platform (s390x). It will be substituted in the future if an alternative can be found, but meanwhile big-endian tests will need to be run manually. --- .travis.yml | 109 ---------------------------------------------------- README.md | 2 +- 2 files changed, 1 insertion(+), 110 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index d85224133..000000000 --- a/.travis.yml +++ /dev/null @@ -1,109 +0,0 @@ -# Control file for continuous integration testing at http://travis-ci.org/ - -language: c - -matrix: - include: - - compiler: gcc - os: linux - env: DO_MAINTAINER_CHECKS=yes USE_CONFIG=no - - - compiler: gcc-8 - os: linux - env: USE_CONFIG=yes CC=gcc-8 AR=gcc-ar-8 - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-8 - - # An optimised build with address and leak checking, also using libdeflate - - compiler: gcc-8 - os: linux - dist: xenial - env: USE_CONFIG=yes USE_LIBDEFLATE=yes CC=gcc-8 AR=gcc-ar-8 CFLAGS="-g -Wall -O3 -fsanitize=address" LDFLAGS="-fsanitize=address" - addons: - apt: - sources: - - ubuntu-toolchain-r-test - packages: - - gcc-8 - - - compiler: clang - os: osx - env: USE_CONFIG=no - - - compiler: clang - os: osx - env: USE_CONFIG=yes - - - compiler: clang - os: osx - env: USE_CONFIG=yes USE_LIBDEFLATE=yes - - - compiler: gcc - os: linux - env: USE_CONFIG=yes - - - compiler: clang - os: linux - env: USE_CONFIG=yes - - - compiler: gcc - os: linux - env: CFLAGS="-std=c99 -pedantic" USE_CONFIG=yes - - # Big-endian - - compiler: gcc - arch: s390x - os: linux - env: USE_CONFIG=yes - addons: - apt: - packages: - - libbz2-dev - - liblzma-dev - - -# For MacOSX systems -before_install: - - | - if [[ "$TRAVIS_OS_NAME" == "osx" && "$USE_CONFIG" == "no" ]]; then - HOMEBREW_NO_AUTO_UPDATE=1 brew install xz || ( brew update && brew install xz ) - fi - -before_script: - - | - if test "x$USE_LIBDEFLATE" == "xyes"; then - pushd "$HOME" && \ - git clone --depth 1 https://github.com/ebiggers/libdeflate.git && \ - pushd libdeflate && \ - make -j 2 CFLAGS='-fPIC -O3' libdeflate.a && \ - popd && \ - popd - fi - git submodule update --init --recursive - -script: - - | - if test "x$USE_LIBDEFLATE" = "xyes"; then - CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' - else - CONFIG_OPTS='--without-libdeflate' - fi - - | - if test "$USE_CONFIG" = "yes"; then - MAKE_OPTS= ; - autoreconf && \ - eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ - ( cat config.log; false ) - else - MAKE_OPTS=-e - fi && \ - if test "x$DO_MAINTAINER_CHECKS" = "xyes"; then - make maintainer-check - fi && \ - make -j 2 $MAKE_OPTS && \ - make test-shlib-exports && \ - make test diff --git a/README.md b/README.md index 1b01d7271..e28f4a415 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Build Status](https://travis-ci.org/samtools/htslib.svg?branch=develop)](https://travis-ci.org/samtools/htslib) +[![Build Status](https://api.cirrus-ci.com/github/samtools/htslib.svg?branch=develop)](https://api.cirrus-ci.com/github/samtools/htslib) [![Build status](https://ci.appveyor.com/api/projects/status/v46hkwyfjp3l8nd3/branch/develop?svg=true)](https://ci.appveyor.com/project/samtools/htslib/branch/develop) [![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib) From 8510a0d2a5ac878f1a4b9a6683580bcdfa9196d3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 28 Jan 2021 17:06:25 +0000 Subject: [PATCH 079/488] Fix more OSS fuzz reported issues. Credit to OSS-Fuzz - Add bounds check in cram_const_decode_init. I've checked and this is the only use of the varint decoders that didn't have a bounds check. Suitably sniffed out by the fuzzer. Good fuzzer! Fixes oss-fuzz 30012 - Free the sub-encoding when initialising XRLE encoding fails due to malformed streams. This removes a tiny memory leak. Fixes oss-fuzz 30014 --- cram/cram_codecs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index e581ba3cf..ab46fc293 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -964,7 +964,7 @@ cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, c->size = cram_const_decode_size; c->get_block = NULL; - c->u.xconst.val = vv->varint_get64s(&cp, NULL, NULL); + c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL); if (cp - data != size) { fprintf(stderr, "Malformed const header stream\n"); @@ -2118,7 +2118,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, char *endp = data+size; int err = 0; - if (!(c = malloc(sizeof(*c)))) + if (!(c = calloc(1, sizeof(*c)))) return NULL; c->codec = E_XRLE; @@ -2175,7 +2175,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, malformed: fprintf(stderr, "Malformed xrle header stream\n"); - free(c); + cram_xrle_decode_free(c); return NULL; } From a6e89c59e976023afb66536e5162a697d298308c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 09:07:25 +0000 Subject: [PATCH 080/488] Pull in htscodecs fuzzing bug fixes Credit to OSS-Fuzz - Fix undefined shifts Fixes oss-fuzz 29995 Fixes oss-fuzz 30017 - Protect against no tokens present in tok_name3 codec Fixes oss-fuzz 30008 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 144bda6ac..cf0e9611d 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 144bda6ac96ea2d0535e7e305077b3fcaea7b5f0 +Subproject commit cf0e9611dc88e351148a19568c858101ed31d0fa From c56cfaa31ba6a15a226ec6e3e0b25eea53b255f8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 1 Feb 2021 09:40:49 +0000 Subject: [PATCH 081/488] Remove small memory leak when decoding malform slide header. This is an old bug, but only recently picked up by the fuzzer. Credit to OSS-Fuzz Fixes oss-fuzz 30105 --- cram/cram_decode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 41203ec7f..26c9c592b 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1032,6 +1032,7 @@ cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b) { if (!err) return hdr; + free(hdr->block_content_ids); free(hdr); return NULL; } From 53d7277ad2feb71e2e65f6c71107d854e89c7440 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 2 Feb 2021 09:27:50 +0000 Subject: [PATCH 082/488] Remove small memory leak when decoding malformed XPACK encoding. As with recent changes to XPACK and XRLE, it now ensures the sub-codec is also properly freed too. Credit to OSS-Fuzz Fixes oss-fuzz 30164 --- cram/cram_codecs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index ab46fc293..1a357fdbe 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1388,7 +1388,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, char *cp = data; char *endp = data+size; - if (!(c = malloc(sizeof(*c)))) + if (!(c = calloc(1, sizeof(*c)))) return NULL; c->codec = E_XPACK; @@ -1433,7 +1433,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, || c->u.xpack.nbits < 0 || c->u.xpack.nbits > 8 * sizeof(int64_t)) { malformed: fprintf(stderr, "Malformed xpack header stream\n"); - free(c); + cram_xpack_decode_free(c); return NULL; } From 6568f13bf7b36f280a8f0e4777fe05c85a9dcf06 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 2 Feb 2021 09:56:55 +0000 Subject: [PATCH 083/488] Pull in htscodecs changes and update Makefile - Fix an overflow bug in the unstripe function Credit to OSS-Fuzz Fixes oss-fuzz 30087 - Move some duplicated code to a new htscodecs/utils.h header - Updates htslib Makefile infrastructure for the new htscodecs header --- Makefile | 6 +++--- htscodecs | 2 +- htscodecs_bundled.mk | 8 ++++++-- htscodecs_external.mk | 5 ++++- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/Makefile b/Makefile index ad1353f10..ef65eb68d 100644 --- a/Makefile +++ b/Makefile @@ -380,11 +380,11 @@ cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/po cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) -htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_c_simple_model.h) +htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htsodecs_utils_h) $(htscodecs_c_simple_model.h) htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model.h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) -htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) -htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_rANS_static_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.c config.h $(htscodecs_varint_h) $(htscodecs_rle_h) htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) diff --git a/htscodecs b/htscodecs index cf0e9611d..0ffb50be0 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit cf0e9611dc88e351148a19568c858101ed31d0fa +Subproject commit 0ffb50be07eab4ff40c8e6b7dc346266c6a23421 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 328a86b0c..8b41f76b4 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -8,6 +8,7 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) +# htscodecs public headers htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h htscodecs_pack_h = htscodecs/htscodecs/pack.h @@ -17,11 +18,14 @@ htscodecs_rle_h = htscodecs/htscodecs/rle.h htscodecs_tokenise_name3_h = htscodecs/htscodecs/tokenise_name3.h htscodecs_varint_h = htscodecs/htscodecs/varint.h -htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h -htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h +# htscodecs internal headers +htscodecs_htscodecs_endian_h = htscodecs/htscodecs/htscodecs_endian.h htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h +htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h $(htscodecs_htscodecs_endian_h) +htscodecs_utils_h = htscodecs/htscodecs/utils.h # Add htscodecs tests into the HTSlib test framework diff --git a/htscodecs_external.mk b/htscodecs_external.mk index 7ee47402c..7ac6944e9 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -11,7 +11,10 @@ htscodecs_rle_h = htscodecs_tokenise_name3_h = htscodecs_varint_h = -htscodecs_rANS_byte_h = +htscodecs_htscodecs_endian_h = htscodecs_c_range_coder_h = htscodecs_c_simple_model_h = htscodecs_pooled_alloc_h = +htscodecs_rANS_byte_h = +htscodecs_rANS_word_h = +htscodecs_utils_h = From 6ad5626c8e01e4eac71360dd9303d33f4c6267d2 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 2 Feb 2021 11:40:46 +0000 Subject: [PATCH 084/488] Add / update copyright boilerplate --- Makefile | 2 +- htscodecs_bundled.mk | 25 +++++++++++++++++++++++++ htscodecs_external.mk | 24 ++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ef65eb68d..0f2fddb47 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2020 Genome Research Ltd. +# Copyright (C) 2013-2021 Genome Research Ltd. # # Author: John Marshall # diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 8b41f76b4..de4d5db8d 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -1,3 +1,28 @@ +# Makefile fragment to add settings needed when bundling htscodecs functions +# +# Copyright (C) 2021 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + + HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/fqzcomp_qual.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ diff --git a/htscodecs_external.mk b/htscodecs_external.mk index 7ac6944e9..ce24dd6f5 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -1,3 +1,27 @@ +# Makefile fragment for use when linking to an external libhtscodecs +# +# Copyright (C) 2021 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + HTSCODECS_SOURCES = HTSCODECS_OBJS = HTSCODECS_TEST_TARGETS = From 2d1e9549f45881a014a387dd2c3287508a863ff9 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 3 Feb 2021 10:22:26 +0000 Subject: [PATCH 085/488] Updated coyright messages prior to release. --- bgzip.c | 2 +- configure.ac | 4 ++-- cram/cram_codecs.c | 2 +- cram/cram_external.c | 2 +- cram/cram_io.c | 2 +- cram/cram_samtools.h | 2 +- hfile.c | 2 +- hts_os.c | 2 +- htsfile.c | 2 +- htslib/cram.h | 2 +- htslib/hfile.h | 2 +- htslib/hts_os.h | 2 +- htslib/knetfile.h | 2 +- htslib/synced_bcf_reader.h | 2 +- htslib_vars.mk | 2 +- kfunc.c | 2 +- probaln.c | 2 +- sam.c | 2 +- tabix.c | 2 +- test/fuzz/hts_open_fuzzer.c | 2 +- test/test.pl | 2 +- test/test_view.c | 2 +- 22 files changed, 23 insertions(+), 23 deletions(-) diff --git a/bgzip.c b/bgzip.c index 097e3f80c..22eb0404a 100644 --- a/bgzip.c +++ b/bgzip.c @@ -171,7 +171,7 @@ int main(int argc, char **argv) case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); diff --git a/configure.ac b/configure.ac index 6b76c00a1..d79947ae4 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2020 Genome Research Ltd. +# Copyright (C) 2015-2021 Genome Research Ltd. # # Author: John Marshall # @@ -34,7 +34,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2020 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2020-2021 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 1a357fdbe..55030c814 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2021 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_external.c b/cram/cram_external.c index 88175103d..314826932 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2019 Genome Research Ltd. +Copyright (c) 2015, 2018-2020 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_io.c b/cram/cram_io.c index 8d257accc..3a6b04c42 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2021 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_samtools.h b/cram/cram_samtools.h index 34c1db40e..a4c9bf5cc 100644 --- a/cram/cram_samtools.h +++ b/cram/cram_samtools.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2010-2013, 2018 Genome Research Ltd. +Copyright (c) 2010-2013, 2018, 2020 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/hfile.c b/hfile.c index 01bace09d..dba8b6f6c 100644 --- a/hfile.c +++ b/hfile.c @@ -1,6 +1,6 @@ /* hfile.c -- buffered low-level input/output streams. - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: John Marshall diff --git a/hts_os.c b/hts_os.c index c26700975..b391a41e0 100644 --- a/hts_os.c +++ b/hts_os.c @@ -1,7 +1,7 @@ /// @file hts_os.c /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019 Genome Research Ltd. + Copyright (C) 2017, 2019-2020 Genome Research Ltd. Author: James Bonfield diff --git a/htsfile.c b/htsfile.c index e37eb7215..d6d6b4e69 100644 --- a/htsfile.c +++ b/htsfile.c @@ -258,7 +258,7 @@ int main(int argc, char **argv) case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", +"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; diff --git a/htslib/cram.h b/htslib/cram.h index 607cf8bcf..dab666345 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016, 2018-2019 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2020 Genome Research Ltd. Author: James Bonfield diff --git a/htslib/hfile.h b/htslib/hfile.h index e0aea9008..829e3b12b 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -1,7 +1,7 @@ /// @file htslib/hfile.h /// Buffered low-level input/output streams. /* - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2020 Genome Research Ltd. Author: John Marshall diff --git a/htslib/hts_os.h b/htslib/hts_os.h index 3461df512..b71cb89e7 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -1,7 +1,7 @@ /// @file hts_os.h /// Operating System specific tweaks, for compatibility with POSIX. /* - Copyright (C) 2017, 2019 Genome Research Ltd. + Copyright (C) 2017, 2019-2020 Genome Research Ltd. Author: James Bonfield diff --git a/htslib/knetfile.h b/htslib/knetfile.h index 598ed379a..da9cdc5e8 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2012, 2014 Genome Research Ltd (GRL). + Copyright (c) 2008, 2012, 2014, 2021 Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 8d3554445..76f889b30 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2020 Genome Research Ltd. Author: Petr Danecek diff --git a/htslib_vars.mk b/htslib_vars.mk index 85835f872..1f4c0905a 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -1,6 +1,6 @@ # Makefile variables useful for third-party code using htslib's public API. # -# Copyright (C) 2013-2017, 2019 Genome Research Ltd. +# Copyright (C) 2013-2017, 2019-2020 Genome Research Ltd. # # Author: John Marshall # diff --git a/kfunc.c b/kfunc.c index b7d7d521a..bf15cdf33 100644 --- a/kfunc.c +++ b/kfunc.c @@ -1,6 +1,6 @@ /* The MIT License - Copyright (C) 2010, 2013-2014 Genome Research Ltd. + Copyright (C) 2010, 2013-2014, 2020 Genome Research Ltd. Copyright (C) 2011 Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining diff --git a/probaln.c b/probaln.c index 73df1a8f1..192f4b751 100644 --- a/probaln.c +++ b/probaln.c @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2003-2006, 2008-2010 by Heng Li - Copyright (C) 2016-2017 Genome Research Ltd. + Copyright (C) 2016-2017, 2020 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/sam.c b/sam.c index 884eecd6a..8bda92384 100644 --- a/sam.c +++ b/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2020 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2021 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li diff --git a/tabix.c b/tabix.c index 36c414798..1471b09c5 100644 --- a/tabix.c +++ b/tabix.c @@ -581,7 +581,7 @@ int main(int argc, char *argv[]) case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2020 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 2: return usage(stdout, EXIT_SUCCESS); diff --git a/test/fuzz/hts_open_fuzzer.c b/test/fuzz/hts_open_fuzzer.c index 5c239d4f1..355f790a5 100644 --- a/test/fuzz/hts_open_fuzzer.c +++ b/test/fuzz/hts_open_fuzzer.c @@ -1,7 +1,7 @@ /* test/fuzz/hts_open_fuzzer.c -- Fuzz driver for hts_open. Copyright (C) 2018 Google LLC. - Copyright (C) 2019 Genome Research Ltd. + Copyright (C) 2019-2020 Genome Research Ltd. Author: Markus Kusano diff --git a/test/test.pl b/test/test.pl index 8def186d9..a33ead381 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2019 Genome Research Ltd. +# Copyright (C) 2012-2020 Genome Research Ltd. # # Author: Petr Danecek # diff --git a/test/test_view.c b/test/test_view.c index 30e604610..dc644610b 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -1,7 +1,7 @@ /* test/test_view.c -- simple view tool, purely for use in a test harness. Copyright (C) 2012 Broad Institute. - Copyright (C) 2013-2019 Genome Research Ltd. + Copyright (C) 2013-2020 Genome Research Ltd. Author: Heng Li From ab75b9ed97e397ec351a769a2a6d303fce357386 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 21 Jan 2021 18:10:43 +0000 Subject: [PATCH 086/488] Move non-plugin introspection API to htslib/hts.h, hts.c It'll be convenient for API users to find the HTS_FEATURE_* functionality in htslib/hts.h alongside hts_version() etc. Fix the missing quote marks for -DHTS_CPPFLAGS by recoding to implement this via a new generated config_vars.h header instead. Similarly to the version.h and config.h rules, this avoids the fragile quoting needed to add possibly-whitespace-containing string values via -D options. Ignore config_vars.h and some recently-added test executables. --- .gitignore | 3 ++ Makefile | 12 +++-- hfile.c | 140 ------------------------------------------------- hts.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++ htslib/hfile.h | 39 -------------- htslib/hts.h | 40 ++++++++++++++ 6 files changed, 191 insertions(+), 182 deletions(-) diff --git a/.gitignore b/.gitignore index 0855b6a88..16a6b288e 100644 --- a/.gitignore +++ b/.gitignore @@ -7,6 +7,7 @@ *.dll.a *.pc.tmp *-uninstalled.pc +config_vars.h /version.h autom4te.cache @@ -55,7 +56,9 @@ shlib-exports-*.txt /test/test-bcf-sr /test/test-bcf-translate /test/test_bgzf +/test/test_expr /test/test_index +/test/test_introspection /test/test_kfunc /test/test_kstring /test/test-parse-reg diff --git a/Makefile b/Makefile index 0f2fddb47..ce256a629 100644 --- a/Makefile +++ b/Makefile @@ -141,6 +141,13 @@ show-version: @echo PACKAGE_VERSION = $(PACKAGE_VERSION) @echo NUMERIC_VERSION = $(NUMERIC_VERSION) +config_vars.h: + echo '#define HTS_CC "$(CC)"' > $@ + echo '#define HTS_CPPFLAGS "$(CPPFLAGS)"' >> $@ + echo '#define HTS_CFLAGS "$(CFLAGS)"' >> $@ + echo '#define HTS_LDFLAGS "$(LDFLAGS)"' >> $@ + echo '#define HTS_LIBS "$(LIBS)"' >> $@ + .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: @@ -341,12 +348,11 @@ errmod.o errmod.pico: errmod.c config.h $(htslib_hts_h) $(htslib_ksort_h) $(htsl kstring.o kstring.pico: kstring.c config.h $(htslib_kstring_h) header.o header.pico: header.c config.h $(textutils_internal_h) $(header_h) hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(htslib_kstring_h) $(hts_internal_h) $(htslib_khash_h) -hfile.o hfile.pico: ALL_CPPFLAGS += -DHTS_CPPFLAGS="\"$(CPPFLAGS)\"" -DHTS_CFLAGS="\"$(CFLAGS)\"" -DHTS_LDFLAGS="\"$(LDFLAGS)\"" -DHTS_CC="\"$(CC)\"" hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) @@ -694,7 +700,7 @@ testclean: -rm -rf htscodecs/tests/test.out mostlyclean: testclean - -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM version.h + -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM config_vars.h version.h -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico -rm -f hts-object-files -rm -f htscodecs/tests/*.o diff --git a/hfile.c b/hfile.c index dba8b6f6c..2799c89dc 100644 --- a/hfile.c +++ b/hfile.c @@ -1250,146 +1250,6 @@ int htslib_has_plugin(const char *name) return 0; } -HTSLIB_EXPORT -unsigned int htslib_features(void) { - unsigned int feat = 0; - -#ifdef PACKAGE_URL - feat |= HTS_FEATURE_CONFIGURE; -#endif - -#ifdef ENABLE_PLUGINS - feat |= HTS_FEATURE_PLUGINS; -#endif - -#ifdef HAVE_LIBCURL - feat |= HTS_FEATURE_LIBCURL; -#endif - -#ifdef ENABLE_S3 - feat |= HTS_FEATURE_S3; -#endif - -#ifdef ENABLE_GCS - feat |= HTS_FEATURE_GCS; -#endif - -#ifdef HAVE_LIBDEFLATE - feat |= HTS_FEATURE_LIBDEFLATE; -#endif - -#ifdef HAVE_LIBLZMA - feat |= HTS_FEATURE_LZMA; -#endif - -#ifdef HAVE_LIBBZ2 - feat |= HTS_FEATURE_BZIP2; -#endif - - return feat; -} - -HTSLIB_EXPORT -const char *htslib_test_feature(int id) { - int feat = htslib_features(); - - switch (id) { - case HTS_FEATURE_CONFIGURE: - return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; - case HTS_FEATURE_PLUGINS: - return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; - case HTS_FEATURE_LIBCURL: - return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; - case HTS_FEATURE_S3: - return feat & HTS_FEATURE_S3 ? "yes" : NULL; - case HTS_FEATURE_GCS: - return feat & HTS_FEATURE_GCS ? "yes" : NULL; - case HTS_FEATURE_LIBDEFLATE: - return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; - case HTS_FEATURE_BZIP2: - return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; - case HTS_FEATURE_LZMA: - return feat & HTS_FEATURE_LZMA ? "yes" : NULL; - - case HTS_FEATURE_CC: - return HTS_CC; - case HTS_FEATURE_CFLAGS: - return HTS_CFLAGS; - case HTS_FEATURE_LDFLAGS: - return HTS_LDFLAGS; - case HTS_FEATURE_CPPFLAGS: - return HTS_CPPFLAGS; - - default: - fprintf(stderr, "Unknown feature code: %d\n", id); - } - - return NULL; -} - -// Note this implementation also means we can just "strings" the library -// to find the configuration parameters. -HTSLIB_EXPORT -const char *htslib_feature_string(void) { - const char *fmt= - -#ifdef PACKAGE_URL - "build=configure " -#else - "build=Makefile " -#endif - -#ifdef ENABLE_PLUGINS - "plugins=yes, plugin-path=%.1000s " -#else - "plugins=no " -#endif - -#ifdef HAVE_LIBCURL - "libcurl=yes " -#else - "libcurl=no " -#endif - -#ifdef ENABLE_S3 - "S3=yes " -#else - "S3=no " -#endif - -#ifdef ENABLE_GCS - "GCS=yes " -#else - "GCS=no " -#endif - -#ifdef HAVE_LIBDEFLATE - "libdeflate=yes " -#else - "libdeflate=no " -#endif - -#ifdef HAVE_LIBLZMA - "lzma=yes " -#else - "lzma=no " -#endif - -#ifdef HAVE_LIBBZ2 - "bzip2=yes "; -#else - "bzip2=no "; -#endif - -#ifdef ENABLE_PLUGINS - static char config[1200]; - sprintf(config, fmt, htslib_plugin_path()); - return config; -#else - return fmt; -#endif -} - /*************************** * hFILE interface proper ***************************/ diff --git a/hts.c b/hts.c index a59431f56..cb0a07cfb 100644 --- a/hts.c +++ b/hts.c @@ -47,6 +47,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hfile.h" #include "htslib/hts_endian.h" #include "version.h" +#include "config_vars.h" #include "hts_internal.h" #include "hfile_internal.h" #include "sam_internal.h" @@ -72,6 +73,144 @@ const char *hts_version() return HTS_VERSION_TEXT; } +unsigned int htslib_features(void) { + unsigned int feat = 0; + +#ifdef PACKAGE_URL + feat |= HTS_FEATURE_CONFIGURE; +#endif + +#ifdef ENABLE_PLUGINS + feat |= HTS_FEATURE_PLUGINS; +#endif + +#ifdef HAVE_LIBCURL + feat |= HTS_FEATURE_LIBCURL; +#endif + +#ifdef ENABLE_S3 + feat |= HTS_FEATURE_S3; +#endif + +#ifdef ENABLE_GCS + feat |= HTS_FEATURE_GCS; +#endif + +#ifdef HAVE_LIBDEFLATE + feat |= HTS_FEATURE_LIBDEFLATE; +#endif + +#ifdef HAVE_LIBLZMA + feat |= HTS_FEATURE_LZMA; +#endif + +#ifdef HAVE_LIBBZ2 + feat |= HTS_FEATURE_BZIP2; +#endif + + return feat; +} + +const char *htslib_test_feature(int id) { + int feat = htslib_features(); + + switch (id) { + case HTS_FEATURE_CONFIGURE: + return feat & HTS_FEATURE_CONFIGURE ? "yes" : NULL; + case HTS_FEATURE_PLUGINS: + return feat & HTS_FEATURE_PLUGINS ? "yes" : NULL; + case HTS_FEATURE_LIBCURL: + return feat & HTS_FEATURE_LIBCURL ? "yes" : NULL; + case HTS_FEATURE_S3: + return feat & HTS_FEATURE_S3 ? "yes" : NULL; + case HTS_FEATURE_GCS: + return feat & HTS_FEATURE_GCS ? "yes" : NULL; + case HTS_FEATURE_LIBDEFLATE: + return feat & HTS_FEATURE_LIBDEFLATE ? "yes" : NULL; + case HTS_FEATURE_BZIP2: + return feat & HTS_FEATURE_BZIP2 ? "yes" : NULL; + case HTS_FEATURE_LZMA: + return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + + case HTS_FEATURE_CC: + return HTS_CC; + case HTS_FEATURE_CFLAGS: + return HTS_CFLAGS; + case HTS_FEATURE_LDFLAGS: + return HTS_LDFLAGS; + case HTS_FEATURE_CPPFLAGS: + return HTS_CPPFLAGS; + + default: + fprintf(stderr, "Unknown feature code: %d\n", id); + } + + return NULL; +} + +// Note this implementation also means we can just "strings" the library +// to find the configuration parameters. +const char *htslib_feature_string(void) { + const char *fmt= + +#ifdef PACKAGE_URL + "build=configure " +#else + "build=Makefile " +#endif + +#ifdef ENABLE_PLUGINS + "plugins=yes, plugin-path=%.1000s " +#else + "plugins=no " +#endif + +#ifdef HAVE_LIBCURL + "libcurl=yes " +#else + "libcurl=no " +#endif + +#ifdef ENABLE_S3 + "S3=yes " +#else + "S3=no " +#endif + +#ifdef ENABLE_GCS + "GCS=yes " +#else + "GCS=no " +#endif + +#ifdef HAVE_LIBDEFLATE + "libdeflate=yes " +#else + "libdeflate=no " +#endif + +#ifdef HAVE_LIBLZMA + "lzma=yes " +#else + "lzma=no " +#endif + +#ifdef HAVE_LIBBZ2 + "bzip2=yes "; +#else + "bzip2=no "; +#endif + +#ifdef ENABLE_PLUGINS + static char config[1200]; + sprintf(config, fmt, htslib_plugin_path()); + return config; +#else + return fmt; +#endif +} + + HTSLIB_EXPORT const unsigned char seq_nt16_table[256] = { 15,15,15,15, 15,15,15,15, 15,15,15,15, 15,15,15,15, diff --git a/htslib/hfile.h b/htslib/hfile.h index 829e3b12b..0a54a7d8f 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -364,45 +364,6 @@ int hts_list_plugins(const char *plist[], int *nplugins); HTSLIB_EXPORT int htslib_has_plugin(const char *name); -/*! @abstract Introspection on the features enabled in htslib - * - * @return a bitfield of HTS_FEATURE_* macros. - */ -HTSLIB_EXPORT -unsigned int htslib_features(void); - -HTSLIB_EXPORT -const char *htslib_test_feature(int id); - -/*! @abstract Introspection on the features enabled in htslib, string form - * - * @return a string describing htslib build features - */ -HTSLIB_EXPORT -const char *htslib_feature_string(void); - -// Whether ./configure was used or vanilla Makefile -#define HTS_FEATURE_CONFIGURE 1 - -// Also see htslib_plugin_path function -#define HTS_FEATURE_PLUGINS 2 - -// Transport specific -#define HTS_FEATURE_LIBCURL 4 -#define HTS_FEATURE_S3 8 -#define HTS_FEATURE_GCS 16 - -// Compression options -#define HTS_FEATURE_LIBDEFLATE 32 -#define HTS_FEATURE_LZMA 64 -#define HTS_FEATURE_BZIP2 128 - -// Build params -#define HTS_FEATURE_CC (1<<28) -#define HTS_FEATURE_CFLAGS (1<<29) -#define HTS_FEATURE_LDFLAGS (1<<30) -#define HTS_FEATURE_CPPFLAGS (1u<<31) - #ifdef __cplusplus } #endif diff --git a/htslib/hts.h b/htslib/hts.h index 0f2fdd021..65315d16d 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -451,6 +451,46 @@ const char *hts_version(void); // further when significant features are merged. #define HTS_VERSION 101190 +/*! @abstract Introspection on the features enabled in htslib + * + * @return a bitfield of HTS_FEATURE_* macros. + */ +HTSLIB_EXPORT +unsigned int htslib_features(void); + +HTSLIB_EXPORT +const char *htslib_test_feature(int id); + +/*! @abstract Introspection on the features enabled in htslib, string form + * + * @return a string describing htslib build features + */ +HTSLIB_EXPORT +const char *htslib_feature_string(void); + +// Whether ./configure was used or vanilla Makefile +#define HTS_FEATURE_CONFIGURE 1 + +// Also see htslib_plugin_path function +#define HTS_FEATURE_PLUGINS 2 + +// Transport specific +#define HTS_FEATURE_LIBCURL 4 +#define HTS_FEATURE_S3 8 +#define HTS_FEATURE_GCS 16 + +// Compression options +#define HTS_FEATURE_LIBDEFLATE 32 +#define HTS_FEATURE_LZMA 64 +#define HTS_FEATURE_BZIP2 128 + +// Build params +#define HTS_FEATURE_CC (1<<28) +#define HTS_FEATURE_CFLAGS (1<<29) +#define HTS_FEATURE_LDFLAGS (1<<30) +#define HTS_FEATURE_CPPFLAGS (1u<<31) + + /*! @abstract Determine format by peeking at the start of a file @param fp File opened for reading, positioned at the beginning From 89db11228c6ae6503ff75f930d9b3fda86fc34d1 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 21 Jan 2021 18:34:12 +0000 Subject: [PATCH 087/488] Renumber introspection feature codes (and they are unsigned) Leave gaps in the allocated bits between the different feature categories. Leave a gap at the end for HTS_FEATURE_LIBS if it is added in future. Don't mention htslib_plugin_path() in the public header as it is an internal function. Change htslib_test_feature() to use unsigned int, as per htslib_features(). --- hts.c | 6 +++--- htslib/hts.h | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/hts.c b/hts.c index cb0a07cfb..68b166892 100644 --- a/hts.c +++ b/hts.c @@ -111,8 +111,8 @@ unsigned int htslib_features(void) { return feat; } -const char *htslib_test_feature(int id) { - int feat = htslib_features(); +const char *htslib_test_feature(unsigned int id) { + unsigned int feat = htslib_features(); switch (id) { case HTS_FEATURE_CONFIGURE: @@ -142,7 +142,7 @@ const char *htslib_test_feature(int id) { return HTS_CPPFLAGS; default: - fprintf(stderr, "Unknown feature code: %d\n", id); + fprintf(stderr, "Unknown feature code: %u\n", id); } return NULL; diff --git a/htslib/hts.h b/htslib/hts.h index 65315d16d..6337306b8 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -459,7 +459,7 @@ HTSLIB_EXPORT unsigned int htslib_features(void); HTSLIB_EXPORT -const char *htslib_test_feature(int id); +const char *htslib_test_feature(unsigned int id); /*! @abstract Introspection on the features enabled in htslib, string form * @@ -471,24 +471,24 @@ const char *htslib_feature_string(void); // Whether ./configure was used or vanilla Makefile #define HTS_FEATURE_CONFIGURE 1 -// Also see htslib_plugin_path function +// Whether --enable-plugins was used #define HTS_FEATURE_PLUGINS 2 // Transport specific -#define HTS_FEATURE_LIBCURL 4 -#define HTS_FEATURE_S3 8 -#define HTS_FEATURE_GCS 16 +#define HTS_FEATURE_LIBCURL (1u<<10) +#define HTS_FEATURE_S3 (1u<<11) +#define HTS_FEATURE_GCS (1u<<12) // Compression options -#define HTS_FEATURE_LIBDEFLATE 32 -#define HTS_FEATURE_LZMA 64 -#define HTS_FEATURE_BZIP2 128 +#define HTS_FEATURE_LIBDEFLATE (1u<<20) +#define HTS_FEATURE_LZMA (1u<<21) +#define HTS_FEATURE_BZIP2 (1u<<22) // Build params -#define HTS_FEATURE_CC (1<<28) -#define HTS_FEATURE_CFLAGS (1<<29) -#define HTS_FEATURE_LDFLAGS (1<<30) -#define HTS_FEATURE_CPPFLAGS (1u<<31) +#define HTS_FEATURE_CC (1u<<27) +#define HTS_FEATURE_CFLAGS (1u<<28) +#define HTS_FEATURE_CPPFLAGS (1u<<29) +#define HTS_FEATURE_LDFLAGS (1u<<30) /*! From 1683c53aece32424248f2a23074aee90ab66f911 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 3 Feb 2021 11:32:44 +0000 Subject: [PATCH 088/488] Rename hFILE plugin introspection functions to hfile_* The hFILE plugin mechanism is the only current plugin endpoint in HTSlib, but the facility is general so that there could be other plugin endpoints added in future, e.g., for other compression methods. Rename these introspection functions as they pertain to hFILE plugins only. --- hfile.c | 12 ++++++------ htslib/hfile.h | 12 ++++++------ test/test_introspection.c | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/hfile.c b/hfile.c index 2799c89dc..708957863 100644 --- a/hfile.c +++ b/hfile.c @@ -1148,7 +1148,7 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) ***************************/ /* - * Fills out sc_list[] with the list of known schemes. + * Fills out sc_list[] with the list of known URL schemes. * This can be restricted to just ones from a specific plugin, * or all (plugin == NULL). * @@ -1156,7 +1156,7 @@ static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) * -1 on failure. */ HTSLIB_EXPORT -int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) +int hfile_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) { pthread_mutex_lock(&plugins_lock); if (!schemes && load_hfile_plugins() < 0) { @@ -1189,13 +1189,13 @@ int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes) /* - * Fills out plist[] with the list of known plugins. + * Fills out plist[] with the list of known hFILE plugins. * * Returns number of schemes found on success; * -1 on failure */ HTSLIB_EXPORT -int hts_list_plugins(const char *plist[], int *nplugins) +int hfile_list_plugins(const char *plist[], int *nplugins) { pthread_mutex_lock(&plugins_lock); if (!schemes && load_hfile_plugins() < 0) { @@ -1225,13 +1225,13 @@ int hts_list_plugins(const char *plist[], int *nplugins) /* - * Tests for the presence of a specific plugin. + * Tests for the presence of a specific hFILE plugin. * * Returns 1 if true * 0 otherwise */ HTSLIB_EXPORT -int htslib_has_plugin(const char *name) +int hfile_has_plugin(const char *name) { pthread_mutex_lock(&plugins_lock); if (!schemes && load_hfile_plugins() < 0) { diff --git a/htslib/hfile.h b/htslib/hfile.h index 0a54a7d8f..55cae244e 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -316,7 +316,7 @@ purpose other than closing. HTSLIB_EXPORT char *hfile_mem_steal_buffer(hFILE *file, size_t *length); -/// Fills out sc_list[] with the list of known schemes. +/// Fills out sc_list[] with the list of known URL schemes. /** * @param plugin [in] Restricts schemes to only those from 'plugin. * @param sc_list [out] Filled out with the scheme names @@ -335,9 +335,9 @@ char *hfile_mem_steal_buffer(hFILE *file, size_t *length); * -1 on failure */ HTSLIB_EXPORT -int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); +int hfile_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); -/// Fills out plist[] with the list of known plugins. +/// Fills out plist[] with the list of known hFILE plugins. /* * @param plist [out] Filled out with the plugin names * @param nplugins [in/out] Size of plist (in) and number returned (out) @@ -353,16 +353,16 @@ int hts_list_schemes(const char *plugin, const char *sc_list[], int *nschemes); * -1 on failure */ HTSLIB_EXPORT -int hts_list_plugins(const char *plist[], int *nplugins); +int hfile_list_plugins(const char *plist[], int *nplugins); -/// Tests for the presence of a specific plugin. +/// Tests for the presence of a specific hFILE plugin. /* * @param name The name of the plugin to query. * * @return 1 if found, 0 otherwise. */ HTSLIB_EXPORT -int htslib_has_plugin(const char *name); +int hfile_has_plugin(const char *name); #ifdef __cplusplus } diff --git a/test/test_introspection.c b/test/test_introspection.c index 9ee7d1aa3..0958a962f 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -64,13 +64,13 @@ int main(void) { const char *plugins[100]; int np = 100, i, j; - if (hts_list_plugins(plugins, &np) < 0) + if (hfile_list_plugins(plugins, &np) < 0) return 1; for (i = 0; i < np; i++) { const char *sc_list[100]; int nschemes = 100; - if (hts_list_schemes(plugins[i], sc_list, &nschemes) < 0) + if (hfile_list_schemes(plugins[i], sc_list, &nschemes) < 0) return 1; printf(" %s:\n", plugins[i]); From d9890a964368e2c300ae3d4258603df3fe534256 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 Feb 2021 16:33:40 +0000 Subject: [PATCH 089/488] Rename htslib_* introspection functions to hts_*. This maintains consistency with the existing hts_version() function. Note this also renames the internal htslib_plugin_path function too, again simply for consistency. Not it queries the HTS_PATH and not HTSLIB_PATH environment variable, so the new name is better. --- hts.c | 10 +++++----- hts_internal.h | 2 +- htslib/hts.h | 6 +++--- plugin.c | 2 +- test/test_introspection.c | 12 ++++++------ 5 files changed, 16 insertions(+), 16 deletions(-) diff --git a/hts.c b/hts.c index 68b166892..57ee4d1d6 100644 --- a/hts.c +++ b/hts.c @@ -73,7 +73,7 @@ const char *hts_version() return HTS_VERSION_TEXT; } -unsigned int htslib_features(void) { +unsigned int hts_features(void) { unsigned int feat = 0; #ifdef PACKAGE_URL @@ -111,8 +111,8 @@ unsigned int htslib_features(void) { return feat; } -const char *htslib_test_feature(unsigned int id) { - unsigned int feat = htslib_features(); +const char *hts_test_feature(unsigned int id) { + unsigned int feat = hts_features(); switch (id) { case HTS_FEATURE_CONFIGURE: @@ -150,7 +150,7 @@ const char *htslib_test_feature(unsigned int id) { // Note this implementation also means we can just "strings" the library // to find the configuration parameters. -const char *htslib_feature_string(void) { +const char *hts_feature_string(void) { const char *fmt= #ifdef PACKAGE_URL @@ -203,7 +203,7 @@ const char *htslib_feature_string(void) { #ifdef ENABLE_PLUGINS static char config[1200]; - sprintf(config, fmt, htslib_plugin_path()); + sprintf(config, fmt, hts_plugin_path()); return config; #else return fmt; diff --git a/hts_internal.h b/hts_internal.h index 5ee88b7d0..602348618 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -104,7 +104,7 @@ plugin_void_func *load_plugin(void **pluginp, const char *filename, const char * void *plugin_sym(void *plugin, const char *name, const char **errmsg); plugin_void_func *plugin_func(void *plugin, const char *name, const char **errmsg); void close_plugin(void *plugin); -const char *htslib_plugin_path(void); +const char *hts_plugin_path(void); /* * Buffers up arguments to hts_idx_push for later use, once we've written all bar diff --git a/htslib/hts.h b/htslib/hts.h index 6337306b8..9488a9f9b 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -456,17 +456,17 @@ const char *hts_version(void); * @return a bitfield of HTS_FEATURE_* macros. */ HTSLIB_EXPORT -unsigned int htslib_features(void); +unsigned int hts_features(void); HTSLIB_EXPORT -const char *htslib_test_feature(unsigned int id); +const char *hts_test_feature(unsigned int id); /*! @abstract Introspection on the features enabled in htslib, string form * * @return a string describing htslib build features */ HTSLIB_EXPORT -const char *htslib_feature_string(void); +const char *hts_feature_string(void); // Whether ./configure was used or vanilla Makefile #define HTS_FEATURE_CONFIGURE 1 diff --git a/plugin.c b/plugin.c index fccc83793..cec5beefd 100644 --- a/plugin.c +++ b/plugin.c @@ -192,7 +192,7 @@ void close_plugin(void *plugin) } } -const char *htslib_plugin_path(void) { +const char *hts_plugin_path(void) { #ifdef ENABLE_PLUGINS char *path = getenv("HTS_PATH"); if (!path) path = ""; diff --git a/test/test_introspection.c b/test/test_introspection.c index 0958a962f..cc8ceb6e7 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -32,12 +32,12 @@ int main(void) { printf("Version string: %s\n", hts_version()); printf("Version number: %d\n", HTS_VERSION); - printf("\nCC: %s\n", htslib_test_feature(HTS_FEATURE_CC)); - printf("CPPFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CPPFLAGS)); - printf("CFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_CFLAGS)); - printf("LDFLAGS: %s\n", htslib_test_feature(HTS_FEATURE_LDFLAGS)); + printf("\nCC: %s\n", hts_test_feature(HTS_FEATURE_CC)); + printf("CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS)); + printf("CFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CFLAGS)); + printf("LDFLAGS: %s\n", hts_test_feature(HTS_FEATURE_LDFLAGS)); - unsigned int feat = htslib_features(); + unsigned int feat = hts_features(); printf("\nFeature number: 0x%x\n", feat); if (feat & HTS_FEATURE_CONFIGURE) printf(" HTS_FEATURE_CONFIGURE\n"); @@ -56,7 +56,7 @@ int main(void) { if (feat & HTS_FEATURE_BZIP2) printf(" HTS_FEATURE_BZIP2\n"); - printf("\nFeature string: %s\n", htslib_feature_string()); + printf("\nFeature string: %s\n", hts_feature_string()); // Plugins and schemes From 8aa9bcd10c782c8a03adde978f9ef169ebaf4573 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 4 Feb 2021 11:58:43 +0000 Subject: [PATCH 090/488] Fix a minor memory leak in XRLE encoding. Credit to OSS-Fuzz Fixes oss-fuzz 30260 --- cram/cram_codecs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 55030c814..6a4ee7821 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -2130,6 +2130,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, c->decode = cram_xrle_decode_char; else { fprintf(stderr, "BYTE_ARRAYs not supported by this codec\n"); + free(c); return NULL; } c->free = cram_xrle_decode_free; From dbac2d17ca4d2c8dc0df770e4b168c2c768548b7 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 9 Feb 2021 13:12:37 +0000 Subject: [PATCH 091/488] Fix htscodecs dependency typos --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index ce256a629..7b162f196 100644 --- a/Makefile +++ b/Makefile @@ -386,8 +386,8 @@ cram/pooled_alloc.o cram/pooled_alloc.pico: cram/pooled_alloc.c config.h cram/po cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/string_alloc.h thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) -htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htsodecs_utils_h) $(htscodecs_c_simple_model.h) -htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model.h) +htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) +htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model_h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) From 222387d9d4ed776cf16d50f32a1170ec659c2f29 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 17:58:06 +0000 Subject: [PATCH 092/488] Fix undefined behaviour warning in kstring Fixes a "runtime error: applying zero offset to null pointer" warning from clang 10.0.1 undefined behaviour sanitizer. This happened when the storage for the string had not been allocated before calling kvsprintf(). Fix by making it allocate a buffer if this is the case, so vsnprintf() has something to write into. --- kstring.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kstring.c b/kstring.c index 55626745d..9b2d60c1f 100644 --- a/kstring.c +++ b/kstring.c @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2018, 2020 Genome Research Ltd. + Copyright (C) 2013-2018, 2020-2021 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the @@ -153,6 +153,15 @@ int kvsprintf(kstring_t *s, const char *fmt, va_list ap) return l; } + if (!s->s) { + const size_t sz = 64; + s->s = malloc(sz); + if (!s->s) + return -1; + s->m = sz; + s->l = 0; + } + l = vsnprintf(s->s + s->l, s->m - s->l, fmt, args); // This line does not work with glibc 2.0. See `man snprintf'. va_end(args); if (l + 1 > s->m - s->l) { From d40cad32b85186e7d760707c61984d3b6fc422b5 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 18:11:31 +0000 Subject: [PATCH 093/488] Fix CRAM undefined behaviour warnings Fixes various warnings from clang 10.0.1 undefined behaviour sanitizer: - A "runtime error: applying zero offset to null pointer" warning in cram_to_bam() on CRAM slices with no CIGAR data. Fix by always allocating memory when creating a slice data structure. For aligned data this simply moves the allocation forward in time. For unaligned data, the allocated memory is not really needed, but it's assumed that it is quicker to make one allocation per slice than to check for a NULL pointer on every call to cram_to_bam(). - A "runtime error: applying zero offset to null pointer" warning in cram_byte_array_stop_decode_block() when it is called before the `out` block has been allocated. Fix by moving the calculation of `out_cp`. Also changes the type of some variables to `unsigned char` enabling removal of a number of (char *) casts. - An integer overflow in the cram_metrics::sz array. Entries 9 and 10 had a high multiplier to ensure they would never win, as they correspond to reserved method codes. Under certain conditions the value in them could build up to the point where they overflowed. The methods are masked out elsewhere , so the numbers would not have been used, but it did cause an overflow warning. Fix by changing the multiplier to 1, and (in a belt-and-braces approach) setting the values for these entries to INT_MAX to ensure they won't be chosen. --- cram/cram_codecs.c | 18 +++++++++--------- cram/cram_io.c | 13 ++++++++----- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 6a4ee7821..0c6cb654e 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3494,8 +3494,8 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, int *out_size) { cram_block *b; cram_block *out = (cram_block *)out_; - char *cp, *out_cp, *cp_end; - char stop; + unsigned char *cp, *cp_end; + unsigned char stop; b = cram_get_block_by_id(slice, c->u.byte_array_stop.content_id); if (!b) @@ -3503,25 +3503,25 @@ int cram_byte_array_stop_decode_block(cram_slice *slice, cram_codec *c, if (b->idx >= b->uncomp_size) return -1; - cp = (char *)b->data + b->idx; - cp_end = (char *)b->data + b->uncomp_size; - out_cp = (char *)BLOCK_END(out); + cp = b->data + b->idx; + cp_end = b->data + b->uncomp_size; stop = c->u.byte_array_stop.stop; if (cp_end - cp < out->alloc - out->byte) { + unsigned char *out_cp = BLOCK_END(out); while (cp != cp_end && *cp != stop) *out_cp++ = *cp++; - BLOCK_SIZE(out) = out_cp - (char *)BLOCK_DATA(out); + BLOCK_SIZE(out) = out_cp - BLOCK_DATA(out); } else { - char *cp_start; + unsigned char *cp_start; for (cp_start = cp; cp != cp_end && *cp != stop; cp++) ; BLOCK_APPEND(out, cp_start, cp - cp_start); BLOCK_GROW(out, cp - cp_start); } - *out_size = cp - (char *)(b->data + b->idx); - b->idx = cp - (char *)b->data + 1; + *out_size = cp - (b->data + b->idx); + b->idx = cp - b->data + 1; return 0; diff --git a/cram/cram_io.c b/cram/cram_io.c index 3a6b04c42..8099a9287 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2080,7 +2080,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, 1.04, // 6 arithpr (O0) 1.05, // 7 fqz 1.05, // 8 tok3 (rans) - 9, 9, // 9,10 reserved + 1.00, 1.00, // 9,10 reserved // Paramterised versions of above 1.01, // gzip rle @@ -2124,6 +2124,9 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, metrics->sz[m] *= 1+(meth_cost[m]-1)/3; } // else cost is ignored + // Ensure these are never used + metrics->sz[9] = metrics->sz[10] = INT_MAX; + for (m = 0; m < CRAM_MAX_METHOD; m++) { if ((!metrics->sz[m]) || (!(method & (1u<block_by_id = NULL; s->last_apos = 0; if (!(s->crecs = malloc(nrecs * sizeof(cram_record)))) goto err; - s->cigar = NULL; - s->cigar_alloc = 0; + s->cigar_alloc = 1024; + if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; s->ncigar = 0; if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; @@ -4499,8 +4502,8 @@ cram_slice *cram_read_slice(cram_fd *fd) { } /* Initialise encoding/decoding tables */ - s->cigar = NULL; - s->cigar_alloc = 0; + s->cigar_alloc = 1024; + if (!(s->cigar = malloc(s->cigar_alloc * sizeof(*s->cigar)))) goto err; s->ncigar = 0; if (!(s->seqs_blk = cram_new_block(EXTERNAL, 0))) goto err; From 1b66f6e2d10504856a13b5cc2f0d250a410298fd Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 18:34:36 +0000 Subject: [PATCH 094/488] Fix undefined behaviour warning in test-vcf-sweep If the input file had no PL entry in the header, bcf_get_format_int32() returns -1. Not checking for return values < 0 led to the loop over the samples running, and an attempt to do arithmetic on a NULL pointer (ptr). Fix by changing the condition on the nPLs test. --- test/test-vcf-sweep.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test-vcf-sweep.c b/test/test-vcf-sweep.c index 4b8e3d70b..40ee4e417 100644 --- a/test/test-vcf-sweep.c +++ b/test/test-vcf-sweep.c @@ -63,7 +63,7 @@ int main(int argc, char **argv) { // get copy of the PL vectors nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); - if ( !nPLs ) continue; // PL not present + if ( nPLs <= 0 ) continue; // PL not present // how many values are there per sample int nvals = nPLs / bcf_hdr_nsamples(hdr); @@ -91,7 +91,7 @@ int main(int argc, char **argv) while ( (rec = bcf_sweep_bwd(sw)) ) { nPLs = bcf_get_format_int32(hdr, rec, "PL", &PLs, &mPLs); - if ( !nPLs ) continue; + if ( nPLs <= 0 ) continue; int nvals = nPLs / bcf_hdr_nsamples(hdr); int32_t *ptr = PLs; for (i=0; i Date: Tue, 2 Feb 2021 16:35:15 +0000 Subject: [PATCH 095/488] Fix undefined behaviour warning in bcf_record_check() Fixes a "runtime error: applying zero offset to null pointer" warning from clang 10.0.1 undefined behaviour sanitizer on lines with no FORMAT values. This happened when bcf_record_check() tried to calculate the end of the (NULL) rec->indiv.s buffer. Instead of trying to handle this in bcf_record_check(), fix by ensuring bcf_read1_core() allocates at least one byte for `indiv`, and good measure do the same for `shared` as well. As well as fixing the warning in bcf_record_check(), this prevents NULL pointers from being passed to other functions, for example bgzf_read(). --- vcf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index e9868cadf..a0c440202 100644 --- a/vcf.c +++ b/vcf.c @@ -1236,9 +1236,9 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) shared_len = le_to_u32(x); if (shared_len < 24) return -2; shared_len -= 24; // to exclude six 32-bit integers - if (ks_resize(&v->shared, shared_len) != 0) return -2; + if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2; indiv_len = le_to_u32(x + 4); - if (ks_resize(&v->indiv, indiv_len) != 0) return -2; + if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2; v->rid = le_to_i32(x + 8); v->pos = le_to_u32(x + 12); v->rlen = le_to_i32(x + 16); From 9f8ac0365e455676af096ec81f83cf01dfb309da Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 2 Feb 2021 17:29:11 +0000 Subject: [PATCH 096/488] Fix truncation bug in filtering bitwise expressions When setting `res->is_true` on the result of bitwise expressions, only the lowest eight bits were used which meant it could be set incorrectly depending on the result. Fix by explicitly checking the result against zero. --- hts_expr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/hts_expr.c b/hts_expr.c index 1a1b9c52d..599d7a54a 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -1,6 +1,6 @@ /* hts_expr.c -- filter expression parsing and processing. - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020-2021 Genome Research Ltd. Author: James Bonfield @@ -413,7 +413,7 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_free(&val); return -1; } - res->is_true = res->d = (int64_t)res->d & (int64_t)val.d; + res->is_true = (res->d = ((int64_t)res->d & (int64_t)val.d)) != 0; } else { break; } @@ -441,7 +441,7 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_free(&val); return -1; } - res->is_true = res->d = (int64_t)res->d ^ (int64_t)val.d; + res->is_true = (res->d = ((int64_t)res->d ^ (int64_t)val.d)) != 0; } else { break; } @@ -469,7 +469,7 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_free(&val); return -1; } - res->is_true = res->d = (int64_t)res->d | (int64_t)val.d; + res->is_true = (res->d = ((int64_t)res->d | (int64_t)val.d)) != 0; } else { break; } From 7152cb40f1b5404ff9a0523aa3d4b6c060b1ac51 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jan 2021 18:41:11 +0000 Subject: [PATCH 097/488] Fix undefined behaviour in bcf_update_alleles() bcf_update_alleles() checked to see if any of the allele pointers passed in were pointing to the existing allele data, so it could avoid clobbering them during the update. This caused a "runtime error: applying zero offset to null pointer" from undefined behaviour sanitizer when it was called with a new BCF structure where line->d.als had not yet been allocated. However, even when it had been allocated, the test relied on undefined behaviour as it compared pointers to different memory regions. Fix by rewriting the function so that it always copies the input data to a new location. Where the new alleles are short enough (which should be the usual case) and can fit in the existing allocation, they are first copied into a stack buffer, and then back to line->d.als; otherwise they go into a newly allocated buffer which is switched for the original. This avoids the undefined behaviour issues, and also allows the line->d.als memory to be reused even when some of the input alleles point into it. Includes unit tests. --- test/test-vcf-api.c | 63 ++++++++++++++++++++++++++++++++++++++-- vcf.c | 70 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 114 insertions(+), 19 deletions(-) diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index 87bce4aab..22886e7af 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -46,6 +46,65 @@ void error(const char *format, ...) #define STRINGIFY(x) #x #define check0(x) ((x) == 0 ? (void) 0 : error("Failed: %s", STRINGIFY(x))) +static int check_alleles(bcf1_t *rec, const char **alleles, int num) { + int i; + if (rec->n_allele != num) { + fprintf(stderr, "Wrong number of alleles - expected %d, got %d\n", + num, rec->n_allele); + return -1; + } + if (bcf_unpack(rec, BCF_UN_STR) != 0) + return -1; + for (i = 0; i < num; i++) { + if (0 != strcmp(alleles[i], rec->d.allele[i])) { + fprintf(stderr, + "Mismatch for allele %d : expected '%s' got '%s'\n", + i, alleles[i], rec->d.allele[i]); + return -1; + } + } + return 0; +} + +static void test_update_alleles(bcf_hdr_t *hdr, bcf1_t *rec) +{ + // Exercise bcf_update_alleles() a bit + const char *alleles1[2] = { "G", "A" }; + const char *alleles2[3] = { "C", "TGCA", "CATG" }; +#define rep10(x) x x x x x x x x x x + const char *alleles3[3] = { rep10("ATTCTAGATC"), "TGCA", + rep10("CTATTATCTCTAATGACATG") }; +#undef rep10 + const char *alleles4[3] = { alleles3[2], NULL, alleles3[0] }; + // Add some alleles + check0(bcf_update_alleles(hdr, rec, alleles1, 2)); + check0(check_alleles(rec, alleles1, 2)); + // Erase them + check0(bcf_update_alleles(hdr, rec, NULL, 0)); + check0(check_alleles(rec, NULL, 0)); + // Expand to three + check0(bcf_update_alleles(hdr, rec, alleles2, 3)); + check0(check_alleles(rec, alleles2, 3)); + // Now try some bigger ones (should force a realloc) + check0(bcf_update_alleles(hdr, rec, alleles3, 3)); + check0(check_alleles(rec, alleles3, 3)); + // Ensure it works even if one of the alleles points into the + // existing structure + alleles4[1] = rec->d.allele[1]; + check0(bcf_update_alleles(hdr, rec, alleles4, 3)); + alleles4[1] = alleles3[1]; // Will have been clobbered by the update + check0(check_alleles(rec, alleles4, 3)); + // Ensure it works when the alleles point into the existing data, + // rec->d.allele is used to define the input array and the + // order of the entries is changed. The result of this should + // be the same as alleles2. + char *tmp = rec->d.allele[0] + strlen(rec->d.allele[0]) - 4; + rec->d.allele[0] = rec->d.allele[2] + strlen(rec->d.allele[2]) - 1; + rec->d.allele[2] = tmp; + check0(bcf_update_alleles(hdr, rec, (const char **) rec->d.allele, 3)); + check0(check_alleles(rec, alleles2, 3)); +} + void write_bcf(char *fname) { // Init @@ -114,10 +173,10 @@ void write_bcf(char *fname) // .. ID check0(bcf_update_id(hdr, rec, "rs6054257")); // .. REF and ALT + test_update_alleles(hdr, rec); const char *alleles[2] = { "G", "A" }; - check0(bcf_update_alleles(hdr, rec, alleles, 2)); - check0(bcf_update_alleles(hdr, rec, NULL, 0)); check0(bcf_update_alleles_str(hdr, rec, "G,A")); + check0(check_alleles(rec, alleles, 2)); // .. QUAL rec->qual = 29; // .. FILTER diff --git a/vcf.c b/vcf.c index a0c440202..980f8bff8 100644 --- a/vcf.c +++ b/vcf.c @@ -4443,28 +4443,64 @@ static inline int _bcf1_sync_alleles(const bcf_hdr_t *hdr, bcf1_t *line, int nal int bcf_update_alleles(const bcf_hdr_t *hdr, bcf1_t *line, const char **alleles, int nals) { if ( !(line->unpacked & BCF_UN_STR) ) bcf_unpack(line, BCF_UN_STR); - kstring_t tmp = {0,0,0}; char *free_old = NULL; - - // If the supplied alleles are not pointers to line->d.als, the existing block can be reused. + char buffer[256]; + size_t used = 0; + + // The pointers in alleles may point into the existing line->d.als memory, + // so care needs to be taken not to clobber them while updating. Usually + // they will be short so we can copy through an intermediate buffer. + // If they're longer, or won't fit in the existing allocation we + // can allocate a new buffer to write into. Note that in either case + // pointers to line->d.als memory in alleles may not be valid when we've + // finished. int i; - for (i=0; i=line->d.als && alleles[i]d.als+line->d.m_als ) break; - if ( i==nals ) - { - // all alleles point elsewhere, reuse the existing block - tmp.l = 0; tmp.s = line->d.als; tmp.m = line->d.m_als; - } - else + size_t avail = line->d.m_als < sizeof(buffer) ? line->d.m_als : sizeof(buffer); + for (i=0; id.m_als) // Don't shrink the buffer + needed = line->d.m_als; + if (needed > INT_MAX) { + hts_log_error("REF + alleles too long to fit in a BCF record"); + return -1; + } + new_als = malloc(needed); + if (!new_als) + return -1; free_old = line->d.als; + line->d.als = new_als; + line->d.m_als = needed; + } - for (i=0; id.m_als); + memcpy(line->d.als, buffer, used); } - line->d.als = tmp.s; line->d.m_als = tmp.m; - free(free_old); + + // Add in any remaining entries - if this happens we will always be + // writing to a newly-allocated buffer. + for (; i < nals; i++) { + size_t sz = strlen(alleles[i]) + 1; + memcpy(line->d.als + used, alleles[i], sz); + used += sz; + } + + if (free_old) + free(free_old); return _bcf1_sync_alleles(hdr,line,nals); } From 473f8d86411548b24819597a75d8f7f6cdb27629 Mon Sep 17 00:00:00 2001 From: Robert Davies Date: Mon, 8 Feb 2021 17:36:46 +0000 Subject: [PATCH 098/488] Change MACH_O compatibility and current versions When building libhts.dylib, the compatibility_version and current_version were not being set in the way documented by Apple - notably they expect current_version to be from the same sequence as, and greater than compatibility_version. See: https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html Change the values so current_version is related to the current HTSlib version, and compatibility_version the last one that introduced a backwards compatible change. As earlier HTSlib releases set compatibility_version to 3, we prepend '3.' to these numbers so programs linked to earlier versions will still work with libraries built after this change. This will be removed on the next ABI-changing update to the library, which will allow the compatibility_version and current_version sequences to be reset. Thanks to John Marshall for help with this issue. --- Makefile | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 7b162f196..d2bdaf07f 100644 --- a/Makefile +++ b/Makefile @@ -122,7 +122,13 @@ include htscodecs.mk PACKAGE_VERSION := $(shell ./version.sh) LIBHTS_SOVERSION = 3 -MACH_O_COMPATIBILITY_VERSION = $(LIBHTS_SOVERSION) + +# Version numbers for the Mac dynamic library. Note that the leading 3 +# is not strictly necessary and should be removed the next time +# LIBHTS_SOVERSION is bumped (see #1144 and +# https://developer.apple.com/library/archive/documentation/DeveloperTools/Conceptual/DynamicLibraries/100-Articles/DynamicLibraryDesignGuidelines.html#//apple_ref/doc/uid/TP40002013-SW23) +MACH_O_COMPATIBILITY_VERSION = 3.1.11 +MACH_O_CURRENT_VERSION = 3.1.11 # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string # even if this is a dirty or untagged Git working tree. @@ -308,7 +314,7 @@ libhts.so: $(LIBHTS_OBJS:.o=.pico) # includes this project's build directory). libhts.dylib: $(LIBHTS_OBJS) - $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(NUMERIC_VERSION) -compatibility_version $(MACH_O_COMPATIBILITY_VERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LIBS) + $(CC) -dynamiclib -install_name $(libdir)/libhts.$(LIBHTS_SOVERSION).dylib -current_version $(MACH_O_CURRENT_VERSION) -compatibility_version $(MACH_O_COMPATIBILITY_VERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS) $(LIBS) ln -sf $@ libhts.$(LIBHTS_SOVERSION).dylib cyghts-$(LIBHTS_SOVERSION).dll libhts.dll.a: $(LIBHTS_OBJS) From 26c9da239bc50bba2de0ad571ad3279c4dd8dd99 Mon Sep 17 00:00:00 2001 From: Alberto Date: Wed, 10 Feb 2021 11:27:22 -0700 Subject: [PATCH 099/488] Update vcf.h Fixed typo in example of bcf_get_format_*() funcitons. --- htslib/vcf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 6e476187a..732f25607 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1023,7 +1023,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). * int max_ploidy = ngt/nsmpl; * for (i=0; i Date: Wed, 10 Feb 2021 10:13:37 +0000 Subject: [PATCH 100/488] Update htscodecs submodule - Fix off-by-one error in array bounds checking. Credit to OSS-Fuzz Fixes oss-fuzz 30381 Fixes oss-fuzz 30395 - Use printf in shell scripts instead of echo -n --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 0ffb50be0..306b3ebf9 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 0ffb50be07eab4ff40c8e6b7dc346266c6a23421 +Subproject commit 306b3ebf96890e71ca6e376f09ae6a1a0765ed99 From 449cfe1cdabef00a9533a44f7bda921038395924 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 11 Feb 2021 16:44:43 +0000 Subject: [PATCH 101/488] Change travis to cirrus in .gitattributes --- .gitattributes | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitattributes b/.gitattributes index e46cc5cb4..a14bb82b1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -9,7 +9,7 @@ # Omit these files from release tarballs. /.appveyor.yml export-ignore .git* export-ignore -/.travis.yml export-ignore +/.cirrus.yml export-ignore README.md export-ignore # Remove the text attribute from reference files, so that git doesn't convert From 086881b291c827ad67e3d95ff605ed8094190bb3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 11 Feb 2021 17:45:45 +0000 Subject: [PATCH 102/488] Add {check,distclean,test}-htslib phony targets to htslib.mk So distclean-all, check-all and test-all targets can be added to samtools/bcftools build systems. --- htslib.mk | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/htslib.mk b/htslib.mk index 4d0fa0205..0c000a1a3 100644 --- a/htslib.mk +++ b/htslib.mk @@ -188,7 +188,8 @@ $(HTSDIR)/htslib.pc.tmp: # # clean: clean-htslib -all-htslib clean-htslib install-htslib plugins-htslib: +all-htslib check-htslib clean-htslib distclean-htslib install-htslib plugins-htslib test-htslib: +cd $(HTSDIR) && $(MAKE) $(@:-htslib=) -.PHONY: all-htslib clean-htslib install-htslib plugins-htslib +.PHONY: all-htslib check-htslib clean-htslib distclean-htslib install-htslib +.PHONY: plugins-htslib test-htslib From 550c6b1b98414ef60eeb665cbfda9f6350d8907c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 11 Feb 2021 17:53:55 +0000 Subject: [PATCH 103/488] Clarify that the htscodecs submodule is part of htslib The original message printed when the submodule files are not present could be confusing if HTSlib was being built as an embedded part of SAMtools. Alter the message slightly to make it clearer that the submodule needs to be updated in the htslib checkout. Fixes samtools/samtools#1364 --- Makefile | 4 ++-- configure.ac | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index d2bdaf07f..c61a18491 100644 --- a/Makefile +++ b/Makefile @@ -422,10 +422,10 @@ htscodecs/htscodecs/%.h: | htscodecs/htscodecs ; htscodecs/htscodecs: @if test -e .git ; then \ - printf "\\n\\nError: htscodecs submodule files not present.\\n\ + printf "\\n\\nError: htscodecs submodule files not present for htslib.\\n\ Try running: \\n\ git submodule update --init --recursive\\n\ - and then re-run make.\\n\\n\\n" ; \ + in the top-level htslib directory and then re-run make.\\n\\n\\n" ; \ else \ printf "\\n\\nError: htscodecs submodule files not present and this is not a git checkout.\\n\ You have an incomplete distribution. Please try downloading one of the\\n\ diff --git a/configure.ac b/configure.ac index d79947ae4..4d777e137 100644 --- a/configure.ac +++ b/configure.ac @@ -319,7 +319,7 @@ included as a submodule. Try running: git submodule update --init --recursive -to update it, and then re-run configure. +in the top-level htslib directory to update it, and then re-run configure. ])], [MSG_ERROR([htscodecs submodule files not present. From 680c0b8ef0ff133d3b572abc80fe66fc2ea965f0 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 16 Feb 2021 14:01:58 +0000 Subject: [PATCH 104/488] Change CI scripts to use "autoreconf -i" and update docs autoconf 2.70 requires use of `autoreconf -i` for the AC_CANONICAL_HOST macro, which is pulled in by AC_FUNC_MMAP. Update CI configuration to use this, and change the instructions in the INSTALL and README.md files to match. --- .appveyor.yml | 2 +- .cirrus.yml | 2 +- INSTALL | 8 ++------ README.md | 3 +-- 4 files changed, 5 insertions(+), 10 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index f8944daef..45550bade 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -34,7 +34,7 @@ build_script: - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - git submodule update --init --recursive - - "sh -lc \"autoheader && autoconf && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" + - "sh -lc \"autoreconf -i && ./configure --enable-werror CFLAGS='-g -O3' && make -j2\"" #build_script: # - make diff --git a/.cirrus.yml b/.cirrus.yml index 6b9bcd8cf..4df3dfd08 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -33,7 +33,7 @@ compile_template: &COMPILE fi if test "$USE_CONFIG" = "yes"; then MAKE_OPTS= - autoreconf + autoreconf -i eval ./configure --enable-plugins --enable-werror $CONFIG_OPTS CFLAGS=\"-g -O3 $CFLAGS\" || \ ( cat config.log; false ) else diff --git a/INSTALL b/INSTALL index 42bffb85b..fbe591d5a 100644 --- a/INSTALL +++ b/INSTALL @@ -17,6 +17,7 @@ In addition, building the configure script requires: autoheader autoconf + autoreconf Running the configure script uses awk, along with a number of standard UNIX tools (cat, cp, grep, mv, rm, sed, among others). Almost @@ -79,12 +80,7 @@ This step is only needed if configure.ac has been changed, or if configure does not exist (for example, when building from a git clone). The configure script and config.h.in can be built by running: - autoheader - autoconf - -If you have a full GNU autotools install, you can alternatively run: - - autoreconf + autoreconf -i Basic Installation ================== diff --git a/README.md b/README.md index e28f4a415..fccfa7775 100644 --- a/README.md +++ b/README.md @@ -28,8 +28,7 @@ committed to this repository, so building the code from a Git repository requires extra steps: ```sh -autoheader # If using configure, generate the header template... -autoconf # ...and configure script (or use autoreconf to do both) +autoreconf -i # Build the configure script and install files it uses ./configure # Optional but recommended, for choosing extra functionality make make install From 10a6a8b869d7d58aa658fc5ca01f613321928944 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 16 Feb 2021 13:32:55 +0000 Subject: [PATCH 105/488] Allow a "remove tag" operation followed by "add the same tag" Fixes https://github.com/samtools/bcftools/issues/1414 (bcftools annotate can segfault when --remove and --annotations used at once) --- vcf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index 980f8bff8..565c28edd 100644 --- a/vcf.c +++ b/vcf.c @@ -4165,7 +4165,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v if ( inf ) { // Is it big enough to accommodate new block? - if ( str.l <= inf->vptr_len + inf->vptr_off ) + if ( inf->vptr && str.l <= inf->vptr_len + inf->vptr_off ) { if ( str.l != inf->vptr_len + inf->vptr_off ) line->d.shared_dirty |= BCF1_DIRTY_INF; uint8_t *ptr = inf->vptr - inf->vptr_off; @@ -4322,7 +4322,7 @@ int bcf_update_format(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const else { // The tag is already present, check if it is big enough to accommodate the new block - if ( str.l <= fmt->p_len + fmt->p_off ) + if ( fmt->p && str.l <= fmt->p_len + fmt->p_off ) { // good, the block is big enough if ( str.l != fmt->p_len + fmt->p_off ) line->d.indiv_dirty = 1; From 8127bfc98e9b4361dca2423fd42a59ad7c25dda7 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 16 Feb 2021 16:49:59 +0000 Subject: [PATCH 106/488] Fix reading UTF-8 encoded sample names when char is signed The trick used in bcf_hdr_parse_sample_line() to rapidly find tabs and newlines could be defeated by UTF-8 characters outside the Basic Latin range on platforms where "char" is signed (like x86). It's currently not clear if VCF intends to allow these, but the 4.3 specification does allow UTF-8 and it's easy enough to support. Fix by casting to unsigned when making the comparison. Modifies formatcols.vcf to include a UTF-8 character for a round-trip test. Fixes samtools/bcftools#1408 --- test/formatcols.vcf | 2 +- vcf.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/test/formatcols.vcf b/test/formatcols.vcf index c46cf46fe..6bbdb3af6 100644 --- a/test/formatcols.vcf +++ b/test/formatcols.vcf @@ -2,5 +2,5 @@ ##FILTER= ##contig= ##FORMAT= -#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S2 S3 +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT S1 S² S3 1 100 a A T . . . S a bbbbbbb ccccccccc diff --git a/vcf.c b/vcf.c index 565c28edd..81fc70c66 100644 --- a/vcf.c +++ b/vcf.c @@ -150,7 +150,7 @@ int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) const char *p, *q; // add samples for (p = q = str;; ++q) { - if (*q > '\n') continue; + if ((unsigned char) *q > '\n') continue; if (++i > 9) { if ( bcf_hdr_add_sample_len(h, p, q - p) < 0 ) ret = -1; } From a117153ee56eb6fd0a4f8933320591451c3395c9 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 17 Feb 2021 17:04:19 +0000 Subject: [PATCH 107/488] Add mostlyclean-htslib and testclean-htslib targets to htslib.mk --- htslib.mk | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/htslib.mk b/htslib.mk index 0c000a1a3..6b199e382 100644 --- a/htslib.mk +++ b/htslib.mk @@ -188,8 +188,8 @@ $(HTSDIR)/htslib.pc.tmp: # # clean: clean-htslib -all-htslib check-htslib clean-htslib distclean-htslib install-htslib plugins-htslib test-htslib: +all-htslib check-htslib clean-htslib distclean-htslib install-htslib mostlyclean-htslib plugins-htslib test-htslib testclean-htslib: +cd $(HTSDIR) && $(MAKE) $(@:-htslib=) .PHONY: all-htslib check-htslib clean-htslib distclean-htslib install-htslib -.PHONY: plugins-htslib test-htslib +.PHONY: mostlyclean-htslib plugins-htslib test-htslib testclean-htslib From c6713293758bd8a852203bf08df0396a33d9a23c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 22 Feb 2021 15:23:30 +0000 Subject: [PATCH 108/488] Pull in htscodecs name tokeniser uninitialised memory access fix Credit to OSS-Fuzz Fixes oss-fuzz 31210 Fixes oss-fuzz 31215 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 306b3ebf9..2f481c822 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 306b3ebf96890e71ca6e376f09ae6a1a0765ed99 +Subproject commit 2f481c8227724c408fa7e7b2087457f74c041de9 From ee32bfb430dd15adf1291f2d46c26d2462c129ee Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 1 Mar 2021 12:34:56 +0000 Subject: [PATCH 109/488] Recognise legacy RAZF compression RAZF is an obsolete predecessor to BGZF, and is similarly a variant of GZIP using an extra header field. It also adds a trailing index table. Adding this htsCompression value does not affect bgzf_read_init()'s detection of BGZF vs plain-GZIP; RAZF remains treated as is_gzip and the trailing index table is not handled well, leading to problems if you try to decompress such a legacy file with e.g. bgzip -d. --- hts.c | 10 ++++++++-- htslib/hts.h | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/hts.c b/hts.c index 57ee4d1d6..fe5d971ad 100644 --- a/hts.c +++ b/hts.c @@ -468,8 +468,13 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) if (len >= 2 && s[0] == 0x1f && s[1] == 0x8b) { // The stream is either gzip-compressed or BGZF-compressed. // Determine which, and decompress the first few records or lines. - fmt->compression = (len >= 18 && (s[3] & 4) && - memcmp(&s[12], "BC\2\0", 4) == 0)? bgzf : gzip; + fmt->compression = gzip; + if (len >= 18 && (s[3] & 4)) { + if (memcmp(&s[12], "BC\2\0", 4) == 0) + fmt->compression = bgzf; + else if (memcmp(&s[12], "RAZF", 4) == 0) + fmt->compression = razf_compression; + } if (len >= 9 && s[2] == 8) fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1; @@ -664,6 +669,7 @@ char *hts_format_description(const htsFormat *format) switch (format->compression) { case bzip2_compression: kputs(" bzip2-compressed", &str); break; + case razf_compression: kputs(" legacy-RAZF-compressed", &str); break; case custom: kputs(" compressed", &str); break; case gzip: kputs(" gzip-compressed", &str); break; case bgzf: diff --git a/htslib/hts.h b/htslib/hts.h index 9488a9f9b..a0bb7e472 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -209,7 +209,7 @@ enum htsExactFormat { }; enum htsCompression { - no_compression, gzip, bgzf, custom, bzip2_compression, + no_compression, gzip, bgzf, custom, bzip2_compression, razf_compression, compression_maximum = 32767 }; From 2abfea362eab837500b2c492dcbc47f5688d318c Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 2 Mar 2021 12:15:47 +0000 Subject: [PATCH 110/488] Refuse to read the legacy RAZF compression format Instead emit an error message recommending the use of gunzip to decompress the file, in the unlikely event a RAZF file is encountered. If seeking is available, attempt to read the sizes stored at the end of the RAZF trailing index table so that the message can show a truncate command to remove the index table before gunzipping the file. --- bgzf.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 46 insertions(+), 4 deletions(-) diff --git a/bgzf.c b/bgzf.c index ded3884e4..6fa6105fe 100644 --- a/bgzf.c +++ b/bgzf.c @@ -49,6 +49,10 @@ #include "cram/pooled_alloc.h" #include "hts_internal.h" +#ifndef EFTYPE +#define EFTYPE ENOEXEC +#endif + #define BGZF_CACHE #define BGZF_MT @@ -315,6 +319,37 @@ static inline void packInt32(uint8_t *buffer, uint32_t value) buffer[3] = value >> 24; } +static void razf_info(hFILE *hfp, const char *filename) +{ + uint64_t usize, csize; + off_t sizes_pos; + + if (filename == NULL || strcmp(filename, "-") == 0) filename = "FILE"; + + // RAZF files end with USIZE,CSIZE stored as big-endian uint64_t + if ((sizes_pos = hseek(hfp, -16, SEEK_END)) < 0) goto no_sizes; + if (hread(hfp, &usize, 8) != 8 || hread(hfp, &csize, 8) != 8) goto no_sizes; + if (!ed_is_big()) ed_swap_8p(&usize), ed_swap_8p(&csize); + if (csize >= sizes_pos) goto no_sizes; // Very basic validity check + + hts_log_error( +"To decompress this file, use the following commands:\n" +" truncate -s %" PRIu64 " %s\n" +" gunzip %s\n" +"The resulting uncompressed file should be %" PRIu64 " bytes in length.\n" +"If you do not have a truncate command, skip that step (though gunzip will\n" +"likely produce a \"trailing garbage ignored\" message, which can be ignored).", + csize, filename, filename, usize); + return; + +no_sizes: + hts_log_error( +"To decompress this file, use the following command:\n" +" gunzip %s\n" +"This will likely produce a \"trailing garbage ignored\" message, which can\n" +"usually be safely ignored.", filename); +} + static const char *bgzf_zerr(int errnum, z_stream *zs) { static char buffer[32]; @@ -352,7 +387,7 @@ static const char *bgzf_zerr(int errnum, z_stream *zs) } } -static BGZF *bgzf_read_init(hFILE *hfpr) +static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename) { BGZF *fp; uint8_t magic[18]; @@ -368,6 +403,13 @@ static BGZF *bgzf_read_init(hFILE *hfpr) fp->compressed_block = (char *)fp->uncompressed_block + BGZF_MAX_BLOCK_SIZE; fp->is_compressed = (n==18 && magic[0]==0x1f && magic[1]==0x8b); fp->is_gzip = ( !fp->is_compressed || ((magic[3]&4) && memcmp(&magic[12], "BC\2\0",4)==0) ) ? 0 : 1; + if (fp->is_compressed && (magic[3]&4) && memcmp(&magic[12], "RAZF", 4)==0) { + hts_log_error("Cannot decompress legacy RAZF format"); + razf_info(hfpr, filename); + free(fp); + errno = EFTYPE; + return NULL; + } #ifdef BGZF_CACHE if (!(fp->cache = malloc(sizeof(*fp->cache)))) { free(fp); @@ -450,7 +492,7 @@ BGZF *bgzf_open(const char *path, const char *mode) if (strchr(mode, 'r')) { hFILE *fpr; if ((fpr = hopen(path, mode)) == 0) return 0; - fp = bgzf_read_init(fpr); + fp = bgzf_read_init(fpr, path); if (fp == 0) { hclose_abruptly(fpr); return NULL; } fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { @@ -473,7 +515,7 @@ BGZF *bgzf_dopen(int fd, const char *mode) if (strchr(mode, 'r')) { hFILE *fpr; if ((fpr = hdopen(fd, mode)) == 0) return 0; - fp = bgzf_read_init(fpr); + fp = bgzf_read_init(fpr, NULL); if (fp == 0) { hclose_abruptly(fpr); return NULL; } // FIXME this closes fd fp->fp = fpr; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { @@ -494,7 +536,7 @@ BGZF *bgzf_hopen(hFILE *hfp, const char *mode) BGZF *fp = NULL; assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r')) { - fp = bgzf_read_init(hfp); + fp = bgzf_read_init(hfp, NULL); if (fp == NULL) return NULL; } else if (strchr(mode, 'w') || strchr(mode, 'a')) { fp = bgzf_write_init(mode); From b6ccfa641431523a4ee583b34f8a9817f44535a9 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 25 Feb 2021 16:36:05 +0000 Subject: [PATCH 111/488] Update to htscodecs release 1.0 Updates the htscodecs submodule and updates dependencies for the new header files. Adds Makefile infrastructure to build and clean up the htscodecs/htscodecs/version.h file, for git checkouts only. Release tarballs will ship with a pre-built copy of this file. Fixes issue where the Makefile rules to test if the submodule is missing would cause missing source files to be silently ignored. They now run a recipe to check for the target, and suggest updating the submodule if it's missing. Adds documentation for the '--with-external-htscodecs' configure option to INSTALL. The hts_version() function is unchanged reporting pure htslib only. Use hts_test_feature(HTS_FEATURE_HTSCODECS) to query htscodecs version. --- INSTALL | 8 ++++++++ Makefile | 37 +++++++++++++++++++++++++++++++++---- hts.c | 25 ++++++++++++++++++------- htscodecs | 2 +- htscodecs_bundled.mk | 3 +++ htscodecs_external.mk | 2 ++ htslib/hts.h | 1 + test/test_introspection.c | 4 ++++ 8 files changed, 70 insertions(+), 12 deletions(-) diff --git a/INSTALL b/INSTALL index fbe591d5a..277be56d2 100644 --- a/INSTALL +++ b/INSTALL @@ -73,6 +73,10 @@ either clone the project using "git clone --recurse-submodules", or run: to ensure the correct version of the submodule is present. +It is also possible to link against an external libhtscodecs library +by using the '--with-external-htscodecs' configure option. When +this is used, the submodule files will be ignored. + Building Configure ================== @@ -140,6 +144,10 @@ various features and specify further optional external requirements: searched; you can use --with-plugin-path='DIR:$(plugindir):DIR' and so on to cause additional directories to be searched. +--with-external-htscodecs + Build and link against an external copy of the htscodecs library + instead of using the source files in the htscodecs directory. + --enable-libcurl Use libcurl () to implement network access to remote files via FTP, HTTP, HTTPS, etc. diff --git a/Makefile b/Makefile index c61a18491..24bb2295d 100644 --- a/Makefile +++ b/Makefile @@ -358,7 +358,7 @@ hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstrin hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) +hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) @@ -394,6 +394,7 @@ thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model_h) +htscodecs/htscodecs/htscodecs.o htscodecs/htscodecs/htscodecs.pico: htscodecs/htscodecs/htscodecs.c $(htscodecs_htscodecs_h) $(htscodecs_version_h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) @@ -417,9 +418,19 @@ tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htsl # Runes to check that the htscodecs submodule is present ifdef HTSCODECS_SOURCES htscodecs/htscodecs/%.c: | htscodecs/htscodecs - ; + @if test -e htscodecs/.git && test ! -e "$@" ; then \ + echo "Missing file '$@'" ; \ + echo " - Do you need to update the htscodecs submodule?" ; \ + false ; \ + fi + htscodecs/htscodecs/%.h: | htscodecs/htscodecs - ; + @if test -e htscodecs/.git && test ! -e "$@" ; then \ + echo "Missing file '$@'" ; \ + echo " - Do you need to update the htscodecs submodule?" ; \ + false ; \ + fi + htscodecs/htscodecs: @if test -e .git ; then \ printf "\\n\\nError: htscodecs submodule files not present for htslib.\\n\ @@ -432,6 +443,21 @@ htscodecs/htscodecs: official releases from https://www.htslib.org/\\n" ; \ fi @false + +# Build the htscodecs/htscodecs/version.h file if necessary +htscodecs/htscodecs/version.h: force + @if test -e htscodecs/.git && test -e htscodecs/configure.ac ; then \ + cd htscodecs && \ + vers=`git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \ + case "$$vers" in \ + v*) vers=$${vers#v} ;; \ + *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \ + esac ; \ + if ! grep -s -q '"'"$$vers"'"' htscodecs/version.h ; then \ + echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \ + echo '#define HTSCODECS_VERSION_TEXT "'"$$vers"'"' > htscodecs/version.h ; \ + fi ; \ + fi endif # Maintainer source code checks @@ -705,9 +731,12 @@ testclean: -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt -rm -rf htscodecs/tests/test.out +# Only remove this in git checkouts +DEL_HTSCODECS_VERSION := $(if $(wildcard htscodecs/.git),htscodecs/htscodecs/version.h) + mostlyclean: testclean -rm -f *.o *.pico cram/*.o cram/*.pico test/*.o test/*.dSYM config_vars.h version.h - -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico + -rm -f htscodecs/htscodecs/*.o htscodecs/htscodecs/*.pico $(DEL_HTSCODECS_VERSION) -rm -f hts-object-files -rm -f htscodecs/tests/*.o diff --git a/hts.c b/hts.c index fe5d971ad..7842d92f3 100644 --- a/hts.c +++ b/hts.c @@ -58,6 +58,11 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/kseq.h" #include "htslib/ksort.h" #include "htslib/tbx.h" +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#else +#include "htscodecs/htscodecs/htscodecs.h" +#endif #ifndef EFTYPE #define EFTYPE ENOEXEC @@ -74,7 +79,7 @@ const char *hts_version() } unsigned int hts_features(void) { - unsigned int feat = 0; + unsigned int feat = HTS_FEATURE_HTSCODECS; // Always present #ifdef PACKAGE_URL feat |= HTS_FEATURE_CONFIGURE; @@ -132,6 +137,9 @@ const char *hts_test_feature(unsigned int id) { case HTS_FEATURE_LZMA: return feat & HTS_FEATURE_LZMA ? "yes" : NULL; + case HTS_FEATURE_HTSCODECS: + return htscodecs_version(); + case HTS_FEATURE_CC: return HTS_CC; case HTS_FEATURE_CFLAGS: @@ -151,6 +159,7 @@ const char *hts_test_feature(unsigned int id) { // Note this implementation also means we can just "strings" the library // to find the configuration parameters. const char *hts_feature_string(void) { + static char config[1200]; const char *fmt= #ifdef PACKAGE_URL @@ -196,18 +205,20 @@ const char *hts_feature_string(void) { #endif #ifdef HAVE_LIBBZ2 - "bzip2=yes "; + "bzip2=yes " #else - "bzip2=no "; + "bzip2=no " #endif + "htscodecs=%.40s"; + #ifdef ENABLE_PLUGINS - static char config[1200]; - sprintf(config, fmt, hts_plugin_path()); - return config; + snprintf(config, sizeof(config), fmt, + hts_plugin_path(), htscodecs_version()); #else - return fmt; + snprintf(config, sizeof(config), fmt, htscodecs_version()); #endif + return config; } diff --git a/htscodecs b/htscodecs index 2f481c822..30bc9fdca 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 2f481c8227724c408fa7e7b2087457f74c041de9 +Subproject commit 30bc9fdca45e144bd975eb2a2563c1cac43c2ec5 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index de4d5db8d..7242e210b 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -25,6 +25,7 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/fqzcomp_qual.c \ + $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ @@ -36,6 +37,7 @@ HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) # htscodecs public headers htscodecs_arith_dynamic_h = htscodecs/htscodecs/arith_dynamic.h htscodecs_fqzcomp_qual_h = htscodecs/htscodecs/fqzcomp_qual.h +htscodecs_htscodecs_h = htscodecs/htscodecs/htscodecs.h $(htscodecs_version_h) htscodecs_pack_h = htscodecs/htscodecs/pack.h htscodecs_rANS_static_h = htscodecs/htscodecs/rANS_static.h htscodecs_rANS_static4x16_h = htscodecs/htscodecs/rANS_static4x16.h @@ -51,6 +53,7 @@ htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h $(htscodecs_htscodecs_endian_h) htscodecs_utils_h = htscodecs/htscodecs/utils.h +htscodecs_version_h = htscodecs/htscodecs/version.h # Add htscodecs tests into the HTSlib test framework diff --git a/htscodecs_external.mk b/htscodecs_external.mk index ce24dd6f5..3f86811f5 100644 --- a/htscodecs_external.mk +++ b/htscodecs_external.mk @@ -28,6 +28,7 @@ HTSCODECS_TEST_TARGETS = htscodecs_arith_dynamic_h = htscodecs_fqzcomp_qual_h = +htscodecs_htscodecs_h = htscodecs_pack_h = htscodecs_rANS_static_h = htscodecs_rANS_static4x16_h = @@ -42,3 +43,4 @@ htscodecs_pooled_alloc_h = htscodecs_rANS_byte_h = htscodecs_rANS_word_h = htscodecs_utils_h = +htscodecs_version_h = diff --git a/htslib/hts.h b/htslib/hts.h index a0bb7e472..d4365e6e3 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -483,6 +483,7 @@ const char *hts_feature_string(void); #define HTS_FEATURE_LIBDEFLATE (1u<<20) #define HTS_FEATURE_LZMA (1u<<21) #define HTS_FEATURE_BZIP2 (1u<<22) +#define HTS_FEATURE_HTSCODECS (1u<<23) // htscodecs library version // Build params #define HTS_FEATURE_CC (1u<<27) diff --git a/test/test_introspection.c b/test/test_introspection.c index cc8ceb6e7..658d92f92 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -31,6 +31,8 @@ DEALINGS IN THE SOFTWARE. */ int main(void) { printf("Version string: %s\n", hts_version()); printf("Version number: %d\n", HTS_VERSION); + printf("\nhtscodecs version: %s\n", + hts_test_feature(HTS_FEATURE_HTSCODECS)); printf("\nCC: %s\n", hts_test_feature(HTS_FEATURE_CC)); printf("CPPFLAGS: %s\n", hts_test_feature(HTS_FEATURE_CPPFLAGS)); @@ -55,6 +57,8 @@ int main(void) { printf(" HTS_FEATURE_LZMA\n"); if (feat & HTS_FEATURE_BZIP2) printf(" HTS_FEATURE_BZIP2\n"); + if (feat & HTS_FEATURE_HTSCODECS) + printf(" HTS_FEATURE_HTSCODECS\n"); printf("\nFeature string: %s\n", hts_feature_string()); From 0380c7b4f35f576e3d9b94a27dfe10f22ecdb620 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 3 Mar 2021 23:19:51 +0000 Subject: [PATCH 112/488] When not using configure, define _XOPEN_SOURCE (PR #1246) Non-configure -std=c99 builds (as opposed to -std=gnu99 builds), e.g., `make CC='gcc -std=c99'`, previously failed as glibc (and maybe others) suppresses non-Standard-C functions in standard headers in this mode. This reactivates them. In particular: Rhtslib, Rsamtools, and other R-based builds do not use configure and don't supply their own config.h, and may specify -std=c99. (500 suffices for glibc, but macOS's headers require 600 to have them provide declarations for strdup() and snprintf(). This appears to be a bug related to their expected _C99_SOURCE define, which isn't defined.) --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index 24bb2295d..a3c0748b4 100644 --- a/Makefile +++ b/Makefile @@ -239,6 +239,9 @@ thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) # but if those aren't used create a default config.h here. config.h: echo '/* Default config.h generated by Makefile */' > $@ + echo '#ifndef _XOPEN_SOURCE' >> $@ + echo '#define _XOPEN_SOURCE 600' >> $@ + echo '#endif' >> $@ echo '#define HAVE_LIBBZ2 1' >> $@ echo '#define HAVE_LIBLZMA 1' >> $@ echo '#ifndef __APPLE__' >> $@ From 848b301d6f04a44fa49b08987085eb040bbe1fc3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 3 Mar 2021 23:35:01 +0000 Subject: [PATCH 113/488] Fix leak of uncompressed_block on bgzf_read_init() error Detected by making an RAZF file, but it could also leak on failure to create the cache. Credit to OSS-Fuzz Fixes oss-fuzz 31665 --- bgzf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/bgzf.c b/bgzf.c index 6fa6105fe..b2ee7c869 100644 --- a/bgzf.c +++ b/bgzf.c @@ -406,16 +406,19 @@ static BGZF *bgzf_read_init(hFILE *hfpr, const char *filename) if (fp->is_compressed && (magic[3]&4) && memcmp(&magic[12], "RAZF", 4)==0) { hts_log_error("Cannot decompress legacy RAZF format"); razf_info(hfpr, filename); + free(fp->uncompressed_block); free(fp); errno = EFTYPE; return NULL; } #ifdef BGZF_CACHE if (!(fp->cache = malloc(sizeof(*fp->cache)))) { + free(fp->uncompressed_block); free(fp); return NULL; } if (!(fp->cache->h = kh_init(cache))) { + free(fp->uncompressed_block); free(fp->cache); free(fp); return NULL; From 1949a15a544ce5fb1ea48b53d7860df035a09f93 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 4 Mar 2021 17:33:29 +0000 Subject: [PATCH 114/488] Add options to force S3 address style. (PR #1249) Add options to force S3 address style. (PR #1249) Allow S3 configuration options to force path-style URLs, for local set-ups that do not support virtual hosts. Fixes samtools/samtools#1377 --- hfile_s3.c | 54 +++++++++++++++++++++++++++++++++++++++++++--- htslib-s3-plugin.7 | 19 +++++++++++++++- 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/hfile_s3.c b/hfile_s3.c index eeb355065..4f1d536dc 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -29,6 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include +#include #include #include @@ -503,6 +504,7 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, ptrdiff_t bucket_len; int is_https = 1, dns_compliant; char *query_start; + enum {s3_auto, s3_virtual, s3_path} address_style = s3_auto; if (!ad) return NULL; @@ -555,29 +557,75 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile); else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile); else kputs("default", &profile); + + if ((v = getenv("HTS_S3_ADDRESS_STYLE")) != NULL) { + if (strcasecmp(v, "virtual") == 0) { + address_style = s3_virtual; + } else if (strcasecmp(v, "path") == 0) { + address_style = s3_path; + } + } } if (ad->id.l == 0) { + kstring_t url_style = KS_INITIALIZE; const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); parse_ini(v? v : "~/.aws/credentials", profile.s, "aws_access_key_id", &ad->id, "aws_secret_access_key", &ad->secret, "aws_session_token", &ad->token, - "region", &ad->region, NULL); + "region", &ad->region, + "addressing_style", &url_style, + NULL); + + if (url_style.l) { + if (strcmp(url_style.s, "virtual") == 0) { + address_style = s3_virtual; + } else if (strcmp(url_style.s, "path") == 0) { + address_style = s3_path; + } else { + address_style = s3_auto; + } + } + + ks_free(&url_style); } if (ad->id.l == 0) { + kstring_t url_style = KS_INITIALIZE; const char *v = getenv("HTS_S3_S3CFG"); parse_ini(v? v : "~/.s3cfg", profile.s, "access_key", &ad->id, "secret_key", &ad->secret, "access_token", &ad->token, "host_base", &ad->host, - "bucket_location", &ad->region, NULL); + "bucket_location", &ad->region, + "host_bucket", &url_style, + NULL); + + if (url_style.l) { + // Conforming to s3cmd's GitHub PR#416, host_bucket without the "%(bucket)s" string + // indicates use of path style adressing. + if (strstr(url_style.s, "%(bucket)s") == NULL) { + address_style = s3_path; + } else { + address_style = s3_auto; + } + } + + ks_free(&url_style); } if (ad->id.l == 0) parse_simple("~/.awssecret", &ad->id, &ad->secret); - dns_compliant = is_dns_compliant(bucket, path, is_https); + + // if address_style is set, force the dns_compliant setting + if (address_style == s3_virtual) { + dns_compliant = 1; + } else if (address_style == s3_path) { + dns_compliant = 0; + } else { + dns_compliant = is_dns_compliant(bucket, path, is_https); + } if (ad->host.l == 0) kputs("s3.amazonaws.com", &ad->host); diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index ea8eefa39..022003afb 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -2,7 +2,7 @@ .SH NAME s3 plugin \- htslib AWS S3 plugin .\" -.\" Copyright (C) 2019 Genome Research Ltd. +.\" Copyright (C) 2021 Genome Research Ltd. .\" .\" Author: Andrew Whitwham .\" @@ -105,16 +105,33 @@ Sets the upload part size in Mb, the minimum being 5Mb. By default the part size starts at 5Mb and expands at regular intervals to accommodate bigger files (up to 2.5 Tbytes with the current rate). Using this setting disables the automatic part size expansion. +.TP +.B HTS_S3_ADDRESS_STYLE +Sets the URL style. Options are auto (default), virtual or path. .LP In the absence of an ID from the previous two methods the credential/config files will be used. The default file locations are either \fI~/.aws/credentials\fR or \fI~/.s3cfg\fR (in that order). + +Entries used in aws style credentials file are aws_access_key_id, +aws_secret_access_key, aws_session_token, region and addressing_style. Only the +first two are usually needed. + +Entries used in s3cmd style config files are access_key, secret_key, +access_token, host_base, bucket_location and host_bucket. Again only the first +two are usually needed. The host_bucket option is only used to set a path-style +URL, see below. + .SH NOTES In most cases this plugin transforms the given URL into a virtual host-style format e.g. \fIhttps://bucket.host/path/to/file\fR. A path-style format is used where the URL is not DNS compliant or the bucket name contains a dot e.g. \fIhttps://host/bu.cket/path/to/file\fR. +Path-style can be forced by setting one either HTS_S3_ADDRESS_STYLE, +addressing_style or host_bucket. The first two can be set to \fBpath\fR while +host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string. + .SH "SEE ALSO" .BR htsfile (1) .BR samtools (1) From f518a265cfeb575c0d93f6090eea863e5ad4a914 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 2 Feb 2021 13:59:33 +0000 Subject: [PATCH 115/488] News update for the next release (1.12). --- NEWS | 141 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 141 insertions(+) diff --git a/NEWS b/NEWS index 658b7cad2..170bacee6 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,36 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Features and Updates +-------------------- + +* Added experimental CRAM 3.1 and 4.0 support. (#929) + + These should not be used for long term data storage as the + specification still needs to be ratified by GA4GH and may be subject + to changes in format. (This is highly likely for 4.0). However it + may be tested using: + + test/test_view -t ref.fa -C -o version=3.1 in.bam -p out31.cram + + For smaller but slower files, try varying the compression profile + with an additional "-o small". Profile choices are fast, normal, + small and archive, and can be applied to all CRAM versions. + +* Added a general filtering syntax for alignment records in SAM/BAM/CRAM + readers. (#1181, #1203) + + An example to find chromosome spanning read-pairs with high mapping + quality: 'mqual >= 30 && mrname != rname' + + To find significant sized deletions: + 'cigar =~ "[0-9]{2}D"' or 'rlen - qlen > 10'. + + To report duplicates that aren't part of a "proper pair": + 'flag.dup && !flag.proper_pair' + + More details are in the samtools.1 man page under "FILTER EXPRESSIONS". + * The knet networking code has been removed. It only supported the http and ftp protocols, and a better and safer alternative using libcurl has been available since release 1.3. If you need access to ftp:// and @@ -9,6 +39,117 @@ Noteworthy changes in release a.b * The old htslib/knetfile.h interfaces have been marked as deprecated. Any code still using them should be updated to use hFILE instead. (#1200) +* Added an introspection API for checking some of the capabilities provided + by HTSlib. (#1170) Thanks also to John Marshall for contributions. (#1222) + - `hfile_list_schemes`: returns the number of schemes found + - `hfile_list_plugins`: returns the number of plugins found + - `hfile_has_plugin`: checks if a specific plugin is available + - `hts_features`: returns a bit mask with all available features + - `hts_test_feature`: test if a feature is available + - `hts_feature_string`: return a string summary of enabled features + +* Made performance improvements to `probaln_glocal` method, which + speeds up mpileup BAQ calculations. (#1188) + - Caching of reused loop variables and removal of loop invariants + - Code reordering to remove instruction latency. + - Other refactoring and tidyups. + +* Added a public method for constructing a BAM record from the + component pieces. Thanks to Anders Kaplan. (#1159, #1164) + +* Added two public methods, `sam_parse_cigar` and `bam_parse_cigar`, as part of + a small CIGAR API (#1169, #1182). Thanks to Daniel Cameron for input. (#1147) + +* HTSlib, and the included htsfile program, will now recognise the old + RAZF compressed file format. Note that while the format is detected, + HTSlib is unable to read it. It is recommended that RAZF files are + uncompressed with `gunzip` before using them with HTSlib. Thanks to + John Marshall (#1244); and Matthew J. Oldach who reported problems + with uncompressing some RAZF files (samtools/samtools#1387). + +* The S3 plugin now has options to force the address style. It will recognise + the addressing_style and host_bucket entries in the respective aws + .credentials and s3cmd .s3cfg files. There is also a new HTS_S3_ADDRESS_STYLE + environment variable. Details are in the htslib-s3-plugin.7 man file (#1249). + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Added new Makefile targets for the applications that embed HTSlib and + want to run its test suite or clean its generated artefacts. (#1230, #1238) + +* The CRAM codecs are now obtained via the htscodecs submodule, hence + when cloning it is now best to use "git clone --recursive". In an + existing clone, you may use "git submodule update --init" to obtain + the htscodecs submodule checkout. + +* Updated CI test configuration to recurse HTSlib submodules. (#1359) + +* Added Cirrus-CI integration as a replacement for Travis, which was + phased out. (#1175; #1212) + +* Updated the Windows image used by Appveyor to 'Visual Studio 2019'. (#1172; + fixed #1166) + +* Fixed a buglet in configure.ac, exposed by the release 2.70 of autoconf. + Thanks to John Marshall. (#1198) + +* Fixed plugin linking on macOS, to prevent symbol conflict when linking + with a static HTSlib. Thanks to John Marshall. (#1184) + +* Fixed a clang++9 error in `cram_io.h`. Thanks to Pjotr Prins. (#1190) + +* Introduced $(ALL_CPPFLAGS) to allow for more flexibility in setting the + compiler flags. Thanks to John Marshall. (#1187) + +* Added 'fall through' comments to prevent warnings issued by Clang on + intentional fall through case statements, when building with + `-Wextra flag`. Thanks to John Marshall. (#1163) + +* Non-configure builds now define _XOPEN_SOURCE=600 to allow them to work + when the `gcc -std=c99` option is used. Thanks to John Marshall. (#1246) + +Bug fixes +--------- + +* Fixed VCF `#CHROM` header parsing to only separate columns at tab characters. + Thanks to Sam Morris for reporting the issue. + (#1237; fixed samtools/bcftools#1408) + +* Fixed a crash reported in `bcf_sr_sort_set`, which expects REF to be present. + (#1204; fixed samtools/bcftools#1361) + +* Fixed a bug in the overlapping logic of mpileup, dealing with iterating over + CIGAR segments. Thanks to `@wulj2` for the analysis. (#1202; fixed #1196) + +* Fixed a tabix bug that prevented setting the correct number of lines to be + skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189; + fixed #1186) + +* Made `bam_itr_next` an alias for `sam_itr_next`, to prevent it from crashing + when working with htsFile pointers. Thanks to Torbjörn Klatt for + reporting it. (#1180; fixed #1179) + +* Fixed once per outgoing multi-threaded block `bgzf_idx_flush` assertion, to + accommodate situations when a single record could span multiple blocks. + Thanks to `@lacek`. (#1168; fixed samtools/samtools#1328) + +* Fixed assumption of pthread_t being a non-structure, as permitted by POSIX. + Thanks also to John Marshall and Anders Kaplan. (#1167, #1153, #1153) + +* Fixed the minimum offset of a BAI index bin, to account for unmapped reads. + Thanks to John Marshall for spotting the issue. (#1158; fixed #1142) + +* Fixed the CRLF handling in `sam_parse_worker` method. Thanks to + Anders Kaplan. (#1149; fixed #1148) + +* Included unistd.h and erro.h directly in HTSlib files, as opposed to + including them indirectly, via third party code. Thanks to + Andrew Patterson (#1143) and John Marshall (#1145). + + Noteworthy changes in release 1.11 (22nd September 2020) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 6038f97e901cc2be832cc7749200faf375b287e4 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 5 Mar 2021 17:46:17 +0000 Subject: [PATCH 116/488] Fix NEWS typo --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 170bacee6..b70533915 100644 --- a/NEWS +++ b/NEWS @@ -145,7 +145,7 @@ Bug fixes * Fixed the CRLF handling in `sam_parse_worker` method. Thanks to Anders Kaplan. (#1149; fixed #1148) -* Included unistd.h and erro.h directly in HTSlib files, as opposed to +* Included unistd.h and errno.h directly in HTSlib files, as opposed to including them indirectly, via third party code. Thanks to Andrew Patterson (#1143) and John Marshall (#1145). From 061ef366d54b4165777553023a76922b54bf236f Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 11 Mar 2021 10:06:51 +0000 Subject: [PATCH 117/488] Fix a bug where target regions could miss the first record This could happen with e.g. with `bcftools view -t` when the last record in a chromosome block has the same coordinate as the first record in the chromosome block that follows. Fixes https://github.com/samtools/bcftools/issues/1441 --- synced_bcf_reader.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index f51723300..10604b16c 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -614,7 +614,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } reader->nbuffer++; - if ( files->require_index==ALLOW_NO_IDX_ && reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; + if ( reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) @@ -638,11 +638,12 @@ static void _reader_shift_buffer(bcf_sr_t *reader) { int i; for (i=2; i<=reader->nbuffer; i++) - if ( reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; + if ( reader->buffer[i]->rid!=reader->buffer[1]->rid || reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; if ( i<=reader->nbuffer ) { // A record with a different position follows, swap it. Because of the reader's logic, // only one such line can be present. + assert( i==reader->nbuffer ); bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp; reader->nbuffer = 1; } From 41e11f96486189fce8f6c7b8064d83476ab7a2bc Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 12 Mar 2021 11:53:23 +0000 Subject: [PATCH 118/488] NEWS item for #1254 --- NEWS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/NEWS b/NEWS index b70533915..fd240f132 100644 --- a/NEWS +++ b/NEWS @@ -121,6 +121,10 @@ Bug fixes * Fixed a crash reported in `bcf_sr_sort_set`, which expects REF to be present. (#1204; fixed samtools/bcftools#1361) +* Fixed a bcf synced reader bug when filtering with a region list, and + the first record for a chromosome had the same position as the last + record for the previous chromosome. (#1254; fixed samtools/bcftools#1441) + * Fixed a bug in the overlapping logic of mpileup, dealing with iterating over CIGAR segments. Thanks to `@wulj2` for the analysis. (#1202; fixed #1196) From 718695931de92f3e5f8659db46d15a16481972a9 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 9 Mar 2021 13:58:37 +0000 Subject: [PATCH 119/488] Improve "CRAM version 4.0 is still draft" wording In particular, shorten to two lines and improve the line break. --- cram/cram_io.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 8099a9287..05f7c7b06 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5688,10 +5688,9 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { if (major > 3 || (major == 3 && minor > 0)) { hts_log_warning( - "CRAM version %s is still in draft and is subject to\n" - "change. Please consider this a technology demonstration " - "and do not use for\n" - "long term archival of data.", s); + "CRAM version %s is still a draft and subject to change.\n" + "This is a technology demonstration that should not be " + "used for archival data.", s); } fd->version = major*256 + minor; From 385d2489cffaec72768a191bc57c0443905b8fbe Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Fri, 19 Mar 2021 15:41:17 +0000 Subject: [PATCH 120/488] Prevent the corruption of the ID value of a PG line when parsing the header and issue a warning when multiple ID tags are encountered on the same line. This change also enforces consistency across the header API methods, which use the ID tag as an argument, by making them agree to always return the first encountered ID value. --- NEWS | 7 +++++++ header.c | 11 ++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 08830f77a..5acdf4877 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,13 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Features and Updates +-------------------- + +* In case a PG header line has multiple ID tags supplied by other applications, + the header API now selects the first one encountered as the identifying tag + and issues a warning when detecting subsequent ID tags. + Noteworthy changes in release 1.12 (17th March 2021) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/header.c b/header.c index 148268cc6..1d2fee491 100644 --- a/header.c +++ b/header.c @@ -330,9 +330,14 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, while (tag) { if (tag->str[0] == 'I' && tag->str[1] == 'D') { - assert(tag->len >= 3); - hrecs->pg[npg].name = tag->str + 3; - hrecs->pg[npg].name_len = tag->len - 3; + /* Avoid duplicate ID tags coming from other applications */ + if (!hrecs->pg[npg].name) { + assert(tag->len >= 3); + hrecs->pg[npg].name = tag->str + 3; + hrecs->pg[npg].name_len = tag->len - 3; + } else { + hts_log_warning("PG line with multiple ID tags. The first encountered was preferred - ID:%s", hrecs->pg[npg].name); + } } else if (tag->str[0] == 'P' && tag->str[1] == 'P') { // Resolve later if needed khint_t k; From 1ee61df502a10daf552418ee974fcf8715f4bd93 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Thu, 1 Apr 2021 08:22:17 +0100 Subject: [PATCH 121/488] Minor corrections in comments. --- htslib/sam.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 97aa4acf2..c3481808f 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -239,7 +239,7 @@ typedef struct bam1_core_t { See the bam_cigar_* macros for manipulation. 4. seq is nibble-encoded according to bam_nt16_table. See the bam_seqi macro for retrieving individual bases. - 5. Per base qualilties are stored in the Phred scale with no +33 offset. + 5. Per base qualities are stored in the Phred scale with no +33 offset. Ie as per the BAM specification and not the SAM ASCII printable method. */ typedef struct bam1_t { @@ -904,7 +904,7 @@ void bam_destroy1(bam1_t *b); if (!recs || !buffer) goto cleanup; for (nrecs = 0; nrecs < MAX_RECS; nrecs++) { - bam_set_mempolicy(BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA); + bam_set_mempolicy(&recs[nrecs], BAM_USER_OWNS_STRUCT|BAM_USER_OWNS_DATA); // Set data pointer to unused part of buffer recs[nrecs].data = &buffer[buff_used]; @@ -1159,7 +1159,7 @@ int sam_idx_init(htsFile *fp, sam_hdr_t *h, int min_shift, const char *fnidx); /// Writes the index initialised with sam_idx_init to disk. /** @param fp File handle for the data file being written. - @return 0 on success, <0 on filaure. + @return 0 on success, <0 on failure. */ HTSLIB_EXPORT int sam_idx_save(htsFile *fp) HTS_RESULT_USED; @@ -1432,7 +1432,7 @@ int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, * @param type Single letter type code: ACcSsIifHZB. * @param tag Tag data pointer, in BAM format * @param end Pointer to end of bam record (largest extent of tag) - * @param ks Kstring to write the formatted tag to + * @param ks kstring to write the formatted tag to * * @return pointer to end of tag on success, * NULL on failure. From f53cc428025cfefa54f5ba6e6d825bf07622c818 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Wed, 31 Mar 2021 16:29:37 +0100 Subject: [PATCH 122/488] Display only the duplicate sample name, not the entire list. --- vcf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf.c b/vcf.c index 81fc70c66..464dba772 100644 --- a/vcf.c +++ b/vcf.c @@ -129,7 +129,7 @@ static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) kh_val(d, k) = bcf_idinfo_def; kh_val(d, k).id = n; } else { - hts_log_error("Duplicated sample name '%s'", s); + hts_log_error("Duplicated sample name '%s'", sdup); free(sdup); return -1; } From 515f6df8f7f7dab6c80d0e7aede6e60826ef5374 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 7 Apr 2021 16:39:59 +0100 Subject: [PATCH 123/488] Remove compressBound assertions. (PR #1258) These trip on with zlib-ng as the worse case expansion is 9-bits per byte (at level 1 only). The test is done on opening a file, which seems strange. We could do the check on opening for write only, but it seems more productive to error only when we actually can't fit the data rather than just incase it may if given truely random input. Our real data doesn't seem to trigger this. Random data with zlib-ng can cause data expansion beyond 64k. However bgzf_compress already has code to handle uncompressed blocks, so we just fall back to that (thanks to John Marshall for the idea). Fixes #1257 --- bgzf.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/bgzf.c b/bgzf.c index b2ee7c869..89f441c1b 100644 --- a/bgzf.c +++ b/bgzf.c @@ -491,7 +491,6 @@ static BGZF *bgzf_write_init(const char *mode) BGZF *bgzf_open(const char *path, const char *mode) { BGZF *fp = 0; - assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r')) { hFILE *fpr; if ((fpr = hopen(path, mode)) == 0) return 0; @@ -514,7 +513,6 @@ BGZF *bgzf_open(const char *path, const char *mode) BGZF *bgzf_dopen(int fd, const char *mode) { BGZF *fp = 0; - assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r')) { hFILE *fpr; if ((fpr = hdopen(fd, mode)) == 0) return 0; @@ -537,7 +535,6 @@ BGZF *bgzf_dopen(int fd, const char *mode) BGZF *bgzf_hopen(hFILE *hfp, const char *mode) { BGZF *fp = NULL; - assert(compressBound(BGZF_BLOCK_SIZE) < BGZF_MAX_BLOCK_SIZE); if (strchr(mode, 'r')) { fp = bgzf_read_init(hfp, NULL); if (fp == NULL) return NULL; @@ -617,6 +614,7 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le uint8_t *dst = (uint8_t*)_dst; if (level == 0) { + uncomp: // Uncompressed data if (*dlen < slen+5 + BLOCK_HEADER_LENGTH + BLOCK_FOOTER_LENGTH) return -1; dst[BLOCK_HEADER_LENGTH] = 1; // BFINAL=1, BTYPE=00; see RFC1951 @@ -638,9 +636,21 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le return -1; } if ((ret = deflate(&zs, Z_FINISH)) != Z_STREAM_END) { - hts_log_error("Deflate operation failed: %s", bgzf_zerr(ret, ret == Z_DATA_ERROR ? &zs : NULL)); + if (ret == Z_OK && zs.avail_out == 0) { + deflateEnd(&zs); + goto uncomp; + } else { + hts_log_error("Deflate operation failed: %s", bgzf_zerr(ret, ret == Z_DATA_ERROR ? &zs : NULL)); + } return -1; } + // If we used up the entire output buffer, then we either ran out of + // room or we *just* fitted, but either way we may as well store + // uncompressed for faster decode. + if (zs.avail_out == 0) { + deflateEnd(&zs); + goto uncomp; + } if ((ret = deflateEnd(&zs)) != Z_OK) { hts_log_error("Call to deflateEnd failed: %s", bgzf_zerr(ret, NULL)); return -1; From 8044091e4a6d5151bf2cd7dd96d241347ceffabe Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 3 Apr 2021 09:34:36 +0100 Subject: [PATCH 124/488] Add libcurl AC_CHECK_HEADER([curl/curl.h]) configure check As for zlib, bz2, lzma, and libdeflate, check that both headers and link library are present. In particular, this aids diagnosis in scenarios in which the compiler is made aware of the library via $CPPFLAGS/$LDFLAGS settings and only one of them is correct. --- configure.ac | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/configure.ac b/configure.ac index 4d777e137..53d03c16d 100644 --- a/configure.ac +++ b/configure.ac @@ -351,15 +351,20 @@ HTSlib.])])])]) libcurl=disabled if test "$enable_libcurl" != no; then - AC_CHECK_LIB([curl], [curl_easy_pause], - [AC_DEFINE([HAVE_LIBCURL], 1, [Define if libcurl file access is enabled.]) - libcurl=enabled], + libcurl_devel=ok + AC_CHECK_HEADER([curl/curl.h], [], [libcurl_devel="headers not found"], [;]) + AC_CHECK_LIB([curl], [curl_easy_pause], [:], [AC_CHECK_LIB([curl], [curl_easy_init], - [message="library is too old (7.18+ required)"], - [message="library not found"]) - case "$enable_libcurl" in - check) AC_MSG_WARN([libcurl not enabled: $message]) ;; - *) MSG_ERROR([libcurl $message + [libcurl_devel="library is too old (7.18+ required)"], + [libcurl_devel="library not found"])]) + + if test "$libcurl_devel" = ok; then + AC_DEFINE([HAVE_LIBCURL], 1, [Define if libcurl file access is enabled.]) + libcurl=enabled + elif test "$enable_libcurl" = check; then + AC_MSG_WARN([libcurl not enabled: $libcurl_devel]) + else + MSG_ERROR([libcurl $libcurl_devel Support for HTTPS and other SSL-based URLs requires routines from the libcurl library . Building HTSlib with libcurl enabled @@ -369,8 +374,8 @@ Debian or Ubuntu Linux) or libcurl-devel (on RPM-based Linux distributions or Cygwin) is installed. Either configure with --disable-libcurl or resolve this error to build HTSlib.]) - ;; - esac]) + fi + dnl -lcurl is only needed for static linking if hfile_libcurl is not a plugin if test "$libcurl" = enabled ; then if test "$enable_plugins" != yes ; then From 088096f7d20bd36d7c8b4338571055ca94c9eea4 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 5 Apr 2021 22:52:36 +0100 Subject: [PATCH 125/488] Document tri-state (enable/check/disable) configure options As for libdeflate, the (current) default for libcurl is to probe for the library and enable the facility accordingly. Document this, and mention --disable-libcurl. Also mention that --disable-FOO and --without-BAR options are also available but usually unneeded as they just reinforce the default. --- INSTALL | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/INSTALL b/INSTALL index 277be56d2..e8d0bea6b 100644 --- a/INSTALL +++ b/INSTALL @@ -150,7 +150,10 @@ various features and specify further optional external requirements: --enable-libcurl Use libcurl () to implement network access to - remote files via FTP, HTTP, HTTPS, etc. + remote files via FTP, HTTP, HTTPS, etc. By default or with + --enable-libcurl=check, configure will probe for libcurl and include + this functionality if libcurl is available. Use --disable-libcurl + to prevent this. --enable-gcs Implement network access to Google Cloud Storage. By default or with @@ -176,6 +179,12 @@ various features and specify further optional external requirements: By default, ./configure will probe for libdeflate and use it if available. To prevent this, use --without-libdeflate. +Each --enable-FEATURE/--disable-FEATURE/--with-PACKAGE/--without-PACKAGE +option listed also has an opposite, e.g., --without-external-htscodecs +or --disable-plugins. However, apart from those options for which the +default is to probe for related facilities, using these opposite options +is mostly unnecessary as they just select the default configure behaviour. + The configure script also accepts the usual options and environment variables for tuning installation locations and compilers: type './configure --help' for details. For example, From a680900bb277cb36d10325a09c99af5d1cf3c7c8 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Fri, 9 Apr 2021 08:03:40 +0100 Subject: [PATCH 126/488] Do not download the index file by default, when reading the VCF header. --- vcf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf.c b/vcf.c index 464dba772..9b9e4d36a 100644 --- a/vcf.c +++ b/vcf.c @@ -1916,7 +1916,7 @@ bcf_hdr_t *vcf_hdr_read(htsFile *fp) if ( bcf_hdr_parse(h, txt.s) < 0 ) goto error; // check tabix index, are all contigs listed in the header? add the missing ones - idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL); + idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SILENT_FAIL); if ( idx ) { int i, n, need_sync = 0; From 83fd8374ddb174bec91be3041d272639b6bd4192 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Mon, 12 Apr 2021 10:37:56 +0100 Subject: [PATCH 127/488] Add citation section. --- README | 22 ++++++++++++++++++++++ README.md | 26 ++++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/README b/README index 4225bec20..db368afdf 100644 --- a/README +++ b/README @@ -3,3 +3,25 @@ formats, such as SAM, CRAM, VCF, and BCF, used for high-throughput sequencing data. It is the core library used by samtools and bcftools. See INSTALL for building and installation instructions. + +Please cite this paper when using HTSlib for your publications: + +HTSlib: C library for reading/writing high-throughput sequencing data +James K Bonfield, John Marshall, Petr Danecek, Heng Li, Valeriu Ohan, Andrew Whitwham, Thomas Keane, Robert M Davies +GigaScience, Volume 10, Issue 2, February 2021, giab007, https://doi.org/10.1093/gigascience/giab007 + +@article{10.1093/gigascience/giab007, + author = {Bonfield, James K and Marshall, John and Danecek, Petr and Li, Heng and Ohan, Valeriu and Whitwham, Andrew and Keane, Thomas and Davies, Robert M}, + title = "{HTSlib: C library for reading/writing high-throughput sequencing data}", + journal = {GigaScience}, + volume = {10}, + number = {2}, + year = {2021}, + month = {02}, + abstract = "{Since the original publication of the VCF and SAM formats, an explosion of software tools have been created to process these data files. To facilitate this a library was produced out of the original SAMtools implementation, with a focus on performance and robustness. The file formats themselves have become international standards under the jurisdiction of the Global Alliance for Genomics and Health.We present a software library for providing programmatic access to sequencing alignment and variant formats. It was born out of the widely used SAMtools and BCFtools applications. Considerable improvements have been made to the original code plus many new features including newer access protocols, the addition of the CRAM file format, better indexing and iterators, and better use of threading.Since the original Samtools release, performance has been considerably improved, with a BAM read-write loop running 5 times faster and BAM to SAM conversion 13 times faster (both using 16 threads, compared to Samtools 0.1.19). Widespread adoption has seen HTSlib downloaded \\>1 million times from GitHub and conda. The C library has been used directly by an estimated 900 GitHub projects and has been incorporated into Perl, Python, Rust, and R, significantly expanding the number of uses via other languages. HTSlib is open source and is freely available from htslib.org under MIT/BSD license.}", + issn = {2047-217X}, + doi = {10.1093/gigascience/giab007}, + url = {https://doi.org/10.1093/gigascience/giab007}, + note = {giab007}, + eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab007/36332285/giab007.pdf}, +} diff --git a/README.md b/README.md index fccfa7775..f90f2a30f 100644 --- a/README.md +++ b/README.md @@ -35,3 +35,29 @@ make install ``` [download]: http://www.htslib.org/download/ + +### Citing + +Please cite this paper when using HTSlib for your publications. + +> HTSlib: C library for reading/writing high-throughput sequencing data
+> James K Bonfield, John Marshall, Petr Danecek, Heng Li, Valeriu Ohan, Andrew Whitwham, Thomas Keane, Robert M Davies
+> _GigaScience_, Volume 10, Issue 2, February 2021, giab007, https://doi.org/10.1093/gigascience/giab007 + +``` +@article{10.1093/gigascience/giab007, + author = {Bonfield, James K and Marshall, John and Danecek, Petr and Li, Heng and Ohan, Valeriu and Whitwham, Andrew and Keane, Thomas and Davies, Robert M}, + title = "{HTSlib: C library for reading/writing high-throughput sequencing data}", + journal = {GigaScience}, + volume = {10}, + number = {2}, + year = {2021}, + month = {02}, + abstract = "{Since the original publication of the VCF and SAM formats, an explosion of software tools have been created to process these data files. To facilitate this a library was produced out of the original SAMtools implementation, with a focus on performance and robustness. The file formats themselves have become international standards under the jurisdiction of the Global Alliance for Genomics and Health.We present a software library for providing programmatic access to sequencing alignment and variant formats. It was born out of the widely used SAMtools and BCFtools applications. Considerable improvements have been made to the original code plus many new features including newer access protocols, the addition of the CRAM file format, better indexing and iterators, and better use of threading.Since the original Samtools release, performance has been considerably improved, with a BAM read-write loop running 5 times faster and BAM to SAM conversion 13 times faster (both using 16 threads, compared to Samtools 0.1.19). Widespread adoption has seen HTSlib downloaded \\>1 million times from GitHub and conda. The C library has been used directly by an estimated 900 GitHub projects and has been incorporated into Perl, Python, Rust, and R, significantly expanding the number of uses via other languages. HTSlib is open source and is freely available from htslib.org under MIT/BSD license.}", + issn = {2047-217X}, + doi = {10.1093/gigascience/giab007}, + url = {https://doi.org/10.1093/gigascience/giab007}, + note = {giab007}, + eprint = {https://academic.oup.com/gigascience/article-pdf/10/2/giab007/36332285/giab007.pdf}, +} +``` From c3ba3026d6cebc30bb6c50e6171473db3cd8d445 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 14 Apr 2021 17:29:57 +0100 Subject: [PATCH 128/488] Update htscodecs to fix uninitialised access in name tokeniser Credit to OSS-Fuzz Fixes oss-fuzz 32688 --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 30bc9fdca..d7e357946 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 30bc9fdca45e144bd975eb2a2563c1cac43c2ec5 +Subproject commit d7e357946ead219b81cc1becbe0de8a99d96ca84 From d7308a3b0aa01382ef25f86cea6ef3950eb4680c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 1 Oct 2020 16:16:18 +0100 Subject: [PATCH 129/488] Add fastq support, to read directly into unmapped BAM. /1 and /2 suffixes on read names can be spotted and turned into READ1/READ2 and PAIRED flags. With the FASTQ_OPT_CASAVA option it can also extra this data from CASAVA tags, along with setting the BC tag. Similarly the FASTQ_OPT_AUX option will extract SAM aux data in the fastq comment fields (silently glossing over non SAM formatted data). Possibly we should break out the aux tag parsing from sam_parse1 and use the shared code so we gain all the tag types, but this would need a flag to permit glossing over non aux tag strings in there too. --- Makefile | 1 + hts.c | 20 ++- htslib/hts.h | 4 + sam.c | 214 ++++++++++++++++++++++++++++++- sam_internal.h | 4 + test/fastq/fastq.tst | 58 +++++++++ test/fastq/filter_casava.fq | 16 +++ test/fastq/filter_casava.sam | 4 + test/fastq/inter_aux.sam | 10 ++ test/fastq/inter_casava.sam | 10 ++ test/fastq/inter_noaux.sam | 10 ++ test/fastq/interleaved.fq | 40 ++++++ test/fastq/interleaved_casava.fq | 40 ++++++ test/fastq/r1.fq | 20 +++ test/fastq/r1.sam | 5 + test/fastq/r2.fq | 20 +++ test/fastq/r2.sam | 5 + test/fastq/single.fq | 20 +++ test/fastq/single_aux.sam | 5 + test/fastq/single_noaux.sam | 5 + test/fastq/test-fastq.sh | 31 +++++ 21 files changed, 537 insertions(+), 5 deletions(-) create mode 100644 test/fastq/fastq.tst create mode 100644 test/fastq/filter_casava.fq create mode 100644 test/fastq/filter_casava.sam create mode 100644 test/fastq/inter_aux.sam create mode 100644 test/fastq/inter_casava.sam create mode 100644 test/fastq/inter_noaux.sam create mode 100644 test/fastq/interleaved.fq create mode 100644 test/fastq/interleaved_casava.fq create mode 100644 test/fastq/r1.fq create mode 100644 test/fastq/r1.sam create mode 100644 test/fastq/r2.fq create mode 100644 test/fastq/r2.sam create mode 100644 test/fastq/single.fq create mode 100644 test/fastq/single_aux.sam create mode 100644 test/fastq/single_noaux.sam create mode 100755 test/fastq/test-fastq.sh diff --git a/Makefile b/Makefile index f0a70da88..e69ae7320 100644 --- a/Makefile +++ b/Makefile @@ -490,6 +490,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODEC cd test/sam_filter && ./filter.sh filter.tst cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst + cd test/fastq && ./test-fastq.sh REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq test/test-regidx cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-} diff --git a/hts.c b/hts.c index 7842d92f3..a26568488 100644 --- a/hts.c +++ b/hts.c @@ -1017,6 +1017,14 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "FILTER") == 0) o->opt = HTS_OPT_FILTER, o->val.s = val; + else if (strcmp(o->arg, "fastq_aux") == 0 || + strcmp(o->arg, "FASTQ_AUX") == 0) + o->opt = FASTQ_OPT_AUX, o->val.i = 1; + + else if (strcmp(o->arg, "fastq_casava") == 0 || + strcmp(o->arg, "FASTQ_CASAVA") == 0) + o->opt = FASTQ_OPT_CASAVA, o->val.i = 1; + else { hts_log_error("Unknown option '%s'", o->arg); free(o->arg); @@ -1379,7 +1387,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) int hts_close(htsFile *fp) { - int ret, save; + int ret = 0, save; switch (fp->format.format) { case binary_format: @@ -1409,7 +1417,10 @@ int hts_close(htsFile *fp) case fastq_format: case sam: case vcf: - ret = sam_state_destroy(fp); + if (fp->format.format == sam) + ret = sam_state_destroy(fp); + else if (fp->format.format == fastq_format) + fastq_state_destroy(fp); if (fp->format.compression != no_compression) ret |= bgzf_close(fp->fp.bgzf); @@ -1562,6 +1573,11 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { break; } + case FASTQ_OPT_CASAVA: + case FASTQ_OPT_AUX: + fastq_state_set(fp, opt); + return 0; + default: break; } diff --git a/htslib/hts.h b/htslib/hts.h index 4b1a414e4..4a05c9eea 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -329,6 +329,10 @@ enum hts_fmt_option { HTS_OPT_BLOCK_SIZE, HTS_OPT_FILTER, HTS_OPT_PROFILE, + + // Fastq + FASTQ_OPT_CASAVA = 1000, + FASTQ_OPT_AUX, }; // Profile options for encoding; primarily used at present in CRAM diff --git a/sam.c b/sam.c index 8bda92384..a0f5ba3e8 100644 --- a/sam.c +++ b/sam.c @@ -55,6 +55,20 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) +// Surprising though it is, we've apparently never needed to read data +// directly from htsFile before! Everything else that supports both +// compressed and uncompressed data instead bypasses htsFile and goes +// direct to bgzf, but hts_open doesn't use the bgzf layer for +// uncompressed data so we have to switch functions on the fly instead. +static size_t hts_read(htsFile *fp, void *data, size_t length) { + return fp->is_bgzf + ? bgzf_read(fp->fp.bgzf, data, length) + : hread(fp->fp.hfile, data, length); +} + +#include "htslib/kseq.h" +KSEQ_INIT(htsFile*, hts_read) + #ifndef EFTYPE #define EFTYPE ENOEXEC #endif @@ -2027,6 +2041,9 @@ sam_hdr_t *sam_hdr_read(htsFile *fp) case sam: return sam_hdr_create(fp); + case fastq_format: + return sam_hdr_init(); + case empty_format: errno = EPIPE; return NULL; @@ -2044,9 +2061,6 @@ int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) return -1; } - if (!h->hrecs && !h->text) - return 0; - switch (fp->format.format) { case binary_format: fp->format.category = sequence_data; @@ -2070,6 +2084,8 @@ int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) fp->format.format = sam; /* fall-through */ case sam: { + if (!h->hrecs && !h->text) + return 0; char *text; kstring_t hdr_ks = { 0, 0, NULL }; size_t l_text; @@ -3488,6 +3504,184 @@ int sam_set_threads(htsFile *fp, int nthreads) { return 0; } +typedef struct { + kseq_t *seq; + int parse_casava; + int parse_aux; +} fastq_state; + +void fastq_state_set(samFile *fp, enum hts_fmt_option opt) { + if (!fp) + return; + if (!fp->state) + if (!(fp->state = calloc(1, sizeof(fastq_state)))) + return; + + fastq_state *x = (fastq_state *)fp->state; + + switch (opt) { + case FASTQ_OPT_CASAVA: + x->parse_casava = 1; + break; + + case FASTQ_OPT_AUX: + x->parse_aux = 1; + break; + + default: + break; + } +} + +int fastq_parse1(htsFile *fp, bam1_t *b) { + fastq_state *x = (fastq_state *)fp->state; + kseq_t *seq = x->seq; + int i, l = kseq_read(seq); + int ret = 0; + + if (l <= 0) { + ret = l; + goto err; + } + + // Decr qual + for (i = 0; i < seq->qual.l; i++) + seq->qual.s[i] -= '!'; + + int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; + if (seq->name.l > 2 && + seq->name.s[seq->name.l-2] == '/' && + isdigit(seq->name.s[seq->name.l-1])) { + switch(seq->name.s[seq->name.l-1]) { + case '1': flag |= BAM_FREAD1 | pflag; break; + case '2': flag |= BAM_FREAD2 | pflag; break; + default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; + } + seq->name.s[seq->name.l-=2] = 0; + } + + // Convert to BAM + ret = bam_set1(b, + seq->name.l, seq->name.s, + flag, + -1, -1, 0, // ref '*', pos, mapq, + 0, NULL, // no cigar, + -1, -1, 0, // mate + seq->seq.l, seq->seq.s, seq->qual.s, + 0); + + // FIXME: this code could be more efficient by appending all tags + // to a kstring and then supplying that as extra_len to bam_construct_seq + // and a memcpy. That avoids needless reallocs. + // + // The use of bam_aux_update_* is inefficient too as we don't need to + // be checking what tags we've already set, but we lack an alternative API. + + // Identify Illumina CASAVA strings. + // ::: + char *barcode = NULL; + int barcode_len = 0; + kstring_t *kc = &seq->comment; + if (x->parse_casava && + kc->l > 6 && (kc->s[1] | kc->s[3] | kc->s[5]) == ':' && + isdigit(kc->s[0]) && isdigit(kc->s[4])) { + + // read num + switch(kc->s[0]) { + case '1': b->core.flag |= BAM_FREAD1 | pflag; break; + case '2': b->core.flag |= BAM_FREAD2 | pflag; break; + default : b->core.flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; + } + + if (kc->s[2] == 'Y') + b->core.flag |= BAM_FQCFAIL; + + // Barcode + barcode = kc->s + 6; + for (i = 6; i < kc->l; i++) + if (isspace(kc->s[i])) + break; + + kc->s[i] = 0; + barcode_len = i+1-(barcode - kc->s); + } + + if (ret >= 0 && barcode_len) + if (bam_aux_append(b, "BC", 'Z', barcode_len, (uint8_t *)barcode) < 0) + ret = -1; + + if (!x->parse_aux) + return ret; + + // Identify any SAM style aux tags in comments too. + i = barcode_len; + + do { + int j; + while (i < kc->l && isspace(kc->s[i])) + i++; + + if (i+5 /*XX:Z:*/ < kc->l) { + if ((kc->s[i+2] | kc->s[i+4]) == ':' && + isalnum(kc->s[i]) && isalnum(kc->s[i+1])) { + switch (kc->s[i+3]) { + case 'Z': + j = i+5; + while (j < kc->l && kc->s[j++] > '\t') + ; + if (j < kc->l) + kc->s[j-1]=0; // tab to nul + bam_aux_update_str(b, &kc->s[i], j - (i+5), &kc->s[i+5]); + break; + + case 'i': { + char *end; + int err = 0; + uint64_t i64 = hts_str2int(&kc->s[i+5], &end, 63, &err); + if (!err) + bam_aux_update_int(b, &kc->s[i], i64); + j = end - kc->s; + break; + } + + case 'f': { + char *end; + int err = 0; + double d = hts_str2dbl(&kc->s[i+5], &end, &err); + if (!err) + bam_aux_update_float(b, &kc->s[i], d); + j = end - kc->s; + break; + } + + default: + j = i+5; + while (j < kc->l && kc->s[j++] != '\t') + ; + break; + } + i = j; + } else { + while (i < kc->l && !isspace(kc->s[i])) + i++; + } + } else { + break; + } + + } while(i < kc->l); + + err: + return ret; +} + +void fastq_state_destroy(htsFile *fp) { + if (fp->state) { + kseq_destroy(((fastq_state *)fp->state)->seq); + free(fp->state); + } +} + // Internal component of sam_read1 below static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { int ret = bam_read1(fp->fp.bgzf, b); @@ -3628,6 +3822,20 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) ret = sam_read1_sam(fp, h, b); break; + case fastq_format: { + fastq_state *x = (fastq_state *)fp->state; + if (!x) { + if (!(x = calloc(1, sizeof(fastq_state)))) + return -2; + fp->state = x; + } + + if (!x->seq) + x->seq = kseq_init(fp); + + return fastq_parse1(fp, b); + } + case empty_format: errno = EPIPE; return -3; diff --git a/sam_internal.h b/sam_internal.h index 630b57003..e15cbe7e3 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -36,6 +36,10 @@ int sam_state_destroy(samFile *fp); int sam_set_thread_pool(htsFile *fp, htsThreadPool *p); int sam_set_threads(htsFile *fp, int nthreads); +// Fastq state +void fastq_state_set(samFile *fp, enum hts_fmt_option opt); +void fastq_state_destroy(samFile *fp); + // bam1_t data (re)allocation int sam_realloc_bam_data(bam1_t *b, size_t desired); diff --git a/test/fastq/fastq.tst b/test/fastq/fastq.tst new file mode 100644 index 000000000..185db37dd --- /dev/null +++ b/test/fastq/fastq.tst @@ -0,0 +1,58 @@ +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $pileup is replaced with the path to the pileup test +# program + +# Single file, unpaired data, with / without aux tags +P single_noaux.sam $tview single.fq +P single_aux.sam $tview -i fastq_aux single.fq + +# Single file, interleaved paired data, no aux +P inter_noaux.sam $tview interleaved.fq + +# Single file, interleaved paired data, with aux +P inter_aux.sam $tview -i fastq_aux interleaved.fq + +# Single file, interleaved paired data, using CASAVA +P inter_casava.sam $tview -i fastq_casava interleaved_casava.fq + +# CASAVA with filtering +P filter_casava.sam $tview -i fastq_casava filter_casava.fq + + +# Paired data is mainly tested by the Samtools test harness. +# Basically though it's just reading two files and relying on either +# this code or explicit overloading of READ1/READ2. +# We simply test here we can read r1 and r2 as separate files +P r1.sam $tview -i fastq_aux r1.fq +P r2.sam $tview -i fastq_aux r2.fq diff --git a/test/fastq/filter_casava.fq b/test/fastq/filter_casava.fq new file mode 100644 index 000000000..3f69a7702 --- /dev/null +++ b/test/fastq/filter_casava.fq @@ -0,0 +1,16 @@ +@HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1559:70726#49 2:Y:0:NGTCTATC +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ++ +;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? diff --git a/test/fastq/filter_casava.sam b/test/fastq/filter_casava.sam new file mode 100644 index 000000000..9a278537f --- /dev/null +++ b/test/fastq/filter_casava.sam @@ -0,0 +1,4 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 653 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? BC:Z:NGTCTATC diff --git a/test/fastq/inter_aux.sam b/test/fastq/inter_aux.sam new file mode 100644 index 000000000..91e4acaca --- /dev/null +++ b/test/fastq/inter_aux.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG RG:Z:1#49 diff --git a/test/fastq/inter_casava.sam b/test/fastq/inter_casava.sam new file mode 100644 index 000000000..823d78b19 --- /dev/null +++ b/test/fastq/inter_casava.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? BC:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E BC:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG BC:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE BC:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG BC:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF BC:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG BC:Z:NGTCTATC diff --git a/test/fastq/inter_noaux.sam b/test/fastq/inter_noaux.sam new file mode 100644 index 000000000..bc7edff0e --- /dev/null +++ b/test/fastq/inter_noaux.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/interleaved.fq b/test/fastq/interleaved.fq new file mode 100644 index 000000000..5aa88e129 --- /dev/null +++ b/test/fastq/interleaved.fq @@ -0,0 +1,40 @@ +@HS25_09827:2:1201:1505:59795#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1559:70726#49/2 RG:Z:1#49 +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ++ +;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? +@HS25_09827:2:1201:1564:39627#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@HS25_09827:2:1201:1564:39627#49/2 RG:Z:1#49 +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ++ +;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG +@HS25_09827:2:1201:1565:91731#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +@HS25_09827:2:1201:1565:91731#49/2 RG:Z:1#49 +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ++ +;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG +@HS25_09827:2:1201:1624:69925#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA ++ +CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF +@HS25_09827:2:1201:1624:69925#49/2 RG:Z:1#49 +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ++ +;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/interleaved_casava.fq b/test/fastq/interleaved_casava.fq new file mode 100644 index 000000000..0e0e1ac9c --- /dev/null +++ b/test/fastq/interleaved_casava.fq @@ -0,0 +1,40 @@ +@HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1559:70726#49 2:N:0:NGTCTATC +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ++ +;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? +@HS25_09827:2:1201:1564:39627#49 1:N:0:NGTCTATC +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@HS25_09827:2:1201:1564:39627#49 2:N:0:NGTCTATC +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ++ +;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG +@HS25_09827:2:1201:1565:91731#49 1:N:0:NGTCTATC +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +@HS25_09827:2:1201:1565:91731#49 2:N:0:NGTCTATC +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ++ +;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG +@HS25_09827:2:1201:1624:69925#49 1:N:0:NGTCTATC +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA ++ +CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF +@HS25_09827:2:1201:1624:69925#49 2:N:0:NGTCTATC +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ++ +;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/r1.fq b/test/fastq/r1.fq new file mode 100644 index 000000000..3728b9a98 --- /dev/null +++ b/test/fastq/r1.fq @@ -0,0 +1,20 @@ +@HS25_09827:2:1201:1505:59795#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1564:39627#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@HS25_09827:2:1201:1565:91731#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +@HS25_09827:2:1201:1624:69925#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA ++ +CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF diff --git a/test/fastq/r1.sam b/test/fastq/r1.sam new file mode 100644 index 000000000..ffc06191a --- /dev/null +++ b/test/fastq/r1.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB diff --git a/test/fastq/r2.fq b/test/fastq/r2.fq new file mode 100644 index 000000000..a766bf9d5 --- /dev/null +++ b/test/fastq/r2.fq @@ -0,0 +1,20 @@ +@HS25_09827:2:1201:1505:59795#49/2 RG:Z:1#49 +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT ++ +DCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHGCFCFFGGCEBF? +@HS25_09827:2:1201:1564:39627#49/2 RG:Z:1#49 +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ++ +;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG +@HS25_09827:2:1201:1565:91731#49/2 RG:Z:1#49 +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ++ +;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG +@HS25_09827:2:1201:1624:69925#49/2 RG:Z:1#49 +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ++ +;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/r2.sam b/test/fastq/r2.sam new file mode 100644 index 000000000..f75edc9d1 --- /dev/null +++ b/test/fastq/r2.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT DCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHGCFCFFGGCEBF? RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG RG:Z:1#49 diff --git a/test/fastq/single.fq b/test/fastq/single.fq new file mode 100644 index 000000000..c6ad15dd6 --- /dev/null +++ b/test/fastq/single.fq @@ -0,0 +1,20 @@ +@HS25_09827:2:1201:1505:59795#49 RG:Z:1#49 +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +@HS25_09827:2:1201:1564:39627#49 RG:Z:1#49 +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@HS25_09827:2:1201:1565:91731#49 RG:Z:1#49 +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +@HS25_09827:2:1201:1624:69925#49 RG:Z:1#49 +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA ++ +CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF diff --git a/test/fastq/single_aux.sam b/test/fastq/single_aux.sam new file mode 100644 index 000000000..bfe355e0f --- /dev/null +++ b/test/fastq/single_aux.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 4 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF RG:Z:1#49 diff --git a/test/fastq/single_noaux.sam b/test/fastq/single_noaux.sam new file mode 100644 index 000000000..9a98989e8 --- /dev/null +++ b/test/fastq/single_noaux.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +HS25_09827:2:1201:1564:39627#49 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +HS25_09827:2:1201:1565:91731#49 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE +HS25_09827:2:1201:1624:69925#49 4 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF diff --git a/test/fastq/test-fastq.sh b/test/fastq/test-fastq.sh new file mode 100755 index 000000000..6362c7866 --- /dev/null +++ b/test/fastq/test-fastq.sh @@ -0,0 +1,31 @@ +#!/bin/sh +# +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +. ../simple_test_driver.sh + +echo "Testing fastq support" + +tview=../test_view +test_driver fastq.tst +exit $? From 3a56a8f167238eeea9a781cb95690950f6910263 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 13 Oct 2020 09:55:45 +0100 Subject: [PATCH 130/488] FASTQ support code merged from draft in PR #1157 This contains a subset of the code in #1157, as a basis for adding fastq write support onto the existing read support. It also moves the aux tag parsing code into a new function so it can be used by both the fastq and sam parsers. Co-authored-by: James Bonfield --- hts.c | 15 +- htslib/hts.h | 3 +- sam.c | 362 +++++++++++++++++++++++++++-------------------- test/test_view.c | 6 +- 4 files changed, 231 insertions(+), 155 deletions(-) diff --git a/hts.c b/hts.c index a26568488..82bbcee68 100644 --- a/hts.c +++ b/hts.c @@ -742,7 +742,8 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) htsFile *fp = NULL; hFILE *hfile = NULL; char fmt_code = '\0'; - const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0"; + // see enum htsExactFormat in htslib/hts.h + const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0\0f\0\0"; strncpy(smode, mode, 99); smode[99]=0; @@ -1149,7 +1150,7 @@ int hts_parse_format(htsFormat *format, const char *str) { if (strcmp(fmt, "sam") == 0) { format->category = sequence_data; format->format = sam; - format->compression = no_compression;; + format->compression = no_compression; format->compression_level = 0; } else if (strcmp(fmt, "sam.gz") == 0) { format->category = sequence_data; @@ -1169,13 +1170,18 @@ int hts_parse_format(htsFormat *format, const char *str) { } else if (strcmp(fmt, "vcf") == 0) { format->category = variant_data; format->format = vcf; - format->compression = no_compression;; + format->compression = no_compression; format->compression_level = 0; } else if (strcmp(fmt, "bcf") == 0) { format->category = variant_data; format->format = bcf; format->compression = bgzf; format->compression_level = -1; + } else if (strcmp(fmt, "fastq") == 0 || strcmp(fmt, "fq") == 0) { + format->category = sequence_data; + format->format = fastq_format; + format->compression = no_compression; + format->compression_level = 0; } else { return -1; } @@ -1300,6 +1306,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) if (strchr(simple_mode, 'b')) fmt->format = binary_format; else if (strchr(simple_mode, 'c')) fmt->format = cram; + else if (strchr(simple_mode, 'f')) fmt->format = fastq_format; else fmt->format = text_format; if (strchr(simple_mode, 'z')) fmt->compression = bgzf; @@ -1310,6 +1317,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) switch (fmt->format) { case binary_format: fmt->compression = bgzf; break; case cram: fmt->compression = custom; break; + case fastq_format: fmt->compression = no_compression; break; case text_format: fmt->compression = no_compression; break; default: abort(); } @@ -1482,6 +1490,7 @@ static hFILE *hts_hfile(htsFile *fp) { case cram: return cram_hfile(fp->fp.cram); case text_format: return fp->fp.hfile; case vcf: // fall through + case fastq_format: // fall through case sam: return fp->format.compression != no_compression ? bgzf_hfile(fp->fp.bgzf) : fp->fp.hfile; diff --git a/htslib/hts.h b/htslib/hts.h index 4a05c9eea..b56045f12 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -519,7 +519,7 @@ char *hts_format_description(const htsFormat *format); @param fn The file name or "-" for stdin/stdout. For indexed files with a non-standard naming, the file name can include the name of the index file delimited with HTS_IDX_DELIM - @param mode Mode matching / [rwa][bceguxz0-9]* / + @param mode Mode matching / [rwa][bcefguxz0-9]* / @discussion With 'r' opens for reading; any further format mode letters are ignored as the format is detected by checking the first few bytes or BGZF blocks @@ -527,6 +527,7 @@ char *hts_format_description(const htsFormat *format); specifier letters: b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) c CRAM format + f FASTQ format g gzip compressed u uncompressed z bgzf compressed diff --git a/sam.c b/sam.c index a0f5ba3e8..3e94b40e1 100644 --- a/sam.c +++ b/sam.c @@ -2146,6 +2146,10 @@ int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) } break; + case fastq_format: + // Nothing to output; FASTQ has no file headers. + break; + default: errno = EBADF; return -1; @@ -2403,6 +2407,142 @@ static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { } } +// Parse tag line and append to bam object b. +// Shared by both SAM and FASTQ parsers. +// +// The difference between the two is how lenient we are to recognising +// non-compliant strings. The FASTQ parser glosses over arbitrary +// non-SAM looking strings. +static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient) { + int overflow = 0; + char logbuf[40]; + char *q = start, *p = end; + +#define _parse_err(cond, ...) \ + do { \ + if (cond) { \ + if (lenient) { \ + while (q < p && !isspace(*q)) \ + q++; \ + while (q < p && isspace(*q)) \ + q++; \ + goto loop; \ + } else { \ + hts_log_error(__VA_ARGS__); \ + goto err_ret; \ + } \ + } \ + } while (0) + + while (q < p) loop: { + char type; + if (p - q < 5) { + if (lenient) { + break; + } else { + hts_log_error("Incomplete aux field"); + goto err_ret; + } + } + _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); + + if (lenient && (q[2] | q[4]) != ':') { + while (q < p && !isspace(*q)) + q++; + while (q < p && isspace(*q)) + q++; + continue; + } + + // Copy over id + if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; + memcpy(b->data + b->l_data, q, 2); b->l_data += 2; + q += 3; type = *q++; ++q; // q points to value + if (type != 'Z' && type != 'H') // the only zero length acceptable fields + _parse_err(*q <= '\t', "incomplete aux field"); + + // Ensure enough space for a double + type allocated. + if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; + + if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { + b->data[b->l_data++] = 'A'; + b->data[b->l_data++] = *q++; + } else if (type == 'i' || type == 'I') { + if (*q == '-') { + int32_t x = hts_str2int(q, &q, 32, &overflow); + if (x >= INT8_MIN) { + b->data[b->l_data++] = 'c'; + b->data[b->l_data++] = x; + } else if (x >= INT16_MIN) { + b->data[b->l_data++] = 's'; + i16_to_le(x, b->data + b->l_data); + b->l_data += 2; + } else { + b->data[b->l_data++] = 'i'; + i32_to_le(x, b->data + b->l_data); + b->l_data += 4; + } + } else { + uint32_t x = hts_str2uint(q, &q, 32, &overflow); + if (x <= UINT8_MAX) { + b->data[b->l_data++] = 'C'; + b->data[b->l_data++] = x; + } else if (x <= UINT16_MAX) { + b->data[b->l_data++] = 'S'; + u16_to_le(x, b->data + b->l_data); + b->l_data += 2; + } else { + b->data[b->l_data++] = 'I'; + u32_to_le(x, b->data + b->l_data); + b->l_data += 4; + } + } + } else if (type == 'f') { + b->data[b->l_data++] = 'f'; + float_to_le(strtod(q, &q), b->data + b->l_data); + b->l_data += sizeof(float); + } else if (type == 'd') { + b->data[b->l_data++] = 'd'; + double_to_le(strtod(q, &q), b->data + b->l_data); + b->l_data += sizeof(double); + } else if (type == 'Z' || type == 'H') { + char *end = strchr(q, '\t'); + if (!end) end = q + strlen(q); + _parse_err(type == 'H' && ((end-q)&1) != 0, + "hex field does not have an even number of digits"); + b->data[b->l_data++] = type; + if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; + memcpy(b->data + b->l_data, q, end - q); + b->l_data += end - q; + b->data[b->l_data++] = '\0'; + q = end; + } else if (type == 'B') { + uint32_t n; + char *r; + type = *q++; // q points to the first ',' following the typing byte + _parse_err(*q && *q != ',' && *q != '\t', + "B aux field type not followed by ','"); + + for (r = q, n = 0; *r > '\t'; ++r) + if (*r == ',') ++n; + + if (sam_parse_B_vals(type, n, q, &q, r, b) < 0) + goto err_ret; + } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); + + while (*q > '\t') { q++; } // Skip any junk to next tab + q++; + } + + _parse_err(overflow != 0, "numeric value out of allowed range"); +#undef _parse_err + + return 0; + +err_ret: + return -2; +} + int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) { #define _read_token(_p) (_p); do { char *tab = strchr((_p), '\t'); if (!tab) goto err_ret; *tab = '\0'; (_p) = tab + 1; } while (0) @@ -2568,94 +2708,10 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) _parse_err(failed, "invalid QUAL character"); p += c->l_qseq + 1; } - // aux - q = p; - p = s->s + s->l; - while (q < p) { - char type; - _parse_err(p - q < 5, "incomplete aux field"); - _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); - // Copy over id - if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; - memcpy(b->data + b->l_data, q, 2); b->l_data += 2; - q += 3; type = *q++; ++q; // q points to value - if (type != 'Z' && type != 'H') // the only zero length acceptable fields - _parse_err(*q <= '\t', "incomplete aux field"); - - // Ensure enough space for a double + type allocated. - if (possibly_expand_bam_data(b, 16) < 0) goto err_ret; - - if (type == 'A' || type == 'a' || type == 'c' || type == 'C') { - b->data[b->l_data++] = 'A'; - b->data[b->l_data++] = *q++; - } else if (type == 'i' || type == 'I') { - if (*q == '-') { - int32_t x = hts_str2int(q, &q, 32, &overflow); - if (x >= INT8_MIN) { - b->data[b->l_data++] = 'c'; - b->data[b->l_data++] = x; - } else if (x >= INT16_MIN) { - b->data[b->l_data++] = 's'; - i16_to_le(x, b->data + b->l_data); - b->l_data += 2; - } else { - b->data[b->l_data++] = 'i'; - i32_to_le(x, b->data + b->l_data); - b->l_data += 4; - } - } else { - uint32_t x = hts_str2uint(q, &q, 32, &overflow); - if (x <= UINT8_MAX) { - b->data[b->l_data++] = 'C'; - b->data[b->l_data++] = x; - } else if (x <= UINT16_MAX) { - b->data[b->l_data++] = 'S'; - u16_to_le(x, b->data + b->l_data); - b->l_data += 2; - } else { - b->data[b->l_data++] = 'I'; - u32_to_le(x, b->data + b->l_data); - b->l_data += 4; - } - } - } else if (type == 'f') { - b->data[b->l_data++] = 'f'; - float_to_le(strtod(q, &q), b->data + b->l_data); - b->l_data += sizeof(float); - } else if (type == 'd') { - b->data[b->l_data++] = 'd'; - double_to_le(strtod(q, &q), b->data + b->l_data); - b->l_data += sizeof(double); - } else if (type == 'Z' || type == 'H') { - char *end = strchr(q, '\t'); - if (!end) end = q + strlen(q); - _parse_err(type == 'H' && ((end-q)&1) != 0, - "hex field does not have an even number of digits"); - b->data[b->l_data++] = type; - if (possibly_expand_bam_data(b, end - q + 1) < 0) goto err_ret; - memcpy(b->data + b->l_data, q, end - q); - b->l_data += end - q; - b->data[b->l_data++] = '\0'; - q = end; - } else if (type == 'B') { - uint32_t n; - char *r; - type = *q++; // q points to the first ',' following the typing byte - _parse_err(*q && *q != ',' && *q != '\t', - "B aux field type not followed by ','"); - - for (r = q, n = 0; *r > '\t'; ++r) - if (*r == ',') ++n; - - if (sam_parse_B_vals(type, n, q, &q, r, b) < 0) - goto err_ret; - } else _parse_err(1, "unrecognized type %s", hts_strprint(logbuf, sizeof logbuf, '\'', &type, 1)); - - while (*q > '\t') { q++; } // Skip any junk to next tab - q++; - } - _parse_err(overflow != 0, "numeric value out of allowed range"); + // aux + if (aux_parse(p, s->s + s->l, b, 0) < 0) + goto err_ret; if (bam_tag2cigar(b, 1, 1) < 0) return -2; @@ -3570,13 +3626,6 @@ int fastq_parse1(htsFile *fp, bam1_t *b) { seq->seq.l, seq->seq.s, seq->qual.s, 0); - // FIXME: this code could be more efficient by appending all tags - // to a kstring and then supplying that as extra_len to bam_construct_seq - // and a memcpy. That avoids needless reallocs. - // - // The use of bam_aux_update_* is inefficient too as we don't need to - // be checking what tags we've already set, but we lack an alternative API. - // Identify Illumina CASAVA strings. // ::: char *barcode = NULL; @@ -3614,62 +3663,8 @@ int fastq_parse1(htsFile *fp, bam1_t *b) { return ret; // Identify any SAM style aux tags in comments too. - i = barcode_len; - - do { - int j; - while (i < kc->l && isspace(kc->s[i])) - i++; - - if (i+5 /*XX:Z:*/ < kc->l) { - if ((kc->s[i+2] | kc->s[i+4]) == ':' && - isalnum(kc->s[i]) && isalnum(kc->s[i+1])) { - switch (kc->s[i+3]) { - case 'Z': - j = i+5; - while (j < kc->l && kc->s[j++] > '\t') - ; - if (j < kc->l) - kc->s[j-1]=0; // tab to nul - bam_aux_update_str(b, &kc->s[i], j - (i+5), &kc->s[i+5]); - break; - - case 'i': { - char *end; - int err = 0; - uint64_t i64 = hts_str2int(&kc->s[i+5], &end, 63, &err); - if (!err) - bam_aux_update_int(b, &kc->s[i], i64); - j = end - kc->s; - break; - } - - case 'f': { - char *end; - int err = 0; - double d = hts_str2dbl(&kc->s[i+5], &end, &err); - if (!err) - bam_aux_update_float(b, &kc->s[i], d); - j = end - kc->s; - break; - } - - default: - j = i+5; - while (j < kc->l && kc->s[j++] != '\t') - ; - break; - } - i = j; - } else { - while (i < kc->l && !isspace(kc->s[i])) - i++; - } - } else { - break; - } - - } while(i < kc->l); + if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1) < 0) + ret = -1; err: return ret; @@ -3944,6 +3939,64 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) return sam_format1_append(h, b, str); } +int fastq_format1(const bam1_t *b, kstring_t *str) +{ + unsigned flag = b->core.flag; + int i, len = b->core.l_qseq; + uint8_t *seq, *qual; + int mode_slashes = 1, mode_colons = 1, mode_aux = 0; + + str->l = 0; + + if (len == 0) return 0; + + if (kputc('@', str) == EOF || kputs(bam_get_qname(b), str) == EOF) + return -1; + + if (mode_slashes && (flag & BAM_FPAIRED)) { + const char *suffix = (flag & BAM_FREAD1)? "/1" : (flag & BAM_FREAD2)? "/2" : ""; + if (kputs(suffix, str) == EOF) return -1; + } + + if (mode_colons) { + int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0; + char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N'; + if (ksprintf(str, " %d:%c:0:0", rnum, filtered) < 0) return -1; + } + + if (mode_aux) { + // ... FIXME + } + + if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1; + + kputc_('\n', str); + + seq = bam_get_seq(b); + if (flag & BAM_FREVERSE) + for (i = len-1; i >= 0; i--) + kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str); + else + for (i = 0; i < len; i++) + kputc_(seq_nt16_str[bam_seqi(seq, i)], str); + + kputsn("\n+\n", 3, str); + + qual = bam_get_qual(b); + if (qual[0] == 0xff) + for (i = 0; i < len; i++) + kputc_('B', str); + else if (flag & BAM_FREVERSE) + for (i = len-1; i >= 0; i--) + kputc_(33 + qual[i], str); + else + for (i = 0; i < len; i++) + kputc_(33 + qual[i], str); + + kputc('\n', str); + return str->l; +} + // Sadly we need to be able to modify the bam_hdr here so we can // reference count the structure. int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) @@ -4072,6 +4125,13 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) return fp->line.l; } + + case fastq_format: + if (fastq_format1(b, &fp->line) < 0 || + hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) + return -1; + return fp->line.l; + default: errno = EBADF; return -1; @@ -4538,6 +4598,8 @@ int sam_open_mode(char *mode, const char *fn, const char *format) else if (strcasecmp(format, "cram") == 0) strcpy(mode, "c"); else if (strcasecmp(format, "sam") == 0) strcpy(mode, ""); else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z"); + else if (strcasecmp(format, "fastq") == 0 || + strcasecmp(format, "fq") == 0) strcpy(mode, "f"); else return -1; return 0; diff --git a/test/test_view.c b/test/test_view.c index dc644610b..547ffc6c4 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -57,6 +57,7 @@ enum test_op { WRITE_CRAM = 8, WRITE_UNCOMPRESSED = 16, WRITE_COMPRESSED = 32, // eg vcf.gz, sam.gz + WRITE_FASTQ = 64, }; int sam_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, htsFile *out) { @@ -294,7 +295,7 @@ int main(int argc, char *argv[]) opts.index = NULL; opts.min_shift = 0; - while ((c = getopt(argc, argv, "DSIt:i:bzCul:o:N:BZ:@:Mx:m:p:v")) >= 0) { + while ((c = getopt(argc, argv, "DSIt:i:bzCful:o:N:BZ:@:Mx:m:p:v")) >= 0) { switch (c) { case 'D': opts.flag |= READ_CRAM; break; case 'S': opts.flag |= READ_COMPRESSED; break; @@ -304,6 +305,7 @@ int main(int argc, char *argv[]) case 'b': opts.flag |= WRITE_BINARY_COMP; break; case 'z': opts.flag |= WRITE_COMPRESSED; break; case 'C': opts.flag |= WRITE_CRAM; break; + case 'f': opts.flag |= WRITE_FASTQ; break; case 'u': opts.flag |= WRITE_UNCOMPRESSED; break; // eg u-BAM not SAM case 'l': opts.clevel = atoi(optarg); break; case 'o': if (hts_opt_add(&out_opts, optarg)) return 1; break; @@ -330,6 +332,7 @@ int main(int argc, char *argv[]) fprintf(stderr, "-b: write binary compressed BCF, BAM, FAI (mode 'b')\n"); fprintf(stderr, "-z: write text compressed VCF.gz, SAM.gz (mode 'z')\n"); fprintf(stderr, "-C: write CRAM format (mode 'c')\n"); + fprintf(stderr, "-f: write FASTQ format (mode 'f')\n"); fprintf(stderr, "-l 0-9: set zlib compression level\n"); fprintf(stderr, "-o option=value: set an option for CRAM output\n"); fprintf(stderr, "-N: num_reads: limit the output to the first num_reads reads\n"); @@ -361,6 +364,7 @@ int main(int argc, char *argv[]) else if (opts.flag & WRITE_BINARY_COMP) strcat(modew, "b"); else if (opts.flag & WRITE_COMPRESSED) strcat(modew, "z"); else if (opts.flag & WRITE_UNCOMPRESSED) strcat(modew, "bu"); + else if (opts.flag & WRITE_FASTQ) strcat(modew, "f"); out = hts_open(out_fn, modew); if (out == NULL) { fprintf(stderr, "Error opening standard output\n"); From 3bae24ff7b36692c7cc25ed0f0c398e2e01eb520 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 13 Oct 2020 15:23:18 +0100 Subject: [PATCH 131/488] Further work on fastq writing, plus some read updates. It now uses the fastq_state struct to track options and tidied up the code a bit. Also added AUX tag output to fastq_format1 and made both input and output tag lists filterable. By default it's all tags, but it can be specified as a specific comma-separated tag whitelist. This matches the behavior of "samtools fastq -T". Added tests for writing. Replace use of kseq by own dedicated fastq parser (reader) Permit the fastq barcode tag to be adjusted (both reading and writing). This is for both parsing CASAVA tags into aux tags, and back out again. Permit fastq mode 'f' to be used in conjunction with bgzf mode 'z'. FASTQ_OPT_AUX string is now processed with a hand written tokeniser instead of strtok_r. Strtok is just more trouble than it's worth! The input needs to be writeable, meaning we cannot pass in literal strings. This string is now also tokenised to 16-bit integers (2 chars of tag code) rather than as a 2 byte string. This means the khash used doesn't require the argument passed in to be resident in memory for the duration (or to have to strdup it itself to compensate for this). Finally the CASAVA tag writing now has a check for non "+" symbols between barcodes. The SAM spec recommends "-", so we spot and alter these. --- hts.c | 42 +++- htslib/hts.h | 2 + sam.c | 327 ++++++++++++++++++++++--------- sam_internal.h | 2 +- test/fastq/fastq.tst | 34 +++- test/fastq/filter_casava.fq | 8 +- test/fastq/inter_casavaOX.sam | 10 + test/fastq/interleaved_casava.fq | 20 +- test/fastq/minimal.fq | 4 + test/fastq/minimal.sam | 1 + test/fastq/multiline.fq | 20 ++ test/fastq/multiline.sam | 2 + test/fastq/test-fastq.sh | 3 +- test/test_view.c | 6 +- 14 files changed, 362 insertions(+), 119 deletions(-) create mode 100644 test/fastq/inter_casavaOX.sam create mode 100644 test/fastq/minimal.fq create mode 100644 test/fastq/minimal.sam create mode 100644 test/fastq/multiline.fq create mode 100644 test/fastq/multiline.sam diff --git a/hts.c b/hts.c index 82bbcee68..02fb800b1 100644 --- a/hts.c +++ b/hts.c @@ -1020,7 +1020,15 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { else if (strcmp(o->arg, "fastq_aux") == 0 || strcmp(o->arg, "FASTQ_AUX") == 0) - o->opt = FASTQ_OPT_AUX, o->val.i = 1; + o->opt = FASTQ_OPT_AUX, o->val.s = val; + + else if (strcmp(o->arg, "fastq_barcode") == 0 || + strcmp(o->arg, "FASTQ_BARCODE") == 0) + o->opt = FASTQ_OPT_BARCODE, o->val.s = val; + + else if (strcmp(o->arg, "fastq_rnum") == 0 || + strcmp(o->arg, "FASTQ_RNUM") == 0) + o->opt = FASTQ_OPT_RNUM, o->val.i = 1; else if (strcmp(o->arg, "fastq_casava") == 0 || strcmp(o->arg, "FASTQ_CASAVA") == 0) @@ -1066,6 +1074,8 @@ int hts_opt_apply(htsFile *fp, hts_opt *opts) { case CRAM_OPT_VERSION: case CRAM_OPT_PREFIX: case HTS_OPT_FILTER: + case FASTQ_OPT_AUX: + case FASTQ_OPT_BARCODE: if (hts_set_opt(fp, opts->opt, opts->val.s) != 0) return -1; break; @@ -1542,6 +1552,31 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { return 0; } + case FASTQ_OPT_CASAVA: + case FASTQ_OPT_RNUM: + if (fp->format.format == fastq_format) + fastq_state_set(fp, opt); + return 0; + + case FASTQ_OPT_AUX: + if (fp->format.format == fastq_format) { + va_start(args, opt); + char *list = va_arg(args, char *); + va_end(args); + fastq_state_set(fp, opt, list); + } + return 0; + + case FASTQ_OPT_BARCODE: + if (fp->format.format == fastq_format) { + va_start(args, opt); + char *bc = va_arg(args, char *); + va_end(args); + fastq_state_set(fp, opt, bc); + } + return 0; + + // Options below here flow through to cram_set_voption case HTS_OPT_COMPRESSION_LEVEL: { va_start(args, opt); int level = va_arg(args, int); @@ -1582,11 +1617,6 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { break; } - case FASTQ_OPT_CASAVA: - case FASTQ_OPT_AUX: - fastq_state_set(fp, opt); - return 0; - default: break; } diff --git a/htslib/hts.h b/htslib/hts.h index b56045f12..198f7e66d 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -333,6 +333,8 @@ enum hts_fmt_option { // Fastq FASTQ_OPT_CASAVA = 1000, FASTQ_OPT_AUX, + FASTQ_OPT_RNUM, + FASTQ_OPT_BARCODE, }; // Profile options for encoding; primarily used at present in CRAM diff --git a/sam.c b/sam.c index 3e94b40e1..f52cb08b2 100644 --- a/sam.c +++ b/sam.c @@ -54,20 +54,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/khash.h" KHASH_DECLARE(s2i, kh_cstr_t, int64_t) - -// Surprising though it is, we've apparently never needed to read data -// directly from htsFile before! Everything else that supports both -// compressed and uncompressed data instead bypasses htsFile and goes -// direct to bgzf, but hts_open doesn't use the bgzf layer for -// uncompressed data so we have to switch functions on the fly instead. -static size_t hts_read(htsFile *fp, void *data, size_t length) { - return fp->is_bgzf - ? bgzf_read(fp->fp.bgzf, data, length) - : hread(fp->fp.hfile, data, length); -} - -#include "htslib/kseq.h" -KSEQ_INIT(htsFile*, hts_read) +KHASH_SET_INIT_INT(tag) #ifndef EFTYPE #define EFTYPE ENOEXEC @@ -2413,7 +2400,8 @@ static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { // The difference between the two is how lenient we are to recognising // non-compliant strings. The FASTQ parser glosses over arbitrary // non-SAM looking strings. -static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient) { +static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, + khash_t(tag) *tag_whitelist) { int overflow = 0; char logbuf[40]; char *q = start, *p = end; @@ -2422,9 +2410,9 @@ static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient) { do { \ if (cond) { \ if (lenient) { \ - while (q < p && !isspace(*q)) \ + while (q < p && !isspace_c(*q)) \ q++; \ - while (q < p && isspace(*q)) \ + while (q < p && isspace_c(*q)) \ q++; \ goto loop; \ } else { \ @@ -2447,13 +2435,22 @@ static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient) { _parse_err(q[0] < '!' || q[1] < '!', "invalid aux tag id"); if (lenient && (q[2] | q[4]) != ':') { - while (q < p && !isspace(*q)) + while (q < p && !isspace_c(*q)) q++; - while (q < p && isspace(*q)) + while (q < p && isspace_c(*q)) q++; continue; } + if (tag_whitelist) { + int tt = q[0]*256 + q[1]; + if (kh_get(tag, tag_whitelist, tt) == kh_end(tag_whitelist)) { + while (q < p && *q != '\t') + q++; + continue; + } + } + // Copy over id if (possibly_expand_bam_data(b, 2) < 0) goto err_ret; memcpy(b->data + b->l_data, q, 2); b->l_data += 2; @@ -2710,7 +2707,7 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) } // aux - if (aux_parse(p, s->s + s->l, b, 0) < 0) + if (aux_parse(p, s->s + s->l, b, 0, NULL) < 0) goto err_ret; if (bam_tag2cigar(b, 1, 1) < 0) @@ -3561,27 +3558,89 @@ int sam_set_threads(htsFile *fp, int nthreads) { } typedef struct { - kseq_t *seq; - int parse_casava; - int parse_aux; + kstring_t name; + kstring_t comment; // NB: pointer into name, do not free + kstring_t seq; + kstring_t qual; + int casava; + int aux; + int rnum; + char BC[3]; // aux tag ID for barcode + khash_t(tag) *tags; // which aux tags to use (if empty, use all). } fastq_state; -void fastq_state_set(samFile *fp, enum hts_fmt_option opt) { +static fastq_state *fastq_state_init(void) { + fastq_state *x = (fastq_state *)calloc(1, sizeof(*x)); + if (!x) + return NULL; + strcpy(x->BC, "BC"); + + return x; +} + +void fastq_state_destroy(htsFile *fp) { + if (fp->state) { + fastq_state *x = (fastq_state *)fp->state; + if (x->tags) + kh_destroy(tag, x->tags); + ks_free(&x->name); + ks_free(&x->seq); + ks_free(&x->qual); + free(fp->state); + } +} + +void fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { + va_list args; + if (!fp) return; if (!fp->state) - if (!(fp->state = calloc(1, sizeof(fastq_state)))) + if (!(fp->state = fastq_state_init())) return; fastq_state *x = (fastq_state *)fp->state; switch (opt) { case FASTQ_OPT_CASAVA: - x->parse_casava = 1; + x->casava = 1; break; - case FASTQ_OPT_AUX: - x->parse_aux = 1; + case FASTQ_OPT_AUX: { + va_start(args, opt); + x->aux = 1; + char *tag = va_arg(args, char *); + va_end(args); + if (tag && strcmp(tag, "1") != 0) { + if (!x->tags) + if (!(x->tags = kh_init(tag))) + return; + + size_t i, tlen = strlen(tag); + for (i = 0; i+3 <= tlen+1; i += 3) { + if (tag[i+0] == ',' || tag[i+1] == ',' || + !(tag[i+2] == ',' || tag[i+2] == '\0')) { + hts_log_error("Bad tag format '%.3s'; skipping option", tag+i); + break; + } + int ret, tcode = tag[i+0]*256 + tag[i+1]; + kh_put(tag, x->tags, tcode, &ret); + } + } + break; + } + + case FASTQ_OPT_BARCODE: { + va_start(args, opt); + char *bc = va_arg(args, char *); + va_end(args); + strncpy(x->BC, bc, 2); + x->BC[2] = 0; + break; + } + + case FASTQ_OPT_RNUM: + x->rnum = 1; break; default: @@ -3589,51 +3648,94 @@ void fastq_state_set(samFile *fp, enum hts_fmt_option opt) { } } -int fastq_parse1(htsFile *fp, bam1_t *b) { +static int fastq_parse1(htsFile *fp, bam1_t *b) { fastq_state *x = (fastq_state *)fp->state; - kseq_t *seq = x->seq; - int i, l = kseq_read(seq); + size_t i, l; int ret = 0; - if (l <= 0) { - ret = l; - goto err; + // Read a FASTQ format entry. + ret = hts_getline(fp, KS_SEP_LINE, &x->name); + if (ret == -1) + return -1; // EOF + else if (ret < -1) + return ret; // ERR + + // Name + if (*x->name.s != '@') + return -2; + + i = 0; l = x->name.l; + char *s = x->name.s; + while (i < l && !isspace_c(s[i])) + i++; + if (i < l) { + s[i] = 0; + x->name.l = i++; + } + + // Comment; a kstring struct, but pointer into name line. (Do not free) + while (i < l && isspace_c(s[i])) + i++; + x->comment.s = s+i; + x->comment.l = l - i; + + // Seq + x->seq.l = 0; + for (;;) { + if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) + return -2; + if (*fp->line.s == '+') + break; + kputsn(fp->line.s, fp->line.l, &x->seq); } + // Qual + size_t remainder = x->seq.l; + x->qual.l = 0; + do { + if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) + return -2; + kputsn(fp->line.s, fp->line.l, &x->qual); + remainder -= fp->line.l; + } while (remainder > 0); + // Decr qual - for (i = 0; i < seq->qual.l; i++) - seq->qual.s[i] -= '!'; + for (i = 0; i < x->qual.l; i++) + x->qual.s[i] -= '!'; int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; - if (seq->name.l > 2 && - seq->name.s[seq->name.l-2] == '/' && - isdigit(seq->name.s[seq->name.l-1])) { - switch(seq->name.s[seq->name.l-1]) { + if (x->name.l > 2 && + x->name.s[x->name.l-2] == '/' && + isdigit_c(x->name.s[x->name.l-1])) { + switch(x->name.s[x->name.l-1]) { case '1': flag |= BAM_FREAD1 | pflag; break; case '2': flag |= BAM_FREAD2 | pflag; break; default : flag |= BAM_FREAD1 | BAM_FREAD2 | pflag; break; } - seq->name.s[seq->name.l-=2] = 0; + x->name.s[x->name.l-=2] = 0; } // Convert to BAM ret = bam_set1(b, - seq->name.l, seq->name.s, + x->name.l-1, x->name.s+1, flag, -1, -1, 0, // ref '*', pos, mapq, 0, NULL, // no cigar, -1, -1, 0, // mate - seq->seq.l, seq->seq.s, seq->qual.s, + x->seq.l, x->seq.s, x->qual.s, 0); // Identify Illumina CASAVA strings. // ::: char *barcode = NULL; int barcode_len = 0; - kstring_t *kc = &seq->comment; - if (x->parse_casava && - kc->l > 6 && (kc->s[1] | kc->s[3] | kc->s[5]) == ':' && - isdigit(kc->s[0]) && isdigit(kc->s[4])) { + kstring_t *kc = &x->comment; + char *endptr; + if (x->casava && + // \d:[YN]:\d+:[ACGTN]+ + kc->l > 6 && (kc->s[1] | kc->s[3]) == ':' && isdigit_c(kc->s[0]) && + strtol(kc->s+4, &endptr, 10) >= 0 && endptr != kc->s+4 + && *endptr == ':') { // read num switch(kc->s[0]) { @@ -3645,38 +3747,32 @@ int fastq_parse1(htsFile *fp, bam1_t *b) { if (kc->s[2] == 'Y') b->core.flag |= BAM_FQCFAIL; - // Barcode - barcode = kc->s + 6; - for (i = 6; i < kc->l; i++) - if (isspace(kc->s[i])) - break; + // Barcode, maybe numeric in which case we skip it + if (!isdigit_c(endptr[1])) { + barcode = endptr+1; + for (i = barcode - kc->s; i < kc->l; i++) + if (isspace_c(kc->s[i])) + break; - kc->s[i] = 0; - barcode_len = i+1-(barcode - kc->s); + kc->s[i] = 0; + barcode_len = i+1-(barcode - kc->s); + } } if (ret >= 0 && barcode_len) - if (bam_aux_append(b, "BC", 'Z', barcode_len, (uint8_t *)barcode) < 0) + if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0) ret = -1; - if (!x->parse_aux) + if (!x->aux) return ret; // Identify any SAM style aux tags in comments too. - if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1) < 0) + if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0) ret = -1; - err: return ret; } -void fastq_state_destroy(htsFile *fp) { - if (fp->state) { - kseq_destroy(((fastq_state *)fp->state)->seq); - free(fp->state); - } -} - // Internal component of sam_read1 below static inline int sam_read1_bam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { int ret = bam_read1(fp->fp.bgzf, b); @@ -3820,14 +3916,10 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) case fastq_format: { fastq_state *x = (fastq_state *)fp->state; if (!x) { - if (!(x = calloc(1, sizeof(fastq_state)))) + if (!(fp->state = fastq_state_init())) return -2; - fp->state = x; } - if (!x->seq) - x->seq = kseq_init(fp); - return fastq_parse1(fp, b); } @@ -3939,62 +4031,98 @@ int sam_format1(const bam_hdr_t *h, const bam1_t *b, kstring_t *str) return sam_format1_append(h, b, str); } -int fastq_format1(const bam1_t *b, kstring_t *str) +static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end); +int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) { unsigned flag = b->core.flag; - int i, len = b->core.l_qseq; + int i, e = 0, len = b->core.l_qseq; uint8_t *seq, *qual; - int mode_slashes = 1, mode_colons = 1, mode_aux = 0; str->l = 0; if (len == 0) return 0; + // Name if (kputc('@', str) == EOF || kputs(bam_get_qname(b), str) == EOF) return -1; - if (mode_slashes && (flag & BAM_FPAIRED)) { - const char *suffix = (flag & BAM_FREAD1)? "/1" : (flag & BAM_FREAD2)? "/2" : ""; - if (kputs(suffix, str) == EOF) return -1; + // /1 or /2 suffix + if (x && x->rnum && (flag & BAM_FPAIRED)) { + int r12 = flag & (BAM_FREAD1 | BAM_FREAD2); + if (r12 == BAM_FREAD1) { + if (kputs("/1", str) == EOF) + return -1; + } else if (r12 == BAM_FREAD2) { + if (kputs("/2", str) == EOF) + return -1; + } } - if (mode_colons) { + // Illumina CASAVA tag. + // This is ::: + if (x && x->casava) { int rnum = (flag & BAM_FREAD1)? 1 : (flag & BAM_FREAD2)? 2 : 0; char filtered = (flag & BAM_FQCFAIL)? 'Y' : 'N'; - if (ksprintf(str, " %d:%c:0:0", rnum, filtered) < 0) return -1; - } + uint8_t *bc = bam_aux_get(b, x->BC); + if (ksprintf(str, " %d:%c:0:%s", rnum, filtered, + bc ? (char *)bc+1 : "0") < 0) + return -1; - if (mode_aux) { - // ... FIXME + // Replace any non-alpha with '+'. Ie seq-seq to seq+seq + if (bc) { + int l = strlen((char *)bc+1); + char *c = (char *)str->s + str->l - l; + for (i = 0; i < l; i++) + if (!isalpha_c(c[i])) + c[i] = '+'; + } + } + + // Aux tags + if (x && x->aux) { + uint8_t *s = bam_get_aux(b), *end = b->data + b->l_data; + while (s && end - s >= 4) { + int tt = s[0]*256 + s[1]; + if (x->tags == NULL || + kh_get(tag, x->tags, tt) != kh_end(x->tags)) { + e |= kputc_('\t', str) < 0; + if (!(s = (uint8_t *)sam_format_aux1(s, s[2], s+3, end, str))) + return -1; + } else { + s = skip_aux(s+2, end); + } + } + e |= kputsn("", 0, str) < 0; // nul terminate } if (ks_resize(str, str->l + 1 + len+1 + 2 + len+1 + 1) < 0) return -1; + e |= kputc_('\n', str) < 0; - kputc_('\n', str); - + // Seq line seq = bam_get_seq(b); if (flag & BAM_FREVERSE) for (i = len-1; i >= 0; i--) - kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str); + e |= kputc_("!TGKCYSBAWRDMHVN"[bam_seqi(seq, i)], str) < 0; else for (i = 0; i < len; i++) - kputc_(seq_nt16_str[bam_seqi(seq, i)], str); + e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0; kputsn("\n+\n", 3, str); + // Qual line qual = bam_get_qual(b); if (qual[0] == 0xff) for (i = 0; i < len; i++) - kputc_('B', str); + e |= kputc_('B', str) < 0; else if (flag & BAM_FREVERSE) for (i = len-1; i >= 0; i--) - kputc_(33 + qual[i], str); + e |= kputc_(33 + qual[i], str) < 0; else for (i = 0; i < len; i++) - kputc_(33 + qual[i], str); + e |= kputc_(33 + qual[i], str) < 0; - kputc('\n', str); - return str->l; + e |= kputc('\n', str) < 0; + return e ? -1 : str->l; } // Sadly we need to be able to modify the bam_hdr here so we can @@ -4126,11 +4254,24 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) } - case fastq_format: - if (fastq_format1(b, &fp->line) < 0 || - hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) + case fastq_format: { + fastq_state *x = (fastq_state *)fp->state; + if (!x) { + if (!(fp->state = fastq_state_init())) + return -2; + } + + if (fastq_format1(fp->state, b, &fp->line) < 0) return -1; + if (fp->is_bgzf) { + if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) + return -1; + } else { + if (hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l) + return -1; + } return fp->line.l; + } default: errno = EBADF; diff --git a/sam_internal.h b/sam_internal.h index e15cbe7e3..20f154dcd 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -37,7 +37,7 @@ int sam_set_thread_pool(htsFile *fp, htsThreadPool *p); int sam_set_threads(htsFile *fp, int nthreads); // Fastq state -void fastq_state_set(samFile *fp, enum hts_fmt_option opt); +void fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...); void fastq_state_destroy(samFile *fp); // bam1_t data (re)allocation diff --git a/test/fastq/fastq.tst b/test/fastq/fastq.tst index 185db37dd..39be05705 100644 --- a/test/fastq/fastq.tst +++ b/test/fastq/fastq.tst @@ -33,6 +33,15 @@ # Command to execute. $pileup is replaced with the path to the pileup test # program +# -------------------- +# Reading + +# Minimal +P minimal.sam $tview minimal.fq + +# Multi-line FASTQ +P multiline.sam $tview multiline.fq + # Single file, unpaired data, with / without aux tags P single_noaux.sam $tview single.fq P single_aux.sam $tview -i fastq_aux single.fq @@ -45,14 +54,37 @@ P inter_aux.sam $tview -i fastq_aux interleaved.fq # Single file, interleaved paired data, using CASAVA P inter_casava.sam $tview -i fastq_casava interleaved_casava.fq +P inter_casavaOX.sam $tview -i fastq_barcode=OX -i fastq_casava interleaved_casava.fq # CASAVA with filtering P filter_casava.sam $tview -i fastq_casava filter_casava.fq - # Paired data is mainly tested by the Samtools test harness. # Basically though it's just reading two files and relying on either # this code or explicit overloading of READ1/READ2. # We simply test here we can read r1 and r2 as separate files P r1.sam $tview -i fastq_aux r1.fq P r2.sam $tview -i fastq_aux r2.fq + +# -------------------- +# Writing + +# Minimal +P minimal.fq $tview -f minimal.sam + +# Single file with unpaired data plus aux tags +P single.fq $tview -f -o fastq_aux single_aux.sam + +# Single file, interleaved paired data, with aux and /rnum +P interleaved.fq $tview -f -o fastq_aux -o fastq_rnum inter_aux.sam + +# CASAVA with interleaved data +P interleaved_casava.fq $tview -f -o fastq_casava inter_casava.sam +P interleaved_casava.fq $tview -f -o fastq_barcode=OX -o fastq_casava inter_casavaOX.sam + +# CASAVA with filtering +P filter_casava.fq $tview -f -o fastq_casava filter_casava.sam + +# Paired data +P r1.fq $tview -f -o fastq_aux -o fastq_rnum r1.sam +P r2.fq $tview -f -o fastq_aux -o fastq_rnum r2.sam diff --git a/test/fastq/filter_casava.fq b/test/fastq/filter_casava.fq index 3f69a7702..adb191365 100644 --- a/test/fastq/filter_casava.fq +++ b/test/fastq/filter_casava.fq @@ -1,16 +1,16 @@ -@HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +@HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG + CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE -@HS25_09827:2:1201:1559:70726#49 2:Y:0:NGTCTATC +@HS25_09827:2:1201:1559:70726#49 2:Y:0:NGTCTATC TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC + ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? diff --git a/test/fastq/inter_casavaOX.sam b/test/fastq/inter_casavaOX.sam new file mode 100644 index 000000000..85fbbdf55 --- /dev/null +++ b/test/fastq/inter_casavaOX.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE OX:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? OX:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E OX:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG OX:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE OX:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG OX:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF OX:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG OX:Z:NGTCTATC diff --git a/test/fastq/interleaved_casava.fq b/test/fastq/interleaved_casava.fq index 0e0e1ac9c..39a19457c 100644 --- a/test/fastq/interleaved_casava.fq +++ b/test/fastq/interleaved_casava.fq @@ -1,40 +1,40 @@ -@HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +@HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG + CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGDCHFEFBHAGCGACF7CJI8HBIIEFGFEBG?DCGA?ACFGGI=BEDG?EFEHFFFEHFD?HG+DFH>FFHGFBFE4F@I3HF@>A5F?GFHEFCFHGGGHEJEAJE -@HS25_09827:2:1201:1559:70726#49 2:N:0:NGTCTATC +@HS25_09827:2:1201:1559:70726#49 2:N:0:NGTCTATC TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC + ;CBCEFDHDGFGHDGDIGEF@EJIIGEEIECGFHGFHGGGHHHHGGKIFFEHGEGHFIEFFHHGDHHGJEGF?FBHFFGCHHFFII>GCFCFFGGCEBF? -@HS25_09827:2:1201:1564:39627#49 1:N:0:NGTCTATC +@HS25_09827:2:1201:1564:39627#49 1:N:0:NGTCTATC ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA + BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E -@HS25_09827:2:1201:1564:39627#49 2:N:0:NGTCTATC +@HS25_09827:2:1201:1564:39627#49 2:N:0:NGTCTATC AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT + ;CACCDEBDEFGAGGGF5EGFFFEIDDIICGGF>H?GGG@DGAHD@HIDIFGG@IEEFHFGFEBGIHHDE5GHEIABFG>CB=FCF4GGHLBBJGFEHEG -@HS25_09827:2:1201:1565:91731#49 1:N:0:NGTCTATC +@HS25_09827:2:1201:1565:91731#49 1:N:0:NGTCTATC GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA + CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE -@HS25_09827:2:1201:1565:91731#49 2:N:0:NGTCTATC +@HS25_09827:2:1201:1565:91731#49 2:N:0:NGTCTATC AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA + ;CBABFEEFJFGEFGCCDIGIFFIHDCGIEGHGHHEHGIDBGHHGFEIJHHGG@GFCHGFDHKFFEGHDIEFHHFHEIFCHFGCFBHG@>IBF5*DBHHG -@HS25_09827:2:1201:1624:69925#49 1:N:0:NGTCTATC +@HS25_09827:2:1201:1624:69925#49 1:N:0:NGTCTATC GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA + CABEFGFGIFGGGJGHGGFH?FDHGHDHGHEHHJCGHHFHDHDHFGHIGHIFFHGHFGGGI9GHF@IGGH;FICGEFEIHGGIEEFC:DEGGGBDJHHFF -@HS25_09827:2:1201:1624:69925#49 2:N:0:NGTCTATC +@HS25_09827:2:1201:1624:69925#49 2:N:0:NGTCTATC TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC + ;CBCDDDHDHFGGEGGICEGFGJF@GEIFECIFHJCDAI>HDAH?@EIDICDGEIFEB@7DHFFE?CCD4EFIEBHFC?FFBFFG3HC@AGFDFGCBHEG diff --git a/test/fastq/minimal.fq b/test/fastq/minimal.fq new file mode 100644 index 000000000..6f13b948d --- /dev/null +++ b/test/fastq/minimal.fq @@ -0,0 +1,4 @@ +@x +A ++ ++ diff --git a/test/fastq/minimal.sam b/test/fastq/minimal.sam new file mode 100644 index 000000000..e12f4a6b4 --- /dev/null +++ b/test/fastq/minimal.sam @@ -0,0 +1 @@ +x 4 * 0 0 * * 0 0 A + diff --git a/test/fastq/multiline.fq b/test/fastq/multiline.fq new file mode 100644 index 000000000..0c94aec53 --- /dev/null +++ b/test/fastq/multiline.fq @@ -0,0 +1,20 @@ +@seq1 +NAAAAAAAAA +CCCCCCCCCC +GGGGGGGGGG +TTTTTTTTN ++ +++++++++++ +@@@@@@@@@@ +########## +........> +@seq2 +RAAAAAAAAA +CCCCCCCCCC +GGGGGGGGGG +TTTTTTTTY ++ +<<<<<<<<<< +!!!!!!!!!! +########## +@@@@@@@@@ diff --git a/test/fastq/multiline.sam b/test/fastq/multiline.sam new file mode 100644 index 000000000..70442ef3e --- /dev/null +++ b/test/fastq/multiline.sam @@ -0,0 +1,2 @@ +seq1 4 * 0 0 * * 0 0 NAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTN ++++++++++@@@@@@@@@@##########........> +seq2 4 * 0 0 * * 0 0 RAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTY <<<<<<<<< Date: Mon, 19 Oct 2020 15:07:48 +0100 Subject: [PATCH 132/488] Permit FASTA read and write as sequence_data. This is prompted by wanted to simplify "samtools fastq", or more specifically the "fasta" variant of it (it's the same code). I don't want to have to keep all the old tag formatting cruft in samtools/bam_fastq.c, especially as it has a few buglets, so that means we want htslib to be able to write in fasta format if we're to keep both "samtools fastq" and "samtools fasta". Logically speaking we should be symmetric, permitting reading fasta as well as writing fasta. However this may not necessarily be what people were expecting... Time will tell whether it gives rope for people to hang themselves with, eg "samtools view GRCh37.fa". (That works, but it's more likely they were wanting to specify a reference to use along side some command reading from stdin and forgot the -T.) Plus more checking of file suffixes and gzip options. I can't believe I've found 5 separate functions that recognise file formats and act on them. A bit of a tangled web, but I believe this is now all of them recognising fastq/fasta properly. --- hts.c | 35 +++++++-- hts_internal.h | 2 +- htslib/hts.h | 3 +- sam.c | 124 ++++++++++++++++++++++--------- test/fastq/fastq.tst | 19 +++++ test/fastq/filter_casava-q.sam | 4 + test/fastq/filter_casava.fa | 8 ++ test/fastq/inter_aux-q.sam | 10 +++ test/fastq/inter_casava-q.sam | 10 +++ test/fastq/inter_casavaOX-q.sam | 10 +++ test/fastq/inter_noaux-q.sam | 10 +++ test/fastq/interleaved.fa | 20 +++++ test/fastq/interleaved_casava.fa | 20 +++++ test/fastq/minimal-q.sam | 1 + test/fastq/minimal.fa | 2 + test/fastq/multiline-q.sam | 2 + test/fastq/multiline.fa | 10 +++ test/fastq/r1-q.sam | 5 ++ test/fastq/r1.fa | 10 +++ test/fastq/r2-q.sam | 5 ++ test/fastq/r2.fa | 10 +++ test/fastq/single.fa | 10 +++ test/fastq/single_aux-q.sam | 5 ++ test/fastq/single_noaux-q.sam | 5 ++ test/test_view.c | 5 +- 25 files changed, 299 insertions(+), 46 deletions(-) create mode 100644 test/fastq/filter_casava-q.sam create mode 100644 test/fastq/filter_casava.fa create mode 100644 test/fastq/inter_aux-q.sam create mode 100644 test/fastq/inter_casava-q.sam create mode 100644 test/fastq/inter_casavaOX-q.sam create mode 100644 test/fastq/inter_noaux-q.sam create mode 100644 test/fastq/interleaved.fa create mode 100644 test/fastq/interleaved_casava.fa create mode 100644 test/fastq/minimal-q.sam create mode 100644 test/fastq/minimal.fa create mode 100644 test/fastq/multiline-q.sam create mode 100644 test/fastq/multiline.fa create mode 100644 test/fastq/r1-q.sam create mode 100644 test/fastq/r1.fa create mode 100644 test/fastq/r2-q.sam create mode 100644 test/fastq/r2.fa create mode 100644 test/fastq/single.fa create mode 100644 test/fastq/single_aux-q.sam create mode 100644 test/fastq/single_noaux-q.sam diff --git a/hts.c b/hts.c index 02fb800b1..dc9097b9e 100644 --- a/hts.c +++ b/hts.c @@ -260,6 +260,7 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) case sam: case cram: case fastq_format: + case fasta_format: return sequence_data; case vcf: @@ -278,7 +279,6 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) case bed: return region_list; - case fasta_format: case htsget: case hts_crypt4gh_format: return unknown_category; @@ -591,6 +591,7 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) return 0; } else if (len >= 1 && s[0] == '>' && secondline_is_bases(s, &s[len])) { + fmt->category = sequence_data; fmt->format = fasta_format; return 0; } @@ -743,7 +744,7 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) hFILE *hfile = NULL; char fmt_code = '\0'; // see enum htsExactFormat in htslib/hts.h - const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0\0f\0\0"; + const char format_to_mode[] = "\0g\0\0b\0c\0\0b\0g\0\0\0\0\0Ff\0\0"; strncpy(smode, mode, 99); smode[99]=0; @@ -1192,6 +1193,21 @@ int hts_parse_format(htsFormat *format, const char *str) { format->format = fastq_format; format->compression = no_compression; format->compression_level = 0; + } else if (strcmp(fmt, "fastq.gz") == 0 || strcmp(fmt, "fq.gz") == 0) { + format->category = sequence_data; + format->format = fastq_format; + format->compression = bgzf; + format->compression_level = 0; + } else if (strcmp(fmt, "fasta") == 0 || strcmp(fmt, "fa") == 0) { + format->category = sequence_data; + format->format = fasta_format; + format->compression = no_compression; + format->compression_level = 0; + } else if (strcmp(fmt, "fasta.gz") == 0 || strcmp(fmt, "fa.gz") == 0) { + format->category = sequence_data; + format->format = fasta_format; + format->compression = bgzf; + format->compression_level = 0; } else { return -1; } @@ -1317,6 +1333,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) if (strchr(simple_mode, 'b')) fmt->format = binary_format; else if (strchr(simple_mode, 'c')) fmt->format = cram; else if (strchr(simple_mode, 'f')) fmt->format = fastq_format; + else if (strchr(simple_mode, 'F')) fmt->format = fasta_format; else fmt->format = text_format; if (strchr(simple_mode, 'z')) fmt->compression = bgzf; @@ -1328,6 +1345,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) case binary_format: fmt->compression = bgzf; break; case cram: fmt->compression = custom; break; case fastq_format: fmt->compression = no_compression; break; + case fasta_format: fmt->compression = no_compression; break; case text_format: fmt->compression = no_compression; break; default: abort(); } @@ -1437,7 +1455,8 @@ int hts_close(htsFile *fp) case vcf: if (fp->format.format == sam) ret = sam_state_destroy(fp); - else if (fp->format.format == fastq_format) + else if (fp->format.format == fastq_format || + fp->format.format == fasta_format) fastq_state_destroy(fp); if (fp->format.compression != no_compression) @@ -1501,6 +1520,7 @@ static hFILE *hts_hfile(htsFile *fp) { case text_format: return fp->fp.hfile; case vcf: // fall through case fastq_format: // fall through + case fasta_format: // fall through case sam: return fp->format.compression != no_compression ? bgzf_hfile(fp->fp.bgzf) : fp->fp.hfile; @@ -1554,12 +1574,14 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { case FASTQ_OPT_CASAVA: case FASTQ_OPT_RNUM: - if (fp->format.format == fastq_format) + if (fp->format.format == fastq_format || + fp->format.format == fasta_format) fastq_state_set(fp, opt); return 0; case FASTQ_OPT_AUX: - if (fp->format.format == fastq_format) { + if (fp->format.format == fastq_format || + fp->format.format == fasta_format) { va_start(args, opt); char *list = va_arg(args, char *); va_end(args); @@ -1568,7 +1590,8 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { return 0; case FASTQ_OPT_BARCODE: - if (fp->format.format == fastq_format) { + if (fp->format.format == fastq_format || + fp->format.format == fasta_format) { va_start(args, opt); char *bc = va_arg(args, char *); va_end(args); diff --git a/hts_internal.h b/hts_internal.h index 602348618..b4aa4c0d9 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -29,7 +29,7 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/hts.h" #include "textutils_internal.h" -#define HTS_MAX_EXT_LEN 8 +#define HTS_MAX_EXT_LEN 9 #ifdef __cplusplus extern "C" { diff --git a/htslib/hts.h b/htslib/hts.h index 198f7e66d..0c78b71f4 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -521,7 +521,7 @@ char *hts_format_description(const htsFormat *format); @param fn The file name or "-" for stdin/stdout. For indexed files with a non-standard naming, the file name can include the name of the index file delimited with HTS_IDX_DELIM - @param mode Mode matching / [rwa][bcefguxz0-9]* / + @param mode Mode matching / [rwa][bcefFguxz0-9]* / @discussion With 'r' opens for reading; any further format mode letters are ignored as the format is detected by checking the first few bytes or BGZF blocks @@ -530,6 +530,7 @@ char *hts_format_description(const htsFormat *format); b binary format (BAM, BCF, etc) rather than text (SAM, VCF, etc) c CRAM format f FASTQ format + F FASTA format g gzip compressed u uncompressed z bgzf compressed diff --git a/sam.c b/sam.c index f52cb08b2..d97fe096d 100644 --- a/sam.c +++ b/sam.c @@ -2029,6 +2029,7 @@ sam_hdr_t *sam_hdr_read(htsFile *fp) return sam_hdr_create(fp); case fastq_format: + case fasta_format: return sam_hdr_init(); case empty_format: @@ -2134,6 +2135,7 @@ int sam_hdr_write(htsFile *fp, const sam_hdr_t *h) break; case fastq_format: + case fasta_format: // Nothing to output; FASTQ has no file headers. break; @@ -3567,13 +3569,17 @@ typedef struct { int rnum; char BC[3]; // aux tag ID for barcode khash_t(tag) *tags; // which aux tags to use (if empty, use all). + char nprefix; } fastq_state; -static fastq_state *fastq_state_init(void) { +// Initialise fastq state. +// Name char of '@' or '>' distinguishes fastq vs fasta variant +static fastq_state *fastq_state_init(int name_char) { fastq_state *x = (fastq_state *)calloc(1, sizeof(*x)); if (!x) return NULL; strcpy(x->BC, "BC"); + x->nprefix = name_char; return x; } @@ -3596,7 +3602,8 @@ void fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { if (!fp) return; if (!fp->state) - if (!(fp->state = fastq_state_init())) + if (!(fp->state = fastq_state_init(fp->format.format == fastq_format + ? '@' : '>'))) return; fastq_state *x = (fastq_state *)fp->state; @@ -3653,15 +3660,27 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { size_t i, l; int ret = 0; - // Read a FASTQ format entry. - ret = hts_getline(fp, KS_SEP_LINE, &x->name); - if (ret == -1) - return -1; // EOF - else if (ret < -1) - return ret; // ERR + if (fp->format.format == fasta_format && fp->line.s) { + // For FASTA we've already read the >name line; steal it + // Not the most efficient, but we don't optimise for fasta reading. + if (fp->line.l == 0) + return -1; // EOF + + free(x->name.s); + x->name = fp->line; + fp->line.l = fp->line.m = 0; + fp->line.s = NULL; + } else { + // Read a FASTQ format entry. + ret = hts_getline(fp, KS_SEP_LINE, &x->name); + if (ret == -1) + return -1; // EOF + else if (ret < -1) + return ret; // ERR + } // Name - if (*x->name.s != '@') + if (*x->name.s != x->nprefix) return -2; i = 0; l = x->name.l; @@ -3682,26 +3701,30 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { // Seq x->seq.l = 0; for (;;) { - if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) - return -2; - if (*fp->line.s == '+') + if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) + if (fp->format.format == fastq_format || ret < -1) + return -2; + if (*fp->line.s == (fp->format.format == fastq_format ? '+' : '>') + || ret == -1) break; kputsn(fp->line.s, fp->line.l, &x->seq); } // Qual - size_t remainder = x->seq.l; - x->qual.l = 0; - do { - if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) - return -2; - kputsn(fp->line.s, fp->line.l, &x->qual); - remainder -= fp->line.l; - } while (remainder > 0); + if (fp->format.format == fastq_format) { + size_t remainder = x->seq.l; + x->qual.l = 0; + do { + if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) + return -2; + kputsn(fp->line.s, fp->line.l, &x->qual); + remainder -= fp->line.l; + } while (remainder > 0); - // Decr qual - for (i = 0; i < x->qual.l; i++) - x->qual.s[i] -= '!'; + // Decr qual + for (i = 0; i < x->qual.l; i++) + x->qual.s[i] -= '!'; + } int flag = BAM_FUNMAP; int pflag = BAM_FMUNMAP | BAM_FPAIRED; if (x->name.l > 2 && @@ -3913,10 +3936,12 @@ int sam_read1(htsFile *fp, sam_hdr_t *h, bam1_t *b) ret = sam_read1_sam(fp, h, b); break; + case fasta_format: case fastq_format: { fastq_state *x = (fastq_state *)fp->state; if (!x) { - if (!(fp->state = fastq_state_init())) + if (!(fp->state = fastq_state_init(fp->format.format + == fastq_format ? '@' : '>'))) return -2; } @@ -4043,7 +4068,7 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) if (len == 0) return 0; // Name - if (kputc('@', str) == EOF || kputs(bam_get_qname(b), str) == EOF) + if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) return -1; // /1 or /2 suffix @@ -4107,21 +4132,24 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) for (i = 0; i < len; i++) e |= kputc_(seq_nt16_str[bam_seqi(seq, i)], str) < 0; - kputsn("\n+\n", 3, str); // Qual line - qual = bam_get_qual(b); - if (qual[0] == 0xff) - for (i = 0; i < len; i++) - e |= kputc_('B', str) < 0; - else if (flag & BAM_FREVERSE) - for (i = len-1; i >= 0; i--) - e |= kputc_(33 + qual[i], str) < 0; - else - for (i = 0; i < len; i++) - e |= kputc_(33 + qual[i], str) < 0; + if (x->nprefix == '@') { + kputsn("\n+\n", 3, str); + qual = bam_get_qual(b); + if (qual[0] == 0xff) + for (i = 0; i < len; i++) + e |= kputc_('B', str) < 0; + else if (flag & BAM_FREVERSE) + for (i = len-1; i >= 0; i--) + e |= kputc_(33 + qual[i], str) < 0; + else + for (i = 0; i < len; i++) + e |= kputc_(33 + qual[i], str) < 0; + } e |= kputc('\n', str) < 0; + return e ? -1 : str->l; } @@ -4254,10 +4282,12 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) } + case fasta_format: case fastq_format: { fastq_state *x = (fastq_state *)fp->state; if (!x) { - if (!(fp->state = fastq_state_init())) + if (!(fp->state = fastq_state_init(fp->format.format + == fastq_format ? '@' : '>'))) return -2; } @@ -4741,6 +4771,12 @@ int sam_open_mode(char *mode, const char *fn, const char *format) else if (strcasecmp(format, "sam.gz") == 0) strcpy(mode, "z"); else if (strcasecmp(format, "fastq") == 0 || strcasecmp(format, "fq") == 0) strcpy(mode, "f"); + else if (strcasecmp(format, "fastq.gz") == 0 || + strcasecmp(format, "fq.gz") == 0) strcpy(mode, "fz"); + else if (strcasecmp(format, "fasta") == 0 || + strcasecmp(format, "fa") == 0) strcpy(mode, "F"); + else if (strcasecmp(format, "fasta.gz") == 0 || + strcasecmp(format, "fa.gz") == 0) strcpy(mode, "Fz"); else return -1; return 0; @@ -4802,6 +4838,20 @@ char *sam_open_mode_opts(const char *fn, ; // format mode="" } else if (strncmp(format, "sam.gz", format_len) == 0) { *cp++ = 'z'; + } else if (strncmp(format, "fastq", format_len) == 0 || + strncmp(format, "fq", format_len) == 0) { + *cp++ = 'f'; + } else if (strncmp(format, "fastq.gz", format_len) == 0 || + strncmp(format, "fq.gz", format_len) == 0) { + *cp++ = 'f'; + *cp++ = 'z'; + } else if (strncmp(format, "fasta", format_len) == 0 || + strncmp(format, "fa", format_len) == 0) { + *cp++ = 'F'; + } else if (strncmp(format, "fasta.gz", format_len) == 0 || + strncmp(format, "fa", format_len) == 0) { + *cp++ = 'F'; + *cp++ = 'z'; } else { free(mode_opts); return NULL; diff --git a/test/fastq/fastq.tst b/test/fastq/fastq.tst index 39be05705..5feb31fe1 100644 --- a/test/fastq/fastq.tst +++ b/test/fastq/fastq.tst @@ -38,26 +38,35 @@ # Minimal P minimal.sam $tview minimal.fq +P minimal-q.sam $tview minimal.fa # Multi-line FASTQ P multiline.sam $tview multiline.fq +P multiline-q.sam $tview multiline.fa # Single file, unpaired data, with / without aux tags P single_noaux.sam $tview single.fq +P single_noaux-q.sam $tview single.fa P single_aux.sam $tview -i fastq_aux single.fq +P single_aux-q.sam $tview -i fastq_aux single.fa # Single file, interleaved paired data, no aux P inter_noaux.sam $tview interleaved.fq +P inter_noaux-q.sam $tview interleaved.fa # Single file, interleaved paired data, with aux P inter_aux.sam $tview -i fastq_aux interleaved.fq +P inter_aux-q.sam $tview -i fastq_aux interleaved.fa # Single file, interleaved paired data, using CASAVA P inter_casava.sam $tview -i fastq_casava interleaved_casava.fq P inter_casavaOX.sam $tview -i fastq_barcode=OX -i fastq_casava interleaved_casava.fq +P inter_casava-q.sam $tview -i fastq_casava interleaved_casava.fa +P inter_casavaOX-q.sam $tview -i fastq_barcode=OX -i fastq_casava interleaved_casava.fa # CASAVA with filtering P filter_casava.sam $tview -i fastq_casava filter_casava.fq +P filter_casava-q.sam $tview -i fastq_casava filter_casava.fa # Paired data is mainly tested by the Samtools test harness. # Basically though it's just reading two files and relying on either @@ -65,26 +74,36 @@ P filter_casava.sam $tview -i fastq_casava filter_casava.fq # We simply test here we can read r1 and r2 as separate files P r1.sam $tview -i fastq_aux r1.fq P r2.sam $tview -i fastq_aux r2.fq +P r1-q.sam $tview -i fastq_aux r1.fa +P r2-q.sam $tview -i fastq_aux r2.fa # -------------------- # Writing # Minimal P minimal.fq $tview -f minimal.sam +P minimal.fa $tview -F minimal.sam # Single file with unpaired data plus aux tags P single.fq $tview -f -o fastq_aux single_aux.sam +P single.fa $tview -F -o fastq_aux single_aux.sam # Single file, interleaved paired data, with aux and /rnum P interleaved.fq $tview -f -o fastq_aux -o fastq_rnum inter_aux.sam +P interleaved.fa $tview -F -o fastq_aux -o fastq_rnum inter_aux.sam # CASAVA with interleaved data P interleaved_casava.fq $tview -f -o fastq_casava inter_casava.sam P interleaved_casava.fq $tview -f -o fastq_barcode=OX -o fastq_casava inter_casavaOX.sam +P interleaved_casava.fa $tview -F -o fastq_casava inter_casava.sam +P interleaved_casava.fa $tview -F -o fastq_barcode=OX -o fastq_casava inter_casavaOX.sam # CASAVA with filtering P filter_casava.fq $tview -f -o fastq_casava filter_casava.sam +P filter_casava.fa $tview -F -o fastq_casava filter_casava.sam # Paired data P r1.fq $tview -f -o fastq_aux -o fastq_rnum r1.sam P r2.fq $tview -f -o fastq_aux -o fastq_rnum r2.sam +P r1.fa $tview -F -o fastq_aux -o fastq_rnum r1.sam +P r2.fa $tview -F -o fastq_aux -o fastq_rnum r2.sam diff --git a/test/fastq/filter_casava-q.sam b/test/fastq/filter_casava-q.sam new file mode 100644 index 000000000..6d7a2ea06 --- /dev/null +++ b/test/fastq/filter_casava-q.sam @@ -0,0 +1,4 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * BC:Z:NGTCTATC +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 589 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 653 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * BC:Z:NGTCTATC diff --git a/test/fastq/filter_casava.fa b/test/fastq/filter_casava.fa new file mode 100644 index 000000000..ba58034c9 --- /dev/null +++ b/test/fastq/filter_casava.fa @@ -0,0 +1,8 @@ +>HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1505:59795#49 2:N:0:NGTCTATC +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT +>HS25_09827:2:1201:1559:70726#49 1:Y:0:NGTCTATC +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1559:70726#49 2:Y:0:NGTCTATC +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC diff --git a/test/fastq/inter_aux-q.sam b/test/fastq/inter_aux-q.sam new file mode 100644 index 000000000..cd5fbc407 --- /dev/null +++ b/test/fastq/inter_aux-q.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * RG:Z:1#49 +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * RG:Z:1#49 diff --git a/test/fastq/inter_casava-q.sam b/test/fastq/inter_casava-q.sam new file mode 100644 index 000000000..31f8e8bf6 --- /dev/null +++ b/test/fastq/inter_casava-q.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * BC:Z:NGTCTATC +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * BC:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * BC:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * BC:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * BC:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * BC:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * BC:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * BC:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * BC:Z:NGTCTATC diff --git a/test/fastq/inter_casavaOX-q.sam b/test/fastq/inter_casavaOX-q.sam new file mode 100644 index 000000000..1273184ef --- /dev/null +++ b/test/fastq/inter_casavaOX-q.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * OX:Z:NGTCTATC +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * OX:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * OX:Z:NGTCTATC +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * OX:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * OX:Z:NGTCTATC +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * OX:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * OX:Z:NGTCTATC +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * OX:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * OX:Z:NGTCTATC +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * OX:Z:NGTCTATC diff --git a/test/fastq/inter_noaux-q.sam b/test/fastq/inter_noaux-q.sam new file mode 100644 index 000000000..107f31ae1 --- /dev/null +++ b/test/fastq/inter_noaux-q.sam @@ -0,0 +1,10 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * diff --git a/test/fastq/interleaved.fa b/test/fastq/interleaved.fa new file mode 100644 index 000000000..bd5ca44b0 --- /dev/null +++ b/test/fastq/interleaved.fa @@ -0,0 +1,20 @@ +>HS25_09827:2:1201:1505:59795#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1505:59795#49/2 RG:Z:1#49 +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT +>HS25_09827:2:1201:1559:70726#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1559:70726#49/2 RG:Z:1#49 +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC +>HS25_09827:2:1201:1564:39627#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>HS25_09827:2:1201:1564:39627#49/2 RG:Z:1#49 +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT +>HS25_09827:2:1201:1565:91731#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA +>HS25_09827:2:1201:1565:91731#49/2 RG:Z:1#49 +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA +>HS25_09827:2:1201:1624:69925#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA +>HS25_09827:2:1201:1624:69925#49/2 RG:Z:1#49 +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC diff --git a/test/fastq/interleaved_casava.fa b/test/fastq/interleaved_casava.fa new file mode 100644 index 000000000..8ea7447f9 --- /dev/null +++ b/test/fastq/interleaved_casava.fa @@ -0,0 +1,20 @@ +>HS25_09827:2:1201:1505:59795#49 1:N:0:NGTCTATC +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1505:59795#49 2:N:0:NGTCTATC +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT +>HS25_09827:2:1201:1559:70726#49 1:N:0:NGTCTATC +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1559:70726#49 2:N:0:NGTCTATC +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC +>HS25_09827:2:1201:1564:39627#49 1:N:0:NGTCTATC +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>HS25_09827:2:1201:1564:39627#49 2:N:0:NGTCTATC +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT +>HS25_09827:2:1201:1565:91731#49 1:N:0:NGTCTATC +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA +>HS25_09827:2:1201:1565:91731#49 2:N:0:NGTCTATC +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA +>HS25_09827:2:1201:1624:69925#49 1:N:0:NGTCTATC +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA +>HS25_09827:2:1201:1624:69925#49 2:N:0:NGTCTATC +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC diff --git a/test/fastq/minimal-q.sam b/test/fastq/minimal-q.sam new file mode 100644 index 000000000..d2daaf47f --- /dev/null +++ b/test/fastq/minimal-q.sam @@ -0,0 +1 @@ +x 4 * 0 0 * * 0 0 A * diff --git a/test/fastq/minimal.fa b/test/fastq/minimal.fa new file mode 100644 index 000000000..4f65d8c17 --- /dev/null +++ b/test/fastq/minimal.fa @@ -0,0 +1,2 @@ +>x +A diff --git a/test/fastq/multiline-q.sam b/test/fastq/multiline-q.sam new file mode 100644 index 000000000..d31407b36 --- /dev/null +++ b/test/fastq/multiline-q.sam @@ -0,0 +1,2 @@ +seq1 4 * 0 0 * * 0 0 NAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTN * +seq2 4 * 0 0 * * 0 0 RAAAAAAAAACCCCCCCCCCGGGGGGGGGGTTTTTTTTY * diff --git a/test/fastq/multiline.fa b/test/fastq/multiline.fa new file mode 100644 index 000000000..2917e066f --- /dev/null +++ b/test/fastq/multiline.fa @@ -0,0 +1,10 @@ +>seq1 +NAAAAAAAAA +CCCCCCCCCC +GGGGGGGGGG +TTTTTTTTN +>seq2 +RAAAAAAAAA +CCCCCCCCCC +GGGGGGGGGG +TTTTTTTTY diff --git a/test/fastq/r1-q.sam b/test/fastq/r1-q.sam new file mode 100644 index 000000000..c87f07fe4 --- /dev/null +++ b/test/fastq/r1-q.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 77 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +HS25_09827:2:1201:1559:70726#49 77 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +HS25_09827:2:1201:1564:39627#49 77 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +HS25_09827:2:1201:1565:91731#49 77 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +HS25_09827:2:1201:1624:69925#49 77 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB diff --git a/test/fastq/r1.fa b/test/fastq/r1.fa new file mode 100644 index 000000000..4b7af5cf4 --- /dev/null +++ b/test/fastq/r1.fa @@ -0,0 +1,10 @@ +>HS25_09827:2:1201:1505:59795#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=BDDDF +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1559:70726#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DBDFD +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1564:39627#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1:DDDFE +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>HS25_09827:2:1201:1565:91731#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DDFFD +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA +>HS25_09827:2:1201:1624:69925#49/1 RG:Z:1#49 BC:Z:NGTCTATC QT:Z:!1=DD?DB +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA diff --git a/test/fastq/r2-q.sam b/test/fastq/r2-q.sam new file mode 100644 index 000000000..033fa98cf --- /dev/null +++ b/test/fastq/r2-q.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 141 * 0 0 * * 0 0 AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT * RG:Z:1#49 +HS25_09827:2:1201:1559:70726#49 141 * 0 0 * * 0 0 TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC * RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 141 * 0 0 * * 0 0 AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT * RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 141 * 0 0 * * 0 0 AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA * RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 141 * 0 0 * * 0 0 TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC * RG:Z:1#49 diff --git a/test/fastq/r2.fa b/test/fastq/r2.fa new file mode 100644 index 000000000..4d6f08e85 --- /dev/null +++ b/test/fastq/r2.fa @@ -0,0 +1,10 @@ +>HS25_09827:2:1201:1505:59795#49/2 RG:Z:1#49 +AAGGAAAGAAGGGAGGGAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAGGAAAGTAGGAAGAATTCATCTACCCAATT +>HS25_09827:2:1201:1559:70726#49/2 RG:Z:1#49 +TTTTCTTTTATTAATTTTATACTTACATTTAAGTCTTTATTCCATTTTGAGTCAATGTTTGTATATGATGAGAGATAGGGGTCTAGTTTCATACTTCTAC +>HS25_09827:2:1201:1564:39627#49/2 RG:Z:1#49 +AATTCCTTGTTTGAAAACACTCCTAGCACCATGAGTCTCTGCTGCTTTATCTAAAGCATCTCTGGGGCTCATGTGTTTGTGAAGAGCTTGCCTAGCTTTT +>HS25_09827:2:1201:1565:91731#49/2 RG:Z:1#49 +AGCACAGATTCATGCTAGCACATGGATGACCCTCAACAGCATTATCCCCAGCGATAGAAGCCAGACACAAAAGAACACATATGACAGGATTCCATTGATA +>HS25_09827:2:1201:1624:69925#49/2 RG:Z:1#49 +TCCTTCCATTCCATTTTTGTGTTTGTTTTGTTCTTTTCTGTCACTGATCCGTATTACCACTTTTGGAAAAAAATAAATAAATAAATAAATAAAAGGCAGC diff --git a/test/fastq/single.fa b/test/fastq/single.fa new file mode 100644 index 000000000..cdd58d8dc --- /dev/null +++ b/test/fastq/single.fa @@ -0,0 +1,10 @@ +>HS25_09827:2:1201:1505:59795#49 RG:Z:1#49 +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>HS25_09827:2:1201:1559:70726#49 RG:Z:1#49 +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>HS25_09827:2:1201:1564:39627#49 RG:Z:1#49 +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>HS25_09827:2:1201:1565:91731#49 RG:Z:1#49 +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA +>HS25_09827:2:1201:1624:69925#49 RG:Z:1#49 +GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA diff --git a/test/fastq/single_aux-q.sam b/test/fastq/single_aux-q.sam new file mode 100644 index 000000000..68af8bc89 --- /dev/null +++ b/test/fastq/single_aux-q.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * RG:Z:1#49 +HS25_09827:2:1201:1559:70726#49 4 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * RG:Z:1#49 +HS25_09827:2:1201:1564:39627#49 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * RG:Z:1#49 +HS25_09827:2:1201:1565:91731#49 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * RG:Z:1#49 +HS25_09827:2:1201:1624:69925#49 4 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * RG:Z:1#49 diff --git a/test/fastq/single_noaux-q.sam b/test/fastq/single_noaux-q.sam new file mode 100644 index 000000000..24542c69c --- /dev/null +++ b/test/fastq/single_noaux-q.sam @@ -0,0 +1,5 @@ +HS25_09827:2:1201:1505:59795#49 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * +HS25_09827:2:1201:1559:70726#49 4 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * +HS25_09827:2:1201:1564:39627#49 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * +HS25_09827:2:1201:1565:91731#49 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * +HS25_09827:2:1201:1624:69925#49 4 * 0 0 * * 0 0 GCCAGCCTCCTTCTCAATGGTCTTTTTAAACATTATATGAAAACCAGACATTTACATTTGATTTCTTTTTCAATACTATACAGTTCTAAGAGAAAAAACA * diff --git a/test/test_view.c b/test/test_view.c index d67c72fb6..416ffe98d 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -58,6 +58,7 @@ enum test_op { WRITE_UNCOMPRESSED = 16, WRITE_COMPRESSED = 32, // eg vcf.gz, sam.gz, fastq.gz WRITE_FASTQ = 64, + WRITE_FASTA = 128, }; int sam_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, htsFile *out) { @@ -295,7 +296,7 @@ int main(int argc, char *argv[]) opts.index = NULL; opts.min_shift = 0; - while ((c = getopt(argc, argv, "DSIt:i:bzCful:o:N:BZ:@:Mx:m:p:v")) >= 0) { + while ((c = getopt(argc, argv, "DSIt:i:bzCfFul:o:N:BZ:@:Mx:m:p:v")) >= 0) { switch (c) { case 'D': opts.flag |= READ_CRAM; break; case 'S': opts.flag |= READ_COMPRESSED; break; @@ -306,6 +307,7 @@ int main(int argc, char *argv[]) case 'z': opts.flag |= WRITE_COMPRESSED; break; case 'C': opts.flag |= WRITE_CRAM; break; case 'f': opts.flag |= WRITE_FASTQ; break; + case 'F': opts.flag |= WRITE_FASTA; break; case 'u': opts.flag |= WRITE_UNCOMPRESSED; break; // eg u-BAM not SAM case 'l': opts.clevel = atoi(optarg); break; case 'o': if (hts_opt_add(&out_opts, optarg)) return 1; break; @@ -365,6 +367,7 @@ int main(int argc, char *argv[]) else if (opts.flag & WRITE_COMPRESSED) strcat(modew, "z"); else if (opts.flag & WRITE_UNCOMPRESSED) strcat(modew, "bu"); if (opts.flag & WRITE_FASTQ) strcat(modew, "f"); + else if (opts.flag & WRITE_FASTA) strcat(modew, "F"); out = hts_open(out_fn, modew); if (out == NULL) { fprintf(stderr, "Error opening standard output\n"); From 5698c91e18d5661b2440d83fce78091781b3335c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 16 Apr 2021 17:21:43 +0100 Subject: [PATCH 133/488] Return an error if setting fastq option fails --- hts.c | 6 +++--- sam.c | 13 ++++++++----- sam_internal.h | 2 +- 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/hts.c b/hts.c index dc9097b9e..8ef82fb13 100644 --- a/hts.c +++ b/hts.c @@ -1576,7 +1576,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { case FASTQ_OPT_RNUM: if (fp->format.format == fastq_format || fp->format.format == fasta_format) - fastq_state_set(fp, opt); + return fastq_state_set(fp, opt); return 0; case FASTQ_OPT_AUX: @@ -1585,7 +1585,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { va_start(args, opt); char *list = va_arg(args, char *); va_end(args); - fastq_state_set(fp, opt, list); + return fastq_state_set(fp, opt, list); } return 0; @@ -1595,7 +1595,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { va_start(args, opt); char *bc = va_arg(args, char *); va_end(args); - fastq_state_set(fp, opt, bc); + return fastq_state_set(fp, opt, bc); } return 0; diff --git a/sam.c b/sam.c index d97fe096d..fbf83378b 100644 --- a/sam.c +++ b/sam.c @@ -3596,15 +3596,15 @@ void fastq_state_destroy(htsFile *fp) { } } -void fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { +int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { va_list args; if (!fp) - return; + return -1; if (!fp->state) if (!(fp->state = fastq_state_init(fp->format.format == fastq_format ? '@' : '>'))) - return; + return -1; fastq_state *x = (fastq_state *)fp->state; @@ -3621,17 +3621,19 @@ void fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { if (tag && strcmp(tag, "1") != 0) { if (!x->tags) if (!(x->tags = kh_init(tag))) - return; + return -1; size_t i, tlen = strlen(tag); for (i = 0; i+3 <= tlen+1; i += 3) { if (tag[i+0] == ',' || tag[i+1] == ',' || !(tag[i+2] == ',' || tag[i+2] == '\0')) { - hts_log_error("Bad tag format '%.3s'; skipping option", tag+i); + hts_log_warning("Bad tag format '%.3s'; skipping option", tag+i); break; } int ret, tcode = tag[i+0]*256 + tag[i+1]; kh_put(tag, x->tags, tcode, &ret); + if (ret < 0) + return -1; } } break; @@ -3653,6 +3655,7 @@ void fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { default: break; } + return 0; } static int fastq_parse1(htsFile *fp, bam1_t *b) { diff --git a/sam_internal.h b/sam_internal.h index 20f154dcd..b1fce9fe4 100644 --- a/sam_internal.h +++ b/sam_internal.h @@ -37,7 +37,7 @@ int sam_set_thread_pool(htsFile *fp, htsThreadPool *p); int sam_set_threads(htsFile *fp, int nthreads); // Fastq state -void fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...); +int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...); void fastq_state_destroy(samFile *fp); // bam1_t data (re)allocation From 7dc751c9cdf7f7ce5a1ad324bfe4fdba102e85e8 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 16 Apr 2021 17:27:35 +0100 Subject: [PATCH 134/488] Catch more errors in fastq_parse1() Also fixes a couple of places where errors could be incorrectly reported as EOF. --- sam.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/sam.c b/sam.c index fbf83378b..b63372028 100644 --- a/sam.c +++ b/sam.c @@ -3710,7 +3710,8 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { if (*fp->line.s == (fp->format.format == fastq_format ? '+' : '>') || ret == -1) break; - kputsn(fp->line.s, fp->line.l, &x->seq); + if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) + return -2; } // Qual @@ -3720,7 +3721,10 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { do { if (hts_getline(fp, KS_SEP_LINE, &fp->line) < 0) return -2; - kputsn(fp->line.s, fp->line.l, &x->qual); + if (fp->line.l > remainder) + return -2; + if (kputsn(fp->line.s, fp->line.l, &x->qual) < 0) + return -2; remainder -= fp->line.l; } while (remainder > 0); @@ -3787,14 +3791,14 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { if (ret >= 0 && barcode_len) if (bam_aux_append(b, x->BC, 'Z', barcode_len, (uint8_t *)barcode) < 0) - ret = -1; + ret = -2; if (!x->aux) return ret; // Identify any SAM style aux tags in comments too. if (aux_parse(&kc->s[barcode_len], kc->s + kc->l, b, 1, x->tags) < 0) - ret = -1; + ret = -2; return ret; } From 3944da0922f8ef489c3fa38bc44bcf40f4e73dcc Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 21 Apr 2021 12:20:41 +0100 Subject: [PATCH 135/488] Prevent endless aux_parse loop and avoid malformed aux data In lenient mode, skip the check for integer overflows. Fixes an endless loop due to `overflow` not being reset, and by the time the check is made all the input data has been used so trying to loop back won't do anything useful. Make the _parse_err() macro in aux_parse() reset `b->l_data` to where it was at the start of the loop if it detects an error in lenient mode. This cleans up any semi-formed aux records that may be present before the parser tries to skip to the next tag. --- sam.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sam.c b/sam.c index b63372028..8a3f8c342 100644 --- a/sam.c +++ b/sam.c @@ -2405,6 +2405,7 @@ static inline unsigned int parse_sam_flag(char *v, char **rv, int *overflow) { static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, khash_t(tag) *tag_whitelist) { int overflow = 0; + int checkpoint; char logbuf[40]; char *q = start, *p = end; @@ -2416,6 +2417,7 @@ static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, q++; \ while (q < p && isspace_c(*q)) \ q++; \ + b->l_data = checkpoint; \ goto loop; \ } else { \ hts_log_error(__VA_ARGS__); \ @@ -2426,6 +2428,7 @@ static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, while (q < p) loop: { char type; + checkpoint = b->l_data; if (p - q < 5) { if (lenient) { break; @@ -2533,7 +2536,7 @@ static inline int aux_parse(char *start, char *end, bam1_t *b, int lenient, q++; } - _parse_err(overflow != 0, "numeric value out of allowed range"); + _parse_err(!lenient && overflow != 0, "numeric value out of allowed range"); #undef _parse_err return 0; From 74302c41838a492d765ea5f3fa6ea01ddd021f61 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 20 Apr 2021 16:12:17 +0100 Subject: [PATCH 136/488] Added documentation to FASTQ hts_fmt_option enums. --- htslib/hts.h | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/htslib/hts.h b/htslib/hts.h index 0c78b71f4..219368a90 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -331,9 +331,33 @@ enum hts_fmt_option { HTS_OPT_PROFILE, // Fastq + + // Boolean. + // Read / Write CASAVA 1.8 format. + // See https://emea.support.illumina.com/content/dam/illumina-support/documents/documentation/software_documentation/bcl2fastq/bcl2fastq_letterbooklet_15038058brpmi.pdf + // + // The CASAVA tag matches \d:[YN]:\d+:[ACGTN]+ + // The first \d is read 1/2 (1 or 2), [YN] is QC-PASS/FAIL flag, + // \d+ is a control number, and the sequence at the end is + // for barcode sequence. Barcodes are read into the aux tag defined + // by FASTQ_OPT_BARCODE ("BC" by default). FASTQ_OPT_CASAVA = 1000, + + // String. + // Whether to read / write extra SAM format aux tags from the fastq + // identifier line. For reading this can simply be "1" to request + // decoding aux tags. For writing it is a comma separated list of aux + // tag types to be written out. FASTQ_OPT_AUX, + + // Boolean. + // Whether to add /1 and /2 to read identifiers when writing FASTQ. + // These come from the BAM_FREAD1 or BAM_FREAD2 flags. + // (Detecting the /1 and /2 is automatic when reading fastq.) FASTQ_OPT_RNUM, + + // Two character string. + // Barcode aux tag for CASAVA; defaults to "BC". FASTQ_OPT_BARCODE, }; From 9be3c82504a3fb77b7b0cb3e9e4e0f164eead979 Mon Sep 17 00:00:00 2001 From: ihsinme Date: Thu, 22 Apr 2021 11:22:01 +0300 Subject: [PATCH 137/488] fix for double freeing of memory. my analysis shows that it is possible to achieve double freeing of memory. to eliminate it, I propose a simple fix. --- sam.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sam.c b/sam.c index 8bda92384..7253f2fa3 100644 --- a/sam.c +++ b/sam.c @@ -1804,6 +1804,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (sn) { hts_log_warning("SQ header line has more than one SN: tag"); free(sn); + sn = NULL; } sn = (char*)calloc(r - q + 1, 1); if (!sn) @@ -1831,6 +1832,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (!absent) { hts_log_warning("Duplicated sequence '%s'", sn); free(sn); + sn = NULL; } else { if (ln >= UINT32_MAX) { // Stash away ref length that @@ -1841,7 +1843,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (!long_refs) goto error; } - k2 = kh_put(s2i, long_refs, sn, &absent); + k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); if (absent < 0) goto error; kh_val(long_refs, k2) = ln; @@ -1854,6 +1856,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { } else { hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); free(sn); + sn = NULL; } } else { hts_log_warning("Ignored @SQ line with missing SN: tag"); @@ -1916,6 +1919,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (!absent) { hts_log_warning("Duplicated sequence '%s'", sn); free(sn); + sn = NULL; } else { if (ln >= UINT32_MAX) { // Stash away ref length that @@ -1927,7 +1931,7 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (!long_refs) goto error; } - k2 = kh_put(s2i, long_refs, sn, &absent); + k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); if (absent < 0) goto error; kh_val(long_refs, k2) = ln; From e1b1adceba70aa003f2416ba322e867e0153dd81 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Thu, 22 Apr 2021 14:50:48 +0000 Subject: [PATCH 138/488] Set sn to NULL after it has been attached to the reference dictionary. Always break the loop instead of `goto error`, to free the kstring. --- sam.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/sam.c b/sam.c index 7253f2fa3..87e29c23e 100644 --- a/sam.c +++ b/sam.c @@ -1804,7 +1804,6 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { if (sn) { hts_log_warning("SQ header line has more than one SN: tag"); free(sn); - sn = NULL; } sn = (char*)calloc(r - q + 1, 1); if (!sn) @@ -1830,10 +1829,10 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { goto error; if (!absent) { - hts_log_warning("Duplicated sequence '%s'", sn); + hts_log_warning("Duplicated sequence \"%s\" in file \"%s\"", sn, fp->fn); free(sn); - sn = NULL; } else { + sn = NULL; if (ln >= UINT32_MAX) { // Stash away ref length that // doesn't fit in target_len array @@ -1856,7 +1855,6 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { } else { hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); free(sn); - sn = NULL; } } else { hts_log_warning("Ignored @SQ line with missing SN: tag"); @@ -1907,20 +1905,25 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { continue; sn = (char*)calloc(tab-line.s+1, 1); - if (!sn) + if (!sn) { + e = 1; break; + } memcpy(sn, line.s, tab-line.s); k = kh_put(s2i, d, sn, &absent); - if (absent < 0) + if (absent < 0) { + e = 1; break; + } ln = strtoll(tab, NULL, 10); if (!absent) { - hts_log_warning("Duplicated sequence '%s'", sn); + hts_log_warning("Duplicated sequence \"%s\" in the file \"%s\"", sn, fai_fn); free(sn); sn = NULL; } else { + sn = NULL; if (ln >= UINT32_MAX) { // Stash away ref length that // doesn't fit in target_len array @@ -1928,12 +1931,16 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { int absent = -1; if (!long_refs) { long_refs = kh_init(s2i); - if (!long_refs) - goto error; + if (!long_refs) { + e = 1; + break; + } } k2 = kh_put(s2i, long_refs, kh_key(d, k), &absent); - if (absent < 0) - goto error; + if (absent < 0) { + e = 1; + break; + } kh_val(long_refs, k2) = ln; kh_val(d, k) = ((int64_t) (kh_size(d) - 1) << 32 | UINT32_MAX); From bfbe07dc103042f528ea4253e519be9b5e9a8480 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 26 Apr 2021 11:57:57 +0100 Subject: [PATCH 139/488] Fix to make samtools cat work on CRAMs again. Samtools cat utilises low level APIs to turn codecs from a read cram_fd to codecs sutiable for a write cram_fd. This had a missing initialisation in the HUFFMAN code and a small memory leak, caused by the addition of the CRAM v4 code (but affecting CRAM v3). Also while adding CRAM v4, the EOF block was changed from being a literal string of bytes into simply encoding an empty container and letting the code auto-generate the correct byte stream. There were bugs in cram_encode_compression_header for empty blocks, which affected cram blocks that had gone through this read/write transcode process. Specifically the compression header's num_record check is invalid as this is only populated for CRAM v1.0. We check the container instead, which is always correct and permits us to find the empty EOF blocks. Note this EOF writing change also affected more than samtools cat. All CRAM files being output contained a slightly larger EOF block, containing an unused preservation map. It is debateable whether this is within the spec or not. The spec states both the meaning of the EOF block (an empty container with a specific ref seq id and alignment start) as well as listing what this encodes to in hex. The former worked, but the latter differed. Fixes samtools/samtools#1420 --- cram/cram_codecs.c | 2 ++ cram/cram_encode.c | 6 ++++-- cram/cram_io.c | 2 ++ 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 0c6cb654e..2b077f2c7 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3931,6 +3931,7 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { // unify this. cram_codec *t = malloc(sizeof(*t)); if (!t) return -1; + t->vv = c->vv; t->codec = E_HUFFMAN; t->free = cram_huffman_encode_free; t->store = cram_huffman_encode_store; @@ -4017,6 +4018,7 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { // {len,val}_{encoding,dat} are undefined, but unused. // Leaving them unset here means we can test that assertion. *c = *t; + free(t); break; } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 315279dae..192278f17 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -107,12 +107,14 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, } } - if (h->preservation_map) + if (h->preservation_map) { kh_destroy(map, h->preservation_map); + h->preservation_map = NULL; + } /* Create in-memory preservation map */ /* FIXME: should create this when we create the container */ - if (h->num_records > 0) { + if (c->num_records > 0) { khint_t k; int r; diff --git a/cram/cram_io.c b/cram/cram_io.c index 05f7c7b06..6d1d1877c 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5368,6 +5368,8 @@ int cram_write_eof_block(cram_fd *fd) { cram_free_block(c.comp_hdr_block); return -1; } + if (ch.preservation_map) + kh_destroy(map, ch.preservation_map); cram_free_block(c.comp_hdr_block); // V2.1 bytes From 22297149162411bc69215f2606152fdbb3165a0e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 21 Apr 2021 12:21:22 +0100 Subject: [PATCH 140/488] Make mpileup's overlap removal choose a random sequence. Currently it always chooses the second sequence (except for the circumstance of differing base calls). This is essentially random strand and random coordinate in most library strategies, but some targetted sequencing methods have a very strong strand bias (first is + strand, second is - strand) or positional bias (eg PCR amplicons). Given SNPs near the end of sequences can give rise to poor BAQ scores, both position and strand bias are detrimental. This change makes it select either read 'a' or 'b' based on a hash of the read name. Unlike using a traditional random number generator, this gives it consistent behaviour regardless of how many sequences have gone before. An example from SynDip region 1:185M-200M: No overlap removal: SNP Q>0 / Filtered SNP TP 18830 / 18803 SNP FP 264 / 238 SNP GT 56 / 53 SNP FN 459 / 486 InDel TP 2788 / 2697 InDel FP 1022 / 86 InDel GT 353 / 345 InDel FN 596 / 687 Old removal strategy: SNP Q>0 / Filtered SNP TP 18841 / 18813 SNP FP 270 / 243 SNP GT 56 / 54 SNP FN 448 / 476 InDel TP 2754 / 2663 InDel FP 985 / 83 InDel GT 413 / 404 InDel FN 630 / 721 This PR: SNP Q>0 / Filtered SNP TP 18841 / 18814 SNP FP 272 / 242 SNP GT 55 / 53 SNP FN 448 / 475 InDel TP 2765 / 2679 InDel FP 996 / 85 InDel GT 382 / 375 InDel FN 619 / 705 The CPU cost on bcftools mpileup | bcftools call between the latter two tests was 0.4% (which may also just be random fluctuation). Vs the old removal system, this is a marginal improvement for SNPs and, oddly, a significant improvement to Indels. (It's still behind no overlap removal for indels, but I'm unsure on the veracity of small indels in that truth set). Fixes samtools/bcftools#1459 --- htscodecs | 2 +- sam.c | 121 ++++++++++++++++++++++++++++++++++-------------------- 2 files changed, 77 insertions(+), 46 deletions(-) diff --git a/htscodecs b/htscodecs index d7e357946..30bc9fdca 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit d7e357946ead219b81cc1becbe0de8a99d96ca84 +Subproject commit 30bc9fdca45e144bd975eb2a2563c1cac43c2ec5 diff --git a/sam.c b/sam.c index 64f5d0bae..397288b61 100644 --- a/sam.c +++ b/sam.c @@ -5322,10 +5322,18 @@ static inline int cigar_iref2iseq_next(const uint32_t **cigar, return -1; } +// Given overlapping read 'a' (left) and 'b' (right) on the same +// template, adjust quality values to zero for either a or b. +// Note versions 1.12 and earlier always removed quality from 'b' for +// matching bases. Now we select a or b semi-randomly based on name hash. +// Returns 0 on success, +// -1 on failure static int tweak_overlap_quality(bam1_t *a, bam1_t *b) { - const uint32_t *a_cigar = bam_get_cigar(a), *a_cigar_max = a_cigar + a->core.n_cigar; - const uint32_t *b_cigar = bam_get_cigar(b), *b_cigar_max = b_cigar + b->core.n_cigar; + const uint32_t *a_cigar = bam_get_cigar(a), + *a_cigar_max = a_cigar + a->core.n_cigar; + const uint32_t *b_cigar = bam_get_cigar(b), + *b_cigar_max = b_cigar + b->core.n_cigar; hts_pos_t a_icig = 0, a_iseq = 0; hts_pos_t b_icig = 0, b_iseq = 0; uint8_t *a_qual = bam_get_qual(a), *b_qual = bam_get_qual(b); @@ -5334,69 +5342,92 @@ static int tweak_overlap_quality(bam1_t *a, bam1_t *b) hts_pos_t iref = b->core.pos; hts_pos_t a_iref = iref - a->core.pos; hts_pos_t b_iref = iref - b->core.pos; - int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); - if ( a_ret<0 ) return a_ret<-1 ? -1:0; // no overlap or error - int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); - if ( b_ret<0 ) return b_ret<-1 ? -1:0; // no overlap or error - #if DBG - fprintf(stderr,"tweak %s n_cigar=%d %d .. %d-%d vs %"PRIhts_pos"-%"PRIhts_pos"\n", bam_get_qname(a), a->core.n_cigar, b->core.n_cigar, - a->core.pos+1,a->core.pos+bam_cigar2rlen(a->core.n_cigar,bam_get_cigar(a)), b->core.pos+1, b->core.pos+bam_cigar2rlen(b->core.n_cigar,bam_get_cigar(b))); - #endif + int a_ret = cigar_iref2iseq_set(&a_cigar, a_cigar_max, + &a_icig, &a_iseq, &a_iref); + if ( a_ret<0 ) + // no overlap or error + return a_ret<-1 ? -1:0; + + int b_ret = cigar_iref2iseq_set(&b_cigar, b_cigar_max, + &b_icig, &b_iseq, &b_iref); + if ( b_ret<0 ) + // no overlap or error + return b_ret<-1 ? -1:0; + + // Determine which seq is the one getting modified qualities. + uint8_t amul, bmul; + if (__ac_Wang_hash(__ac_X31_hash_string(bam_get_qname(a))) & 1) { + amul = 1; + bmul = 0; + } else { + amul = 0; + bmul = 1; + } + // Loop over the overlapping region nulling qualities in either + // seq a or b. int err = 0; while ( 1 ) { - // Increment reference position + // Step to next matching reference position in a and b while ( a_ret >= 0 && a_iref>=0 && a_iref < iref - a->core.pos ) - a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, &a_icig, &a_iseq, &a_iref); - if ( a_ret<0 ) { err = a_ret<-1?-1:0; break; } // done - if ( iref < a_iref + a->core.pos ) iref = a_iref + a->core.pos; + a_ret = cigar_iref2iseq_next(&a_cigar, a_cigar_max, + &a_icig, &a_iseq, &a_iref); + if ( a_ret<0 ) { // done + err = a_ret<-1?-1:0; + break; + } + if ( iref < a_iref + a->core.pos ) + iref = a_iref + a->core.pos; while ( b_ret >= 0 && b_iref>=0 && b_iref < iref - b->core.pos ) - b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, &b_iseq, &b_iref); - if ( b_ret<0 ) { err = b_ret<-1?-1:0; break; } // done - if ( iref < b_iref + b->core.pos ) iref = b_iref + b->core.pos; + b_ret = cigar_iref2iseq_next(&b_cigar, b_cigar_max, &b_icig, + &b_iseq, &b_iref); + if ( b_ret<0 ) { // done + err = b_ret<-1?-1:0; + break; + } + if ( iref < b_iref + b->core.pos ) + iref = b_iref + b->core.pos; iref++; - if ( a_iref+a->core.pos != b_iref+b->core.pos ) continue; // only CMATCH positions, don't know what to do with indels + + if ( a_iref+a->core.pos != b_iref+b->core.pos ) + // only CMATCH positions, don't know what to do with indels + continue; if (a_iseq > a->core.l_qseq || b_iseq > b->core.l_qseq) - return -1; // Fell off end of sequence, bad CIGAR? + // Fell off end of sequence, bad CIGAR? + return -1; - if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) - { - #if DBG - fprintf(stderr,"%c",seq_nt16_str[bam_seqi(a_seq,a_iseq)]); - #endif - // we are very confident about this base + // We're finally at the same ref base in both a and b. + // Check if the bases match (confident) or mismatch + // (not so confident). + if ( bam_seqi(a_seq,a_iseq) == bam_seqi(b_seq,b_iseq) ) { + // We are very confident about this base. Use sum of quals int qual = a_qual[a_iseq] + b_qual[b_iseq]; - a_qual[a_iseq] = qual>200 ? 200 : qual; - b_qual[b_iseq] = 0; - } - else - { - if ( a_qual[a_iseq] >= b_qual[b_iseq] ) - { - #if DBG - fprintf(stderr,"[%c/%c]",seq_nt16_str[bam_seqi(a_seq,a_iseq)],tolower_c(seq_nt16_str[bam_seqi(b_seq,b_iseq)])); - #endif - a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; // not so confident about a_qual anymore given the mismatch + a_qual[a_iseq] = amul * (qual>200 ? 200 : qual); + b_qual[b_iseq] = bmul * (qual>200 ? 200 : qual);; + } else { + // Not so confident about anymore given the mismatch. + // Reduce qual for lowest quality base. + if ( a_qual[a_iseq] > b_qual[b_iseq] ) { + // A highest qual base; keep + a_qual[a_iseq] = 0.8 * a_qual[a_iseq]; b_qual[b_iseq] = 0; - } - else - { - #if DBG - fprintf(stderr,"[%c/%c]",tolower_c(seq_nt16_str[bam_seqi(a_seq,a_iseq)]),seq_nt16_str[bam_seqi(b_seq,b_iseq)]); - #endif + } else if (a_qual[a_iseq] < b_qual[b_iseq] ) { + // B highest qual base; keep b_qual[b_iseq] = 0.8 * b_qual[b_iseq]; a_qual[a_iseq] = 0; + } else { + // Both equal, so pick randomly + a_qual[a_iseq] = amul * 0.8 * a_qual[a_iseq]; + b_qual[b_iseq] = bmul * 0.8 * b_qual[b_iseq]; } } } - #if DBG - fprintf(stderr,"\n"); - #endif + return err; } From 5b684ce625d772dc558c68fd360c8af3ac155f7e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 27 Apr 2021 17:08:05 +0100 Subject: [PATCH 141/488] Update htscodecs submodule. This was accidentally rolled back to v1.0 (2 commit previous) in the previous commit. --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 30bc9fdca..d7e357946 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 30bc9fdca45e144bd975eb2a2563c1cac43c2ec5 +Subproject commit d7e357946ead219b81cc1becbe0de8a99d96ca84 From 3d9132430c4937588cbbb765c325e1d3d1c24261 Mon Sep 17 00:00:00 2001 From: Kevin Murray Date: Thu, 15 Apr 2021 15:46:09 +1000 Subject: [PATCH 142/488] Prevent assert() in bcf_sr_set_regions It seems bcf_sr_set_regions ought be used before bcf_sr_add_reader. However, if a user unaware of this calls bcf_sr_set_regions **after** bcf_sr_add_reader, they are met with a failing assert. This is odd, as bcf_sr_set_regions already contains a useful error message about exactly this error. This patch reorders the check for incorrect usage and the assert(), so that if an API user stuffs up the order of the calls, they get the nice message that already exists and not an assert(). I'm not sure if the assert() is even needed here, and probably is best replaced with a nice hts_log_error() in any case. I think `readers->regions != NULL` only when either bcf_sr_add_reader or bcf_sr_set_regions has already been called. I'll leave this in the maintainer's hands. --- synced_bcf_reader.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 10604b16c..d24101573 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -171,12 +171,14 @@ static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) { - assert( !readers->regions ); if ( readers->nreaders ) { hts_log_error("Must call bcf_sr_set_regions() before bcf_sr_add_reader()"); return -1; } + + assert( !readers->regions ); + readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); if ( !readers->regions ) return -1; readers->explicit_regs = 1; From f4cfd6fac5900f8546d4a61ad5d28d72a19a5f48 Mon Sep 17 00:00:00 2001 From: Kevin Murray Date: Tue, 20 Apr 2021 09:42:28 +1000 Subject: [PATCH 143/488] Better handling of bcf_sr_set_region/target misuse As suggested by @valeriuo, do away with assert() and document in the header file. --- htslib/synced_bcf_reader.h | 3 +++ synced_bcf_reader.c | 11 +++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 76f889b30..f262327ff 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -302,6 +302,9 @@ int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); * file are currently not supported. * Targets (but not regions) can be prefixed with "^" to request logical complement, * for example "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. + * + * API note: bcf_sr_set_regions/bcf_sr_set_targets MUST be called before the + * first call to bcf_sr_add_reader(). */ HTSLIB_EXPORT int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index d24101573..44b2f154b 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -171,23 +171,26 @@ static int *init_filters(bcf_hdr_t *hdr, const char *filters, int *nfilters) int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) { - if ( readers->nreaders ) + if ( readers->nreaders || readers->regions ) { hts_log_error("Must call bcf_sr_set_regions() before bcf_sr_add_reader()"); return -1; } - assert( !readers->regions ); - readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); if ( !readers->regions ) return -1; readers->explicit_regs = 1; readers->require_index = REQUIRE_IDX_; return 0; } + int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles) { - assert( !readers->targets ); + if ( readers->nreaders || readers->targets ) + { + hts_log_error("Must call bcf_sr_set_targets() before bcf_sr_add_reader()"); + return -1; + } if ( targets[0]=='^' ) { readers->targets_exclude = 1; From 9aa2d03b1f139ad4173366e0fcfcb518772998ee Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 4 May 2021 11:16:27 +0100 Subject: [PATCH 144/488] Fix undefined behaviour in cram_xdelta_encode_char. "dat" is a malloc result and cp_end is computed from dat, irrespective of whether dat returned NULL or not. As pointer arithmetic with a NULL pointer is undefined, this is bad. We do check for dat being NULL, but the undefined behaviour has already been triggered. It is not believed however that this leads to any optimisation bugs or bounds checks being removed causing security issues. The compiler knows malloc *may* (and probably will) return non-NULL so it is required to produce code for handling that case. If it chooses to produce another code path for the NULL case it can, and that could have optimisation differences, but it'll be unreachable due to the malloc return value check. --- cram/cram_codecs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 2b077f2c7..88ba55d02 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1900,9 +1900,10 @@ int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c, int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, char *in, int in_size) { - char *dat = malloc(in_size*5), *cp = dat, *cp_end = dat + in_size*5; + char *dat = malloc(in_size*5); if (!dat) return -1; + char *cp = dat, *cp_end = dat + in_size*5; c->u.e_xdelta.last = 0; // reset for each new array switch(c->u.e_xdelta.word_size) { From d112e4fc33c006258efb67c03630268cf2e69ad8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 4 May 2021 15:49:49 +0100 Subject: [PATCH 145/488] Guard against a gcc 11.1.0 false positive warning. Gcc 11 complains (it's a warning, but upgraded due to -Werror): cram/cram_codecs.c:1916:19: error: 'cp_end' may be used uninitialized [-Werror=maybe-uninitialized] 1916 | cp += c->vv->varint_put32(cp, cp_end, zigzag16(c->u.e_xdelta.last)); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The cause of this appears to be because cp (dat) is malloced rather than calloced, so gcc knows the buffer is uninitialised. varint_put32 prototype takes "const char *" for the cp_end argument, which leads gcc to warn because it thinks there is no way the function can modify the data and it can detect we haven't initialised it before hand. However it hasn't acknowledged that cp and cp_end are both pointers to the same malloc block, and the cp is accepted as "char *" by the varint_put32, so it is indeed writing and not reading. This has been filed as https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100417 This commit explicitly removes this warning for this one function on gcc 11.1 only. It's included as a demonstration of fixing the issue incase other people report it, but I am not convinced we should be doing so as hopefully it will be transient. (Or, quite possibly, the gcc team educate us into reasons why our code is undefined in some manner.) --- cram/cram_codecs.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 88ba55d02..0956f29f0 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1898,16 +1898,20 @@ int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c, return -1; } +#if __GNUC__ == 11 && __GNUC_MINOR__ == 1 +// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100417 +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, - char *in, int in_size) { + char *in, int in_size) { char *dat = malloc(in_size*5); if (!dat) return -1; char *cp = dat, *cp_end = dat + in_size*5; c->u.e_xdelta.last = 0; // reset for each new array - switch(c->u.e_xdelta.word_size) { - case 2: { + if (c->u.e_xdelta.word_size == 2) { int i, part; part = in_size%2; @@ -1923,9 +1927,6 @@ int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, c->u.e_xdelta.last = le_int2(in16[i]); cp += c->vv->varint_put32(cp, cp_end, zigzag16(d)); } - - break; - } } if (c->u.e_xdelta.sub_codec->encode(slice, c->u.e_xdelta.sub_codec, (char *)dat, cp-dat)) { @@ -1936,6 +1937,9 @@ int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, free(dat); return 0; } +#if __GNUC__ == 11 && __GNUC_MINOR__ == 1 +#pragma GCC diagnostic pop +#endif void cram_xdelta_encode_free(cram_codec *c) { if (!c) return; From b331413a039cd30be39fc0c1c009b9b3d829415f Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 4 May 2021 13:08:25 +0100 Subject: [PATCH 146/488] Use bcf_itr_destroy() with bcf_itr_querys() [minor] Both tbx_itr_destroy() and bcf_itr_destroy() are currently just aliases for hts_itr_destroy(), but correct it anyway as tabix.c may be used as sample code. --- tabix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tabix.c b/tabix.c index 1471b09c5..969f902f6 100644 --- a/tabix.c +++ b/tabix.c @@ -263,7 +263,7 @@ static int query_regions(args_t *args, tbx_conf_t *conf, char *fname, char **reg if (ret < -1) { error_errno("Reading \"%s\" failed", fname); } - tbx_itr_destroy(itr); + bcf_itr_destroy(itr); } bcf_destroy(rec); } From eac27c9a42d6a1a0b76953afd651d2b4d31b45a8 Mon Sep 17 00:00:00 2001 From: indraniel Date: Sun, 14 Mar 2021 13:05:16 -0500 Subject: [PATCH 147/488] + introduce GCP requester pays bucket access - this is my first attempt at introducing the ability for HTSLIB to access files in Google Cloud Storage buckets that have enabled [requester pays][0]. To enable requester pays, one will need to export the additional environment variable: GCS_REQUESTER_PAYS_PROJECT=my-project-name [0]: https://cloud.google.com/storage/docs/requester-pays --- hfile_gcs.c | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/hfile_gcs.c b/hfile_gcs.c index e6f72ae4c..d622ca754 100644 --- a/hfile_gcs.c +++ b/hfile_gcs.c @@ -42,10 +42,11 @@ static hFILE * gcs_rewrite(const char *gsurl, const char *mode, int mode_has_colon, va_list *argsp) { - const char *bucket, *path, *access_token; + const char *bucket, *path, *access_token, *requester_pays_project; kstring_t mode_colon = { 0, 0, NULL }; kstring_t url = { 0, 0, NULL }; kstring_t auth_hdr = { 0, 0, NULL }; + kstring_t requester_pays_hdr = { 0, 0, NULL }; hFILE *fp = NULL; // GCS URL format is gs[+SCHEME]://BUCKET/PATH @@ -81,15 +82,35 @@ gcs_rewrite(const char *gsurl, const char *mode, int mode_has_colon, kputs(access_token, &auth_hdr); } - if (argsp || auth_hdr.l > 0 || mode_has_colon) { + requester_pays_project = getenv("GCS_REQUESTER_PAYS_PROJECT"); + + if (requester_pays_project) { + kputs("X-Goog-User-Project: ", &requester_pays_hdr); + kputs(requester_pays_project, &requester_pays_hdr); + } + + if (argsp || mode_has_colon || auth_hdr.l > 0 || requester_pays_hdr.l > 0) { if (! mode_has_colon) { kputs(mode, &mode_colon); kputc(':', &mode_colon); mode = mode_colon.s; } - fp = hopen(url.s, mode, "va_list", argsp, - "httphdr", (auth_hdr.l > 0)? auth_hdr.s : NULL, NULL); + if (auth_hdr.l > 0 && requester_pays_hdr.l > 0) { + fp = hopen( + url.s, mode, "va_list", argsp, + "httphdr:l", + auth_hdr.s, + requester_pays_hdr.s, + NULL, + NULL + ); + + } + else { + fp = hopen(url.s, mode, "va_list", argsp, + "httphdr", (auth_hdr.l > 0)? auth_hdr.s : NULL, NULL); + } } else fp = hopen(url.s, mode); @@ -97,6 +118,7 @@ gcs_rewrite(const char *gsurl, const char *mode, int mode_has_colon, free(mode_colon.s); free(url.s); free(auth_hdr.s); + free(requester_pays_hdr.s); return fp; } From e36da5371e7527bd2328555942c1d5c3a7bce609 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 4 May 2021 11:36:22 +0100 Subject: [PATCH 148/488] Add method `dump_index` for dumping the content of an index struct to stderr. --- hts.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/hts.c b/hts.c index 8ef82fb13..7c74891c6 100644 --- a/hts.c +++ b/hts.c @@ -1992,6 +1992,38 @@ static char * idx_format_name(int fmt) { } } +#if DEBUG_INDEX +static void dump_index(const hts_idx_t *idx) { + int i; + int64_t j; + + if (!idx) fprintf(stderr, "Null index\n"); + + fprintf(stderr, "format='%s', min_shift=%d, n_lvls=%d, n_bins=%d, l_meta=%u ", + idx_format_name(idx->fmt), idx->min_shift, idx->n_lvls, idx->n_bins, idx->l_meta); + fprintf(stderr, "n=%d, m=%d, n_no_coor=%"PRIu64"\n", idx->n, idx->m, idx->n_no_coor); + for (i = 0; i < idx->n; i++) { + bidx_t *bidx = idx->bidx[i]; + lidx_t *lidx = &idx->lidx[i]; + khint_t k; + fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size); + int b; + for (b = 0; b < META_BIN(idx); b++) { + if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) { + bins_t *entries = &kh_value(bidx, k); + fprintf(stderr, "\tbin=%d, parent=%d, n_entries=%d, loff=%"PRIu64"\n", + b, hts_bin_parent(b), entries->n, entries->loff); + for (j = 0; j < entries->n; j++) + fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v); + } + } + fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n); + for (j = 0; j < lidx->n; j++) + fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64"\n", j, lidx->offset[j]); + } +} +#endif + static inline int insert_to_b(bidx_t *b, int bin, uint64_t beg, uint64_t end) { khint_t k; From ae54a47eef3852d727442ccf88e3e7c855786071 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 4 May 2021 13:07:32 +0100 Subject: [PATCH 149/488] Fix the update of `min_off`, when unmapped reads are detected. Previously, `min_off` was set to the start offset of the first chunk of the first matching bin, but this relied on the loose assumption that that first chunk would contain the region of interest. The present solution just resets the `min_off` value to the bin index `loff`, in case no better offset is found in the linear index. --- hts.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/hts.c b/hts.c index 7c74891c6..c710c8372 100644 --- a/hts.c +++ b/hts.c @@ -2966,6 +2966,8 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t if (min_off < idx->lidx[tid].offset[rel_off]) min_off = idx->lidx[tid].offset[rel_off]; if (unmapped) { + // unmapped reads are not covered by the linear index, + // so search backwards for a smaller offset int tmp_off; for (tmp_off = rel_off-1; tmp_off >= 0; tmp_off--) { if (idx->lidx[tid].offset[tmp_off] < min_off) { @@ -2973,13 +2975,14 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t break; } } - - if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0)) - min_off = kh_val(bidx, k).list[0].u; + // if the search went too far back or no satisfactory entry + // was found, revert to the bin index loff value + if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0)) + min_off = kh_val(bidx, k).loff; } } else if (unmapped) { //CSI index if (k != kh_end(bidx)) - min_off = kh_val(bidx, k).list[0].u; + min_off = kh_val(bidx, k).loff; } // compute max_off: a virtual offset from a bin to the right of end @@ -3143,12 +3146,12 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) } } - if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).list[0].u || tmp_off < 0)) - min_off = kh_val(bidx, k).list[0].u; + if (k != kh_end(bidx) && (min_off < kh_val(bidx, k).loff || tmp_off < 0)) + min_off = kh_val(bidx, k).loff; } } else if (unmapped) { //CSI index if (k != kh_end(bidx)) - min_off = kh_val(bidx, k).list[0].u; + min_off = kh_val(bidx, k).loff; } // compute max_off: a virtual offset from a bin to the right of end From 2b68267146f11f94c8d2ea8a746900532702cf48 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 18 Mar 2021 11:17:01 +0000 Subject: [PATCH 150/488] Permit platform specific BAQ parameters. The flag passed to sam_prob_realn can now contain platform data, encoded along with other bits via the htsRealnPlatform enum. It also automatically selects long-read parameters if the read length is bigger than 1kb, but otherwise it defaults to the same parameters as before. This helps bcftools mpileup call SNPs on PacBio CCS reads. TODO: ideally we would have an additional API that accepts a BAM header and returns a platform field for us to ease use of this function. Eg: sam_prob_realn(b, ref, len, BAQ_APPLY | BAQ_EXTEND | sam_baq_platform(hdr)); Or to add a new function taking an extra argument for the tuned probaln_par and similar header-processing function. Eg: sam_prob_realn2(b, ref, len, BAQ_APPLY | BAQ_EXTEND, sam_platform_probaln_par(hdr)); --- htslib/sam.h | 35 +++++++++++++++++++++++++++++++---- realn.c | 29 +++++++++++++++++++++++++---- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index c3481808f..e359a83f4 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2031,6 +2031,21 @@ typedef struct bam_mplp_s *bam_mplp_t; HTSLIB_EXPORT int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres); +// Used as flag parameter in sam_prob_realn. +enum htsRealnFlags { + BAQ_APPLY = 1, + BAQ_EXTEND = 2, + BAQ_REDO = 4, + + // Platform subfield, in bit position 3 onwards + BAQ_AUTO = 0<<3, + BAQ_ILLUMINA = 1<<3, + BAQ_PACBIOCCS = 2<<3, + BAQ_PACBIO = 3<<3, + BAQ_ONT = 4<<3, + BAQ_GENAPSYS = 5<<3 +}; + /// Calculate BAQ scores /** @param b BAM record @param ref Reference sequence @@ -2045,9 +2060,15 @@ This function calculates base alignment quality (BAQ) values using the method described in "Improving SNP discovery by base alignment quality", Heng Li, Bioinformatics, Volume 27, Issue 8 (https://doi.org/10.1093/bioinformatics/btr076). +The @param flag value can be generated using the htsRealnFlags enum, but for +backwards compatibilty reasons is retained as an "int". An example usage +of the enum could be this, equivalent to flag 19: + + sam_prob_realn(b, ref, len, BAQ_APPLY | BAQ_EXTEND | BAQ_PACBIOCCS); + The following @param flag bits can be used: -Bit 0: Adjust the quality values using the BAQ values +Bit 0 (BAQ_APPLY): Adjust the quality values using the BAQ values If set, the data in the BQ:Z tag is used to adjust the quality values, and the BQ:Z tag is renamed to ZQ:Z. @@ -2055,22 +2076,28 @@ Bit 0: Adjust the quality values using the BAQ values If clear, and a ZQ:Z tag is present, the quality values are reverted using the data in the tag, and the tag is renamed to BQ:Z. -Bit 1: Use "extended" BAQ. +Bit 1 (BAQ_EXTEND): Use "extended" BAQ. Changes the BAQ calculation to increase sensitivity at the expense of reduced specificity. -Bit 2: Recalculate BAQ, even if a BQ tag is present. +Bit 2 (BAQ_REDO): Recalculate BAQ, even if a BQ tag is present. Force BAQ to be recalculated. Note that a ZQ:Z tag will always disable recalculation. +Bits 3-10: Choose parameters tailored to a specific instrument type. + + One of BAQ_AUTO, BAQ_ILLUMINA, BAQ_PACBIOCCS, BAQ_PACBIO, BAQ_ONT and + BAQ_GENAPSYS. The BAQ parameter tuning are still a work in progress and + at the time of writing mainly consist of Illumina vs long-read technology + adjustments. + @bug If the input read has both BQ:Z and ZQ:Z tags, the ZQ:Z one will be removed. Depending on what previous processing happened, this may or may not be the correct thing to do. It would be wise to avoid this situation if possible. */ - HTSLIB_EXPORT int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag); diff --git a/realn.c b/realn.c index 40b796b46..dfb338cb5 100644 --- a/realn.c +++ b/realn.c @@ -103,13 +103,32 @@ static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity, return 0; } -int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) -{ - int k, bw, y, yb, ye, xb, xe, apply_baq = flag&1, extend_baq = flag>>1&1, redo_baq = flag&4, fix_bq = 0; +int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) { + int k, bw, y, yb, ye, xb, xe, fix_bq = 0, apply_baq = flag & BAQ_APPLY, + extend_baq = flag & BAQ_EXTEND, redo_baq = flag & BAQ_REDO; + enum htsRealnFlags system = flag & (0xff << 3); hts_pos_t i, x; uint32_t *cigar = bam_get_cigar(b); bam1_core_t *c = &b->core; - probaln_par_t conf = { 0.001, 0.1, 10 }; + + // d(I) e(M) band + probaln_par_t conf = { 0.001, 0.1, 10 }; // Illumina + + if (b->core.l_qseq > 1000 || system > BAQ_ILLUMINA) { + // Params that work well on PacBio CCS 15k. Unknown if they + // help other long-read platforms yet, but likely better than + // the short-read tuned ones. + // + // This function has no access to the SAM header. + // Ideally the calling function would check for e.g. + // @RG PL = "PACBIO" and DS contains "READTYPE=CCS". + // + // In the absense of this, we simply auto-detect via a crude + // short vs long strategy. + conf.d = 1e-7; + conf.e = 1e-1; + } + uint8_t *bq = NULL, *zq = NULL, *qual = bam_get_qual(b); int *state = NULL; if ((c->flag & BAM_FUNMAP) || b->core.l_qseq == 0 || qual[0] == (uint8_t)-1) @@ -177,6 +196,7 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) if (abs((xe - xb) - (ye - yb)) > bw) bw = abs((xe - xb) - (ye - yb)) + 3; conf.bw = bw; + xb -= yb + bw/2; if (xb < 0) xb = 0; xe += c->l_qseq - ye + bw/2; if (xe - xb - c->l_qseq > bw) @@ -282,6 +302,7 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) } else bam_aux_append(b, "BQ", 'Z', c->l_qseq + 1, bq); free(bq); free(state); } + return 0; fail: From 85b0a7248dc0bc049df93edb88611d0735b649ed Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 5 May 2021 17:01:38 +0100 Subject: [PATCH 151/488] Silence a gcc 11.1 warning with kseq. The new uninitialised memory checking in gcc 11 is tripped up as it thinks there may be overflow cases in ks_expand meaning the resized buffer still isn't large enough. We have already fixed this in kroundup.h, but it's not sufficient to silence it. However adding an explicit check to spot if kroundup32 has overflowed is enough to silence it. NB: this gcc 11.1 issue wasn't visible from within Htslib, but Samtools tickled it in samtools/dict.c. Fixes #1283 --- htslib/kseq.h | 1 + 1 file changed, 1 insertion(+) diff --git a/htslib/kseq.h b/htslib/kseq.h index ee13dd0d8..3e2404568 100644 --- a/htslib/kseq.h +++ b/htslib/kseq.h @@ -207,6 +207,7 @@ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ + if (seq->seq.l + 1 >= seq->seq.m) return -3; /* error: adjusting m overflowed */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ From bf32d5624d9732824af039fbae157a9c3e41bd28 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 30 Apr 2021 16:07:46 +0100 Subject: [PATCH 152/488] Detect incorrect number of AC fields Incorrect number of AC fields (other than nALTs) could cause writes beyond allocated memory, resulting in core dumps and cryptic messages such as "corrupted size vs. prev_size" or "munmap_chunk(): invalid pointer" --- vcfutils.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/vcfutils.c b/vcfutils.c index aff5cdd55..0a44f5197 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -53,6 +53,17 @@ int bcf_calc_ac(const bcf_hdr_t *header, bcf1_t *line, int *ac, int which) } if ( an>=0 && ac_ptr ) { + if ( ac_len != line->n_allele - 1 ) + { + static int warned = 0; + if ( !warned ) + { + hts_log_warning("Incorrect number of AC fields at %s:%"PRIhts_pos". (This message is printed only once.)\n", + header->id[BCF_DT_CTG][line->rid].key, line->pos+1); + warned = 1; + } + return 0; + } int nac = 0; #define BRANCH_INT(type_t, convert) { \ for (i=0; i Date: Thu, 6 May 2021 10:21:17 +0100 Subject: [PATCH 153/488] Remove const from varint_putNN() endp parameters This works around a spurious warning from GCC 11.1 whereby the compiler assumed the const pointer cp_end/endp would be read from (because it was const, so what else would you do with it?). Remove PR #1280's previous workaround. See . As endp points to the same array as cp, it might as well be char* as well, which avoids the spurious warning. As partially noted in htscodecs/varint.h, the ideal signatures for the getters and putters would be int64_t (*varint_get32)(const char **cp, const char *endp, int *err); int (*varint_put32)(char *cp, char *endp, int32_t val_p); /* etc */ Adding const to the getters would have to occur after htscodecs had been updated correspondingly (if at all), but happily removing const from the putter callers can be done in advance of any corresponding htscodecs change. While in the neighbourhood, make the "for now" wrappers static. --- cram/cram_codecs.c | 8 -------- cram/cram_io.c | 22 +++++++++++----------- cram/cram_structs.h | 8 ++++---- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 0956f29f0..a913b9c86 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1898,11 +1898,6 @@ int cram_xdelta_encode_int(cram_slice *slice, cram_codec *c, return -1; } -#if __GNUC__ == 11 && __GNUC_MINOR__ == 1 -// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=100417 -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" -#endif int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, char *in, int in_size) { char *dat = malloc(in_size*5); @@ -1937,9 +1932,6 @@ int cram_xdelta_encode_char(cram_slice *slice, cram_codec *c, free(dat); return 0; } -#if __GNUC__ == 11 && __GNUC_MINOR__ == 1 -#pragma GCC diagnostic pop -#endif void cram_xdelta_encode_free(cram_codec *c) { if (!c) return; diff --git a/cram/cram_io.c b/cram/cram_io.c index 6d1d1877c..3bd98dd7f 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -739,15 +739,15 @@ static int64_t safe_ltf8_get(char **cp, const char *endp, int *err) { } // Wrapper for now -int safe_itf8_put(char *cp, const char *cp_end, int32_t val) { +static int safe_itf8_put(char *cp, char *cp_end, int32_t val) { return itf8_put(cp, val); } -int safe_ltf8_put(char *cp, const char *cp_end, int64_t val) { +static int safe_ltf8_put(char *cp, char *cp_end, int64_t val) { return ltf8_put(cp, val); } -int itf8_size(int64_t v) { +static int itf8_size(int64_t v) { return ((!((v)&~0x7f))?1:(!((v)&~0x3fff))?2:(!((v)&~0x1fffff))?3:(!((v)&~0xfffffff))?4:5); } @@ -796,20 +796,20 @@ static int64_t sint7_get_64(char **cp, const char *endp, int *err) { return val; } -static int uint7_put_32(char *cp, const char *endp, int32_t val) { - return var_put_u32((uint8_t *)cp, (const uint8_t *)endp, val); +static int uint7_put_32(char *cp, char *endp, int32_t val) { + return var_put_u32((uint8_t *)cp, (uint8_t *)endp, val); } -static int sint7_put_32(char *cp, const char *endp, int32_t val) { - return var_put_s32((uint8_t *)cp, (const uint8_t *)endp, val); +static int sint7_put_32(char *cp, char *endp, int32_t val) { + return var_put_s32((uint8_t *)cp, (uint8_t *)endp, val); } -static int uint7_put_64(char *cp, const char *endp, int64_t val) { - return var_put_u64((uint8_t *)cp, (const uint8_t *)endp, val); +static int uint7_put_64(char *cp, char *endp, int64_t val) { + return var_put_u64((uint8_t *)cp, (uint8_t *)endp, val); } -static int sint7_put_64(char *cp, const char *endp, int64_t val) { - return var_put_s64((uint8_t *)cp, (const uint8_t *)endp, val); +static int sint7_put_64(char *cp, char *endp, int64_t val) { + return var_put_s64((uint8_t *)cp, (uint8_t *)endp, val); } // Put direct to to cram_block diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 0c3ae8542..11cc4fa13 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -753,10 +753,10 @@ typedef struct varint_vec { int64_t (*varint_get64s)(char **cp, const char *endp, int *err); // Returns the number of bytes written, <= 0 on error. - int (*varint_put32) (char *cp, const char *endp, int32_t val_p); - int (*varint_put32s)(char *cp, const char *endp, int32_t val_p); - int (*varint_put64) (char *cp, const char *endp, int64_t val_p); - int (*varint_put64s)(char *cp, const char *endp, int64_t val_p); + int (*varint_put32) (char *cp, char *endp, int32_t val_p); + int (*varint_put32s)(char *cp, char *endp, int32_t val_p); + int (*varint_put64) (char *cp, char *endp, int64_t val_p); + int (*varint_put64s)(char *cp, char *endp, int64_t val_p); // Returns the number of bytes written, <= 0 on error. int (*varint_put32_blk) (cram_block *blk, int32_t val_p); From 818008a750eefb347bb3732dff9fb60afc367de6 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 14 Apr 2021 19:34:05 +0100 Subject: [PATCH 154/488] Support building in a separate build directory Traditional autoconf-using projects generate a directory tree of recursive Makefiles from a directory tree of Makefile.in files. Thus when configuring for an out-of-tree build, typically the tree of builddirs and their Makefiles are created fairly automatically. HTSlib instead uses a single non-recursive non-generated Makefile that includes fragments generated by configure. Hence configure.ac needs to detect out-of-tree configuration as a special case and explicitly create Makefile and the tree of subdirectories in the build directory. Add builddir_vars.mk.in, which for an out-of-tree build (only) is generated into $builddir/htslib_vars.mk. It sets VPATH and some other additional variables and then includes the real $srcdir/htslib_vars.mk. This is used to override $srcdir etc for out-of-tree builds, and notably adds an -I option for htscodecs/version.h which in an in-tree build is found by being in the same directory as the C file that includes it. HTSlib also provides htslib.mk for use by external projects. Split its variables and rules into $(HTSDIR) and $(HTSSRCDIR) as appropriate. These are the same by default, but builddir_vars.mk.in overrides $(HTSSRCDIR) to be $(HTSDIR)/$([HTSlib's ]srcdir) instead, taking care to handle the case of $(srcdir) being an absolute path. The `include` in htscodecs.mk.in would need to be $(srcdir)/selected.mk for Makefile but $(HTSPREFIX)selected.mk for htslib.mk. Rather than arrange this, it's easier to generate $builddir/htscodecs.mk as a symlink to $srcdir/selected.mk instead, using AC_CONFIG_LINKS. Add $(srcdir)/ or $(srcprefix) (for tidiness in in-tree build logs) to commands in recipes containing explicit filenames in source directories. This has been done for `make all install` but not yet for `make test`. --- Makefile | 32 +++++--- builddir_vars.mk.in | 58 +++++++++++++ configure.ac | 26 +++++- htscodecs.mk.in | 5 -- htslib.mk | 195 ++++++++++++++++++++++---------------------- 5 files changed, 198 insertions(+), 118 deletions(-) create mode 100644 builddir_vars.mk.in delete mode 100644 htscodecs.mk.in diff --git a/Makefile b/Makefile index e69ae7320..bca111198 100644 --- a/Makefile +++ b/Makefile @@ -113,13 +113,15 @@ htscodecs.mk: echo '# Default htscodecs.mk generated by Makefile' > $@ echo 'include $$(HTSPREFIX)htscodecs_bundled.mk' >> $@ +srcdir = . +srcprefix = HTSPREFIX = include htslib_vars.mk include htscodecs.mk # If not using GNU make, you need to copy the version number from version.sh # into here. -PACKAGE_VERSION := $(shell ./version.sh) +PACKAGE_VERSION := $(shell $(srcdir)/version.sh) LIBHTS_SOVERSION = 3 @@ -132,7 +134,7 @@ MACH_O_CURRENT_VERSION = 3.1.12 # $(NUMERIC_VERSION) is for items that must have a numeric X.Y.Z string # even if this is a dirty or untagged Git working tree. -NUMERIC_VERSION := $(shell ./version.sh numeric) +NUMERIC_VERSION := $(shell $(srcdir)/version.sh numeric) # Force version.h to be remade if $(PACKAGE_VERSION) has changed. version.h: $(if $(wildcard version.h),$(if $(findstring "$(PACKAGE_VERSION)",$(shell cat version.h)),,force)) @@ -254,7 +256,7 @@ config.h: # on htslib.pc.in listed, as if that file is newer the usual way to regenerate # this target is via configure or config.status rather than this rule. htslib.pc.tmp: - sed -e '/^static_libs=/s/@static_LIBS@/$(htslib_default_libs)/;s#@[^-][^@]*@##g' htslib.pc.in > $@ + sed -e '/^static_libs=/s/@static_LIBS@/$(htslib_default_libs)/;s#@[^-][^@]*@##g' $(srcprefix)htslib.pc.in > $@ # Create a makefile fragment listing the libraries and LDFLAGS needed for # static linking. This can be included by projects that want to build @@ -449,16 +451,15 @@ htscodecs/htscodecs: # Build the htscodecs/htscodecs/version.h file if necessary htscodecs/htscodecs/version.h: force - @if test -e htscodecs/.git && test -e htscodecs/configure.ac ; then \ - cd htscodecs && \ - vers=`git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \ + @if test -e $(srcdir)/htscodecs/.git && test -e $(srcdir)/htscodecs/configure.ac ; then \ + vers=`cd $(srcdir)/htscodecs && git describe --always --dirty --match 'v[0-9]\.[0-9]*'` && \ case "$$vers" in \ v*) vers=$${vers#v} ;; \ - *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \ + *) iv=`awk '/^AC_INIT/ { match($$0, /^AC_INIT\(htscodecs, *([0-9](\.[0-9])*)\)/, m); print substr($$0, m[1, "start"], m[1, "length"]) }' $(srcdir)/htscodecs/configure.ac` ; vers="$$iv$${vers:+-g$$vers}" ;; \ esac ; \ - if ! grep -s -q '"'"$$vers"'"' htscodecs/version.h ; then \ + if ! grep -s -q '"'"$$vers"'"' $@ ; then \ echo 'Updating $@ : #define HTSCODECS_VERSION_TEXT "'"$$vers"'"' ; \ - echo '#define HTSCODECS_VERSION_TEXT "'"$$vers"'"' > htscodecs/version.h ; \ + echo '#define HTSCODECS_VERSION_TEXT "'"$$vers"'"' > $@ ; \ fi ; \ fi endif @@ -470,6 +471,11 @@ maintainer-check: test/maintainer/check_copyright.pl . test/maintainer/check_spaces.pl . +# Create a shorthand. We use $(SRC) or $(srcprefix) rather than $(srcdir)/ +# for brevity in test and install rules, and so that build logs do not have +# ./ sprinkled throughout. +SRC = $(srcprefix) + # For tests that might use it, set $REF_PATH explicitly to use only reference # areas within the test suite (or set it to ':' to use no reference areas). # @@ -687,11 +693,11 @@ shlib-exports-dll.txt: hts.dll.a install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir) if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi - $(INSTALL_DATA) htslib/*.h $(DESTDIR)$(includedir)/htslib + $(INSTALL_DATA) $(SRC)htslib/*.h $(DESTDIR)$(includedir)/htslib $(INSTALL_DATA) libhts.a $(DESTDIR)$(libdir)/libhts.a - $(INSTALL_MAN) bgzip.1 htsfile.1 tabix.1 $(DESTDIR)$(man1dir) - $(INSTALL_MAN) faidx.5 sam.5 vcf.5 $(DESTDIR)$(man5dir) - $(INSTALL_MAN) htslib-s3-plugin.7 $(DESTDIR)$(man7dir) + $(INSTALL_MAN) $(SRC)bgzip.1 $(SRC)htsfile.1 $(SRC)tabix.1 $(DESTDIR)$(man1dir) + $(INSTALL_MAN) $(SRC)faidx.5 $(SRC)sam.5 $(SRC)vcf.5 $(DESTDIR)$(man5dir) + $(INSTALL_MAN) $(SRC)htslib-s3-plugin.7 $(DESTDIR)$(man7dir) installdirs: $(INSTALL_DIR) $(DESTDIR)$(bindir) $(DESTDIR)$(includedir) $(DESTDIR)$(includedir)/htslib $(DESTDIR)$(libdir) $(DESTDIR)$(man1dir) $(DESTDIR)$(man5dir) $(DESTDIR)$(man7dir) $(DESTDIR)$(pkgconfigdir) diff --git a/builddir_vars.mk.in b/builddir_vars.mk.in new file mode 100644 index 000000000..09bb20fea --- /dev/null +++ b/builddir_vars.mk.in @@ -0,0 +1,58 @@ +# Separate build directory Makefile overrides for htslib. +# +# Copyright (C) 2021 University of Glasgow. +# +# Author: John Marshall +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# This is @configure_input@ +# +# When building from a separate build directory, this file is included by +# HTSlib's Makefile or htslib.mk instead of htslib_vars.mk. It adjusts +# variables to account for a separate source directory and then includes +# the real makefile fragment. + +ifneq "$(HTSPREFIX)" "" +# When included externally via htslib.mk, just update $(HTSSRCDIR) and hence +# $(HTSPREFIX) to point to the source directory (without using any extra +# unprefixed variables, which would be in the external project's namespace). + +# Set to $(HTSDIR)/$(srcdir) (or just $(srcdir) if that's absolute) +HTSSRCDIR = @HTSDIRslash_if_relsrcdir@@srcdir@ + +include $(HTSSRCDIR)/htslib_vars.mk + +else +# When included from HTSlib's Makefile, override $(srcdir) and set VPATH, +# and make any other adjustments required. ($(HTSPREFIX) remains empty as +# the items it prefixes will be found via VPATH instead.) + +srcdir = @srcdir@ +VPATH = @srcdir@ + +srcprefix = $(srcdir)/ + +# Ensure that htscodecs.c can include its version.h. This -I option must come +# before -I. so that these targets get this version.h rather than HTSlib's. +htscodecs/htscodecs/htscodecs.o htscodecs/htscodecs/htscodecs.pico: ALL_CPPFLAGS = -Ihtscodecs/htscodecs -I. $(CPPFLAGS) + +include $(srcdir)/htslib_vars.mk + +endif diff --git a/configure.ac b/configure.ac index 53d03c16d..8b50697da 100644 --- a/configure.ac +++ b/configure.ac @@ -326,7 +326,6 @@ in the top-level htslib directory to update it, and then re-run configure. You have an incomplete distribution. Please try downloading one of the official releases from https://www.htslib.org ])])])]) -AC_SUBST([selected_htscodecs_mk]) AS_IF([test "x$with_libdeflate" != "xno"], [libdeflate=ok @@ -474,5 +473,28 @@ AC_SUBST([private_LIBS]) AC_SUBST([static_LDFLAGS]) AC_SUBST([static_LIBS]) -AC_CONFIG_FILES([config.mk htscodecs.mk htslib.pc.tmp:htslib.pc.in]) +AC_CONFIG_FILES([config.mk htslib.pc.tmp:htslib.pc.in]) +AC_CONFIG_LINKS([htscodecs.mk:$selected_htscodecs_mk]) + +if test "$srcdir" != .; then + # Set up for a separate build directory. As HTSlib uses a non-recursive + # makefile, we need to create additional build subdirectories explicitly. + AC_CONFIG_LINKS([Makefile:Makefile htslib.mk:htslib.mk]) + AC_CONFIG_FILES([htslib_vars.mk:builddir_vars.mk.in]) + AC_CONFIG_COMMANDS([mkdir], + [AS_MKDIR_P([cram]) + AS_MKDIR_P([htscodecs/htscodecs]) + AS_MKDIR_P([htscodecs/tests]) + AS_MKDIR_P([test/fuzz]) + AS_MKDIR_P([test/longrefs]) + AS_MKDIR_P([test/tabix])]) +fi + +# @HTSDIRslash_if_relsrcdir@ will be empty when $srcdir is absolute +case "$srcdir" in + /*) HTSDIRslash_if_relsrcdir= ;; + *) HTSDIRslash_if_relsrcdir='$(HTSDIR)/' ;; +esac +AC_SUBST([HTSDIRslash_if_relsrcdir]) + AC_OUTPUT diff --git a/htscodecs.mk.in b/htscodecs.mk.in deleted file mode 100644 index f3bba4f31..000000000 --- a/htscodecs.mk.in +++ /dev/null @@ -1,5 +0,0 @@ -# This is @configure_input@ - -# This file selects Makefile rules for htscodecs - -include $(HTSPREFIX)@selected_htscodecs_mk@ diff --git a/htslib.mk b/htslib.mk index 6b199e382..3a4675814 100644 --- a/htslib.mk +++ b/htslib.mk @@ -27,12 +27,13 @@ # If your source file foo.c #includes and , # you can write the correct prerequisites for foo.o as: # -# HTSDIR = +# HTSDIR = # include $(HTSDIR)/htslib.mk # # foo.o: foo.c $(htslib_hts_h) $(htslib_kstring_h) -HTSPREFIX = $(HTSDIR)/ +HTSSRCDIR = $(HTSDIR) +HTSPREFIX = $(HTSSRCDIR)/ include $(HTSDIR)/htslib_vars.mk # This file provides the HTSCODECS_SOURCES variable. It may not be present @@ -54,101 +55,101 @@ include $(HTSDIR)/htslib_vars.mk # $(HTSDIR)/tabix -p bed bar.bed.bgz HTSLIB_PUBLIC_HEADERS = \ - $(HTSDIR)/htslib/bgzf.h \ - $(HTSDIR)/htslib/cram.h \ - $(HTSDIR)/htslib/faidx.h \ - $(HTSDIR)/htslib/hfile.h \ - $(HTSDIR)/htslib/hts.h \ - $(HTSDIR)/htslib/hts_defs.h \ - $(HTSDIR)/htslib/hts_endian.h \ - $(HTSDIR)/htslib/hts_expr.h \ - $(HTSDIR)/htslib/hts_log.h \ - $(HTSDIR)/htslib/hts_os.h \ - $(HTSDIR)/htslib/kbitset.h \ - $(HTSDIR)/htslib/kfunc.h \ - $(HTSDIR)/htslib/khash.h \ - $(HTSDIR)/htslib/khash_str2int.h \ - $(HTSDIR)/htslib/klist.h \ - $(HTSDIR)/htslib/kseq.h \ - $(HTSDIR)/htslib/ksort.h \ - $(HTSDIR)/htslib/kstring.h \ - $(HTSDIR)/htslib/regidx.h \ - $(HTSDIR)/htslib/sam.h \ - $(HTSDIR)/htslib/synced_bcf_reader.h \ - $(HTSDIR)/htslib/tbx.h \ - $(HTSDIR)/htslib/thread_pool.h \ - $(HTSDIR)/htslib/vcf.h \ - $(HTSDIR)/htslib/vcf_sweep.h \ - $(HTSDIR)/htslib/vcfutils.h + $(HTSSRCDIR)/htslib/bgzf.h \ + $(HTSSRCDIR)/htslib/cram.h \ + $(HTSSRCDIR)/htslib/faidx.h \ + $(HTSSRCDIR)/htslib/hfile.h \ + $(HTSSRCDIR)/htslib/hts.h \ + $(HTSSRCDIR)/htslib/hts_defs.h \ + $(HTSSRCDIR)/htslib/hts_endian.h \ + $(HTSSRCDIR)/htslib/hts_expr.h \ + $(HTSSRCDIR)/htslib/hts_log.h \ + $(HTSSRCDIR)/htslib/hts_os.h \ + $(HTSSRCDIR)/htslib/kbitset.h \ + $(HTSSRCDIR)/htslib/kfunc.h \ + $(HTSSRCDIR)/htslib/khash.h \ + $(HTSSRCDIR)/htslib/khash_str2int.h \ + $(HTSSRCDIR)/htslib/klist.h \ + $(HTSSRCDIR)/htslib/kseq.h \ + $(HTSSRCDIR)/htslib/ksort.h \ + $(HTSSRCDIR)/htslib/kstring.h \ + $(HTSSRCDIR)/htslib/regidx.h \ + $(HTSSRCDIR)/htslib/sam.h \ + $(HTSSRCDIR)/htslib/synced_bcf_reader.h \ + $(HTSSRCDIR)/htslib/tbx.h \ + $(HTSSRCDIR)/htslib/thread_pool.h \ + $(HTSSRCDIR)/htslib/vcf.h \ + $(HTSSRCDIR)/htslib/vcf_sweep.h \ + $(HTSSRCDIR)/htslib/vcfutils.h HTSLIB_ALL = \ $(HTSLIB_PUBLIC_HEADERS) \ - $(HTSDIR)/bcf_sr_sort.c \ - $(HTSDIR)/bcf_sr_sort.h \ - $(HTSDIR)/bgzf.c \ + $(HTSSRCDIR)/bcf_sr_sort.c \ + $(HTSSRCDIR)/bcf_sr_sort.h \ + $(HTSSRCDIR)/bgzf.c \ $(HTSDIR)/config.h \ - $(HTSDIR)/errmod.c \ - $(HTSDIR)/faidx.c \ - $(HTSDIR)/header.c \ - $(HTSDIR)/header.h \ - $(HTSDIR)/hfile_internal.h \ - $(HTSDIR)/hfile.c \ - $(HTSDIR)/hfile_gcs.c \ - $(HTSDIR)/hfile_libcurl.c \ - $(HTSDIR)/hfile_s3.c \ - $(HTSDIR)/hfile_s3_write.c \ - $(HTSDIR)/hts.c \ - $(HTSDIR)/hts_expr.c \ - $(HTSDIR)/hts_internal.h \ - $(HTSDIR)/hts_os.c \ - $(HTSDIR)/kfunc.c \ - $(HTSDIR)/kstring.c \ - $(HTSDIR)/md5.c \ - $(HTSDIR)/multipart.c \ - $(HTSDIR)/plugin.c \ - $(HTSDIR)/probaln.c \ - $(HTSDIR)/realn.c \ - $(HTSDIR)/regidx.c \ - $(HTSDIR)/region.c \ - $(HTSDIR)/sam.c \ - $(HTSDIR)/sam_internal.h \ - $(HTSDIR)/synced_bcf_reader.c \ - $(HTSDIR)/tbx.c \ - $(HTSDIR)/textutils.c \ - $(HTSDIR)/textutils_internal.h \ - $(HTSDIR)/thread_pool.c \ - $(HTSDIR)/thread_pool_internal.h \ - $(HTSDIR)/vcf.c \ - $(HTSDIR)/vcf_sweep.c \ - $(HTSDIR)/vcfutils.c \ - $(HTSDIR)/cram/cram.h \ - $(HTSDIR)/cram/cram_codecs.c \ - $(HTSDIR)/cram/cram_codecs.h \ - $(HTSDIR)/cram/cram_decode.c \ - $(HTSDIR)/cram/cram_decode.h \ - $(HTSDIR)/cram/cram_encode.c \ - $(HTSDIR)/cram/cram_encode.h \ - $(HTSDIR)/cram/cram_external.c \ - $(HTSDIR)/cram/cram_index.c \ - $(HTSDIR)/cram/cram_index.h \ - $(HTSDIR)/cram/cram_io.c \ - $(HTSDIR)/cram/cram_io.h \ - $(HTSDIR)/cram/cram_samtools.h \ - $(HTSDIR)/cram/cram_stats.c \ - $(HTSDIR)/cram/cram_stats.h \ - $(HTSDIR)/cram/cram_structs.h \ - $(HTSDIR)/cram/mFILE.c \ - $(HTSDIR)/cram/mFILE.h \ - $(HTSDIR)/cram/misc.h \ - $(HTSDIR)/cram/open_trace_file.c \ - $(HTSDIR)/cram/open_trace_file.h \ - $(HTSDIR)/cram/os.h \ - $(HTSDIR)/cram/pooled_alloc.c \ - $(HTSDIR)/cram/pooled_alloc.h \ - $(HTSDIR)/cram/string_alloc.c \ - $(HTSDIR)/cram/string_alloc.h \ - $(HTSDIR)/os/lzma_stub.h \ - $(HTSDIR)/os/rand.c \ + $(HTSSRCDIR)/errmod.c \ + $(HTSSRCDIR)/faidx.c \ + $(HTSSRCDIR)/header.c \ + $(HTSSRCDIR)/header.h \ + $(HTSSRCDIR)/hfile_internal.h \ + $(HTSSRCDIR)/hfile.c \ + $(HTSSRCDIR)/hfile_gcs.c \ + $(HTSSRCDIR)/hfile_libcurl.c \ + $(HTSSRCDIR)/hfile_s3.c \ + $(HTSSRCDIR)/hfile_s3_write.c \ + $(HTSSRCDIR)/hts.c \ + $(HTSSRCDIR)/hts_expr.c \ + $(HTSSRCDIR)/hts_internal.h \ + $(HTSSRCDIR)/hts_os.c \ + $(HTSSRCDIR)/kfunc.c \ + $(HTSSRCDIR)/kstring.c \ + $(HTSSRCDIR)/md5.c \ + $(HTSSRCDIR)/multipart.c \ + $(HTSSRCDIR)/plugin.c \ + $(HTSSRCDIR)/probaln.c \ + $(HTSSRCDIR)/realn.c \ + $(HTSSRCDIR)/regidx.c \ + $(HTSSRCDIR)/region.c \ + $(HTSSRCDIR)/sam.c \ + $(HTSSRCDIR)/sam_internal.h \ + $(HTSSRCDIR)/synced_bcf_reader.c \ + $(HTSSRCDIR)/tbx.c \ + $(HTSSRCDIR)/textutils.c \ + $(HTSSRCDIR)/textutils_internal.h \ + $(HTSSRCDIR)/thread_pool.c \ + $(HTSSRCDIR)/thread_pool_internal.h \ + $(HTSSRCDIR)/vcf.c \ + $(HTSSRCDIR)/vcf_sweep.c \ + $(HTSSRCDIR)/vcfutils.c \ + $(HTSSRCDIR)/cram/cram.h \ + $(HTSSRCDIR)/cram/cram_codecs.c \ + $(HTSSRCDIR)/cram/cram_codecs.h \ + $(HTSSRCDIR)/cram/cram_decode.c \ + $(HTSSRCDIR)/cram/cram_decode.h \ + $(HTSSRCDIR)/cram/cram_encode.c \ + $(HTSSRCDIR)/cram/cram_encode.h \ + $(HTSSRCDIR)/cram/cram_external.c \ + $(HTSSRCDIR)/cram/cram_index.c \ + $(HTSSRCDIR)/cram/cram_index.h \ + $(HTSSRCDIR)/cram/cram_io.c \ + $(HTSSRCDIR)/cram/cram_io.h \ + $(HTSSRCDIR)/cram/cram_samtools.h \ + $(HTSSRCDIR)/cram/cram_stats.c \ + $(HTSSRCDIR)/cram/cram_stats.h \ + $(HTSSRCDIR)/cram/cram_structs.h \ + $(HTSSRCDIR)/cram/mFILE.c \ + $(HTSSRCDIR)/cram/mFILE.h \ + $(HTSSRCDIR)/cram/misc.h \ + $(HTSSRCDIR)/cram/open_trace_file.c \ + $(HTSSRCDIR)/cram/open_trace_file.h \ + $(HTSSRCDIR)/cram/os.h \ + $(HTSSRCDIR)/cram/pooled_alloc.c \ + $(HTSSRCDIR)/cram/pooled_alloc.h \ + $(HTSSRCDIR)/cram/string_alloc.c \ + $(HTSSRCDIR)/cram/string_alloc.h \ + $(HTSSRCDIR)/os/lzma_stub.h \ + $(HTSSRCDIR)/os/rand.c \ $(HTSCODECS_SOURCES) $(HTSDIR)/config.h: @@ -166,15 +167,13 @@ $(HTSDIR)/libhts.so: $(HTSLIB_ALL) $(HTSDIR)/libhts.dylib $(HTSDIR)/libhts.dll.a $(HTSDIR)/hts.dll.a: $(HTSDIR)/hts-object-files +cd $(HTSDIR) && $(MAKE) lib-shared -$(HTSDIR)/bgzip: $(HTSDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +$(HTSDIR)/bgzip: $(HTSSRCDIR)/bgzip.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +cd $(HTSDIR) && $(MAKE) bgzip -$(HTSDIR)/htsfile: $(HTSDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a - +$(HTSDIR)/htsfile: $(HTSSRCDIR)/htsfile.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +cd $(HTSDIR) && $(MAKE) htsfile -$(HTSDIR)/tabix: $(HTSDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a - +$(HTSDIR)/tabix: $(HTSSRCDIR)/tabix.c $(HTSLIB_PUBLIC_HEADERS) $(HTSDIR)/libhts.a +cd $(HTSDIR) && $(MAKE) tabix $(HTSDIR)/htslib_static.mk: $(HTSDIR)/htslib.pc.tmp From 3ae534251a08d99a55157f8ea456ebca105f45fe Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Fri, 14 May 2021 11:42:22 +0100 Subject: [PATCH 155/488] Fix a potential invalid memory access. Use the original allele count. --- vcfutils.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vcfutils.c b/vcfutils.c index 0a44f5197..02af68465 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -258,6 +258,8 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; + if (!line->d.allele) + bcf_unpack(line, BCF_UN_STR); kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles @@ -507,8 +509,8 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb } // Update GT fields, the allele indexes might have changed - for (i=1; in_allele; i++) if ( map[i]!=i ) break; - if ( in_allele ) + for (i=1; i Date: Mon, 17 May 2021 13:46:22 +0100 Subject: [PATCH 156/488] Mark removed alleles as missing. --- vcfutils.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/vcfutils.c b/vcfutils.c index 02af68465..9d0d7fdb3 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -532,7 +532,9 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb bcf_seqname_safe(header,line), line->pos+1, al, nR_ori, map[al]); goto err; } - ptr[j] = (map[al]+1)<<1 | (ptr[j]&1); + // if an allele other than the reference is mapped to 0, it has been removed, + // so translate it to 'missing', while preserving the phasing bit + ptr[j] = ((al>0 && !map[al]) ? bcf_gt_missing : (map[al]+1)<<1) | (ptr[j]&1); } ptr += nret; } From f3b29c5300fe94761b2a5d67330365bebbe651b1 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 21 May 2021 20:39:53 +0100 Subject: [PATCH 157/488] Fix knet_open() and knet_dopen() error handling Return early on open failure in these knet stubs, as otherwise we assign NULL and then immediately dereference it. --- hfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hfile.c b/hfile.c index 708957863..ebb5b2244 100644 --- a/hfile.c +++ b/hfile.c @@ -1359,7 +1359,7 @@ knetFile *knet_open(const char *fn, const char *mode) { if (!fp) return NULL; if (!(fp->hf = hopen(fn, mode))) { free(fp); - fp = NULL; + return NULL; } // FD backend is the only one implementing knet_fileno @@ -1376,7 +1376,7 @@ knetFile *knet_dopen(int fd, const char *mode) { if (!fp) return NULL; if (!(fp->hf = hdopen(fd, mode))) { free(fp); - fp = NULL; + return NULL; } fp->fd = fd; return fp; From 3a8781a25a1f198a3eca4d40b827f79eb22ec142 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 21 May 2021 20:42:28 +0100 Subject: [PATCH 158/488] Fix memory leak [minor] --- cram/cram_codecs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index a913b9c86..a5857c100 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1728,8 +1728,10 @@ cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, else if (option == E_BYTE_ARRAY_BLOCK) { option = E_BYTE_ARRAY; c->decode = cram_xdelta_decode_block; - } else + } else { + free(c); return NULL; + } c->free = cram_xdelta_decode_free; c->size = cram_xdelta_decode_size; c->get_block = cram_xdelta_get_block; From 575db47d185481fdaec0dfab45a6bbf99f8e123e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 21 May 2021 15:32:28 +0100 Subject: [PATCH 159/488] Improve compression metrics for unsorted files. Unsorted data is recognised by the rapid change in reference ID giving rise to many tiny slices. A flurry of this switches to unsorted mode where we use the RI data series to store sequences from multiple references in the same slice. Unfortunately this also skews the cram data series metrics, which look at average compression ratios for a variety of codecs, as the first few slices may be tiny and the optimal codec isn't chosen (initially). Eg picking rANS0 for qualities as the buffers are tiny, while rANS1 would be better. Io_lib/scramble has a solution which resets the compression metrics on detection of unsorted data. This works, but here I try a more general and hopefully stable approach which is to look for unusually very large fluctuations in data series sizes. --- cram/cram_io.c | 35 ++++++++++++++++++++++++++++++++++- cram/cram_structs.h | 1 + 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 3bd98dd7f..92102ffd7 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1957,8 +1957,27 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, return 0; } +#ifndef ABS +# define ABS(a) ((a)>=0?(a):-(a)) +#endif + if (metrics) { pthread_mutex_lock(&fd->metrics_lock); + // Sudden changes in size trigger a retrial. These are mainly + // triggered when switching to sorted / unsorted, where the number + // of elements in a slice radically changes. + // + // We also get large fluctuations based on genome coordinate for + // e.g. SA:Z and SC series, but we consider the typical scale of + // delta between blocks and use this to look for abnormality. + if (metrics->input_avg_sz && + (b->uncomp_size + 1000 > 4*(metrics->input_avg_sz+1000) || + b->uncomp_size + 1000 < (metrics->input_avg_sz+1000)/4) && + ABS(b->uncomp_size-metrics->input_avg_sz) + > 10*metrics->input_avg_delta) { + metrics->next_trial = 0; + } + if (metrics->trial > 0 || --metrics->next_trial <= 0) { int m, unpackable = metrics->unpackable; size_t sz_best = b->uncomp_size; @@ -1966,6 +1985,13 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, int method_best = 0; // RAW char *c_best = NULL, *c = NULL; + metrics->input_avg_delta = + 0.9 * (metrics->input_avg_delta + + ABS(b->uncomp_size - metrics->input_avg_sz)); + + metrics->input_avg_sz += b->uncomp_size*.2; + metrics->input_avg_sz *= 0.8; + if (metrics->revised_method) method = metrics->revised_method; else @@ -2124,7 +2150,7 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, metrics->sz[m] *= 1+(meth_cost[m]-1)/3; } // else cost is ignored - // Ensure these are never used + // Ensure these are never used; BSC and ZSTD metrics->sz[9] = metrics->sz[10] = INT_MAX; for (m = 0; m < CRAM_MAX_METHOD; m++) { @@ -2189,6 +2215,13 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, } pthread_mutex_unlock(&fd->metrics_lock); } else { + metrics->input_avg_delta = + 0.9 * (metrics->input_avg_delta + + ABS(b->uncomp_size - metrics->input_avg_sz)); + + metrics->input_avg_sz += b->uncomp_size*.2; + metrics->input_avg_sz *= 0.8; + strat = metrics->strat; method = metrics->method; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 11cc4fa13..ce27bc1a4 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -289,6 +289,7 @@ struct cram_metrics { // aggregate sizes during trials int sz[CRAM_MAX_METHOD]; + int input_avg_sz, input_avg_delta; // resultant method from trials int method, revised_method; From 3aad5e4e1cdf17d79ec865a7c4b4718745ef8bc0 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Wed, 5 May 2021 08:19:36 +0100 Subject: [PATCH 160/488] Show genomic coordinates covered by a bin. Show bin level. Add documentation to bin methods. --- hts.c | 44 ++++++++++++++++++++++++++++---------------- htslib/hts.h | 26 ++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 20 deletions(-) diff --git a/hts.c b/hts.c index c710c8372..d194ab543 100644 --- a/hts.c +++ b/hts.c @@ -1992,8 +1992,8 @@ static char * idx_format_name(int fmt) { } } -#if DEBUG_INDEX -static void dump_index(const hts_idx_t *idx) { +#ifdef DEBUG_INDEX +static void idx_dump(const hts_idx_t *idx) { int i; int64_t j; @@ -2005,22 +2005,30 @@ static void dump_index(const hts_idx_t *idx) { for (i = 0; i < idx->n; i++) { bidx_t *bidx = idx->bidx[i]; lidx_t *lidx = &idx->lidx[i]; - khint_t k; - fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size); - int b; - for (b = 0; b < META_BIN(idx); b++) { - if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) { - bins_t *entries = &kh_value(bidx, k); - fprintf(stderr, "\tbin=%d, parent=%d, n_entries=%d, loff=%"PRIu64"\n", - b, hts_bin_parent(b), entries->n, entries->loff); - for (j = 0; j < entries->n; j++) - fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v); + if (bidx) { + fprintf(stderr, "======== BIN Index - tid=%d, n_buckets=%d, size=%d\n", i, bidx->n_buckets, bidx->size); + int b; + for (b = 0; b < META_BIN(idx); b++) { + khint_t k; + if ((k = kh_get(bin, bidx, b)) != kh_end(bidx)) { + bins_t *entries = &kh_value(bidx, k); + int l = hts_bin_level(b); + int64_t bin_width = 1LL << ((idx->n_lvls - l) * 3 + idx->min_shift); + fprintf(stderr, "\tbin=%d, level=%d, parent=%d, n_chunks=%d, loff=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n", + b, l, hts_bin_parent(b), entries->n, entries->loff, (b-hts_bin_first(l))*bin_width+1, (b+1-hts_bin_first(l))*bin_width); + for (j = 0; j < entries->n; j++) + fprintf(stderr, "\t\tchunk=%"PRId64", u=%"PRIu64", v=%"PRIu64"\n", j, entries->list[j].u, entries->list[j].v); + } + } + } + if (lidx) { + fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n); + for (j = 0; j < lidx->n; j++) { + fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64", interval=[%"PRId64" - %"PRId64"]\n", + j, lidx->offset[j], j*(1<min_shift)+1, (j+1)*(1<min_shift)); } } - fprintf(stderr, "======== LINEAR Index - tid=%d, n_values=%"PRId64"\n", i, lidx->n); - for (j = 0; j < lidx->n; j++) - fprintf(stderr, "\t\tentry=%"PRId64", offset=%"PRIu64"\n", j, lidx->offset[j]); - } + } } #endif @@ -2590,6 +2598,10 @@ static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt) } if (bgzf_read(fp, &idx->n_no_coor, 8) != 8) idx->n_no_coor = 0; if (is_be) ed_swap_8p(&idx->n_no_coor); +#ifdef DEBUG_INDEX + idx_dump(idx); +#endif + return 0; } diff --git a/htslib/hts.h b/htslib/hts.h index 219368a90..8d5774382 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -845,8 +845,10 @@ typedef struct hts_itr_t { typedef hts_itr_t hts_itr_multi_t; - #define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7) - #define hts_bin_parent(l) (((l) - 1) >> 3) +/// Compute the first bin on a given level +#define hts_bin_first(l) (((1<<(((l)<<1) + (l))) - 1) / 7) +/// Compute the parent bin of a given bin +#define hts_bin_parent(b) (((b) - 1) >> 3) /////////////////////////////////////////////////////////// // Low-level API for building indexes. @@ -1446,10 +1448,26 @@ static inline int hts_reg2bin(hts_pos_t beg, hts_pos_t end, int min_shift, int n return 0; } +/// Compute the level of a bin in a binning index +static inline int hts_bin_level(int bin) { + int l, b; + for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); + return l; +} + +/// Compute the corresponding entry into the linear index of a given bin from +/// a binning index +/** @param bin The bin number + * @param n_lvls The index depth (number of levels - 0 based) + * @return The integer offset into the linear index + * + * Explanation of the return value formula: + * Each bin on level l covers exp(2, (n_lvls - l)*3 + min_shift) base pairs. + * A linear index entry covers exp(2, min_shift) base pairs. + */ static inline int hts_bin_bot(int bin, int n_lvls) { - int l, b; - for (l = 0, b = bin; b; ++l, b = hts_bin_parent(b)); // compute the level of bin + int l = hts_bin_level(bin); return (bin - hts_bin_first(l)) << (n_lvls - l) * 3; } From 18305510d010aab54706fe4abec4d90cd6462ecc Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Tue, 16 Mar 2021 13:34:44 +0000 Subject: [PATCH 161/488] Reverse linear index initialization of missing entries. Rename `hts_idx_save_core` to `idx_save_core`, to indicate its static use. --- hts.c | 26 ++++++++++++-------------- htslib/hts.h | 7 ++++--- 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/hts.c b/hts.c index d194ab543..17039aa43 100644 --- a/hts.c +++ b/hts.c @@ -2115,17 +2115,11 @@ static void update_loff(hts_idx_t *idx, int i, int free_lidx) lidx_t *lidx = &idx->lidx[i]; khint_t k; int l; - uint64_t offset0 = 0; - if (bidx) { - k = kh_get(bin, bidx, META_BIN(idx)); - if (k != kh_end(bidx)) - offset0 = kh_val(bidx, k).list[0].u; - for (l = 0; l < lidx->n && lidx->offset[l] == (uint64_t)-1; ++l) - lidx->offset[l] = offset0; - } else l = 1; - for (; l < lidx->n; ++l) // fill missing values + // the last entry is always valid + for (l=lidx->n-2; l >= 0; l--) { if (lidx->offset[l] == (uint64_t)-1) - lidx->offset[l] = lidx->offset[l-1]; + lidx->offset[l] = lidx->offset[l+1]; + } if (bidx == 0) return; for (k = kh_begin(bidx); k != kh_end(bidx); ++k) // set loff if (kh_exist(bidx, k)) @@ -2429,7 +2423,7 @@ static inline void swap_bins(bins_t *p) } } -static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) +static int idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) { int32_t i, j; @@ -2482,6 +2476,10 @@ static int hts_idx_save_core(const hts_idx_t *idx, BGZF *fp, int fmt) } check(idx_write_uint64(fp, idx->n_no_coor)); +#ifdef DEBUG_INDEX + idx_dump(idx); +#endif + return 0; #undef check } @@ -2531,7 +2529,7 @@ int hts_idx_save_as(const hts_idx_t *idx, const char *fn, const char *fnidx, int check(bgzf_write(fp, "BAI\1", 4)); } else abort(); - check(hts_idx_save_core(idx, fp, fmt)); + check(idx_save_core(idx, fp, fmt)); return bgzf_close(fp); #undef check @@ -2591,8 +2589,8 @@ static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt) if (l->offset == NULL) return -2; if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1; if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]); - for (j = 1; j < l->n; ++j) // fill missing values; may happen given older samtools and tabix - if (l->offset[j] == 0) l->offset[j] = l->offset[j-1]; + for (j = l->n-1; j > 0; j--) // fill missing values; may happen given older samtools and tabix + if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j]; update_loff(idx, i, 0); } } diff --git a/htslib/hts.h b/htslib/hts.h index 8d5774382..44b923b45 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1455,9 +1455,10 @@ static inline int hts_bin_level(int bin) { return l; } -/// Compute the corresponding entry into the linear index of a given bin from -/// a binning index -/** @param bin The bin number +//! Compute the corresponding entry into the linear index of a given bin from +//! a binning index +/*! + * @param bin The bin number * @param n_lvls The index depth (number of levels - 0 based) * @return The integer offset into the linear index * From e0d76101d50bb29a2787d4e41927b6fca3825d47 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Thu, 3 Jun 2021 08:53:49 +0100 Subject: [PATCH 162/488] Add new method `hts_idx_nseq` for retrieving the number of contigs covered by reads from the index struct. Use internal method `idx_filename` for consistency. --- NEWS | 3 +++ hts.c | 18 +++++++++++++++--- htslib/hts.h | 7 +++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 5acdf4877..4a6c90cf1 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,9 @@ Noteworthy changes in release a.b Features and Updates -------------------- +* New method `hts_idx_nseq` returns the number of contigs covered by reads + from an index structure. + * In case a PG header line has multiple ID tags supplied by other applications, the header API now selects the first one encountered as the identifying tag and issues a warning when detecting subsequent ID tags. diff --git a/hts.c b/hts.c index 17039aa43..7430a310c 100644 --- a/hts.c +++ b/hts.c @@ -2723,6 +2723,18 @@ const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, return names; } +int hts_idx_nseq(const hts_idx_t *idx) { + int tid = 0, i; + for (i=0; i < idx->n; i++) + { + bidx_t *bidx = idx->bidx[i]; + if ( !bidx ) continue; + tid++; + } + + return tid; +} + int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped) { if ( idx->fmt == HTS_FMT_CRAI ) { @@ -4372,11 +4384,11 @@ static hts_idx_t *idx_find_and_load(const char *fn, int fmt, int flags) if (hts_idx_check_local(fn, fmt, &fnidx) == 0 && hisremote(fn)) { if (flags & HTS_IDX_SAVE_REMOTE) { - fnidx = hts_idx_getfn(fn, ".csi"); + fnidx = idx_filename(fn, ".csi", HTS_IDX_SAVE_REMOTE); if (!fnidx) { switch (fmt) { - case HTS_FMT_BAI: fnidx = hts_idx_getfn(fn, ".bai"); break; - case HTS_FMT_TBI: fnidx = hts_idx_getfn(fn, ".tbi"); break; + case HTS_FMT_BAI: fnidx = idx_filename(fn, ".bai", HTS_IDX_SAVE_REMOTE); break; + case HTS_FMT_TBI: fnidx = idx_filename(fn, ".tbi", HTS_IDX_SAVE_REMOTE); break; default: break; } } diff --git a/htslib/hts.h b/htslib/hts.h index 44b923b45..132701fd4 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1252,6 +1252,13 @@ int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED HTSLIB_EXPORT const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values +/// Return the number targets covered by reads from an index +/** @param idx Index + @return The number of targets + */ +HTSLIB_EXPORT +int hts_idx_nseq(const hts_idx_t *idx); + /********************************** * Iterator with multiple regions * **********************************/ From 8489e4a7a4becd627b327d5155db03551c34d64b Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Fri, 4 Jun 2021 16:42:56 +0100 Subject: [PATCH 163/488] Don't overwrite 0 value linear index entries at the beginning. They could be valid offsets. Stop at the first non-zero entry. --- hts.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/hts.c b/hts.c index 7430a310c..51ded68ac 100644 --- a/hts.c +++ b/hts.c @@ -2577,7 +2577,7 @@ static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt) if (is_be) swap_bins(p); } if (fmt != HTS_FMT_CSI) { // load linear index - int j; + int j, k; uint32_t x; if (bgzf_read(fp, &x, 4) != 4) return -1; if (is_be) ed_swap_4p(&x); @@ -2589,7 +2589,8 @@ static int idx_read_core(hts_idx_t *idx, BGZF *fp, int fmt) if (l->offset == NULL) return -2; if (bgzf_read(fp, l->offset, l->n << 3) != l->n << 3) return -1; if (is_be) for (j = 0; j < l->n; ++j) ed_swap_8p(&l->offset[j]); - for (j = l->n-1; j > 0; j--) // fill missing values; may happen given older samtools and tabix + for (k = j = 0; j < l->n && l->offset[j] == 0; k = ++j); // stop at the first non-zero entry + for (j = l->n-1; j > k; j--) // fill missing values; may happen given older samtools and tabix if (l->offset[j-1] == 0) l->offset[j-1] = l->offset[j]; update_loff(idx, i, 0); } From 12bed3990fed76c7dfcf8d0498f5b5e6b40366d9 Mon Sep 17 00:00:00 2001 From: Alberto Casas Ortiz Date: Fri, 19 Feb 2021 20:17:31 -0700 Subject: [PATCH 164/488] Added case of bracketed list of values to bcf_hdr_parse_line. --- vcf.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/vcf.c b/vcf.c index 9b9e4d36a..c49692404 100644 --- a/vcf.c +++ b/vcf.c @@ -426,11 +426,23 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) if (bcf_hrec_add_key(hrec, p, q-p-m) < 0) goto fail; p = ++q; while ( *q && *q==' ' ) { p++; q++; } - int quoted = *p=='"' ? 1 : 0; + + int quoted = 0; + char ending; + switch (*p) { + case '"': + quoted = 1; + ending = '"'; + break; + case '[': + quoted = 1; + ending = ']'; + break; + } if ( quoted ) p++, q++; while ( *q && *q != '\n' ) { - if ( quoted ) { if ( *q=='"' && !is_escaped(p,q) ) break; } + if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; } else { if ( *q=='<' ) nopen++; @@ -444,7 +456,7 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) while ( r > p && r[-1] == ' ' ) r--; if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0) goto fail; - if ( quoted && *q=='"' ) q++; + if ( quoted && *q==ending ) q++; if ( *q=='>' ) { nopen--; q++; } } From 723e261e5c4fbae0023cb07c0ec22111f7d679b8 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 9 Apr 2021 17:58:48 +0100 Subject: [PATCH 165/488] Attempt to preserve square brackets in VCF header lines Quoted lines are actually stored with the quote marks surrounding them, so we can do the same with the square-bracket syntax used in META lines to enable round-tripping. Add VCF META header tag round-trip test VCF test file comes from the hts-specs repository file test/vcf/4.3/passed/passed_meta_meta.vcf modified to add contig and FILTER headers so `htsfile -c` will round-trip it without making changes or printing any warnings. --- test/test.pl | 2 ++ test/vcf_meta_meta.vcf | 9 +++++++++ vcf.c | 18 ++++++++++++++++-- 3 files changed, 27 insertions(+), 2 deletions(-) create mode 100644 test/vcf_meta_meta.vcf diff --git a/test/test.pl b/test/test.pl index a33ead381..392f53117 100755 --- a/test/test.pl +++ b/test/test.pl @@ -890,6 +890,8 @@ sub test_vcf_various cmd => "$$opts{bin}/htsfile -c $$opts{path}/noroundtrip.vcf"); test_cmd($opts, %args, out => "formatmissing-out.vcf", cmd => "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf"); + test_cmd($opts, %args, out => "vcf_meta_meta.vcf", + cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf_meta_meta.vcf"); } sub write_multiblock_bgzf { diff --git a/test/vcf_meta_meta.vcf b/test/vcf_meta_meta.vcf new file mode 100644 index 000000000..43aba73f7 --- /dev/null +++ b/test/vcf_meta_meta.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##META= +##META= +##META= +##META= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 123 . TC T . . . diff --git a/vcf.c b/vcf.c index c49692404..cc77f14e0 100644 --- a/vcf.c +++ b/vcf.c @@ -428,18 +428,19 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) while ( *q && *q==' ' ) { p++; q++; } int quoted = 0; - char ending; + char ending = '\0'; switch (*p) { case '"': quoted = 1; ending = '"'; + p++; break; case '[': quoted = 1; ending = ']'; break; } - if ( quoted ) p++, q++; + if ( quoted ) q++; while ( *q && *q != '\n' ) { if ( quoted ) { if ( *q==ending && !is_escaped(p,q) ) break; } @@ -453,6 +454,19 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) q++; } const char *r = q; + if (quoted && ending == ']') { + if (*q == ending) { + r++; + q++; + quoted = 0; + } else { + char buffer[320]; + hts_log_error("Missing ']' in header line %s", + hts_strprint(buffer, sizeof(buffer), '"', + line, q-line)); + goto fail; + } + } while ( r > p && r[-1] == ' ' ) r--; if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0) goto fail; From 031365407c145bfec2e8f443de2da3a910c6971d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 11 Jun 2021 09:27:25 +0100 Subject: [PATCH 166/488] Fix the filter expression "cigar" on unmapped reads. Our error checking for kputw/kputc failing also wrongly claimed that the empty string was also a failure. The empty CIGAR string is now "*", as per SAM. To check for not-something while also handling unmapped reads, use e.g. samtools view -e 'cigar!="*" && cigar!~"D"' or samtools view -e 'ncigar && cigar!~"D"' Fixes samtools/samtools#1445 --- sam.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/sam.c b/sam.c index 397288b61..9fcef88c5 100644 --- a/sam.c +++ b/sam.c @@ -1153,12 +1153,16 @@ static int bam_sym_lookup(void *data, char *str, char **end, ks_clear(&res->s); uint32_t *cigar = bam_get_cigar(b); int i, n = b->core.n_cigar, r = 0; - for (i = 0; i < n; i++) { - r |= kputw (bam_cigar_oplen(cigar[i]), &res->s); - r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s); + if (n) { + for (i = 0; i < n; i++) { + r |= kputw (bam_cigar_oplen(cigar[i]), &res->s) < 0; + r |= kputc_(bam_cigar_opchr(cigar[i]), &res->s) < 0; + } + r |= kputs("", &res->s) < 0; + } else { + r |= kputs("*", &res->s) < 0; } - kputs("", &res->s); - return r ? 0 : -1; + return r ? -1 : 0; } break; From 82560b1bf1464689cb57b2f5305d1a3e4f318ef3 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 15 Jun 2021 21:18:06 +0100 Subject: [PATCH 167/488] Fix typos in INSTALL [minor] --- INSTALL | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL b/INSTALL index e8d0bea6b..d0eb638c5 100644 --- a/INSTALL +++ b/INSTALL @@ -49,7 +49,7 @@ Storage is enabled. Amazon S3 support requires an HMAC function to calculate a message authentication code. On MacOS, the CCHmac function from the standard -library is used. Systems that do not have CChmac will get this from +library is used. Systems that do not have CCHmac will get this from libcrypto. libcrypto is part of OpenSSL or one of its derivatives (LibreSSL or BoringSSL). @@ -57,7 +57,7 @@ On Microsoft Windows we recommend use of Mingw64/Msys2. Note that currently for the test harness to work you will need to override the test temporary directory with e.g.: make check TEST_OPTS="-t C:/msys64/tmp/_" Whilst the code may work on Windows with other environments, these have -not be verified. +not been verified. Update htscodecs submodule ========================== @@ -103,7 +103,7 @@ configure and just type 'make; make install' as for previous versions of HTSlib. However if the build fails you should run './configure' as it can diagnose the common reasons for build failures. -The 'make' command builds the HTSlib library and and various useful +The 'make' command builds the HTSlib library and various useful utilities: bgzip, htsfile, and tabix. If compilation fails you should run './configure' as it can diagnose problems with your build environment that cause build failures. From 4ece5376063403c76fce4997b4fd5642c1e0e660 Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Wed, 16 Jun 2021 14:54:12 +0100 Subject: [PATCH 168/488] Improvements of index methods (#1299) Change `hts_idx_nseq` to return the total number of contigs from an index. Add pointer validations to a few index related public methods. --- hts.c | 15 +++++---------- htslib/hts.h | 43 ++++++++++++++++++++++--------------------- 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/hts.c b/hts.c index 51ded68ac..7f70aa0ef 100644 --- a/hts.c +++ b/hts.c @@ -2706,7 +2706,7 @@ uint8_t *hts_idx_get_meta(hts_idx_t *idx, uint32_t *l_meta) const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr) { - if ( !idx->n ) + if ( !idx || !idx->n ) { *n = 0; return NULL; @@ -2725,25 +2725,20 @@ const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, } int hts_idx_nseq(const hts_idx_t *idx) { - int tid = 0, i; - for (i=0; i < idx->n; i++) - { - bidx_t *bidx = idx->bidx[i]; - if ( !bidx ) continue; - tid++; - } - - return tid; + if (!idx) return -1; + return idx->n; } int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* unmapped) { + if (!idx) return -1; if ( idx->fmt == HTS_FMT_CRAI ) { *mapped = 0; *unmapped = 0; return -1; } bidx_t *h = idx->bidx[tid]; + if (!h) return -1; khint_t k = kh_get(bin, h, META_BIN(idx)); if (k != kh_end(h)) { *mapped = kh_val(h, k).list[1].u; diff --git a/htslib/hts.h b/htslib/hts.h index 132701fd4..a92b03810 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1010,6 +1010,8 @@ hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags); /////////////////////////////////////////////////////////// // Functions for accessing meta-data stored in indexes +typedef const char *(*hts_id2name_f)(void*, int); + /// Get extra index meta-data /** @param idx The index @param l_meta Pointer to where the length of the extra data is stored @@ -1066,6 +1068,26 @@ int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* HTSLIB_EXPORT uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx); +/// Return a list of target names from an index +/** @param idx Index + @param[out] n Location to store the number of targets + @param getid Callback function to get the name for a target ID + @param hdr Header from indexed file + @return An array of pointers to the names on success; NULL on failure + + @note The names are pointers into the header data structure. When cleaning + up, only the array should be freed, not the names. + */ +HTSLIB_EXPORT +const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values + +/// Return the number of targets from an index +/** @param idx Index + @return The number of targets + */ +HTSLIB_EXPORT +int hts_idx_nseq(const hts_idx_t *idx); + /////////////////////////////////////////////////////////// // Region parsing @@ -1089,7 +1111,6 @@ HTSLIB_EXPORT long long hts_parse_decimal(const char *str, char **strend, int flags); typedef int (*hts_name2id_f)(void*, const char*); -typedef const char *(*hts_id2name_f)(void*, int); /// Parse a "CHR:START-END"-style region string /** @param str String to be parsed @@ -1239,26 +1260,6 @@ hts_itr_t *hts_itr_querys(const hts_idx_t *idx, const char *reg, hts_name2id_f g HTSLIB_EXPORT int hts_itr_next(BGZF *fp, hts_itr_t *iter, void *r, void *data) HTS_RESULT_USED; -/// Return a list of target names from an index -/** @param idx Index - @param[out] n Location to store the number of targets - @param getid Callback function to get the name for a target ID - @param hdr Header from indexed file - @return An array of pointers to the names on success; NULL on failure - - @note The names are pointers into the header data structure. When cleaning - up, only the array should be freed, not the names. - */ -HTSLIB_EXPORT -const char **hts_idx_seqnames(const hts_idx_t *idx, int *n, hts_id2name_f getid, void *hdr); // free only the array, not the values - -/// Return the number targets covered by reads from an index -/** @param idx Index - @return The number of targets - */ -HTSLIB_EXPORT -int hts_idx_nseq(const hts_idx_t *idx); - /********************************** * Iterator with multiple regions * **********************************/ From bfb2df1e8628f5610da3196f38238b868def7724 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 17 Jun 2021 15:19:54 +0100 Subject: [PATCH 169/488] Make version.sh work from a separate build directory Fix a bug whereby version.sh reported the master release version rather than a git commit description when run from a separate build directory. --- version.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/version.sh b/version.sh index fc04f26c2..9eea911c6 100755 --- a/version.sh +++ b/version.sh @@ -27,10 +27,11 @@ VERSION=1.12 # If we have a git clone, then check against the current tag -if [ -e .git ] +srcdir=${0%/version.sh} +if [ -e $srcdir/.git ] then # If we ever get to 10.x this will need to be more liberal - VERSION=`git describe --match '[0-9].[0-9]*' --dirty` + VERSION=`cd $srcdir && git describe --match '[0-9].[0-9]*' --dirty` fi # Numeric version is for use in .dylib or .so libraries From ba109a49f37e89337d05ed466df7eaa444df9b43 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 16 Jun 2021 22:43:17 +0100 Subject: [PATCH 170/488] On MinGW-i686, testing for recv() requires the system header MinGW's 32-bit version of declares recv() as __stdcall__. When AC_SEARCH_LIBS does a trial link without declaring recv() at all, linking fails due to the mismatch of calling conventions. Before reporting the error with MSG_ERROR, try again to link against ws2_32 this time including the system header to get the annotated declaration. This produces the right $LIBS on MinGW-i686, and leaves the MSG_ERROR failure unchanged on other platforms. --- configure.ac | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/configure.ac b/configure.ac index 8b50697da..1216ecc21 100644 --- a/configure.ac +++ b/configure.ac @@ -234,7 +234,16 @@ AC_SEARCH_LIBS([recv], [socket ws2_32], [ if test "$ac_cv_search_recv" != "none required" then static_LIBS="$static_LIBS $ac_cv_search_recv" -fi], [MSG_ERROR([unable to find the recv() function])]) +fi], + dnl on MinGW-i686, checking recv() linking requires an annotated declaration + [AC_MSG_CHECKING([for library containing recv using declaration]) + LIBS="-lws2_32 $LIBS" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM([[#include ]], [[recv(0, 0, 0, 0);]])], + [AC_MSG_RESULT([-lws2_32]) + static_LIBS="$static_LIBS -lws2_32"], + [AC_MSG_RESULT([no]) + MSG_ERROR([unable to find the recv() function])])]) if test "$enable_bz2" != no; then bz2_devel=ok From f7975f1ac0f55285d53cb4e72424f44f01e70981 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 22 Jun 2021 11:24:52 +0100 Subject: [PATCH 171/488] Summer 2021 release copyright updates. --- LICENSE | 2 +- bgzf.c | 2 +- hfile_gcs.c | 2 +- hfile_s3.c | 2 +- hts.c | 2 +- htslib.mk | 2 +- htslib/hfile.h | 2 +- htslib/hts.h | 2 +- htslib/sam.h | 2 +- realn.c | 2 +- synced_bcf_reader.c | 2 +- test/test-vcf-api.c | 2 +- test/test_introspection.c | 2 +- vcf.c | 2 +- vcfutils.c | 2 +- 15 files changed, 15 insertions(+), 15 deletions(-) diff --git a/LICENSE b/LICENSE index 8456da591..5075c3f53 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.] The MIT/Expat License -Copyright (C) 2012-2020 Genome Research Ltd. +Copyright (C) 2012-2021 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/bgzf.c b/bgzf.c index 89f441c1b..ca01c9717 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2,7 +2,7 @@ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013-2020 Genome Research Ltd + Copyright (C) 2009, 2013-2021 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/hfile_gcs.c b/hfile_gcs.c index d622ca754..2f01a20ce 100644 --- a/hfile_gcs.c +++ b/hfile_gcs.c @@ -1,6 +1,6 @@ /* hfile_gcs.c -- Google Cloud Storage backend for low-level file streams. - Copyright (C) 2016 Genome Research Ltd. + Copyright (C) 2016, 2021 Genome Research Ltd. Author: John Marshall diff --git a/hfile_s3.c b/hfile_s3.c index 4f1d536dc..0453ba5e4 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -1,6 +1,6 @@ /* hfile_s3.c -- Amazon S3 backend for low-level file streams. - Copyright (C) 2015-2017, 2019-2020 Genome Research Ltd. + Copyright (C) 2015-2017, 2019-2021 Genome Research Ltd. Author: John Marshall diff --git a/hts.c b/hts.c index 7f70aa0ef..132f499e7 100644 --- a/hts.c +++ b/hts.c @@ -1,6 +1,6 @@ /* hts.c -- format-neutral I/O, indexing, and iterator API functions. - Copyright (C) 2008, 2009, 2012-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2012-2021 Genome Research Ltd. Copyright (C) 2012, 2013 Broad Institute. Author: Heng Li diff --git a/htslib.mk b/htslib.mk index 3a4675814..9c60ffc2b 100644 --- a/htslib.mk +++ b/htslib.mk @@ -1,6 +1,6 @@ # Makefile rules useful for third-party code using htslib's public API. # -# Copyright (C) 2013-2017, 2019 Genome Research Ltd. +# Copyright (C) 2013-2017, 2019, 2021 Genome Research Ltd. # # Author: John Marshall # diff --git a/htslib/hfile.h b/htslib/hfile.h index 55cae244e..987acb7c8 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -1,7 +1,7 @@ /// @file htslib/hfile.h /// Buffered low-level input/output streams. /* - Copyright (C) 2013-2020 Genome Research Ltd. + Copyright (C) 2013-2021 Genome Research Ltd. Author: John Marshall diff --git a/htslib/hts.h b/htslib/hts.h index a92b03810..e24d33a4c 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1,7 +1,7 @@ /// @file htslib/hts.h /// Format-neutral I/O, indexing, and iterator API functions. /* - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Copyright (C) 2010, 2012 Broad Institute. Portions copyright (C) 2003-2006, 2008-2010 by Heng Li diff --git a/htslib/sam.h b/htslib/sam.h index e359a83f4..a80c89bb1 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1,7 +1,7 @@ /// @file htslib/sam.h /// High-level SAM/BAM/CRAM sequence file operations. /* - Copyright (C) 2008, 2009, 2013-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2021 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li diff --git a/realn.c b/realn.c index dfb338cb5..f05db6ac4 100644 --- a/realn.c +++ b/realn.c @@ -1,6 +1,6 @@ /* realn.c -- BAQ calculation and realignment. - Copyright (C) 2009-2011, 2014-2016, 2018 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2016, 2018, 2021 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 44b2f154b..3dd6c2066 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1,6 +1,6 @@ /* synced_bcf_reader.c -- stream through multiple VCF files. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index 22886e7af..51a9f41f6 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -1,6 +1,6 @@ /* test/test-vcf-api.c -- VCF test harness. - Copyright (C) 2013, 2014, 2017-2020 Genome Research Ltd. + Copyright (C) 2013, 2014, 2017-2021 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test_introspection.c b/test/test_introspection.c index 658d92f92..843e45eb8 100644 --- a/test/test_introspection.c +++ b/test/test_introspection.c @@ -1,6 +1,6 @@ /* test/test_introspection.c -- demonstration of introspection function usage - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020-2021 Genome Research Ltd. Author: James Bonfield diff --git a/vcf.c b/vcf.c index cc77f14e0..bfa253fbe 100644 --- a/vcf.c +++ b/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2021 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li diff --git a/vcfutils.c b/vcfutils.c index 9d0d7fdb3..4153c5c60 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -1,6 +1,6 @@ /* vcfutils.c -- allele-related utility functions. - Copyright (C) 2012-2018, 2020 Genome Research Ltd. + Copyright (C) 2012-2018, 2020-2021 Genome Research Ltd. Author: Petr Danecek From dc65e7ec39a9b0dd227e1d7be6df40e08b99163e Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 6 Jul 2021 09:15:24 +0100 Subject: [PATCH 172/488] Summer 2021 NEWS updates (PR #1303) Co-authored-by: Valeriu Ohan Co-authored-by: John Marshall Co-authored-by: Rob Davies --- NEWS | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 105 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 4a6c90cf1..7b60c913e 100644 --- a/NEWS +++ b/NEWS @@ -4,12 +4,114 @@ Noteworthy changes in release a.b Features and Updates -------------------- -* New method `hts_idx_nseq` returns the number of contigs covered by reads - from an index structure. - * In case a PG header line has multiple ID tags supplied by other applications, the header API now selects the first one encountered as the identifying tag and issues a warning when detecting subsequent ID tags. + (#1256; fixed samtools/samtools#1393) + +* VCF header reading function (vcf_hdr_read) no longer tries to download a + remote index file by default. + (#1266; fixes #380) + +* Support reading and writing FASTQ format in the same way as SAM, BAM or CRAM. + Records read from a FASTQ file will be treated as unmapped data. + (#1156) + +* Added GCP requester pays bucket access. Thanks to @indraniel. + (#1255) + +* Made mpileup's overlap removal choose which copy to remove at random instead + of always removing the second one. This avoids strand bias in experiments + where the +ve and -ve strand reads always appear in the same order. + (#1273; fixes samtools/bcftools#1459) + +* It is now possible to use platform specific BAQ parameters. This also + selects long-read parameters for read lengths bigger than 1kb, which helps + bcftools mpileup call SNPs on PacBio CCS reads. + (#1275) + +* Improved bcf_remove_allele_set. This fixes a bug that stopped iteration over + alleles prematurely, marks removed alleles as 'missing' and does automatic + lazy unpacking. + (#1288; fixes #1259) + +* Improved compression metrics for unsorted CRAM files. This improves the + choice of codecs when handling unsorted data. + (#1291) + +* Linear index entries for empty intervals are now initialised with the file + offset in the next non-empty interval instead of the previous one. This + may reduce the amount of data iterators have to discard before reaching + the desired region, when the starting location is in a sequence gap. + Thanks to @carsonh for reporting the issue. + (#1286; fixes #486) + +* A new hts_bin_level API function has been added, to compute the level of a + given bin in the binning index. + (#1286) + +* Related to the above, a new API method, hts_idx_nseq, now returns the total + number of contigs from an index. + (#1295 and #1299) + +* Added bracket handling to bcf_hdr_parse_line, for use with ##META lines. + Thanks to Alberto Casas Ortiz. + (#1240) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Added a curl/curl.h check to configure and improved INSTALL documentation on + build options. Thanks to Melanie Kirsche and John Marshall. + (#1265; fixes #1261) + +* Some fixes to address GCC 11.1 warnings. + (#1280, #1284, #1285; fixes #1283) + +* Supports building HTSlib in a separate directory. Thanks to John Marshall. + (#1277; fixes #231) + +* Supports building HTSlib on MinGW 32-bit environments. Thanks to + John Marshall. + (#1301) + +Bug fixes +--------- + +* Fixed hts_itr_query() et al region queries: fixed bug introduced in + HTSlib 1.12, which led to iterators producing very few reads for some + queries (especially for larger target regions) when unmapped reads were + present. HTSlib 1.11 had a related problem in which iterators would omit + a few unmapped reads that should have been produced; cf #1142. + Thanks to Daniel Cooke for reporting the issue. + (#1281; fixes #1279) + +* Removed compressBound assertions on opening bgzf files. Thanks to + Gurt Hulselmans for reporting the issue. + (#1258; fixed #1257) + +* Duplicate sample name error message for a VCF file now only displays the + duplicated name rather the entire same name list. + (#1262; fixes samtools/bcftools#1451) + +* Fix to make samtools cat work on CRAMs again. + (#1276; fixes samtools/samtools#1420) + +* Fix for a double memory free in SAM header creation. Thanks to @ihsineme. + (#1274) + +* Prevent assert in bcf_sr_set_regions. Thanks to Dr K D Murray. + (#1270) + +* Fixed crash in knet_open() etc stubs. Thanks to John Marshall. + (#1289) + +* Fixed filter expression "cigar" on unmapped reads. Stop treating an empty + CIGAR string as an error. Thanks to Chang Y for reporting the issue. + (#1298, fixes samtools/samtools#1445) + Noteworthy changes in release 1.12 (17th March 2021) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 09255e618fa6b25e5404df9a776f888ad0b162ac Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 7 Jul 2021 09:33:05 +0100 Subject: [PATCH 173/488] Update htscodecs module to release 1.1.1 Fixes a bug with name tokeniser and variable number of names per slice. --- NEWS | 11 +++++++++++ htscodecs | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 7b60c913e..df74abebe 100644 --- a/NEWS +++ b/NEWS @@ -63,6 +63,8 @@ Build changes These are compiler, configuration and makefile based changes. +* HTSlib now uses libhtscodecs release 1.1.1. + * Added a curl/curl.h check to configure and improved INSTALL documentation on build options. Thanks to Melanie Kirsche and John Marshall. (#1265; fixes #1261) @@ -112,6 +114,15 @@ Bug fixes CIGAR string as an error. Thanks to Chang Y for reporting the issue. (#1298, fixes samtools/samtools#1445) +* Bug fixes in the bundled copy of htscodecs: + + - Fixed an uninitialized access in the name tokeniser decoder. + (samtools/htscodecs#23) + + - Fixed a bug with name tokeniser and variable number of names per slice, + causing it to incorrectly report an error on certain valid inputs. + (samtools/htscodecs#24) + Noteworthy changes in release 1.12 (17th March 2021) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/htscodecs b/htscodecs index d7e357946..d13043b81 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit d7e357946ead219b81cc1becbe0de8a99d96ca84 +Subproject commit d13043b812b4101a3545412d8a4c1a816f593aa2 From d8dbdeb4574f9fd4440acf387f02d43fa762314e Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 7 Jul 2021 12:34:43 +0100 Subject: [PATCH 174/488] Add "endpos" filter expression "endpos" is the (1-based inclusive) position of the rightmost mapped base of the read, as measured using the CIGAR string. (bam_endpos() returns 0-based exclusive, which is the same thing.) For unmapped reads, it is the same as "pos". --- sam.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/sam.c b/sam.c index 9fcef88c5..b42d6c6d2 100644 --- a/sam.c +++ b/sam.c @@ -1166,6 +1166,14 @@ static int bam_sym_lookup(void *data, char *str, char **end, } break; + case 'e': + if (memcmp(str, "endpos", 6) == 0) { + *end = str+6; + res->d = bam_endpos(b); + return 0; + } + break; + case 'f': if (memcmp(str, "flag", 4) == 0) { str = *end = str+4; From d16bed5a5ad93f3ce07dfcc19a8d079f6961edd1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 8 Jul 2021 14:34:55 +0100 Subject: [PATCH 175/488] Speed up BAM, CRAM 3.1 and 4.0 testing in test/test.pl's test_view. Many of the SAM test files cover corner cases that don't change with 3.1 or 4.0, so we don't need to rerun every test in every format variation. Additionally we don't need to use compression for every test using BAM. These have been tested using gcov to validate the total code coverage doesn't change. - Only run 3.1/4.0 tests on a subset of SAM files. - Only test 3.1/4.0 when running with threads. (Second call to test_view). TODO: Likely this could be applied to 3.0 tests too to limit which files are tested in a threaded environment. - Drop some of the 2.1 tests. If we can go from SAM -> 2.1 -> SAM then it's good enough to validate encode and decode, with no need for 2.1 to 3 interchange checks. - Drop some of the 3.0/3.1/4.0 interchange tests. As with 2.1, we only need to know we can go to/from a common SAM format to test the code paths. - Don't test compressed BAM on all files. 1 is enough, and then uncompressed for the rest (lvl 1). - Don't test uncompressed CRAM on all files. Just do it for 1. We do want compressed mostly though as CRAM has a lot of codecs to explore. - No Point in BAM -> CRAM -> BAM -> SAM when BAM -> CRAM -> SAM is sufficient. - Drop the multi-ref CRAM testing for v3.1 and v4.0 as it's the same code path tested in v3.0 anyway. This does change coverage marginally, but it's trivial (3 lines reallocing MD in cram_decode.c). - Only do multi-ref CRAM testing on CRAM files with > 1 ref and when threaded. (Albeit with a minor 2 line testing differnce to cram_io.c.) - Reduce the number of CRAM profiles tested. Do full 4 profiles on one file per v3.1 and v4.0. Otherwise a single profile. The inpact of this is time spent in the test_view subroute drops from real 2m3.525s user 1m20.362s sys 0m26.804s to real 0m21.618s user 0m12.925s sys 0m4.148s This single function dominated all testing (which on the same system took 2m37s for the entire test suite), so the impact on overall time is considerable. --- test/test.pl | 141 +++++++++++++++++++++++---------------------------- 1 file changed, 64 insertions(+), 77 deletions(-) diff --git a/test/test.pl b/test/test.pl index 392f53117..e2e5f8b20 100755 --- a/test/test.pl +++ b/test/test.pl @@ -520,6 +520,32 @@ sub test_view my ($opts, $nthreads) = @_; my $tv_args = $nthreads ? "-\@$nthreads" : ""; + # Files appropriate for CRAM V3.1 and V4.0 testing + my %cram31 = ("auxf#values.sam" => 1, + "c1#pad3.sam" => 1, + "ce#5.sam" => 1, + "ce#1000.sam", => 1, + "ce#large_seq.sam", => 1, + "ce#supp.sam", => 1, + "xx#MD.sam", => 1, + "xx#blank.sam", => 1, + "xx#large_aux.sam", => 1, + "xx#pair.sam", => 1, + "xx#tlen.sam" => 1); + + # Files appropriate for CRAM multi-ref containers + my %cram_ms = ("ce#1000.sam" => 1, + "ce#5.sam" => 1, + "ce#5b.sam" => 1, + "ce#unmap.sam" => 1, + "ce#unmap1.sam" => 1, + "ce#unmap2.sam" => 1, + "xx#blank.sam" => 1, + "xx#minimal.sam" => 1, + "xx#tlen.sam" => 1, + "xx#tlen2.sam" => 1, + "xx#triplet.sam" => 1); + foreach my $sam (glob("*#*.sam")) { my ($base, $ref) = ($sam =~ /((.*)#.*)\.sam/); $ref .= ".fa"; @@ -536,9 +562,11 @@ sub test_view $test_view_failures = 0; # SAM -> BAM -> SAM - testv $opts, "./test_view $tv_args -S -b $sam > $bam"; - testv $opts, "./test_view $tv_args $bam > $bam.sam_"; - testv $opts, "./compare_sam.pl $sam $bam.sam_"; + if ($sam eq "ce#1000.sam") { + testv $opts, "./test_view $tv_args -S -b $sam > $bam"; + testv $opts, "./test_view $tv_args $bam > $bam.sam_"; + testv $opts, "./compare_sam.pl $sam $bam.sam_"; + } # SAM -> BAMu -> SAM testv $opts, "./test_view $tv_args -S -l0 -b $sam > $bam"; @@ -550,93 +578,52 @@ sub test_view testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; - # BAM -> CRAM2 -> BAM -> SAM - $cram = "$bam.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=2.1 $bam > $cram"; - testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; - testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; - # SAM -> CRAM3u -> SAM - $cram = "$base.tmp.cram"; - testv $opts, "./test_view $tv_args -t $ref -S -l0 -C -o VERSION=3.0 $sam > $cram"; - testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; - - # BAM -> CRAM3 -> BAM -> SAM - $cram = "$bam.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $bam > $cram"; - testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; - testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; - - # CRAM3 -> CRAM2 - $cram = "$base.tmp.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=2.1 $cram > $cram.cram"; - - # CRAM2 -> CRAM3 - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram.cram > $cram"; - - # CRAM3 -> CRAM3 + multi-slice - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; - testv $opts, "./test_view $tv_args $cram > $cram.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; - - ## Experimental CRAM 3.1 support. - # SAM -> CRAM31u -> SAM - foreach my $profile (qw/fast normal small archive/) { + if ($sam eq "ce#1000.sam") { $cram = "$base.tmp.cram"; - testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=3.1 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l0 -C -o VERSION=3.0 $sam > $cram"; testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; } - # BAM -> CRAM31 -> BAM -> SAM + # BAM -> CRAM3 -> SAM $cram = "$bam.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 $bam > $cram"; - testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; - testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $bam > $cram"; + testv $opts, "./test_view $tv_args $cram > $cram.bam.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; - # CRAM31 -> CRAM30 - $cram = "$base.tmp.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram > $cram.cram"; - - # CRAM30 -> CRAM31 - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 $cram.cram > $cram"; - - # CRAM31 -> CRAM31 + multi-slice - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.1 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; - testv $opts, "./test_view $tv_args $cram > $cram.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; - - ## Experimental CRAM 4.0 support. - # SAM -> CRAM40u -> SAM - foreach my $profile (qw/fast normal small archive/) { - $cram = "$base.tmp.cram"; - testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=4.0 -o $profile $sam > $cram"; - testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + # CRAM3 -> CRAM3 + multi-slice + if (exists($cram_ms{$sam}) && $nthreads > 0) { + testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram > $cram.ms"; + testv $opts, "./test_view $tv_args $cram.ms > $cram.sam_"; testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; } - # BAM -> CRAM40 -> BAM -> SAM - $cram = "$bam.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 $bam > $cram"; - testv $opts, "./test_view $tv_args -b -D $cram > $cram.bam"; - testv $opts, "./test_view $tv_args $cram.bam > $cram.bam.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.bam.sam_"; - - # CRAM40 -> CRAM30 - $cram = "$base.tmp.cram"; - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=3.0 $cram > $cram.cram"; - - # CRAM30 -> CRAM40 - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 $cram.cram > $cram"; + if (exists($cram31{$sam}) && $nthreads > 0) { + ## Experimental CRAM 3.1 support. + # SAM -> CRAM31 -> SAM + my @p = $sam eq "ce#1000.sam" + ? (qw/fast normal small archive/) + : (qw/archive/); + foreach my $profile (@p) { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=3.1 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } - # CRAM40 -> CRAM40 + multi-slice - testv $opts, "./test_view $tv_args -t $ref -C -o VERSION=4.0 -o seqs_per_slice=7 -o slices_per_container=5 $cram.cram > $cram"; - testv $opts, "./test_view $tv_args $cram > $cram.sam_"; - testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + ## Experimental CRAM 4.0 support. + # SAM -> CRAM40 -> SAM + my @p = $sam eq "ce#large_seq.sam" || $sam eq "xx#large_aux.sam" + ? (qw/fast normal small archive/) + : (qw/archive/); + foreach my $profile (@p) { + $cram = "$base.tmp.cram"; + testv $opts, "./test_view $tv_args -t $ref -S -l7 -C -o VERSION=4.0 -o $profile $sam > $cram"; + testv $opts, "./test_view $tv_args -D $cram > $cram.sam_"; + testv $opts, "./compare_sam.pl $md $sam $cram.sam_"; + } + } # Java pre-made CRAM -> SAM my $jcram = "${base}_java.cram"; From 06520ce34262239143a9deb6ce407dffb4525b12 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 28 Jul 2021 11:43:00 +0100 Subject: [PATCH 176/488] Update NEWS to add CVE number for 1.11's VCF parser fix CVE-2020-36403 was recently allocated for this legacy issue. --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index c388edb47..745d7dbae 100644 --- a/NEWS +++ b/NEWS @@ -522,7 +522,7 @@ Bug fixes * Fixed potential integer overflows in the VCF parser and ensured that the total length of FORMAT fields cannot go over 2Gbytes. [fuzz] (#1044, - #1104) + #1104; latter is CVE-2020-36403 affecting HTSlib versions 1.10 to 1.10.2) * Download index files atomically in idx_test_and_fetch(). This prevents corruption when running parallel jobs on S3 files. Thanks to John Marshall. From 5fbd7e7ce50e29d649b422a2452dedeb2db27eab Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 15 Jul 2021 15:55:32 +0100 Subject: [PATCH 177/488] Work around a deadlock in hts_tpool_process_flush. There is a very rare deadlock in hts_tpool_process_flush. The test harness step can trigger this: ./test_view -@4 -o seqs_per_slice=100 -o no_ref=1 -o multi_seq_per_slice=-1 -S -C multi_ref.tmp.sam [This occured for me about 1 in 750 times on one host. Adding a sleep in cram_io.c's cram_close function, just before the "if (fd->pool &&" check, makes this trigger much more frequently.] As an explanation: the tpool_worker threads have a check on && q->qsize - q->n_output > p->tsize - p->nwaiting This was added in bd32ec6c and is designed to avoid CPU clock-scaling issues when running many more threads than we can keep active. To avoid exhausting the input queue and alternating between many dormant threads and active threads, we only take a job off the input queue if we have more in the queue than the workers currently executing something. However, when flushing, we need to leave a bit more leeway. It's possible that we haven't yet consumed the results, so the output queue is full, meaning the flush isn't something that we can call and just wait on unless we have a separate thread that is able to drain the queue. (This is valid for SAM and BAM, but not CRAM where the result consumer is also the same thread that calls the flush). Hts_tpool_process_flush attempts to fix this, albeit with an ironic comment of "Shouldn't be possible to get here, but just in case". It can, and it matters, but it's not sufficient. The condition was: if (q->qsize < q->n_output + q->n_input + q->n_processing) We can see from tpool_worker above however that it checks "p->tsize - p->nwaiting", so it's not just a case of qsize vs n_output. Adding "p->tsize" to our qsize check above avoids this potential deadlock, but the main cause is elsewhere (spotted by Rob). The primary issue is the initial in tpool_worker didn't consider that the current worker thread isn't executing, So "p->tsize - p->nwaiting - 1" is the correct assessment. This does also cure the deadlock, but @daviesrob suggested "q->n_processing" (which can be anywhere from 0 to p->tsize - p->nwaiting - 1, hence also fixing the same issue) is a better fix and also now matches the Hts_tpool_process_flush logic. I've verified this with ~100,000 tests of the test_view command above. It's harder to state with certainty that it doesn't alter the initial aim of bd32ec6c as more modern systems appear to be considerably less affected by frequency scaling issues, but the effect is still measureable, albeit now very slight (~5% wall-clock time differences). This change appears to not undo that improvment. Co-authored-by: Rob Davies --- thread_pool.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/thread_pool.c b/thread_pool.c index f56b781b3..252a9d24c 100644 --- a/thread_pool.c +++ b/thread_pool.c @@ -529,7 +529,7 @@ static void *tpool_worker(void *arg) { // room to put the result. //if (q && q->input_head && !hts_tpool_process_output_full(q)) { if (q && q->input_head - && q->qsize - q->n_output > p->tsize - p->nwaiting + && q->qsize - q->n_output > q->n_processing && !q->shutdown) { work_to_do = 1; break; @@ -949,7 +949,9 @@ int hts_tpool_process_flush(hts_tpool_process *q) { pthread_cond_signal(&p->t[i].pending_c); // Ensure there is room for the final sprint. - // Shouldn't be possible to get here, but just in case. + // Ideally we shouldn't get here, but the "q->qsize - q->n_output > + // n_processing" check in tpool_worker means we can trigger a + // deadlock there. This negates that possibility. if (q->qsize < q->n_output + q->n_input + q->n_processing) q->qsize = q->n_output + q->n_input + q->n_processing; From 43c127ec10d5e38c9f5b5ec3679c3745148f7c61 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 28 Jul 2021 14:23:46 +0100 Subject: [PATCH 178/488] Update htscodecs submodule * Fix NEWS typo --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index d13043b81..ed325d7e4 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit d13043b812b4101a3545412d8a4c1a816f593aa2 +Subproject commit ed325d7e406926b2be7697bdb185cf1692879c2a From c5aa1582dcf069b24e05268d4f6e201d9b048c94 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 4 Aug 2021 16:38:55 +0100 Subject: [PATCH 179/488] Fix cram_dependent_data_series when FN is used. If we get into the primary loop of cram_decode_seq, iterating over feature (count FN) then we have unguarded code that always checks for sequence overlapping the reference. To do this, seq_pos must be set. Some data series we know how seq_pos is adjusted irrespective of whether we decode, eg BS is always +1, but others are strings and the only way we know how to update seq_pos is to decode them. Hence added FN as having a dependency on SC, IN and BB. Fixes samtools/samtools#1475 --- cram/cram_decode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 26c9c592b..e65e11d08 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -668,6 +668,13 @@ int cram_dependent_data_series(cram_fd *fd, s->data_series |= CRAM_CF | CRAM_NF; if (s->data_series & (CRAM_BA | CRAM_QS | CRAM_BB | CRAM_QQ)) s->data_series |= CRAM_BF | CRAM_CF | CRAM_RL; + if (s->data_series & CRAM_FN) { + // The CRAM_FN loop checks for reference length boundaries, + // which needs a working seq_pos. Some fields are fixed size + // irrespective of if we decode (BS), but others need to know + // the size of the string fetched back (SC, IN, BB). + s->data_series |= CRAM_SC | CRAM_IN | CRAM_BB; + } orig_ds = s->data_series; From 87bfcd3c6e80644d9c10a72be26922d354e443af Mon Sep 17 00:00:00 2001 From: Jun Aruga Date: Sun, 8 Aug 2021 16:18:50 +0200 Subject: [PATCH 180/488] README.md: Update for tabix. [skip ci] The tabix creates .tbi/.csi by indexing other file formats. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f90f2a30f..47afdba2a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ HTSlib implements a generalized BAM index, with file extension `.csi` (coordinate-sorted index). The HTSlib file reader first looks for the new index and then for the old if the new index is absent. -This project also includes the popular tabix indexer, which indexes both `.tbi` +This project also includes the popular tabix indexer, which creates both `.tbi` and `.csi` formats, and the bgzip compression utility. [1]: http://samtools.github.io/hts-specs/ From 4811c2d847eea79503652355f7a600dbee1c10f3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 9 Aug 2021 12:07:17 +0100 Subject: [PATCH 181/488] Make bam_tag2cigar's CG tag handling more lenient. The SAM specification explicitly states the type for this tag is CG:B:I. However Htsjdk (sometimes or always?) writes these tags as CG:B:i. Htslib then silently ignores the tag, assuming it's some unofficial (and incorrect) abuse of an uppercase tag for some local purpose. Given that there is data published in the wild using the incorrect data type, it would be less problematic for us to simple handle the incorrect value sign than the minimal risk of misinterpretting someone's private tag data as CIGAR. (Plus they'd bring such woe onto themselves by using the official name-space.) Fixes samtools/samtools#1477 See also https://github.com/samtools/htsjdk/issues/1560 --- sam.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sam.c b/sam.c index b42d6c6d2..9e6511f8f 100644 --- a/sam.c +++ b/sam.c @@ -654,7 +654,8 @@ static int bam_tag2cigar(bam1_t *b, int recal_bin, int give_warning) // return 0 errno = saved_errno; // restore errno on expected no-CG-tag case return 0; } - if (CG[0] != 'B' || CG[1] != 'I') return 0; // not of type B,I + if (CG[0] != 'B' || !(CG[1] == 'I' || CG[1] == 'i')) + return 0; // not of type B,I CG_len = le_to_u32(CG + 2); if (CG_len < c->n_cigar || CG_len >= 1U<<29) return 0; // don't move if the real CIGAR length is shorter than the fake cigar length From b71f048ae7c6af2e5335e8e580a53cdd86e3cadc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 19 Aug 2020 09:34:01 +0100 Subject: [PATCH 182/488] Added base modification interfaces. The general principle is 1. Allocate a base modification state 2. Parse mods 3. Iterate over mods or query positions 4. Destroy state. An example usage is as follows (minus error checking for clarity): hts_base_mod_state *m = hts_base_mod_state_alloc(); sam_read1(fp, hdr, b); bam_parse_basemod(b, m); hts_base_mod mod; while ((n = bam_next_basemod(b, m, &mod, 1, &pos) > 0)) printf("Modified %c to %c at %d\n", mod.canonical_base, mod.modified_base, pos); } hts_base_mod_state_free(m); If being called from an iterator that marches along the query sequence we may wish to have a loop calling bam_mods_at_next_pos instead (pos 0, 1, 2, 3 etc). A simple example of this is in test/test_mod.c It may also be used from within a pileup iterator which marches along query position instead of sequence position. In this case call bam_mods_at_qpos instead with pileup->qpos; which is normally +1 each time but may be more or less with indels. A simple example of this is in test/pileup_mod.c although it doesn't look for modifications within indels. (That would need to mix in bam_mods_at_next_pos calls for each base in the indel.) Note currently bam_mods_at_qpos is a brain-dead loop around bam_mods_at_next_pos which means it may be excessive when huge soft-clips are being used, however this optimisation may happen later. Right now the overhead of pileup with modifications isn't vast. On a ~1Kb long seqs it's around 70% slower than the same code without these calls. On ~50Kb seqs it drops to aroudn 20% slower. Hence the significant cost in the initial bam_parse_basemod call. --- Makefile | 11 + htslib/sam.h | 131 +++++++++++ sam.c | 427 ++++++++++++++++++++++++++++++++++ test/base_mods/MM-chebi.out | 47 ++++ test/base_mods/MM-chebi.sam | 2 + test/base_mods/MM-double.out | 47 ++++ test/base_mods/MM-double.sam | 3 + test/base_mods/MM-multi.out | 95 ++++++++ test/base_mods/MM-multi.sam | 7 + test/base_mods/MM-orient.sam | 6 + test/base_mods/MM-pileup.out | 36 +++ test/base_mods/MM-pileup.sam | 5 + test/base_mods/MM-pileup2.out | 36 +++ test/base_mods/MM-pileup2.sam | 4 + test/base_mods/base-mods.sh | 35 +++ test/base_mods/base-mods.tst | 42 ++++ test/pileup_mod.c | 223 ++++++++++++++++++ test/simple_test_driver.sh | 3 +- test/test_mod.c | 127 ++++++++++ 19 files changed, 1286 insertions(+), 1 deletion(-) create mode 100644 test/base_mods/MM-chebi.out create mode 100644 test/base_mods/MM-chebi.sam create mode 100644 test/base_mods/MM-double.out create mode 100644 test/base_mods/MM-double.sam create mode 100644 test/base_mods/MM-multi.out create mode 100644 test/base_mods/MM-multi.sam create mode 100644 test/base_mods/MM-orient.sam create mode 100644 test/base_mods/MM-pileup.out create mode 100644 test/base_mods/MM-pileup.sam create mode 100644 test/base_mods/MM-pileup2.out create mode 100644 test/base_mods/MM-pileup2.sam create mode 100755 test/base_mods/base-mods.sh create mode 100644 test/base_mods/base-mods.tst create mode 100644 test/pileup_mod.c create mode 100644 test/test_mod.c diff --git a/Makefile b/Makefile index fb6b1d613..e607ee8d4 100644 --- a/Makefile +++ b/Makefile @@ -74,12 +74,14 @@ BUILT_TEST_PROGRAMS = \ test/fieldarith \ test/hfile \ test/pileup \ + test/pileup_mod \ test/plugins-dlhts \ test/sam \ test/test_bgzf \ test/test_expr \ test/test_kfunc \ test/test_kstring \ + test/test_mod \ test/test_realn \ test/test-regidx \ test/test_str2int \ @@ -497,6 +499,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODEC cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst cd test/fastq && ./test-fastq.sh + cd test/base_mods && ./base-mods.sh base-mods.tst REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq test/test-regidx cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-} @@ -516,6 +519,9 @@ test/hfile: test/hfile.o libhts.a test/pileup: test/pileup.o libhts.a $(CC) $(LDFLAGS) -o $@ test/pileup.o libhts.a $(LIBS) -lpthread +test/pileup_mod: test/pileup_mod.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/pileup_mod.o libhts.a $(LIBS) -lpthread + test/plugins-dlhts: test/plugins-dlhts.o $(CC) $(LDFLAGS) -o $@ test/plugins-dlhts.o $(LIBS) @@ -534,6 +540,9 @@ test/test_kfunc: test/test_kfunc.o libhts.a test/test_kstring: test/test_kstring.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_kstring.o libhts.a -lz $(LIBS) -lpthread +test/test_mod: test/test_mod.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_mod.o libhts.a $(LIBS) -lpthread + test/test_realn: test/test_realn.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_realn.o libhts.a $(LIBS) -lpthread @@ -622,12 +631,14 @@ test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile test/fieldarith.o: test/fieldarith.c config.h $(htslib_sam_h) test/hfile.o: test/hfile.c config.h $(htslib_hfile_h) $(htslib_hts_defs_h) $(htslib_kstring_h) test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) +test/pileup_mod.o: test/pileup_mod.c config.h $(htslib_sam_h) test/plugins-dlhts.o: test/plugins-dlhts.c config.h test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_hts_log_h) test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h) test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) +test/test_mod.o: test/test_mod.c config.h $(htslib_sam_h) test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_sam_h) test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h) diff --git a/htslib/sam.h b/htslib/sam.h index a80c89bb1..27690f7a7 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2101,6 +2101,137 @@ correct thing to do. It would be wise to avoid this situation if possible. HTSLIB_EXPORT int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag); +// --------------------------- +// Base modification retrieval + +/*! @typedef + @abstract Holds a single base modification. + @field modified_base The short base code (m, h, etc) or -ChEBI (negative) + @field canonical_base The canonical base referred to in the MM tag. + One of A, C, G, T or N. Note this may not be the + explicit base recorded in the SEQ column (esp. if N). + @field stran 0 or 1, indicating + or - strand from MM tag. + @field qual Quality code (256*probability), or -1 if unknown + + @discussion + Note this doesn't hold any location data or information on which other + modifications may be possible at this site. +*/ +typedef struct hts_base_mod { + int modified_base; + int canonical_base; + int strand; + int qual; +} hts_base_mod; + +/*! @typedef + @abstract An opaque type used for caching base modification state between + successive calls to bam_mods_* functions. +*/ +typedef struct hts_base_mod_state hts_base_mod_state; + +/// Allocates an hts_base_mode_state. +/** + * @return An hts_base_mode_state pointer on success, + * NULL on failure. + * + * This just allocates the memory. The initialisation of the contents is + * done using bam_parse_basemod. Successive calls may be made to that + * without the need to free and allocate a new state. + * + * The state be destroyed using the hts_base_mode_state_free function. + */ +HTSLIB_EXPORT +hts_base_mod_state *hts_base_mod_state_alloc(void); + +/// Destroys an hts_base_mode_state. +/** + * @param state The base modification state pointer. + * + * The should have previously been created by hts_base_mode_state_alloc. + */ +HTSLIB_EXPORT +void hts_base_mod_state_free(hts_base_mod_state *state); + +/// Parses the Mm and Ml tags out of a bam record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @return 0 on success, + * -1 on failure. + * + * This fills out the contents of the modification state, resetting the + * iterator location to the first sequence base. + */ +HTSLIB_EXPORT +int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state); + +/// Returns modification status for the next base position in the query seq. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param mods A supplied array for returning base modifications + * @param n_mods The size of the mods array + * @return The number of modifications found on success, + * -1 on failure. + * + * This is intended to be used as an iterator, with one call per location + * along the query sequence. + * + * If no modifications are found, the returned value is zero. + * If more than n_mods modifications are found, the total found is returned. + * Note this means the caller needs to check whether this is higher than + * n_mods. + */ +HTSLIB_EXPORT +int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods); + +/// Finds the next location containing base modifications and returns them +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param mods A supplied array for returning base modifications + * @param n_mods The size of the mods array + * @return The number of modifications found on success, + * 0 if no more modifications are present, + * -1 on failure. + * + * Unlike bam_mods_at_next_pos this skips ahead to the next site + * with modifications. + * + * If more than n_mods modifications are found, the total found is returned. + * Note this means the caller needs to check whether this is higher than + * n_mods. + */ +HTSLIB_EXPORT +int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods, int *pos); + +/// Returns modification status for a specific query position. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param mods A supplied array for returning base modifications + * @param n_mods The size of the mods array + * @return The number of modifications found on success, + * -1 on failure. + * + * Note if called multipled times, qpos must be higher than the previous call. + * Hence this is suitable for use from a pileup iterator. If more random + * access is required, bam_parse_basemod must be called each time to reset + * the state although this has an efficiency cost. + * + * If no modifications are found, the returned value is zero. + * If more than n_mods modifications are found, the total found is returned. + * Note this means the caller needs to check whether this is higher than + * n_mods. + */ +HTSLIB_EXPORT +int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods); + + #ifdef __cplusplus } #endif diff --git a/sam.c b/sam.c index 9e6511f8f..747cd4779 100644 --- a/sam.c +++ b/sam.c @@ -5832,3 +5832,430 @@ void bam_mplp_destructor(bam_mplp_t iter, } #endif // ~!defined(BAM_NO_PILEUP) + +// --------------------------- +// Base Modification retrieval +// +// These operate by recording state in an opaque type, allocated and freed +// via the functions below. +// +// Initially we call bam_parse_basemod to process the tags and record the +// modifications in the state structure, and then functions such as +// bam_next_basemod can iterate over this cached state. + +/* + * Base modification are stored in MM/Mm tags as defined as + * + * ::= | "" + * ::= + * + * ::= "A" | "C" | "G" | "T" | "N". + * + * ::= "+" | "-". + * + * ::= | + * ::= | + * ::= + * ::= + * + * ::= "," | ";" + * + * We do not allocate additional memory other than the fixed size + * state, thus we track up to 256 pointers to different locations + * within the MM and ML tags. Each pointer is for a distinct + * modification code (simple or ChEBI), meaning some may point to the + * same delta-list when multiple codes are combined together + * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array. + * + * Each numeric in the delta-list is tracked in MMcount[], counted + * down until it hits zero in which case the next delta is fetched. + * + * ML array similarly holds the locations in the quality (ML) tag per + * type, but these are interleaved so C+mhfc,10,15 will have 4 types + * all pointing to the same delta position, but in ML we store + * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML + * also has MLstride indicating how many positions along ML to jump + * each time we consume a base. (4 in our above example, but usually 1 + * for the simple case). + * + * One complexity of the base modification system is that mods are + * always stored in the original DNA orientation. This is so that + * tools that may reverse-complement a sequence (eg "samtools fastq -T + * MM,ML") can pass through these modification tags irrespective of + * whether they have any knowledge of their internal workings. + * + * Because we don't wish to allocate extra memory, we cannot simply + * reverse the MM and ML tags. Sadly this means we have to manage the + * reverse complementing ourselves on-the-fly. + * For reversed reads we start at the right end of MM and no longer + * stop at the semicolon. Instead we use MMend[] array to mark the + * termination point. + */ +#define MAX_BASE_MOD 256 +struct hts_base_mod_state { + int type[MAX_BASE_MOD]; // char or minus-CHEBI + int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) + char strand[MAX_BASE_MOD]; // strand of modification; + or - + int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod + char *MM[MAX_BASE_MOD]; // next pos delta (string) + char *MMend[MAX_BASE_MOD]; // end of pos-delta string + uint8_t *ML[MAX_BASE_MOD]; // next qual + int MLstride[MAX_BASE_MOD]; // bytes between quals for this type + int seq_pos; // current position along sequence + int nmods; // used array size (0 to MAX_BASE_MOD-1). +}; + +hts_base_mod_state *hts_base_mod_state_alloc(void) { + return calloc(1, sizeof(hts_base_mod_state)); +} + +void hts_base_mod_state_free(hts_base_mod_state *state) { + free(state); +} + +/* + * Count frequency of A, C, G, T and N canonical bases in the sequence + */ +static void seq_freq(const bam1_t *b, int freq[16]) { + int i; + + memset(freq, 0, 16*sizeof(*freq)); + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < b->core.l_qseq; i++) + freq[bam_seqi(seq, i)]++; + freq[15] = b->core.l_qseq; // all bases count as N for base mods +} + +//0123456789ABCDEF +//=ACMGRSVTWYHKDBN aka seq_nt16_str[] +//=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str +//084C2A6E195D3B7F +static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; + +/* + * Parse the MM and ML tags to populate the base mod state. + * This structure will have been previously allocated via + * hts_base_mod_state_alloc, but it does not need to be repeatedly + * freed and allocated for each new bam record. (Although obviously + * it requires a new call to this function.) + * + */ +int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { + // Read MM and ML tags + uint8_t *mm = bam_aux_get(b, "MM"); + if (!mm) mm = bam_aux_get(b, "Mm"); + if (!mm) + return 0; + if (mm[0] != 'Z') { + hts_log_error("MM tag is not of type Z"); + return -1; + } + + uint8_t *ml = bam_aux_get(b, "ML"); + if (!ml) ml = bam_aux_get(b, "Ml"); + if (ml && (ml[0] != 'B' || ml[1] != 'C')) { + hts_log_error("ML tag is not of type B,C"); + return -1; + } + uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; + if (ml) ml += 6; + + state->seq_pos = 0; + + // Aggregate freqs of ACGTN if reversed, to get final-delta (later) + int freq[16]; + if (b->core.flag & BAM_FREVERSE) + seq_freq(b, freq); + + char *cp = (char *)mm+1; + int mod_num = 0; + while (*cp) { + for (; *cp; cp++) { + // cp should be [ACGTNU][+-][^,]*(,\d+)*; + unsigned char btype = *cp++; + + if (btype != 'A' && btype != 'C' && + btype != 'G' && btype != 'T' && + btype != 'U' && btype != 'N') + return -1; + if (btype == 'U') btype = 'T'; + + btype = seq_nt16_table[btype]; + + // Strand + if (*cp != '+' && *cp != '-') + return -1; // malformed + char strand = *cp++; + + // List of modification types + char *ms = cp, *me; // mod code start and end + char *cp_end = NULL; + int chebi = 0; + if (isdigit(*cp)) { + chebi = strtol(cp, &cp_end, 10); + cp = cp_end; + ms = cp-1; + } else { + while (*cp && *cp != ',' && *cp != ';') + cp++; + if (*cp == '\0') + return -1; + } + me = cp; + + long delta; + int n = 0; // nth symbol in a multi-mod string + int stride = me-ms; + int ndelta = 0; + + if (b->core.flag & BAM_FREVERSE) { + // We process the sequence in left to right order, + // but delta is successive count of bases to skip + // counting right to left. This also means the number + // of bases to skip at left edge is unrecorded (as it's + // the remainder). + // + // To output mods in left to right, we step through the + // MM list in reverse and need to identify the left-end + // "remainder" delta. + int total_seq = 0; + for (;;) { + cp += (*cp == ','); + if (*cp == 0 || *cp == ';') + break; + + delta = strtol(cp, &cp_end, 10); + if (cp_end == cp) { + hts_log_error("Hit end of MM tag. Missing semicolon?"); + return -1; + } + + cp = cp_end; + total_seq += delta+1; + ndelta++; + } + delta = freq[seqi_rc[btype]] - total_seq; // remainder + } else { + delta = *cp == ',' + ? strtol(cp+1, &cp_end, 10) + : 0; + if (!cp_end) { + // empty list + delta = INT_MAX; + cp_end = cp+1; + } + } + // Now delta is first in list or computed remainder, + // and cp_end is either start or end of the MM list. + while (ms < me) { + state->type [mod_num] = chebi ? -chebi : *ms; + state->strand [mod_num] = (strand == '-'); + state->canonical[mod_num] = btype; + state->MLstride [mod_num] = stride; + + state->MMcount [mod_num] = delta; + if (b->core.flag & BAM_FREVERSE) { + state->MM [mod_num] = cp+1; + state->MMend[mod_num] = cp_end; + state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL; + } else { + state->MM [mod_num] = cp_end; + state->MMend[mod_num] = NULL; + state->ML [mod_num] = ml ? ml+n : NULL; + } + + if (++mod_num >= MAX_BASE_MOD) { + hts_log_error("Too many base modification types"); + return -1; + } + ms++; n++; + } + + // Skip modification deltas + if (ml) { + if (b->core.flag & BAM_FREVERSE) { + ml += ndelta*stride; + } else { + while (*cp && *cp != ';') { + if (*cp == ',') + ml+=stride; + cp++; + } + } + if (ml > ml_end) { + hts_log_error("Insufficient number of entries in ML tag"); + return -1; + } + } else { + // cp_end already known if FREVERSE + if (cp_end && (b->core.flag & BAM_FREVERSE)) + cp = cp_end; + else + while (*cp && *cp != ';') + cp++; + } + if (!*cp) { + hts_log_error("Hit end of MM tag. Missing semicolon?"); + return -1; + } + } + } + + state->nmods = mod_num; + + return 0; +} + +/* + * Fills out mods[] with the base modifications found. + * Returns the number found (0 if none), which may be more than + * the size of n_mods if more were found than reported. + * Returns <= -1 on error. + * + * This always marches left to right along sequence, irrespective of + * reverse flag or modification strand. + */ +int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods) { + if (b->core.flag & BAM_FREVERSE) { + if (state->seq_pos < 0) + return -1; + } else { + if (state->seq_pos >= b->core.l_qseq) + return -1; + } + + int i, j, n = 0; + unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos); + state->seq_pos++; + if (b->core.flag & BAM_FREVERSE) + base = seqi_rc[base]; + + for (i = 0; i < state->nmods; i++) { + if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) + continue; + + if (state->MMcount[i]-- > 0) + continue; + + char *MMptr = state->MM[i]; + if (n < n_mods) { + mods[n].modified_base = state->type[i]; + mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; + mods[n].strand = state->strand[i]; + mods[n].qual = state->ML[i] ? *state->ML[i] : -1; + } + n++; + if (state->ML[i]) + state->ML[i] += (b->core.flag & BAM_FREVERSE) + ? -state->MLstride[i] + : +state->MLstride[i]; + + if (b->core.flag & BAM_FREVERSE) { + // process MM list backwards + char *cp; + for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--) + if (*cp == ',') + break; + state->MMend[i] = cp; + if (cp != state->MM[i]) + state->MMcount[i] = strtol(cp+1, NULL, 10); + else + state->MMcount[i] = INT_MAX; + } else { + if (*state->MM[i] == ',') + state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10); + else + state->MMcount[i] = INT_MAX; + } + + // Multiple mods at the same coords. + for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) { + if (n < n_mods) { + mods[n].modified_base = state->type[j]; + mods[n].canonical_base = seq_nt16_str[state->canonical[j]]; + mods[n].strand = state->strand[j]; + mods[n].qual = state->ML[j] ? *state->ML[j] : -1; + } + n++; + state->MMcount[j] = state->MMcount[i]; + state->MM[j] = state->MM[i]; + if (state->ML[j]) + state->ML[j] += (b->core.flag & BAM_FREVERSE) + ? -state->MLstride[j] + : +state->MLstride[j]; + } + i = j-1; + } + + return n; +} + +/* + * Looks for the next location with a base modification. + */ +int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods, int *pos) { + if (state->seq_pos >= b->core.l_qseq) + return 0; + + // Look through state->MMcount arrays to see when the next lowest is + // per base type; + int next[16], freq[16] = {0}, i; + memset(next, 0x7f, 16*sizeof(*next)); + if (b->core.flag & BAM_FREVERSE) { + for (i = 0; i < state->nmods; i++) { + if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) + next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; + } + } else { + for (i = 0; i < state->nmods; i++) { + if (next[state->canonical[i]] > state->MMcount[i]) + next[state->canonical[i]] = state->MMcount[i]; + } + } + + // Now step through the sequence counting off base types. + for (i = state->seq_pos; i < b->core.l_qseq; i++) { + unsigned char bc = bam_seqi(bam_get_seq(b), i); + if (next[bc] <= freq[bc] || next[15] <= freq[15]) + break; + freq[bc]++; + if (bc != 15) // N + freq[15]++; + } + *pos = state->seq_pos = i; + + if (i >= b->core.l_qseq) + return 0; + + if (b->core.flag & BAM_FREVERSE) { + for (i = 0; i < state->nmods; i++) + state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]]; + } else { + for (i = 0; i < state->nmods; i++) + state->MMcount[i] -= freq[state->canonical[i]]; + } + + int r = bam_mods_at_next_pos(b, state, mods, n_mods); + return r > 0 ? r : 0; +} + +/* + * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos. + * This can only march forwards along the read, but can do so by more than + * one base-pair. + * + * This makes it useful for calling from pileup iterators where qpos may + * start part way through a read for the first occurrence of that record. + */ +int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods) { + // FIXME: for now this is ineffecient in implementation. + int r = 0; + while (state->seq_pos <= qpos) + if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) + break; + + return r; +} diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out new file mode 100644 index 000000000..663a682df --- /dev/null +++ b/test/base_mods/MM-chebi.out @@ -0,0 +1,47 @@ +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m102 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n212 +16 A +17 C C+m128 +18 G +19 C C+(76792)161 +20 C C+m153 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m179 +32 A +33 C +34 C C+m204 C+(76792)187 +35 A +--- +6 C C+m102 +15 N N+n212 +17 C C+m128 +19 C C+(76792)161 +20 C C+m153 +31 C C+m179 +34 C C+m204 C+(76792)187 + +=== + diff --git a/test/base_mods/MM-chebi.sam b/test/base_mods/MM-chebi.sam new file mode 100644 index 000000000..62920ecc1 --- /dev/null +++ b/test/base_mods/MM-chebi.sam @@ -0,0 +1,2 @@ +@CO Separate m, h and N modifications +* 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15; Ml:B:C,102,128,153,179,204,161,187,212,169 diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out new file mode 100644 index 000000000..82d086a2f --- /dev/null +++ b/test/base_mods/MM-double.out @@ -0,0 +1,47 @@ +0 A +1 G G-m115 +2 G +3 A +4 T +5 C +6 T +7 C C+m128 +8 T +9 A +10 G +11 C +12 G G-m141 +13 G G-m166 G+o102 +14 A +15 T +16 C +17 G +18 G +19 C +20 G +21 G +22 G G-m192 +23 G +24 G +25 A +26 T +27 A +28 T +29 G +30 C C+m153 +31 C C+m179 +32 A +33 T +34 A +35 T +--- +1 G G-m115 +7 C C+m128 +12 G G-m141 +13 G G-m166 G+o102 +22 G G-m192 +30 C C+m153 +31 C C+m179 + +=== + diff --git a/test/base_mods/MM-double.sam b/test/base_mods/MM-double.sam new file mode 100644 index 000000000..608516fc1 --- /dev/null +++ b/test/base_mods/MM-double.sam @@ -0,0 +1,3 @@ +@CO Modifications called on both strands of the same record, +@CO including potentially at the same location simultaneously. +* 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:C+m,1,3,0;G-m,0,2,0,4;G+o,4; Ml:B:C,128,153,179,115,141,166,192,102 diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out new file mode 100644 index 000000000..23c98d97b --- /dev/null +++ b/test/base_mods/MM-multi.out @@ -0,0 +1,95 @@ +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A +--- +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m77 C+h159 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n240 +16 A +17 C C+m103 C+h133 +18 G +19 C C+m128 C+h108 +20 C C+m154 C+h82 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m179 C+h57 +32 A +33 C +34 C C+m204 C+h31 +35 A +--- +6 C C+m77 C+h159 +15 N N+n240 +17 C C+m103 C+h133 +19 C C+m128 C+h108 +20 C C+m154 C+h82 +31 C C+m179 C+h57 +34 C C+m204 C+h31 + +=== + diff --git a/test/base_mods/MM-multi.sam b/test/base_mods/MM-multi.sam new file mode 100644 index 000000000..b2259a09e --- /dev/null +++ b/test/base_mods/MM-multi.sam @@ -0,0 +1,7 @@ +@CO Testing multiple m, h and N modifications on the same read. +@CO r1 has them separated out. +@CO r2 has them combined together, for example as produced by +@CO a joint basecaller which assigns probabilities to all +@CO trained events simultaneously. +r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r2 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/test/base_mods/MM-orient.sam b/test/base_mods/MM-orient.sam new file mode 100644 index 000000000..363e7c2be --- /dev/null +++ b/test/base_mods/MM-orient.sam @@ -0,0 +1,6 @@ +@CO Testing mods on top and bottom strand, but also in +@CO original vs reverse-complemented orientation +top-fwd 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:C+m,1,3,0; Ml:B:C,128,153,179 +top-rev 16 * 0 0 * * 0 0 ATATGGCATATCCCCCGCCGATCCGCTAGAGATCCT * Mm:Z:C+m,1,3,0; Ml:B:C,128,153,179 +bot-fwd 0 * 0 0 * * 0 0 AGGATCTCTAGCGGATCGGCGGGGGATATGCCATAT * Mm:Z:G-m,0,0,4,3; Ml:B:C,115,141,166,192 +bot-rev 16 * 0 0 * * 0 0 ATATGGCATATCCCCCGCCGATCCGCTAGAGATCCT * Mm:Z:G-m,0,0,4,3; Ml:B:C,115,141,166,192 diff --git a/test/base_mods/MM-pileup.out b/test/base_mods/MM-pileup.out new file mode 100644 index 000000000..8c603d661 --- /dev/null +++ b/test/base_mods/MM-pileup.out @@ -0,0 +1,36 @@ +I 0 AA DD +I 1 GG FF +I 2 CC ?? +I 3 TTT GGG +I 4 CCC CCC +I 5 TTT HHH +I 6 C[+m128]C[-m128]C[+m128] 888 +I 7 CCC 888 +I 8 AAA ... +I 9 GGG EEE +I 10 AAAA GGGG +I 11 GGGG 8888 +I 12 TTTT .... +I 13 CCCC 7777 +I 14 GGGG @@@@ +I 15 N[+n215]N[-n215]N[+n215]N[+n215] EEEE +I 16 AAAA 9999 +I 17 C[+m153]C[-m153]C[+m153]C[+m153+h0] GGGG +I 18 G[+n240]G[-n240]G[+n240]G[+n240] 8888 +I 19 C[+h159]C[-h159]C[+h159]C[+m0+h159] AAAA +I 20 C[+m179]C[-m179]C[+m179]C[+m179+h0] ???? +I 21 AAAA HHHH +I 22 TTTT 9999 +I 23 YYYY .... +I 24 CCCC :::: +I 25 GGGG CCCC +I 26 CCCC ???? +I 27 GGGG 8888 +I 28 CCCC ,,,, +I 29 GGGG @@@@ +I 30 CCC ,,, +I 31 C[+m204]C[-m204]C[+m204] ,,, +I 32 AAA 999 +I 33 CCC FFF +I 34 C[+m230+h6]C[-m230-h6]C[+m230+h6] @@@ +I 35 AAA AAA diff --git a/test/base_mods/MM-pileup.sam b/test/base_mods/MM-pileup.sam new file mode 100644 index 000000000..0021c7271 --- /dev/null +++ b/test/base_mods/MM-pileup.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 diff --git a/test/base_mods/MM-pileup2.out b/test/base_mods/MM-pileup2.out new file mode 100644 index 000000000..51e6973d3 --- /dev/null +++ b/test/base_mods/MM-pileup2.out @@ -0,0 +1,36 @@ +I 0 AA D~ +I 1 GG F~ +I 2 CC ?~ +I 3 TT G~ +I 4 CC C~ +I 5 TT H~ +I 6 C[+m128]C[+m77+h159] 8~ +I 7 CC 8~ +I 8 AA .~ +I 9 GG E~ +I 10 AA G~ +I 11 GG 8~ +I 12 TT .~ +I 13 CC 7~ +I 14 GG @~ +I 15 NN[+n240] E~ +I 16 AA 9~ +I 17 C[+m153]C[+m103+h133] G~ +I 18 GG 8~ +I 19 C[+(76792)159]C[+m128+h108] A~ +I 20 C[+m179]C[+m154+h82] ?~ +I 21 AA H~ +I 22 TT 9~ +I 23 YY .~ +I 24 CC :~ +I 25 GG C~ +I 26 CC ?~ +I 27 GG 8~ +I 28 CC ,~ +I 29 GG @~ +I 30 CC ,~ +I 31 C[+m204]C[+m179+h57] ,~ +I 32 AA 9~ +I 33 CC F~ +I 34 C[+m230+(76792)6]C[+m204+h31] @~ +I 35 AA A~ diff --git a/test/base_mods/MM-pileup2.sam b/test/base_mods/MM-pileup2.sam new file mode 100644 index 000000000..7aa9ab3ab --- /dev/null +++ b/test/base_mods/MM-pileup2.sam @@ -0,0 +1,4 @@ +@CO With added CHeBI codes and mod-code combining. +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+76792,6,7; Ml:B:C,128,153,179,204,230,159,6 +r2 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/test/base_mods/base-mods.sh b/test/base_mods/base-mods.sh new file mode 100755 index 000000000..0f67f927f --- /dev/null +++ b/test/base_mods/base-mods.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing base-modifications..." + +test_mod="../test_mod" +pileup_mod="../pileup_mod" + +test_driver $@ + +exit $? diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst new file mode 100644 index 000000000..865a539c7 --- /dev/null +++ b/test/base_mods/base-mods.tst @@ -0,0 +1,42 @@ +# Copyright (C) 2020 Genome Research Ltd. +# +# Author: James Bonfield +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass +# F = expected to fail + +# Second field: +# Filename of expected output + +# Third onwards; command to execute. $fmt is replaced by the current file +# format, ie sam, bam or cram. $samtools is a pointer to the desired +# samtools binary. This can be useful for testing older versions. + +# Test files from SAM spec +P MM-chebi.out $test_mod MM-chebi.sam +P MM-double.out $test_mod MM-double.sam +P MM-multi.out $test_mod MM-multi.sam + +# Pileup testing +P MM-pileup.out $pileup_mod < MM-pileup.sam +P MM-pileup2.out $pileup_mod < MM-pileup2.sam diff --git a/test/pileup_mod.c b/test/pileup_mod.c new file mode 100644 index 000000000..7a55769e2 --- /dev/null +++ b/test/pileup_mod.c @@ -0,0 +1,223 @@ +/* test/pileup_mod.c -- simple pileup tester with base modifications + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include "../htslib/sam.h" + +typedef struct { + samFile *fp; + sam_hdr_t *h; +} plp_dat; + +static int readaln(void *data, bam1_t *b) { + plp_dat *dat = (plp_dat *)data; + return sam_read1(dat->fp, dat->h, b); +} + +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +// No modification reporting. +// This is just a simple base-line for comparison against mod_pileup1 for +// performance testing. +void process_pileup(sam_hdr_t *h, const bam_pileup1_t *p, + int tid, int pos, int n) { + kstring_t s = {0,0}; + printf("%s\t%d\t", sam_hdr_tid2name(h, tid), pos); + int i; + for (i = 0; i < n; i++, p++) { + if (p->is_del) { + putchar('*'); + continue; + } + + uint8_t *seq = bam_get_seq(p->b); + uint8_t *qual = bam_get_qual(p->b); + unsigned char c = seq_nt16_str[bam_seqi(seq, p->qpos)]; + putchar(c); + kputc(MIN('~','!'+qual[p->qpos]), &s); + } + putchar('\t'); + puts(s.l ? s.s : ""); + + free(s.s); +} + +// Initialise and destroy the base modifier state data. This is called +// as each new read is added or removed from the pileups. +int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) { + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); + cd->p = m; + return 0; +} + +int pileup_cd_destroy(void *data, const bam1_t *b, bam_pileup_cd *cd) { + hts_base_mod_state_free(cd->p); + return 0; +} + +// Report a line of pileup, including base modifications inline with +// the sequence (including qualities), as [...] +void process_mod_pileup1(sam_hdr_t *h, const bam_pileup1_t *p, + int tid, int pos, int n) { + kstring_t s = {0,0}; + printf("%s\t%d\t", sam_hdr_tid2name(h, tid), pos); + int i; + for (i = 0; i < n; i++, p++) { + if (p->is_del) { + putchar('*'); + continue; + } + + uint8_t *seq = bam_get_seq(p->b); + uint8_t *qual = bam_get_qual(p->b); + unsigned char c = seq_nt16_str[bam_seqi(seq, p->qpos)]; + putchar(c); + kputc(MIN('~','!'+qual[p->qpos]), &s); + + // Simple mod detection; assumes at most 5 mods + hts_base_mod_state *m = p->cd.p; + hts_base_mod mod[5]; + int nm; + if ((nm = bam_mods_at_qpos(p->b, p->qpos, m, mod, 5)) > 0) { + int j; + putchar('['); + for (j = 0; j < nm && j < 5; j++) { + if (mod[j].modified_base < 0) + // ChEBI + printf("%c(%d)%d", "+-"[mod[j].strand], + -mod[j].modified_base, mod[j].qual); + else + printf("%c%c%d", "+-"[mod[j].strand], + mod[j].modified_base, mod[j].qual); + } + putchar(']'); + } + } + putchar('\t'); + puts(s.l ? s.s : ""); + + free(s.s); +} + +// Report a line of pileup, including base modifications. +// This replaces the base with the mod call (NB this can be confusing +// as both C and G can map to m depending on orientation). +// It also reports qualities in the QUAl column, remapped to +// phred scale as only one single mod is supported and hence extreme +// unlikely probabilities shouldn't be reported (although we don't +// scan to pick the highest). +void process_mod_pileup2(sam_hdr_t *h, const bam_pileup1_t *p, + int tid, int pos, int n) { + kstring_t s = {0,0}; + printf("%s\t%d\t%d\t", sam_hdr_tid2name(h, tid), pos, n); + int i; + for (i = 0; i < n; i++, p++) { + if (p->is_del) { + putchar('*'); + continue; + } + + uint8_t *seq = bam_get_seq(p->b); + uint8_t *qual = bam_get_qual(p->b); + unsigned char c = seq_nt16_str[bam_seqi(seq, p->qpos)]; + + // Simple mod detection; assumes at most 2 non-ChEBI mods + hts_base_mod_state *m = p->cd.p; + int n, is_rev = bam_is_rev(p->b); + hts_base_mod mod; + char base; + uint8_t q = qual[p->qpos]; + if ((n = bam_mods_at_qpos(p->b, p->qpos, m, &mod, 1)) > 0) { + base = mod.modified_base; + // base mod as phred scale + q = -10 * log10(1-((mod.qual+0.5)/256)) + 0.5; + } else { + base = c; + } + + // Case is inappropriate here as some mods (eg "a") are lc. + // So we dim/bold them instead using ANSI escape codes. + // It's a test script, so I'm not going to care about curses. + if (is_rev) { + printf("\033[2m%c\033[0m", base); + } else { + printf("\033[1m%c\033[0m", base); + } + kputc(MIN('~','!'+q), &s); + } + putchar('\t'); + puts(s.l ? s.s : ""); + + free(s.s); +} + +int main(int argc, char **argv) { + int compact = 0; + while (argc > 1 && strcmp(argv[1], "-c") == 0) { + compact++; + argc--; + argv++; + } + + samFile *in = sam_open(argc > 1 ? argv[1] : "-", "r"); + bam1_t *b = bam_init1(); + sam_hdr_t *h = sam_hdr_read(in); + + // Pileup iterator with constructor/destructor to parse base mod tags + plp_dat dat = { + .fp = in, + .h = h, + }; + bam_plp_t iter = bam_plp_init(readaln, &dat); + bam_plp_constructor(iter, pileup_cd_create); + bam_plp_destructor(iter, pileup_cd_destroy); + + const bam_pileup1_t *p; + int tid, pos, n; + while ((p = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { + switch (compact) { + case 0: + process_mod_pileup1(h, p, tid, pos, n); + break; + case 1: + process_mod_pileup2(h, p, tid, pos, n); + break; + default: + process_pileup(h, p, tid, pos, n); + break; + } + } + bam_plp_destroy(iter); + + sam_close(in); + bam_destroy1(b); + sam_hdr_destroy(h); + + return 0; +} diff --git a/test/simple_test_driver.sh b/test/simple_test_driver.sh index 7e4a3bdc2..9ab412511 100644 --- a/test/simple_test_driver.sh +++ b/test/simple_test_driver.sh @@ -3,7 +3,8 @@ # # Copyright (C) 2017-2018 Genome Research Ltd. # -# Author: Robert Davies +# Author: James Bonfield +# Robert Davies # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal diff --git a/test/test_mod.c b/test/test_mod.c new file mode 100644 index 000000000..80b7181a8 --- /dev/null +++ b/test/test_mod.c @@ -0,0 +1,127 @@ +/* test/test_mod.c -- testing of base modification functions + + Copyright (C) 2020 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include + +#include "../htslib/sam.h" + +static char *code(int id) { + static char code[20]; + if (id > 0) { + code[0] = id; + code[1] = 0; + } else { + sprintf(code, "(%d)", -id); + } + + return code; +} + +int main(int argc, char **argv) { + char out[1024] = {0}; + if (argc < 2) + return 1; + + samFile *in = sam_open(argv[1], "r"); + if (!in) + return 1; + + bam1_t *b = bam_init1(); + sam_hdr_t *h = sam_hdr_read(in); + hts_base_mod_state *m = hts_base_mod_state_alloc(); + if (!h || !b || !m) + goto err; + + int r; + while ((r = sam_read1(in, h, b)) >= 0) { + if (bam_parse_basemod(b, m) < 0) { + fprintf(stderr, "Failed to parse MM/ML aux tags\n"); + goto err; + } + + // per-base iterator + int i, j, n; + hts_base_mod mods[5]; + for (i = 0; i < b->core.l_qseq; i++) { + char line[8192], *lp = line; + n = bam_mods_at_next_pos(b, m, mods, 5); + lp += sprintf(lp, "%d\t%c\t", + i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); + for (j = 0; j < n && j < 5; j++) + lp += sprintf(lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); + *lp++ = '\n'; + *lp++ = 0; + + if (argc > 1) + printf("%s", line); + else + strcat(out, line); + } + + if (argc > 1) puts("---"); + + bam_parse_basemod(b, m); + + int pos; + while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { + char line[8192]={0}, *lp = line; + lp += sprintf(lp, "%d\t%c\t", pos, + seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); + for (j = 0; j < n && j < 5; j++) + lp += sprintf(lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); + *lp++ = '\n'; + *lp++ = 0; + + if (argc > 1) + printf("%s", line); + else + strcat(out, line); + } + + if (argc > 1) puts("\n===\n"); + } + fflush(stdout); + if (sam_close(in) != 0 || r < -1) + goto err; + + bam_destroy1(b); + sam_hdr_destroy(h); + hts_base_mod_state_free(m); + return 0; + + err: + bam_destroy1(b); + sam_hdr_destroy(h); + hts_base_mod_state_free(m); + return 1; +} From 0b771f468d5f0d6fe925317b7b784d46a5ebc68a Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 28 Aug 2020 12:29:20 +0100 Subject: [PATCH 183/488] Fix copyright notice --- test/base_mods/base-mods.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/base_mods/base-mods.sh b/test/base_mods/base-mods.sh index 0f67f927f..f3f3ca4b7 100755 --- a/test/base_mods/base-mods.sh +++ b/test/base_mods/base-mods.sh @@ -2,7 +2,7 @@ # # Copyright (C) 2020 Genome Research Ltd. # -# Author: James Bonfield +# Author: James Bonfield # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From 8a7cc84642c0e90fed04a8569131acf69a445a5d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 9 Sep 2020 16:43:33 +0100 Subject: [PATCH 184/488] The pileup constructor now handles errors. The API for this always had both constructor and destructor as type int, but it was never documented what this return value is for and nor was it ever checked. Although the ABI hasn't changed, this is an API change as we have added meaning to an unused element. However (hopefully) the new meaning is in line with expectations and library norms. --- htslib/sam.h | 11 +++++++---- sam.c | 11 ++++++++--- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 27690f7a7..2f7d826f2 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1948,10 +1948,13 @@ typedef struct bam_mplp_s *bam_mplp_t; /** * bam_plp_constructor() - sets a callback to initialise any per-pileup1_t fields. * @plp: The bam_plp_t initialised using bam_plp_init. - * @func: The callback function itself. When called, it is given the - * data argument (specified in bam_plp_init), the bam structure and - * a pointer to a locally allocated bam_pileup_cd union. This union - * will also be present in each bam_pileup1_t created. + * @func: The callback function itself. When called, it is given + * the data argument (specified in bam_plp_init), the bam + * structure and a pointer to a locally allocated + * bam_pileup_cd union. This union will also be present in + * each bam_pileup1_t created. + * The callback function should have a negative return + * value to indicate an error. (Similarly for destructor.) */ HTSLIB_EXPORT void bam_plp_constructor(bam_plp_t plp, diff --git a/sam.c b/sam.c index 747cd4779..3750b582b 100644 --- a/sam.c +++ b/sam.c @@ -5612,9 +5612,14 @@ int bam_plp_push(bam_plp_t iter, const bam1_t *b) iter->error = 1; return -1; } - if (iter->plp_construct) - iter->plp_construct(iter->data, &iter->tail->b, - &iter->tail->cd); + if (iter->plp_construct) { + if (iter->plp_construct(iter->data, &iter->tail->b, + &iter->tail->cd) < 0) { + mp_free(iter->mp, next); + iter->error = 1; + return -1; + } + } if (overlap_push(iter, iter->tail) < 0) { mp_free(iter->mp, next); iter->error = 1; From 17b4d2ec0468e2f67deeb17b328f884f1552150e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 10 Sep 2020 10:09:47 +0100 Subject: [PATCH 185/488] Added bam_plp_insertion_mod API. This is a generalised version of bam_plp_insertion (which now calls this with a NULL argument) that can also include markup for base modifications. Used by samtools mpileup. --- htslib/sam.h | 35 +++++++++++++++++++++++++++++------ sam.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 7 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 2f7d826f2..66b7ed453 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1979,6 +1979,35 @@ typedef struct bam_mplp_s *bam_mplp_t; HTSLIB_EXPORT int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) HTS_RESULT_USED; + + /*! @typedef + @abstract An opaque type used for caching base modification state between + successive calls to bam_mods_* functions. + */ + typedef struct hts_base_mod_state hts_base_mod_state; + + /// Get pileup padded insertion sequence, including base modifications + /** + * @param p pileup data + * @param m state data for the base modification finder + * @param ins the kstring where the insertion sequence will be written + * @param del_len location for deletion length + * @return the length of insertion string on success; -1 on failure. + * + * Fills out the kstring with the padded insertion sequence for the current + * location in 'p'. If this is not an insertion site, the string is blank. + * + * The modification state needs to have been previously initialised using + * bam_parse_basemod. It is permitted to be passed in as NULL, in which + * case this function outputs identically to bam_plp_insertion. + * + * If del_len is not NULL, the location pointed to is set to the length of + * any deletion immediately following the insertion, or zero if none. + */ + HTSLIB_EXPORT + int bam_plp_insertion_mod(const bam_pileup1_t *p, hts_base_mod_state *m, + kstring_t *ins, int *del_len) HTS_RESULT_USED; + /// Create a new bam_mplp_t structure /** The struct returned by a successful call should be freed * via bam_mplp_destroy() when it is no longer needed. @@ -2127,12 +2156,6 @@ typedef struct hts_base_mod { int qual; } hts_base_mod; -/*! @typedef - @abstract An opaque type used for caching base modification state between - successive calls to bam_mods_* functions. -*/ -typedef struct hts_base_mod_state hts_base_mod_state; - /// Allocates an hts_base_mode_state. /** * @return An hts_base_mode_state pointer on success, diff --git a/sam.c b/sam.c index 3750b582b..4763f5693 100644 --- a/sam.c +++ b/sam.c @@ -5105,10 +5105,14 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) * Fills out the kstring with the padded insertion sequence for the current * location in 'p'. If this is not an insertion site, the string is blank. * + * This variant handles base modifications, but only when "m" is non-NULL. + * * Returns the length of insertion string on success; * -1 on failure. */ -int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { +int bam_plp_insertion_mod(const bam_pileup1_t *p, + hts_base_mod_state *m, + kstring_t *ins, int *del_len) { int j, k, indel; uint32_t *cigar; @@ -5159,6 +5163,34 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), p->qpos + j - p->is_del)]; ins->s[indel++] = c; + int nm; + hts_base_mod mod[256]; + if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del, + m, mod, 256)) > 0) { + if (ks_resize(ins, ins->l + nm*16+3) < 0) + return -1; + ins->s[indel++] = '['; + int j; + for (j = 0; j < nm; j++) { + char qual[20]; + if (mod[j].qual >= 0) + sprintf(qual, "%d", mod[j].qual); + else + *qual=0; + if (mod[j].modified_base < 0) + // ChEBI + indel += sprintf(&ins->s[indel], "%c(%d)%s", + "+-"[mod[j].strand], + -mod[j].modified_base, + qual); + else + indel += sprintf(&ins->s[indel], "%c%c%s", + "+-"[mod[j].strand], + mod[j].modified_base, + qual); + } + ins->s[indel++] = ']'; + } } break; case BAM_CDEL: @@ -5177,6 +5209,20 @@ int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { return indel; } +/* + * Fills out the kstring with the padded insertion sequence for the current + * location in 'p'. If this is not an insertion site, the string is blank. + * + * This is the original interface with no capability for reporting base + * modifications. + * + * Returns the length of insertion string on success; + * -1 on failure. + */ +int bam_plp_insertion(const bam_pileup1_t *p, kstring_t *ins, int *del_len) { + return bam_plp_insertion_mod(p, NULL, ins, del_len); +} + /*********************** *** Pileup iterator *** ***********************/ From 85cbd8c481cd9446beaf8b097ffcdeb03db0d2c9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 15 Jul 2021 16:11:48 +0100 Subject: [PATCH 186/488] Change bam_plp_insertion_mod to return number of bases, not length. When we have no base modification, the return value and ins->l are both the same thing (number of bases inserted). When we have base modifications, the old length was string length (and ironically ins->l was incorrectly left as the number of bases). That's not particularly obvious, nor useful. It's now reversed, so ins->l is the length of the string ins->s and the return value is the number of bases. Eg "+3AG[+o99]A" vs "+3AGA" both now return 3. --- htslib/sam.h | 3 ++- sam.c | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 66b7ed453..bc4f9f42c 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1992,7 +1992,8 @@ typedef struct bam_mplp_s *bam_mplp_t; * @param m state data for the base modification finder * @param ins the kstring where the insertion sequence will be written * @param del_len location for deletion length - * @return the length of insertion string on success; -1 on failure. + * @return the number of insertion string on success, with string length + * being accessable via ins->l; -1 on failure. * * Fills out the kstring with the padded insertion sequence for the current * location in 'p'. If this is not an insertion site, the string is blank. diff --git a/sam.c b/sam.c index 4763f5693..30e6e7a67 100644 --- a/sam.c +++ b/sam.c @@ -5107,13 +5107,14 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) * * This variant handles base modifications, but only when "m" is non-NULL. * - * Returns the length of insertion string on success; + * Returns the number of inserted base on success, with string length being + * accessable via ins->l; * -1 on failure. */ int bam_plp_insertion_mod(const bam_pileup1_t *p, hts_base_mod_state *m, kstring_t *ins, int *del_len) { - int j, k, indel; + int j, k, indel, nb = 0; uint32_t *cigar; if (p->indel <= 0) { @@ -5143,7 +5144,7 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, } k++; } - ins->l = indel; + nb = ins->l = indel; // Produce sequence if (ks_resize(ins, indel+1) < 0) @@ -5205,8 +5206,9 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, k++; } ins->s[indel] = '\0'; + ins->l = indel; // string length - return indel; + return nb; // base length } /* From cd9d95326e0603e1e6306c9ca31210c8d2cca7ea Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 4 Aug 2021 15:13:16 +0100 Subject: [PATCH 187/488] Add some error detection to bam_next_basemod. Having an MM tag refer to bases that are beyond the end of the sequence now generates a warning and returns -1 instead of 0 from the function. It's a warning because it's harmless to ignore these, but the caller may wish to check and treat it as a hard error (as the test harness does). Note: this isn't trivial to do for the reverse strand, so (for now) we only check the top strand data. Also fixed a typo in a comment. --- sam.c | 13 +++++++++++-- test/test_mod.c | 2 ++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sam.c b/sam.c index 30e6e7a67..5300e9841 100644 --- a/sam.c +++ b/sam.c @@ -6279,8 +6279,17 @@ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, } *pos = state->seq_pos = i; - if (i >= b->core.l_qseq) + if (i >= b->core.l_qseq) { + // Check for more MM elements than bases present. + for (i = 0; i < state->nmods; i++) { + if (!(b->core.flag & BAM_FREVERSE) && + state->MMcount[i] < 0x7f000000) { + hts_log_warning("MM tag refers to bases beyond sequence length"); + return -1; + } + } return 0; + } if (b->core.flag & BAM_FREVERSE) { for (i = 0; i < state->nmods; i++) @@ -6304,7 +6313,7 @@ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, */ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, hts_base_mod *mods, int n_mods) { - // FIXME: for now this is ineffecient in implementation. + // FIXME: for now this is inefficient in implementation. int r = 0; while (state->seq_pos <= qpos) if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) diff --git a/test/test_mod.c b/test/test_mod.c index 80b7181a8..aade3733c 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -107,6 +107,8 @@ int main(int argc, char **argv) { else strcat(out, line); } + if (n < 0) + goto err; if (argc > 1) puts("\n===\n"); } From 7ba9ecd79b179bb5cdf6e2322e4a4a880d6915bb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 9 Aug 2021 17:19:59 +0100 Subject: [PATCH 188/488] Small fix to MM-chebi.sam methylation test data. The format of data was correct, but the values summed to a probability higher than 1. As reported in samtools/hts-specs#584 --- test/base_mods/MM-chebi.out | 4 ++-- test/base_mods/MM-chebi.sam | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out index 663a682df..cefdc545c 100644 --- a/test/base_mods/MM-chebi.out +++ b/test/base_mods/MM-chebi.out @@ -32,7 +32,7 @@ 31 C C+m179 32 A 33 C -34 C C+m204 C+(76792)187 +34 C C+m204 C+(76792)33 35 A --- 6 C C+m102 @@ -41,7 +41,7 @@ 19 C C+(76792)161 20 C C+m153 31 C C+m179 -34 C C+m204 C+(76792)187 +34 C C+m204 C+(76792)33 === diff --git a/test/base_mods/MM-chebi.sam b/test/base_mods/MM-chebi.sam index 62920ecc1..0ec8b9ddb 100644 --- a/test/base_mods/MM-chebi.sam +++ b/test/base_mods/MM-chebi.sam @@ -1,2 +1,2 @@ @CO Separate m, h and N modifications -* 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15; Ml:B:C,102,128,153,179,204,161,187,212,169 +* 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+76792,6,7;N+n,15; Ml:B:C,102,128,153,179,204,161,33,212,169 From d67da0ecee170b717b8933e2a50f6d10f2c30c1e Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 10 Aug 2021 19:55:42 +0100 Subject: [PATCH 189/488] Minor tidy ups for methylation test programs Add omitted #include, and ignore new built test programs. --- .gitignore | 2 ++ test/pileup_mod.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 16a6b288e..1573a5bf7 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ shlib-exports-*.txt /test/hts_endian /test/longrefs/*.tmp.* /test/pileup +/test/pileup_mod /test/plugins-dlhts /test/sam /test/tabix/*.tmp.* @@ -61,6 +62,7 @@ shlib-exports-*.txt /test/test_introspection /test/test_kfunc /test/test_kstring +/test/test_mod /test/test-parse-reg /test/test_realn /test/test-regidx diff --git a/test/pileup_mod.c b/test/pileup_mod.c index 7a55769e2..95c353771 100644 --- a/test/pileup_mod.c +++ b/test/pileup_mod.c @@ -22,6 +22,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include + #include #include #include From ddc4c4d3da1089b6aea2a12212dc1bada6b83e38 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 12 Aug 2021 12:13:40 +0100 Subject: [PATCH 190/488] Pair lines with symbolic alleles by END tag While non-symbolic variation is uniquely identified by POS,REF,ALT, symbolic alleles starting at the same position were undistinguishable. This prevented correct matching of records with the same positions and variant type but different length (INFO/END) A test case is added in bcftools in a separate commit (annotate24.*.vcf) --- bcf_sr_sort.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/bcf_sr_sort.c b/bcf_sr_sort.c index b7299a4d0..01e98bb39 100644 --- a/bcf_sr_sort.c +++ b/bcf_sr_sort.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017-2019 Genome Research Ltd. + Copyright (C) 2017-2021 Genome Research Ltd. Author: Petr Danecek @@ -259,6 +259,7 @@ static int cmpstringp(const void *p1, const void *p2) return strcmp(* (char * const *) p1, * (char * const *) p2); } +#define DEBUG_VSETS 0 #if DEBUG_VSETS void debug_vsets(sr_sort_t *srt) { @@ -280,6 +281,7 @@ void debug_vsets(sr_sort_t *srt) } #endif +#define DEBUG_VBUF 0 #if DEBUG_VBUF void debug_vbuf(sr_sort_t *srt) { @@ -380,13 +382,33 @@ static int bcf_sr_sort_set(bcf_srs_t *readers, sr_sort_t *srt, const char *chr, if ( srt->str.l ) kputc(';',&srt->str); srt->off[srt->noff++] = srt->str.l; - size_t beg = srt->str.l; + size_t beg = srt->str.l; + int end_pos = -1; for (ivar=1; ivarn_allele; ivar++) { if ( ivar>1 ) kputc(',',&srt->str); kputs(line->d.allele[0],&srt->str); kputc('>',&srt->str); kputs(line->d.allele[ivar],&srt->str); + + // If symbolic allele, check also the END tag in case there are multiple events, + // such as s, starting at the same positions + if ( line->d.allele[ivar][0]=='<' ) + { + if ( end_pos==-1 ) + { + bcf_info_t *end_info = bcf_get_info(reader->header,line,"END"); + if ( end_info ) + end_pos = (int)end_info->v1.i; // this is only to create a unique id, we don't mind a potential int64 overflow + else + end_pos = 0; + } + if ( end_pos ) + { + kputc('/',&srt->str); + kputw(end_pos, &srt->str); + } + } } if ( line->n_allele==1 ) { From 31bf087c52b13bf200b6253a9f159c7be31f4f8d Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Fri, 13 Aug 2021 14:47:33 +0100 Subject: [PATCH 191/488] Modified warning to make it less severe. --- bgzf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bgzf.c b/bgzf.c index ca01c9717..68cfa41d3 100644 --- a/bgzf.c +++ b/bgzf.c @@ -1022,7 +1022,7 @@ int bgzf_read_block(BGZF *fp) if (j->hit_eof) { if (!fp->last_block_eof && !fp->no_eof_block) { fp->no_eof_block = 1; - hts_log_warning("EOF marker is absent. The input is probably truncated"); + hts_log_warning("EOF marker is absent. The input may be truncated"); } fp->mt->hit_eof = 1; } @@ -1124,7 +1124,7 @@ int bgzf_read_block(BGZF *fp) if (count == 0) { // no data read if (!fp->last_block_eof && !fp->no_eof_block && !fp->is_gzip) { fp->no_eof_block = 1; - hts_log_warning("EOF marker is absent. The input is probably truncated"); + hts_log_warning("EOF marker is absent. The input may be truncated"); } fp->block_length = 0; return 0; From 238fe32d8c7aa05d3ac75d2249c61d8e268be58f Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 28 Aug 2021 22:55:31 +0100 Subject: [PATCH 192/488] Add hts_flush() to flush buffered htsFile output data Implemented by calling through to the various underlying flush routines. Also add sam_flush() and bcf_flush() wrappers. --- NEWS | 4 ++++ hts.c | 32 ++++++++++++++++++++++++++++++++ htslib/hts.h | 9 +++++++++ htslib/sam.h | 1 + htslib/vcf.h | 1 + 5 files changed, 47 insertions(+) diff --git a/NEWS b/NEWS index 745d7dbae..29742cabc 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,10 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* New API function hts_flush()/sam_flush()/bcf_flush() for flushing output + htsFile/samFile/vcfFile streams. (PR #1326, thanks to John Marshall) + + Noteworthy changes in release 1.13 (7th July 2021) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/hts.c b/hts.c index 132f499e7..6db9f55c5 100644 --- a/hts.c +++ b/hts.c @@ -1482,6 +1482,38 @@ int hts_close(htsFile *fp) return ret; } +int hts_flush(htsFile *fp) +{ + if (fp == NULL) return 0; + + switch (fp->format.format) { + case binary_format: + case bam: + case bcf: + return bgzf_flush(fp->fp.bgzf); + + case cram: + return cram_flush(fp->fp.cram); + + case empty_format: + case text_format: + case bed: + case fasta_format: + case fastq_format: + case sam: + case vcf: + if (fp->format.compression != no_compression) + return bgzf_flush(fp->fp.bgzf); + else + return hflush(fp->fp.hfile); + + default: + break; + } + + return 0; +} + const htsFormat *hts_get_format(htsFile *fp) { return fp? &fp->format : NULL; diff --git a/htslib/hts.h b/htslib/hts.h index 03a76534e..c6f4f701c 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -599,6 +599,15 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) HTSLIB_EXPORT htsFile *hts_hopen(struct hFILE *fp, const char *fn, const char *mode); +/*! + @abstract For output streams, flush any buffered data + @param fp The file handle to be flushed + @return 0 for success, or negative if an error occurred. + @since 1.14 +*/ +HTSLIB_EXPORT +int hts_flush(htsFile *fp); + /*! @abstract Close a file handle, flushing buffered data for output streams @param fp The file handle to be closed diff --git a/htslib/sam.h b/htslib/sam.h index bc4f9f42c..d37877e48 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1370,6 +1370,7 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, #define sam_open(fn, mode) (hts_open((fn), (mode))) #define sam_open_format(fn, mode, fmt) (hts_open_format((fn), (mode), (fmt))) + #define sam_flush(fp) hts_flush((fp)) #define sam_close(fp) hts_close(fp) HTSLIB_EXPORT diff --git a/htslib/vcf.h b/htslib/vcf.h index 732f25607..7a001aca6 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -304,6 +304,7 @@ typedef struct bcf1_t { typedef htsFile vcfFile; #define bcf_open(fn, mode) hts_open((fn), (mode)) #define vcf_open(fn, mode) hts_open((fn), (mode)) + #define bcf_flush(fp) hts_flush((fp)) #define bcf_close(fp) hts_close(fp) #define vcf_close(fp) hts_close(fp) From b7420ae08affe41dc75a9879d0b03f421d8ed5dc Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 7 Sep 2021 11:03:44 +0100 Subject: [PATCH 193/488] Finer control of --regions vs --targets overlap This is to address a long-standing design flaw in handling regions and targets, as described in these BCFtools issues: https://github.com/samtools/bcftools/issues/1420 https://github.com/samtools/bcftools/issues/1421 HTSlib (and BCFtools) recognize two sets of behaviors / options for resctricting VCF/BCF files by region, one is for streaming (`-t/-T`) and one for index-gumping (`-r/-R`). They behave differently, the first includes only records with POS coordinate within the regions, the other includes overlapping regions. This allows to modify the default behavior and provides three options: - Include only records with POS starting in the regions/targets - Include VCF records that overlap regions/targets, even if POS itself is outside the regions - Include only VCF records where the true variation overlaps regions/targets, e.g. consider the difference between `TC>T-` and `C>-` Most importantly, this allows to make the regions and targets behave the same way. Note that the default behavior remains unchanged. --- htslib/synced_bcf_reader.h | 9 ++- synced_bcf_reader.c | 122 ++++++++++++++++++++++++++++++------- 2 files changed, 107 insertions(+), 24 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index f262327ff..86dd53a8e 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019-2020 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2021 Genome Research Ltd. Author: Petr Danecek @@ -96,7 +96,9 @@ typedef enum { BCF_SR_REQUIRE_IDX, BCF_SR_PAIR_LOGIC, // combination of the PAIR_* values above - BCF_SR_ALLOW_NO_IDX // allow to proceed even if required index is not present (at the user's risk) + BCF_SR_ALLOW_NO_IDX, // allow to proceed even if required index is not present (at the user's risk) + BCF_SR_REGIONS_OVERLAP, // include overlapping records with POS outside the regions: 0=no, 1=VCF line overlap, 2=true variant overlap [1] + BCF_SR_TARGETS_OVERLAP // include overlapping records with POS outside the targets: 0=no, 1=VCF line overlap, 2=true variant overlap [0] } bcf_sr_opt_t; @@ -110,7 +112,8 @@ typedef struct bcf_sr_regions_t kstring_t line; // holder of the current line, set only when reading from tabix-indexed files htsFile *file; char *fname; - int is_bin; // is open in binary mode (tabix access) + int is_bin:30, // is open in binary mode (tabix access) + overlap:2; // see BCF_SR_REGIONS_OVERLAP/BCF_SR_TARGETS_OVERLAP char **als; // parsed alleles if targets_als set and _regions_match_alleles called kstring_t als_str; // block of parsed alleles int nals, mals; // number of set alleles and the size of allocated array diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 3dd6c2066..dae70099d 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -67,6 +67,7 @@ region_t; typedef struct { sr_sort_t sort; + int regions_overlap, targets_overlap; } aux_t; @@ -123,6 +124,18 @@ int bcf_sr_set_opt(bcf_srs_t *readers, bcf_sr_opt_t opt, ...) BCF_SR_AUX(readers)->sort.pair = va_arg(args, int); return 0; + case BCF_SR_REGIONS_OVERLAP: + va_start(args, opt); + BCF_SR_AUX(readers)->regions_overlap = va_arg(args, int); + if ( readers->regions ) readers->regions->overlap = BCF_SR_AUX(readers)->regions_overlap; + return 0; + + case BCF_SR_TARGETS_OVERLAP: + va_start(args, opt); + BCF_SR_AUX(readers)->targets_overlap = va_arg(args, int); + if ( readers->targets ) readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap; + return 0; + default: break; } @@ -181,6 +194,7 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) if ( !readers->regions ) return -1; readers->explicit_regs = 1; readers->require_index = REQUIRE_IDX_; + readers->regions->overlap = BCF_SR_AUX(readers)->regions_overlap; return 0; } @@ -199,6 +213,7 @@ int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int readers->targets = bcf_sr_regions_init(targets,is_file,0,1,-2); if ( !readers->targets ) return -1; readers->targets_als = alleles; + readers->targets->overlap = BCF_SR_AUX(readers)->targets_overlap; return 0; } @@ -391,6 +406,8 @@ bcf_srs_t *bcf_sr_init(void) bcf_srs_t *files = (bcf_srs_t*) calloc(1,sizeof(bcf_srs_t)); files->aux = (aux_t*) calloc(1,sizeof(aux_t)); bcf_sr_sort_init(&BCF_SR_AUX(files)->sort); + bcf_sr_set_opt(files,BCF_SR_REGIONS_OVERLAP,1); + bcf_sr_set_opt(files,BCF_SR_TARGETS_OVERLAP,0); return files; } @@ -545,6 +562,35 @@ static int _readers_next_region(bcf_srs_t *files) return 0; } +static void _set_variant_boundaries(bcf1_t *rec, hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t off; + if ( rec->n_allele ) + { + off = rec->rlen; + bcf_unpack(rec, BCF_UN_STR); + int i; + for (i=1; in_allele; i++) + { + // Make symbolic alleles start at POS, although this is not strictly true for + // , where POS should be the position BEFORE the deletion/insertion. + // However, since arbitrary symbolic alleles can be defined by the user, we + // will simplify the interpretation of --targets-overlap and --region-overlap. + int j = 0; + char *ref = rec->d.allele[0]; + char *alt = rec->d.allele[i]; + while ( ref[j] && alt[j] && ref[j]==alt[j] ) j++; + if ( off > j ) off = j; + if ( !off ) break; + } + } + else + off = 0; + + *beg = rec->pos + off; + *end = rec->pos + rec->rlen - 1; +} + /* * _reader_fill_buffer() - buffers all records with the same coordinate */ @@ -606,8 +652,28 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) bcf_subset_format(reader->header,reader->buffer[reader->nbuffer+1]); } - // prevent creation of duplicates from records overlapping multiple regions - if ( files->regions && reader->buffer[reader->nbuffer+1]->pos <= files->regions->prev_end ) continue; + // Prevent creation of duplicates from records overlapping multiple regions + // and recognize true variant overlaps vs record overlaps (e.g. TA>T vs A>-) + if ( files->regions ) + { + hts_pos_t beg, end; + if ( BCF_SR_AUX(files)->regions_overlap==0 ) + beg = end = reader->buffer[reader->nbuffer+1]->pos; + else if ( BCF_SR_AUX(files)->regions_overlap==1 ) + { + beg = reader->buffer[reader->nbuffer+1]->pos; + end = reader->buffer[reader->nbuffer+1]->pos + reader->buffer[reader->nbuffer+1]->rlen - 1; + } + else if ( BCF_SR_AUX(files)->regions_overlap==2 ) + _set_variant_boundaries(reader->buffer[reader->nbuffer+1], &beg,&end); + else + { + hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); + exit(1); + } + + if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; + } // apply filter if ( !reader->nfilter_ids ) @@ -637,23 +703,18 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } /* - * _readers_shift_buffer() - removes the first line and all subsequent lines with the same position + * _readers_shift_buffer() - removes the first line */ static void _reader_shift_buffer(bcf_sr_t *reader) { + if ( !reader->nbuffer ) return; int i; + bcf1_t *tmp = reader->buffer[1]; for (i=2; i<=reader->nbuffer; i++) - if ( reader->buffer[i]->rid!=reader->buffer[1]->rid || reader->buffer[i]->pos!=reader->buffer[1]->pos ) break; - if ( i<=reader->nbuffer ) - { - // A record with a different position follows, swap it. Because of the reader's logic, - // only one such line can be present. - assert( i==reader->nbuffer ); - bcf1_t *tmp = reader->buffer[1]; reader->buffer[1] = reader->buffer[i]; reader->buffer[i] = tmp; - reader->nbuffer = 1; - } - else - reader->nbuffer = 0; // no other line + reader->buffer[i-1] = reader->buffer[i]; + if ( reader->nbuffer > 1 ) + reader->buffer[reader->nbuffer] = tmp; + reader->nbuffer--; } static int next_line(bcf_srs_t *files) @@ -704,19 +765,38 @@ static int next_line(bcf_srs_t *files) // Skip this position if not present in targets if ( files->targets ) { - int ret = bcf_sr_regions_overlap(files->targets, chr, min_pos, min_pos); - if ( (!files->targets_exclude && ret<0) || (files->targets_exclude && !ret) ) + int match = 0; + for (i=0; inreaders; i++) + { + if ( !files->readers[i].nbuffer || files->readers[i].buffer[1]->pos!=min_pos ) continue; + hts_pos_t beg, end; + if ( BCF_SR_AUX(files)->targets_overlap==0 ) + beg = end = min_pos; + else if ( BCF_SR_AUX(files)->targets_overlap==1 ) + { + beg = min_pos; + end = min_pos + files->readers[i].buffer[1]->rlen - 1; + } + else if ( BCF_SR_AUX(files)->targets_overlap==2 ) + _set_variant_boundaries(files->readers[i].buffer[1], &beg,&end); + else + { + hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); + exit(1); + } + int overlap = bcf_sr_regions_overlap(files->targets, chr, beg, end)==0 ? 1 : 0; + if ( (!files->targets_exclude && !overlap) || (files->targets_exclude && overlap) ) + _reader_shift_buffer(&files->readers[i]); + else + match = 1; + } + if ( !match ) { - // Remove all lines with this position from the buffer - for (i=0; inreaders; i++) - if ( files->readers[i].nbuffer && files->readers[i].buffer[1]->pos==min_pos ) - _reader_shift_buffer(&files->readers[i]); min_pos = HTS_POS_MAX; chr = NULL; continue; } } - break; // done: chr and min_pos are set } if ( !chr ) return 0; From 6f184540af05c5cae071b791234bc5ffa49de257 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 23 Aug 2021 18:14:35 +0100 Subject: [PATCH 194/488] Make version.sh work on shallow git checkouts. Use "git describe --always" to silence "No names found, cannot describe anything." messages on shallow checkouts where the last tagged release may not be available. Where this is the case, git describe --always returns the short commit hash. This is appended to the hard-coded version number to make something similar to the normal git describe output. As we don't know how many commits there are from the last tag in this case, that part is set to 1 for want of anything better. --- version.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/version.sh b/version.sh index 05175fde7..e4896228e 100755 --- a/version.sh +++ b/version.sh @@ -31,7 +31,11 @@ srcdir=${0%/version.sh} if [ -e $srcdir/.git ] then # If we ever get to 10.x this will need to be more liberal - VERSION=`cd $srcdir && git describe --match '[0-9].[0-9]*' --dirty` + v=`cd $srcdir && git describe --always --match '[0-9].[0-9]*' --dirty` + case $v in + [0-9]*.[0-9]*) VERSION="$v" ;; + [0-9a-f][0-9a-f]*) VERSION="$VERSION-1-g$v" ;; + esac fi # Numeric version is for use in .dylib or .so libraries From 3eada2f748dddb7743bb336e326511eda77892b2 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 17 Aug 2021 17:54:32 +0100 Subject: [PATCH 195/488] Look for untracked files in CI builds Adds a check for untracked files after the tests have run. This is enabled for two builds - one with configure, and one without. The others skip the new test so that it's still possible to see that a proposed change works irrespective of issues with untracked files. --- .cirrus.yml | 4 +++- Makefile | 9 ++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 4df3dfd08..2e45de544 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -48,7 +48,7 @@ test_template: &TEST test_script: | make test-shlib-exports make test - + if test "x$DO_UNTRACKED_FILE_CHECK" = "xyes"; then make check-untracked ; fi #-------------------------------------------------- # Task: linux builds. @@ -68,6 +68,7 @@ gcc_task: matrix: - environment: DO_MAINTAINER_CHECKS: yes + DO_UNTRACKED_FILE_CHECK: yes USE_CONFIG: no - environment: USE_CONFIG: yes @@ -96,6 +97,7 @@ ubuntu_task: matrix: - environment: USE_CONFIG: yes + DO_UNTRACKED_FILE_CHECK: yes - environment: USE_CONFIG: yes CFLAGS: -g -Wall -O3 -fsanitize=address diff --git a/Makefile b/Makefile index e607ee8d4..bf66ee3bb 100644 --- a/Makefile +++ b/Makefile @@ -473,6 +473,13 @@ maintainer-check: test/maintainer/check_copyright.pl . test/maintainer/check_spaces.pl . +# Look for untracked files in the git repository. +check-untracked: + @if test -e .git && git status --porcelain | grep '^\?'; then \ + echo 'Untracked files detected (see above). Please either clean up, add to .gitignore, or for test output files consider naming them to match *.tmp or *.tmp.*' ; \ + false ; \ + fi + # Create a shorthand. We use $(SRC) or $(srcprefix) rather than $(srcdir)/ # for brevity in test and install rules, and so that build logs do not have # ./ sprinkled throughout. @@ -801,7 +808,7 @@ distdir: force: -.PHONY: all check clean distclean distdir force +.PHONY: all check check-untracked clean distclean distdir force .PHONY: install install-pkgconfig installdirs lib-shared lib-static .PHONY: maintainer-check maintainer-clean mostlyclean plugins .PHONY: print-config print-version show-version tags From a78b2877dc3a22bf431f621d411df713f36b7a74 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 20 Aug 2021 14:48:28 +0100 Subject: [PATCH 196/488] Add FASTQ_OPT_NAME2 option for parsing of SRA data. This variant of FASTQ has the read name as the second field on the name line, just to be awkward. --- hts.c | 5 +++++ htslib/hts.h | 5 +++++ sam.c | 27 ++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/hts.c b/hts.c index 6db9f55c5..e31615822 100644 --- a/hts.c +++ b/hts.c @@ -1035,6 +1035,10 @@ int hts_opt_add(hts_opt **opts, const char *c_arg) { strcmp(o->arg, "FASTQ_CASAVA") == 0) o->opt = FASTQ_OPT_CASAVA, o->val.i = 1; + else if (strcmp(o->arg, "fastq_name2") == 0 || + strcmp(o->arg, "FASTQ_NAME2") == 0) + o->opt = FASTQ_OPT_NAME2, o->val.i = 1; + else { hts_log_error("Unknown option '%s'", o->arg); free(o->arg); @@ -1606,6 +1610,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...) { case FASTQ_OPT_CASAVA: case FASTQ_OPT_RNUM: + case FASTQ_OPT_NAME2: if (fp->format.format == fastq_format || fp->format.format == fasta_format) return fastq_state_set(fp, opt); diff --git a/htslib/hts.h b/htslib/hts.h index c6f4f701c..d477f4f34 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -359,6 +359,11 @@ enum hts_fmt_option { // Two character string. // Barcode aux tag for CASAVA; defaults to "BC". FASTQ_OPT_BARCODE, + + // Process SRA and ENA read names which pointlessly move the original + // name to the second field and insert a constructed . + // name in its place. + FASTQ_OPT_NAME2, }; // Profile options for encoding; primarily used at present in CRAM diff --git a/sam.c b/sam.c index 5300e9841..7c82b4a7a 100644 --- a/sam.c +++ b/sam.c @@ -3597,6 +3597,7 @@ typedef struct { char BC[3]; // aux tag ID for barcode khash_t(tag) *tags; // which aux tags to use (if empty, use all). char nprefix; + int sra_names; } fastq_state; // Initialise fastq state. @@ -3640,6 +3641,10 @@ int fastq_state_set(samFile *fp, enum hts_fmt_option opt, ...) { x->casava = 1; break; + case FASTQ_OPT_NAME2: + x->sra_names = 1; + break; + case FASTQ_OPT_AUX: { va_start(args, opt); x->aux = 1; @@ -3710,11 +3715,27 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { } // Name + if (*x->name.s != x->nprefix) return -2; - i = 0; l = x->name.l; - char *s = x->name.s; + // Reverse the SRA strangeness of putting the run_name.number before + // the read name. + i = 0; + char *name = x->name.s+1; + if (x->sra_names) { + char *cp = strpbrk(x->name.s, " \t"); + if (cp) { + while (*cp == ' ' || *cp == '\t') + cp++; + *--cp = '@'; + i = cp - x->name.s; + name = cp+1; + } + } + + l = x->name.l; + char *s = x->name.s + i; while (i < l && !isspace_c(s[i])) i++; if (i < l) { @@ -3774,7 +3795,7 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { // Convert to BAM ret = bam_set1(b, - x->name.l-1, x->name.s+1, + x->name.s + x->name.l - name, name, flag, -1, -1, 0, // ref '*', pos, mapq, 0, NULL, // no cigar, From 4a79f257b116e1c61b3f3cd7d78ff7f8b2230515 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 22 Sep 2021 12:38:12 +0100 Subject: [PATCH 197/488] Add fastq_name2 option tests --- test/fastq/fastq.tst | 4 ++++ test/fastq/name2-q.sam | 4 ++++ test/fastq/name2.fa | 8 ++++++++ test/fastq/name2.fq | 16 ++++++++++++++++ test/fastq/name2.sam | 4 ++++ 5 files changed, 36 insertions(+) create mode 100644 test/fastq/name2-q.sam create mode 100644 test/fastq/name2.fa create mode 100644 test/fastq/name2.fq create mode 100644 test/fastq/name2.sam diff --git a/test/fastq/fastq.tst b/test/fastq/fastq.tst index 5feb31fe1..966f0ed8a 100644 --- a/test/fastq/fastq.tst +++ b/test/fastq/fastq.tst @@ -77,6 +77,10 @@ P r2.sam $tview -i fastq_aux r2.fq P r1-q.sam $tview -i fastq_aux r1.fa P r2-q.sam $tview -i fastq_aux r2.fa +# Simple tests for the FASTQ_NAME2 option. +P name2.sam $tview -i fastq_name2 name2.fq +P name2-q.sam $tview -i fastq_name2 name2.fa + # -------------------- # Writing diff --git a/test/fastq/name2-q.sam b/test/fastq/name2-q.sam new file mode 100644 index 000000000..914233fd2 --- /dev/null +++ b/test/fastq/name2-q.sam @@ -0,0 +1,4 @@ +name_001 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG * +name_002 4 * 0 0 * * 0 0 TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA * +name_003 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA * +name_004 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA * diff --git a/test/fastq/name2.fa b/test/fastq/name2.fa new file mode 100644 index 000000000..10a225336 --- /dev/null +++ b/test/fastq/name2.fa @@ -0,0 +1,8 @@ +>ignore_001 name_001 +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG +>ignore_002 name_002 +TTGTTAAAATGACCATACCCAAAGTGATCTACAGACTCAATACAATTTCTATTGAAATACCAATCACACTCTTCACAGAACTAGAAAAACAGTTCTAAAA +>ignore_003 name_003 +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA +>name_004 +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA diff --git a/test/fastq/name2.fq b/test/fastq/name2.fq new file mode 100644 index 000000000..6ac19c92b --- /dev/null +++ b/test/fastq/name2.fq @@ -0,0 +1,16 @@ +@ignore_001 name_001 +CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG ++ +CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +@ignore_003 name_003 +ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA ++ +BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +@name_004 +GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA ++ +CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE diff --git a/test/fastq/name2.sam b/test/fastq/name2.sam new file mode 100644 index 000000000..a79a05805 --- /dev/null +++ b/test/fastq/name2.sam @@ -0,0 +1,4 @@ +name_001 4 * 0 0 * * 0 0 CCGTTAGAGCATTTGTTGAAAATGCTTTCCTTGCTCCATGTGATGACTCTGGTGCCCTTGTCAAAAGCCAGCTGGGCCTATTCGTGTGGGTCTGTTTCTG CABCFGDEEFFEFHGHGGFFGDIGIJFIFHHGHEIFGHBCGHDIFBE9GIAICGGICFIBFGGHGDGGGHE?GIGDFGGHEGIEJG>;FGEFCFHGGGHEJEAJE +name_003 4 * 0 0 * * 0 0 ACGCGGCAATCCAATGTGTGAGTTGAGAAGCGGTGAGGAGGGAATCCTAATTTTATGAGCAGGTCAGGACCGTGGGAGATACCTGACACCTGAGATGGTA BACCFGBFGFHGGJGHGGFEGHIGIJHFEH:HHEHGHHBGGH9IAGHGFHIFJFFAFGIFDIGHKEIGF,CGD66?7EFI5EEG>EGGGGD5=HH6E +name_004 4 * 0 0 * * 0 0 GACATGCCATAACATTCATGTTTTATGTGTACAAGTCAATGAATTTTAGTATATTTACAGAGTTGTATGACTGTCTCCACAATCTAATTTTAGGTTTCCA CABFFGFFJFHEGEGJGGDG?FIGHHHBGHHHGIIGHGHGGHDGHFHIDFCIKEGIFHGGII9HFFGGGEEIGGEEHGGEEGDEHFH>FGGGGHAFAHGE From dfb1947a5e556d0b23af7d76d6bc357558c22ca4 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 16 Sep 2021 09:19:51 +0100 Subject: [PATCH 198/488] Add the ability to handle no credentials. Signature v4 requests were trying to create authorisation strings when no credentials were present. This change adds the v2 lines that handle this. --- hfile_s3.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hfile_s3.c b/hfile_s3.c index 0453ba5e4..e8e505e2a 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -1058,6 +1058,10 @@ static int v4_auth_header_callback(void *ctx, char ***hdrs) { return -1; } + if (!ad->id.l || !ad->secret.l) { + return copy_auth_headers(ad, hdrs); + } + hash_string("", 0, content_hash); // empty hash ad->canonical_query_string.l = 0; From d3788baf8a6ac03572653a2181f45ca86239484b Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Mon, 27 Sep 2021 14:58:49 +0100 Subject: [PATCH 199/488] Fix clang-13 unused variable messages. --- bgzf.c | 4 +--- cram/cram_codecs.c | 4 +--- vcf.c | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/bgzf.c b/bgzf.c index 68cfa41d3..e72ed566d 100644 --- a/bgzf.c +++ b/bgzf.c @@ -1467,7 +1467,7 @@ static void *bgzf_mt_writer(void *vp) { int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) { uint8_t header[BLOCK_HEADER_LENGTH], *compressed_block; - int count, size = 0, block_length, remaining; + int count, block_length, remaining; // NOTE: Guaranteed to be compressed as we block multi-threading in // uncompressed mode. However it may be gzip compression instead @@ -1496,7 +1496,6 @@ int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) if (count != sizeof(header)) // no data read return -1; - size = count; block_length = unpackInt16((uint8_t*)&header[16]) + 1; // +1 because when writing this number, we used "-1" if (block_length < BLOCK_HEADER_LENGTH) { j->errcode |= BGZF_ERR_HEADER; @@ -1510,7 +1509,6 @@ int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) j->errcode |= BGZF_ERR_IO; return -1; } - size += count; j->comp_len = block_length; j->uncomp_len = BGZF_MAX_BLOCK_SIZE; j->block_address = block_address; diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index a5857c100..9f112863e 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3069,7 +3069,7 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, int version, varint_vec *vv) { int *vals = NULL, *freqs = NULL, *lens = NULL, code, len; int *new_vals, *new_freqs; - int i, ntot = 0, max_val = 0, min_val = INT_MAX, k; + int i, max_val = 0, min_val = INT_MAX, k; size_t nvals, vals_alloc = 0; cram_codec *c; cram_huffman_code *codes; @@ -3095,7 +3095,6 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, vals[nvals] = i; freqs[nvals] = st->freqs[i]; assert(st->freqs[i] > 0); - ntot += freqs[nvals]; if (max_val < i) max_val = i; if (min_val > i) min_val = i; nvals++; @@ -3118,7 +3117,6 @@ cram_codec *cram_huffman_encode_init(cram_stats *st, vals[nvals]= kh_key(st->h, k); freqs[nvals] = kh_val(st->h, k); assert(freqs[nvals] > 0); - ntot += freqs[nvals]; if (max_val < i) max_val = i; if (min_val > i) min_val = i; nvals++; diff --git a/vcf.c b/vcf.c index bfa253fbe..d43658a51 100644 --- a/vcf.c +++ b/vcf.c @@ -3584,7 +3584,7 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) return dst; } - int i, ndst_ori = dst->nhrec, need_sync = 0, ret = 0, res; + int i, ndst_ori = dst->nhrec, need_sync = 0, res; for (i=0; inhrec; i++) { if ( src->hrec[i]->type==BCF_HL_GEN && src->hrec[i]->value ) @@ -3641,13 +3641,11 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) { hts_log_warning("Trying to combine \"%s\" tag definitions of different lengths", src->hrec[i]->vals[0]); - ret |= 1; } if ( (kh_val(d_src,k_src).info[rec->type]>>4 & 0xf) != (kh_val(d_dst,k_dst).info[rec->type]>>4 & 0xf) ) { hts_log_warning("Trying to combine \"%s\" tag definitions of different types", src->hrec[i]->vals[0]); - ret |= 1; } } } From dcf09a9cd52140e190a848fb5e15b7c076bf92c6 Mon Sep 17 00:00:00 2001 From: Alex Petty Date: Wed, 15 Sep 2021 11:46:55 -0500 Subject: [PATCH 200/488] Add a keep option to preserve input files to bgzip. Flag is -k or --keep, just like other compression tools --- bgzip.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/bgzip.c b/bgzip.c index 22eb0404a..16a3a9479 100644 --- a/bgzip.c +++ b/bgzip.c @@ -120,14 +120,14 @@ static int bgzip_main_usage(FILE *fp, int status) fprintf(fp, " -g, --rebgzip use an index file to bgzip a file\n"); fprintf(fp, " -s, --size INT decompress INT bytes (uncompressed size)\n"); fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n"); - fprintf(fp, " -t, --test test integrity of compressed file"); - fprintf(fp, "\n"); + fprintf(fp, " -t, --test test integrity of compressed file\n"); + fprintf(fp, " -k, --keep don't delete input files during operation\n"); return status; } int main(int argc, char **argv) { - int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0; + int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep; BGZF *fp; void *buffer; long start, end, size; @@ -150,11 +150,12 @@ int main(int argc, char **argv) {"threads", required_argument, NULL, '@'}, {"test", no_argument, NULL, 't'}, {"version", no_argument, NULL, 1}, + {"keep", no_argument, NULL, 'k'}, {NULL, 0, NULL, 0} }; - compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; - while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grt",loptions,NULL)) >= 0){ + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; + while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grtk",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; case 'c': pstdout = 1; break; @@ -168,6 +169,7 @@ int main(int argc, char **argv) case 'r': reindex = 1; compress = 0; break; case '@': threads = atoi(optarg); break; case 't': test = 1; compress = 0; reindex = 0; break; + case 'k': keep = 1; break; case 1: printf( "bgzip (htslib) %s\n" @@ -272,7 +274,7 @@ int main(int argc, char **argv) } } if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); - if (argc > optind && !pstdout) unlink(argv[optind]); + if (argc > optind && !pstdout && !keep) unlink(argv[optind]); free(buffer); close(f_src); return 0; @@ -416,7 +418,7 @@ int main(int argc, char **argv) } free(buffer); if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); - if (argc > optind && !pstdout && !test) unlink(argv[optind]); + if (argc > optind && !pstdout && !test && !keep) unlink(argv[optind]); return 0; } } From ab22e8408c4710e9462ff4c4f2b3fb4325cfe42e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 15 Sep 2021 16:50:50 +0100 Subject: [PATCH 201/488] Move the "start<1" error check in cram_get_ref to before thread locking. This fixes #1329, which was discovered by code scanning and reported by Github @ryancaicse. I do not believe it is likely to be triggered, but the value of this file can sometimes come from a CRAM file so it is possible malformed data could lead to a threading deadlock. (Untested) --- cram/cram_io.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 92102ffd7..e87aa7fa0 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -3363,7 +3363,7 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { char *seq; int ostart = start; - if (id == -1) + if (id == -1 || start < 1) return NULL; /* FIXME: axiomatic query of r->seq being true? @@ -3439,8 +3439,6 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { end = r->length; if (end >= r->length) end = r->length; - if (start < 1) - return NULL; if (end - start >= 0.5*r->length || fd->shared_ref) { start = 1; From f05aff6ac306f663b0e605f96874d56a355e1a19 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 28 Sep 2021 12:16:49 +0100 Subject: [PATCH 202/488] Fix incorrect offset into fastq header in name2 mode The offset to skip over the first name in the fastq header was being applied twice, potentially causing the search for the comment part to fall off the end of the allocated string. Thanks to John Marshall for reporting this. --- sam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sam.c b/sam.c index 7c82b4a7a..1f02f428e 100644 --- a/sam.c +++ b/sam.c @@ -3735,7 +3735,7 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { } l = x->name.l; - char *s = x->name.s + i; + char *s = x->name.s; while (i < l && !isspace_c(s[i])) i++; if (i < l) { From 2dbbcf655e077bb3d69de0a0ba9f8bb6db99b704 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 27 Sep 2021 20:47:22 +0100 Subject: [PATCH 203/488] Fix hts_idx_get_n_no_coor() out-of-bounds memory access For CRAI-indexed CRAM files idx actually points to an hts_cram_idx_t, which has no n_no_coor field. Return an appropriate "not available" value instead. --- hts.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hts.c b/hts.c index e31615822..e2d6ba44d 100644 --- a/hts.c +++ b/hts.c @@ -2789,6 +2789,7 @@ int hts_idx_get_stat(const hts_idx_t* idx, int tid, uint64_t* mapped, uint64_t* uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) { + if (idx->fmt == HTS_FMT_CRAI) return 0; return idx->n_no_coor; } From 39fd425be1de1ea5c40b29c6affbee1ae771206a Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 28 Sep 2021 17:16:09 +0100 Subject: [PATCH 204/488] Update to bgzip man page and usage text. (PR #1336) Added missing keep and test options to man page. Reordered the program usage to match the man page. Updated NEWS file. --- NEWS | 3 +++ bgzip.1 | 24 +++++++++++++++--------- bgzip.c | 8 ++++---- 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/NEWS b/NEWS index 29742cabc..06dc8defb 100644 --- a/NEWS +++ b/NEWS @@ -4,6 +4,9 @@ Noteworthy changes in release a.b * New API function hts_flush()/sam_flush()/bcf_flush() for flushing output htsFile/samFile/vcfFile streams. (PR #1326, thanks to John Marshall) +* Added a keep option to bgzip to leave the original file untouched. This + brings bgzip into line with gzip. (PR #1331, thanks to Alex Petty). + Noteworthy changes in release 1.13 (7th July 2021) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/bgzip.1 b/bgzip.1 index 48dd1eb6f..48f66f9c9 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -4,7 +4,7 @@ bgzip \- Block compression/decompression utility .\" .\" Copyright (C) 2009-2011 Broad Institute. -.\" Copyright (C) 2018 Genome Research Limited. +.\" Copyright (C) 2018, 2021 Genome Research Limited. .\" .\" Author: Heng Li .\" @@ -43,7 +43,7 @@ bgzip \- Block compression/decompression utility .SH SYNOPSIS .PP .B bgzip -.RB [ -cdfhir ] +.RB [ -cdfhikrt ] .RB [ -b .IR virtualOffset ] .RB [ -I @@ -90,6 +90,13 @@ Overwrite files without asking, or decompress files that don't have a known compression filename extension (e.g., \fI.gz\fR) without asking. Use \fB--force\fR twice to do both without asking. .TP +.B "-g, --rebgzip" +Try to use an existing index to create a compressed file with matching +block offsets. +Note that this assumes that the same compression library and level are in use +as when making the original file. +Don't use it unless you know what you're doing. +.TP .B "-h, --help" Displays a help message. .TP @@ -101,6 +108,9 @@ file with .gzi appended to it. .BI "-I, --index-name " FILE Index file name. .TP +.B "-k, --keep" +Do not delete input file during operation. +.TP .BI "-l, --compress-level " INT Compression level to use when compressing. From 0 to 9, or -1 for the default level set by the compression library. [-1] @@ -108,17 +118,13 @@ From 0 to 9, or -1 for the default level set by the compression library. [-1] .B "-r, --reindex" Rebuild the index on an existing compressed file. .TP -.B "-g, --rebgzip" -Try to use an existing index to create a compressed file with matching -block offsets. -Note that this assumes that the same compression library and level are in use -as when making the original file. -Don't use it unless you know what you're doing. -.TP .BI "-s, --size " INT Decompress INT bytes (uncompressed size) to standard output. Implies -c. .TP +.B "-t, --test" +Test the intregrity of the compressed file. +.TP .BI "-@, --threads " INT Number of threads to use [1]. .PP diff --git a/bgzip.c b/bgzip.c index 16a3a9479..f9abf99c5 100644 --- a/bgzip.c +++ b/bgzip.c @@ -1,7 +1,7 @@ /* bgzip.c -- Block compression/decompression utility. Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology - Copyright (C) 2010, 2013-2019 Genome Research Ltd. + Copyright (C) 2010, 2013-2019, 2021 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -112,16 +112,16 @@ static int bgzip_main_usage(FILE *fp, int status) fprintf(fp, " -c, --stdout write on standard output, keep original files unchanged\n"); fprintf(fp, " -d, --decompress decompress\n"); fprintf(fp, " -f, --force overwrite files without asking\n"); + fprintf(fp, " -g, --rebgzip use an index file to bgzip a file\n"); fprintf(fp, " -h, --help give this help\n"); fprintf(fp, " -i, --index compress and create BGZF index\n"); fprintf(fp, " -I, --index-name FILE name of BGZF index file [file.gz.gzi]\n"); + fprintf(fp, " -k, --keep don't delete input files during operation\n"); fprintf(fp, " -l, --compress-level INT Compression level to use when compressing; 0 to 9, or -1 for default [-1]\n"); fprintf(fp, " -r, --reindex (re)index compressed file\n"); - fprintf(fp, " -g, --rebgzip use an index file to bgzip a file\n"); fprintf(fp, " -s, --size INT decompress INT bytes (uncompressed size)\n"); - fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n"); fprintf(fp, " -t, --test test integrity of compressed file\n"); - fprintf(fp, " -k, --keep don't delete input files during operation\n"); + fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n"); return status; } From c5b815c31e9b2f06c7484f534bad609c76b3c15d Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 10 Sep 2021 13:58:15 +0200 Subject: [PATCH 205/488] Warn if spaces instead of tabs are detected to prevent confusion such as in https://github.com/samtools/bcftools/issues/1575 Also require the #CHROM line strictly conforms to VCF specification --- vcf.c | 38 +++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/vcf.c b/vcf.c index d43658a51..0d59be6e6 100644 --- a/vcf.c +++ b/vcf.c @@ -143,21 +143,33 @@ int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) return bcf_hdr_add_sample_len(h, s, 0); } -int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *h, const char *str) +int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str) { - int ret = 0; - int i = 0; - const char *p, *q; - // add samples - for (p = q = str;; ++q) { - if ((unsigned char) *q > '\n') continue; - if (++i > 9) { - if ( bcf_hdr_add_sample_len(h, p, q - p) < 0 ) ret = -1; - } - if (*q == 0 || *q == '\n' || ret < 0) break; - p = q + 1; + const char *mandatory = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO"; + if ( strncmp(str,mandatory,strlen(mandatory)) ) + { + hts_log_error("Could not parse the \"#CHROM..\" line, either the fields are incorrect or spaces are present instead of tabs:\n\t%s",str); + return -1; + } + + const char *beg = str + strlen(mandatory), *end; + if ( !*beg || *beg=='\n' ) return 0; + if ( strncmp(beg,"\tFORMAT\t",8) ) + { + hts_log_error("Could not parse the \"#CHROM..\" line, either FORMAT is missing or spaces are present instead of tabs:\n\t%s",str); + return -1; } + beg += 8; + int ret = 0; + while ( *beg ) + { + end = beg; + while ( *end && *end!='\t' && *end!='\n' ) end++; + if ( bcf_hdr_add_sample_len(hdr, beg, end-beg) < 0 ) ret = -1; + if ( !*end || *end=='\n' || ret<0 ) break; + beg = end + 1; + } return ret; } @@ -873,7 +885,7 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // operations do not really care about a few malformed lines). // In the future we may want to add a strict mode that errors in // this case. - if ( strncmp("#CHROM\tPOS",p,10) != 0 ) { + if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) { char *eol = strchr(p, '\n'); if (*p != '\0') { char buffer[320]; From 8a7e0a7a2d35ad0aa0c7f43b32e95a21351cf279 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 21 Sep 2021 20:53:25 +0100 Subject: [PATCH 206/488] Ensure decompress_peek() errors are propagated In the unlikely event of an error from decompress_peek() (now renamed in anticipation of similar functions for other compression formats), ensure hts_detect_format() also returns an error rather than incorrectly detecting the contents as being empty. --- hts.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/hts.c b/hts.c index e2d6ba44d..17f07519b 100644 --- a/hts.c +++ b/hts.c @@ -296,13 +296,14 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) // Decompress several hundred bytes by peeking at the file, which must be // positioned at the start of a GZIP block. -static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) +static ssize_t +decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) { unsigned char buffer[2048]; z_stream zs; ssize_t npeek = hpeek(fp, buffer, sizeof buffer); - if (npeek < 0) return 0; + if (npeek < 0) return -1; zs.zalloc = NULL; zs.zfree = NULL; @@ -310,7 +311,7 @@ static size_t decompress_peek(hFILE *fp, unsigned char *dest, size_t destsize) zs.avail_in = npeek; zs.next_out = dest; zs.avail_out = destsize; - if (inflateInit2(&zs, 31) != Z_OK) return 0; + if (inflateInit2(&zs, 31) != Z_OK) return -1; while (zs.total_out < destsize) if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break; @@ -489,7 +490,7 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) if (len >= 9 && s[2] == 8) fmt->compression_level = (s[8] == 2)? 9 : (s[8] == 4)? 1 : -1; - len = decompress_peek(hfile, s, sizeof s); + len = decompress_peek_gz(hfile, s, sizeof s); } else if (len >= 10 && memcmp(s, "BZh", 3) == 0 && (memcmp(&s[4], "\x31\x41\x59\x26\x53\x59", 6) == 0 || From a641bb4e9a67c01b97e0a34c0df58769aa999035 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 21 Sep 2021 21:05:00 +0100 Subject: [PATCH 207/488] Partially recognise XZ- and Zstd-compressed data Similarly to hts_detect_format()'s basic detection of bzip2-compressed data, recognise the magic numbers of these two formats. If HAVE_LIBLZMA is defined, we can decompress the first few KB (as we do for gzipped data) to detect the contents. Otherwise, and for Zstandard compression, we can just report "Unknown {XZ|Zstandard}-compressed data". --- Makefile | 2 +- hts.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ htslib/hts.h | 1 + 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bf66ee3bb..78d0e17e6 100644 --- a/Makefile +++ b/Makefile @@ -365,7 +365,7 @@ hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstrin hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) -hts.o hts.pico: hts.c config.h $(htslib_hts_expr_h) $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) +hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) diff --git a/hts.c b/hts.c index 17f07519b..490a9905d 100644 --- a/hts.c +++ b/hts.c @@ -41,6 +41,14 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#ifdef HAVE_LIBLZMA +#ifdef HAVE_LZMA_H +#include +#else +#include "os/lzma_stub.h" +#endif +#endif + #include "htslib/hts.h" #include "htslib/bgzf.h" #include "cram/cram.h" @@ -322,6 +330,38 @@ decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) return destsize; } +#ifdef HAVE_LIBLZMA +// Similarly decompress a portion by peeking at the file, which must be +// positioned at the start of the file. +static ssize_t +decompress_peek_xz(hFILE *fp, unsigned char *dest, size_t destsize) +{ + unsigned char buffer[2048]; + ssize_t npeek = hpeek(fp, buffer, sizeof buffer); + if (npeek < 0) return -1; + + lzma_stream ls = LZMA_STREAM_INIT; + if (lzma_stream_decoder(&ls, lzma_easy_decoder_memusage(9), 0) != LZMA_OK) + return -1; + + ls.next_in = buffer; + ls.avail_in = npeek; + ls.next_out = dest; + ls.avail_out = destsize; + + int r = lzma_code(&ls, LZMA_RUN); + if (! (r == LZMA_OK || r == LZMA_STREAM_END)) { + lzma_end(&ls); + return -1; + } + + destsize = ls.total_out; + lzma_end(&ls); + + return destsize; +} +#endif + // Parse "x.y" text, taking care because the string is not NUL-terminated // and filling in major/minor only when the digits are followed by a delimiter, // so we don't misread "1.10" as "1.1" due to reaching the end of the buffer. @@ -504,6 +544,19 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) if (s[4] == '\x31') return 0; else len = 0; } + else if (len >= 6 && memcmp(s, "\xfd""7zXZ\0", 6) == 0) { + fmt->compression = xz_compression; +#ifdef HAVE_LIBLZMA + len = decompress_peek_xz(hfile, s, sizeof s); +#else + // Without liblzma, we can't recognise the decompressed contents. + return 0; +#endif + } + else if (len >= 4 && memcmp(s, "\x28\xb5\x2f\xfd", 4) == 0) { + fmt->compression = zstd_compression; + return 0; + } else { len = hpeek(hfile, s, sizeof s); } @@ -683,6 +736,8 @@ char *hts_format_description(const htsFormat *format) switch (format->compression) { case bzip2_compression: kputs(" bzip2-compressed", &str); break; case razf_compression: kputs(" legacy-RAZF-compressed", &str); break; + case xz_compression: kputs(" XZ-compressed", &str); break; + case zstd_compression: kputs(" Zstandard-compressed", &str); break; case custom: kputs(" compressed", &str); break; case gzip: kputs(" gzip-compressed", &str); break; case bgzf: diff --git a/htslib/hts.h b/htslib/hts.h index d477f4f34..7c44dbd79 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -210,6 +210,7 @@ enum htsExactFormat { enum htsCompression { no_compression, gzip, bgzf, custom, bzip2_compression, razf_compression, + xz_compression, zstd_compression, compression_maximum = 32767 }; From 790842c194796be75eeb4f69b70034bd9ba2bd3d Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 5 Oct 2021 14:06:33 +0100 Subject: [PATCH 208/488] Fix lzma_mem_inflate() error handling memory leak --- cram/cram_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index e87aa7fa0..bc4f03646 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1344,7 +1344,7 @@ static char *lzma_mem_inflate(char *cdata, size_t csize, size_t *size) { r = lzma_code(&strm, LZMA_FINISH); if (r != LZMA_OK && r != LZMA_STREAM_END) { hts_log_error("Call to lzma_code failed with error %d", r); - return NULL; + goto fail; } new_out = realloc(out, strm.total_out > 0 ? strm.total_out : 1); From 8b38a1bb001e02f34b878387b2e48df2b7581b4b Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 2 Oct 2021 17:33:42 +0100 Subject: [PATCH 209/488] Add basic recognition of D4 quantitative interval format --- hts.c | 10 ++++++++++ htslib/hts.h | 1 + 2 files changed, 11 insertions(+) diff --git a/hts.c b/hts.c index 490a9905d..194fb1f7a 100644 --- a/hts.c +++ b/hts.c @@ -285,6 +285,7 @@ static enum htsFormatCategory format_category(enum htsExactFormat fmt) return index_file; case bed: + case d4_format: return region_list; case htsget: @@ -634,6 +635,13 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->version.major = 1, fmt->version.minor = -1; return 0; } + else if (len >= 8 && memcmp(s, "d4\xdd\xdd", 4) == 0) { + fmt->category = region_list; + fmt->format = d4_format; + // How to decode the D4 Format Version bytes is not yet specified + // so we don't try to set fmt->version.{major,minor}. + return 0; + } else if (cmp_nonblank("{\"htsget\":", s, &s[len]) == 0) { fmt->category = unknown_category; fmt->format = htsget; @@ -718,6 +726,7 @@ char *hts_format_description(const htsFormat *format) case gzi: kputs("GZI", &str); break; case tbi: kputs("Tabix", &str); break; case bed: kputs("BED", &str); break; + case d4_format: kputs("D4", &str); break; case htsget: kputs("htsget", &str); break; case hts_crypt4gh_format: kputs("crypt4gh", &str); break; case empty_format: kputs("empty", &str); break; @@ -1597,6 +1606,7 @@ const char *hts_format_file_extension(const htsFormat *format) { case gzi: return "gzi"; case tbi: return "tbi"; case bed: return "bed"; + case d4_format: return "d4"; case fasta_format: return "fa"; case fastq_format: return "fq"; default: return "?"; diff --git a/htslib/hts.h b/htslib/hts.h index 7c44dbd79..62cd4beec 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -205,6 +205,7 @@ enum htsExactFormat { empty_format, // File is empty (or empty after decompression) fasta_format, fastq_format, fai_format, fqi_format, hts_crypt4gh_format, + d4_format, format_maximum = 32767 }; From ee017a52a558ae89fdc8fa4e59600cf3b56cb49e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 5 Oct 2021 16:24:35 +0100 Subject: [PATCH 210/488] Fix a filter expression bug for flag.read2 This was compared with string length 6 instead of 5, meaning it included the nul terminator byte. This means "flag.read2" works at the end of strings only. Fixes samtools/samtools#1513 --- sam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sam.c b/sam.c index 1f02f428e..04b8b9372 100644 --- a/sam.c +++ b/sam.c @@ -1211,7 +1211,7 @@ static int bam_sym_lookup(void *data, char *str, char **end, *end = str+5; res->d = b->core.flag & BAM_FREAD1; return 0; - } else if (!memcmp(str, "read2", 6)) { + } else if (!memcmp(str, "read2", 5)) { *end = str+5; res->d = b->core.flag & BAM_FREAD2; return 0; From 0cd58034a148fb81ad5581f9fd5540dc4fec9ab8 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 5 Oct 2021 15:20:55 +0100 Subject: [PATCH 211/488] Add the new field as integer, not as bit field There was some uncertainty how #1327 would behave with programs and htslib on different endian platforms when the library and the program is compiled using different compilers. Adding the new field as an integer at the end of the structure was deemed safer. --- htslib/synced_bcf_reader.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 86dd53a8e..78e9a0b4a 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -112,8 +112,7 @@ typedef struct bcf_sr_regions_t kstring_t line; // holder of the current line, set only when reading from tabix-indexed files htsFile *file; char *fname; - int is_bin:30, // is open in binary mode (tabix access) - overlap:2; // see BCF_SR_REGIONS_OVERLAP/BCF_SR_TARGETS_OVERLAP + int is_bin; // is open in binary mode (tabix access) char **als; // parsed alleles if targets_als set and _regions_match_alleles called kstring_t als_str; // block of parsed alleles int nals, mals; // number of set alleles and the size of allocated array @@ -134,6 +133,7 @@ typedef struct bcf_sr_regions_t hts_pos_t start, end; // current position: start, end of the region (0-based) int prev_seq; hts_pos_t prev_start, prev_end; + int overlap; // see BCF_SR_REGIONS_OVERLAP/BCF_SR_TARGETS_OVERLAP } bcf_sr_regions_t; From decd7a0c746ef02007c114d214252077ba645344 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 6 Oct 2021 15:53:37 +0100 Subject: [PATCH 212/488] Autumn 2021 Release Copyright Update --- test/test.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.pl b/test/test.pl index e2e5f8b20..5ba6022b4 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2020 Genome Research Ltd. +# Copyright (C) 2012-2021 Genome Research Ltd. # # Author: Petr Danecek # From ab6065cd02a7cb4f104f2d20f25792ebc4c76406 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 15 Sep 2021 19:49:13 +0100 Subject: [PATCH 213/488] Print diagnostic for SAM input corrupted by aligner log messages Motivated by samtools/samtools#1499: When headers are malformed and an error is already being printed, if the malformation is due to a known (e.g. aligner) error or log message, add a note recommending using that tool's -o (or similar) option rather than the shell redirection that has inadvertently captured stderr too. Motivated by samtools/samtools#1458: Do this also in the case that the header type code is malformed (e.g. `@S[W::foo]Q`). Previously this was only detected in sam_hrecs_parse_lines() so could be spuriously accepted by e.g. `samtools view --no-PG`; this adds a check to sam_hdr_create() so that such invalid headers are always rejected when reading SAM. --- sam.c | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/sam.c b/sam.c index 04b8b9372..393a3b22e 100644 --- a/sam.c +++ b/sam.c @@ -1716,6 +1716,22 @@ sam_hdr_t *sam_hdr_parse(size_t l_text, const char *text) return bh; } +static int valid_sam_header_type(const char *s) { + if (s[0] != '@') return 0; + switch (s[1]) { + case 'H': + return s[2] == 'D' && s[3] == '\t'; + case 'S': + return s[2] == 'Q' && s[3] == '\t'; + case 'R': + case 'P': + return s[2] == 'G' && s[3] == '\t'; + case 'C': + return s[2] == 'O'; + } + return 0; +} + // Minimal sanitisation of a header to ensure. // - null terminated string. // - all lines start with @ (also implies no blank lines). @@ -1790,6 +1806,20 @@ static sam_hdr_t *sam_hdr_sanitise(sam_hdr_t *h) { return h; } +static void known_stderr(const char *tool, const char *advice) { + hts_log_warning("SAM file corrupted by embedded %s error/log message", tool); + hts_log_warning("%s", advice); +} + +static void warn_if_known_stderr(const char *line) { + if (strstr(line, "M::bwa_idx_load_from_disk") != NULL) + known_stderr("bwa", "Use `bwa mem -o file.sam ...` or `bwa sampe -f file.sam ...` instead of `bwa ... > file.sam`"); + else if (strstr(line, "M::mem_pestat") != NULL) + known_stderr("bwa", "Use `bwa mem -o file.sam ...` instead of `bwa mem ... > file.sam`"); + else if (strstr(line, "loaded/built the index") != NULL) + known_stderr("minimap2", "Use `minimap2 -o file.sam ...` instead of `minimap2 ... > file.sam`"); +} + static sam_hdr_t *sam_hdr_create(htsFile* fp) { kstring_t str = { 0, 0, NULL }; khint_t k; @@ -1868,13 +1898,21 @@ static sam_hdr_t *sam_hdr_create(htsFile* fp) { } } else { hts_log_warning("Ignored @SQ SN:%s : bad or missing LN tag", sn); + warn_if_known_stderr(fp->line.s); free(sn); } } else { hts_log_warning("Ignored @SQ line with missing SN: tag"); + warn_if_known_stderr(fp->line.s); } sn = NULL; } + else if (!valid_sam_header_type(fp->line.s)) { + hts_log_error("Invalid header line: must start with @HD/@SQ/@RG/@PG/@CO"); + warn_if_known_stderr(fp->line.s); + goto error; + } + if (kputsn(fp->line.s, fp->line.l, &str) < 0) goto error; From 7ecf4e4153d3d3d2ec0adb24611369494a41427d Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 7 Oct 2021 18:31:41 +0100 Subject: [PATCH 214/488] NEWS updates for autumn 2021 release --- NEWS | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 06dc8defb..c07bc5930 100644 --- a/NEWS +++ b/NEWS @@ -1,12 +1,95 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* New API function hts_flush()/sam_flush()/bcf_flush() for flushing output - htsFile/samFile/vcfFile streams. (PR #1326, thanks to John Marshall) +Features and Updates +-------------------- * Added a keep option to bgzip to leave the original file untouched. This - brings bgzip into line with gzip. (PR #1331, thanks to Alex Petty). + brings bgzip into line with gzip. (PR #1331, thanks to Alex Petty) + +* "endpos" has been added to the filter language, giving the position + of the rightmost mapped base as measured by the CIGAR string. For + unmapped reads it is the same as "pos". (PR #1307, thanks to John Marshall) + +* Interfaces have been added to interpret the new base modification tags + added to the SAMtags document in samtools/hts-specs#418. (PR #1132) + +* New API functions hts_flush()/sam_flush()/bcf_flush() for flushing output + htsFile/samFile/vcfFile streams. (PR #1326, thanks to John Marshall) + +* The synced_bcf_reader now sorts lines with symbolic alleles by END tag as + well as POS. (PR #1321) + +* Added synced_bcf_reader options BCF_SR_REGIONS_OVERLAP and + BCF_SR_TARGETS_OVERLAP for better control of records that start outside + the desired region but overlap it are handled. Fixes samtools/bcftools#1420 + and samtools/bcftools#1421 raised by John Marshall. (PR #1327) + +* HTSlib will now accept long-cigar CG:B: tags made by htsjdk which don't + quite follow the specification properly (using signed values instead of + unsigned). Thanks to Colin Diesh for reporting an example file. (PR #1317) + +* The warning printed when the BGZF reader finds a file with no EOF block + has been changed to be less alarming. Unfortunately some third-party + BGZF encoders don't write EOF blocks at the end of files. Thanks to + Keiran Raine for reporting an example file. (PR #1323) + +* The FASTA and FASTQ readers get an option to skip over the first item on + the header line, and use the second as the read name. It allows the original + name to be restored on some of the fastq files served from the European + Nucleotide Archive (ENA). (PR #1325) + +* HTSlib is now more strict when parsing the VCF samples line (beginning + #CHROM). It will only accept tabs between the mandatory field names and + sample names must be separated with tabs. (PR #1328) + +* HTSlib will now warn if it looks like the header has been corrupted + by diagnostic messages from the program that made it. This can happen when + using `nohup`, which by default mixes stdout and stderr into the same + stream. (PR#1339, thanks to John Marshall) + +* File format detection will now recognise signatures for XZ, Zstd and D4 + files (note that HTSlib will not read them yet). (PR #1340, thanks to + John Marshall) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Some redundant tests have been removed from the test harness, speeding it up. + (PR #1308) + +* The version.sh script now works better on shallow checkouts. (PR #1324) + +* A check-untracked Makefile target has been added to catch untracked files + (mostly) left by the test harness. (PR #1324) + +Bug fixes +--------- + +* Fixed a case where flushing the thread pool could very occasionally cause + a deadlock. (PR #1309) + +* Fixed a bug where some CRAM files could fail to decode if the required_fields + option was in use. Thanks to Matt Sexton for reporting the issue. + (PR #1314, fixes samtools/samtools#1475) + +* Fixed a regression where the S3 plugin could not read public files unless + you supplied some Amazon credentials. Thanks to Chris Saunders for reporting. + (PR #1332, fixes samtools/samtools#1491) + +* Fixed a possible CRAM thread deadlock discovered by @ryancaicse. + (PR #1330, fixes #1329) + +* Some set-but-unused variables have been removed. (PR #1334) + +* Fixed a bug which prevented "flag.read2" from working in the filter + language unless it was at the end of the expression. Thanks to Vamsi Kodali + for reporting the issue. (PR #1342) +* Fixed a memory leak that could happen if CRAM fails to inflate a LZMA + block. (PR #1340, thanks to John Marshall) Noteworthy changes in release 1.13 (7th July 2021) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From b136e5da63234fa467e7abe189e95086ce547349 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 15 Oct 2021 16:03:05 +0100 Subject: [PATCH 215/488] Resolve icc 2021 warning about HTS_DEPRECATED_ENUM. In file included from htslib/synced_bcf_reader.h(59), from bcf_sr_sort.h(37), from bcf_sr_sort.c(31): htslib/hts.h(204): warning #2650: attributes ignored here json HTS_DEPRECATED_ENUM("Use htsExactFormat 'htsget' instead") = htsget, Note that the normal HTS_DEPRECATED is fine, so it appears to only be problematic when used with an enum. (It's possible this is a new issue in icc 2021, but the licenses have expired for all the old installs.) --- htslib/hts_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/hts_defs.h b/htslib/hts_defs.h index 8b24167e9..7719215c1 100644 --- a/htslib/hts_defs.h +++ b/htslib/hts_defs.h @@ -79,7 +79,7 @@ DEALINGS IN THE SOFTWARE. */ #define HTS_DEPRECATED(message) #endif -#if HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(6,4) +#if (HTS_COMPILER_HAS(__deprecated__) || HTS_GCC_AT_LEAST(6,4)) && !defined(__ICC) #define HTS_DEPRECATED_ENUM(message) __attribute__ ((__deprecated__ (message))) #else #define HTS_DEPRECATED_ENUM(message) From f674651974de23bb73b6ef702b460e74fe2348dd Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 29 Oct 2021 10:35:49 +0100 Subject: [PATCH 216/488] Fill in cr->rg when writing CRAM v1 As per the other branches of the if-else, set cr->rg. Also rewrite the assert: the previous code produced "brg is unused" warnings when compiled with -DNDEBUG. --- cram/cram_encode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 192278f17..468822247 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3057,7 +3057,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->rg = brg ? brg->id : -1; } else if (CRAM_MAJOR_VERS(fd->version) == 1) { sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN"); - assert(brg); + if (!brg) goto block_err; + cr->rg = brg->id; } else { cr->rg = -1; } From 9045785cef02e7cd97b0b4caecbdcaf6217a986a Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 31 Oct 2021 20:37:11 +0000 Subject: [PATCH 217/488] Add hts_detect_format2() that also considers filename extensions Format detection to date uses only the stream contents, as filenames are not always available (e.g., when reading from standard input) or may be inaccurate or unexpected. However there are a very few cases where the filename extension is important: * FASTA/Q indexes (uncommon for hts_open()) are a particular case of 5/6-column BED files (comparatively common). We don't want to misrecognise any actual BED files as FASTA/Q indexes, so require a .fai/.fqi extension for the latter -- which are unlikely to appear on standard input anyway, so filenames will usually be available. * GZI indexes have not previously been recognised, as they have no magic numbers. They can now be recognised by their .gzi extension. Fixes #1085, fixes #1165, and fixes #1347. --- hts.c | 38 +++++++++++++++++++++++++++++++------- htsfile.c | 2 +- htslib/hts.h | 19 +++++++++++++++++++ 3 files changed, 51 insertions(+), 8 deletions(-) diff --git a/hts.c b/hts.c index 194fb1f7a..9d5af1289 100644 --- a/hts.c +++ b/hts.c @@ -505,7 +505,12 @@ static int colmatch(const char *columns, const char *pattern) int hts_detect_format(hFILE *hfile, htsFormat *fmt) { - char columns[24]; + return hts_detect_format2(hfile, NULL, fmt); +} + +int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt) +{ + char extension[HTS_MAX_EXT_LEN], columns[24]; unsigned char s[1024]; int complete = 0; ssize_t len = hpeek(hfile, s, 18); @@ -568,6 +573,18 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) return 0; } + // We avoid using filename extensions wherever possible (as filenames are + // not always available), but in a few cases they must be considered: + // - FASTA/Q indexes are simply tab-separated text; files that match these + // patterns but not the fai/fqi extension are usually generic BED files + // - GZI indexes have no magic numbers so can only be detected by filename + if (fname && strcmp(fname, "-") != 0) { + char *s; + if (find_file_extension(fname, extension) < 0) extension[0] = '\0'; + for (s = extension; *s; s++) *s = tolower_c(*s); + } + else extension[0] = '\0'; + if (len >= 6 && memcmp(s,"CRAM",4) == 0 && s[4]>=1 && s[4]<=7 && s[5]<=7) { fmt->category = sequence_data; fmt->format = cram; @@ -613,6 +630,13 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->format = tbi; return 0; } + // GZI indexes have no magic numbers, so must be recognised solely by + // filename extension. + else if (strcmp(extension, "gzi") == 0) { + fmt->category = index_file; + fmt->format = gzi; + return 0; + } } else if (len >= 16 && memcmp(s, "##fileformat=VCF", 16) == 0) { fmt->category = variant_data; @@ -679,12 +703,12 @@ int hts_detect_format(hFILE *hfile, htsFormat *fmt) fmt->format = crai; return 0; } - else if (colmatch(columns, "Ziiiii") == 6) { + else if (strstr(extension, "fqi") && colmatch(columns, "Ziiiii") == 6) { fmt->category = index_file; fmt->format = fqi_format; return 0; } - else if (colmatch(columns, "Ziiii") == 5) { + else if (strstr(extension, "fai") && colmatch(columns, "Ziiii") == 5) { fmt->category = index_file; fmt->format = fai_format; return 0; @@ -1367,7 +1391,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) if (strchr(simple_mode, 'r')) { const int max_loops = 5; // Should be plenty int loops = 0; - if (hts_detect_format(hfile, &fp->format) < 0) goto error; + if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error; // Deal with formats that re-direct an underlying file via a plug-in. // Loops as we may have crypt4gh served via htsget, or @@ -1392,7 +1416,7 @@ htsFile *hts_hopen(hFILE *hfile, const char *fn, const char *mode) } // Re-detect format against the result of the redirection - if (hts_detect_format(hfile, &fp->format) < 0) goto error; + if (hts_detect_format2(hfile, fn, &fp->format) < 0) goto error; } } else if (strchr(simple_mode, 'w') || strchr(simple_mode, 'a')) { @@ -2014,7 +2038,7 @@ int hts_file_type(const char *fname) if (f == NULL) return 0; htsFormat fmt; - if (hts_detect_format(f, &fmt) < 0) { hclose_abruptly(f); return 0; } + if (hts_detect_format2(f, fname, &fmt) < 0) { hclose_abruptly(f); return 0; } if (hclose(f) < 0) return 0; switch (fmt.format) { @@ -4220,7 +4244,7 @@ static int idx_test_and_fetch(const char *fn, const char **local_fn, int *local_ free(s.s); return -1; } - if (hts_detect_format(remote_hfp, &fmt)) { + if (hts_detect_format2(remote_hfp, fn, &fmt)) { hts_log_error("Failed to detect format of index file '%s'", fn); goto fail; } diff --git a/htsfile.c b/htsfile.c index d6d6b4e69..bbddc1c39 100644 --- a/htsfile.c +++ b/htsfile.c @@ -283,7 +283,7 @@ int main(int argc, char **argv) if (mode == identify) { htsFormat fmt; - if (hts_detect_format(fp, &fmt) < 0) { + if (hts_detect_format2(fp, argv[i], &fmt) < 0) { error("detecting \"%s\" format failed", argv[i]); hclose_abruptly(fp); continue; diff --git a/htslib/hts.h b/htslib/hts.h index 4fff1321b..48c263cab 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -534,10 +534,29 @@ const char *hts_feature_string(void); @param fp File opened for reading, positioned at the beginning @param fmt Format structure that will be filled out on return @return 0 for success, or negative if an error occurred. + + Equivalent to hts_detect_format2(fp, NULL, fmt). */ HTSLIB_EXPORT int hts_detect_format(struct hFILE *fp, htsFormat *fmt); +/*! + @abstract Determine format primarily by peeking at the start of a file + @param fp File opened for reading, positioned at the beginning + @param fname Name of the file, or NULL if not available + @param fmt Format structure that will be filled out on return + @return 0 for success, or negative if an error occurred. + @since 1.15 + +Some formats are only recognised if the filename is available and has the +expected extension, as otherwise more generic files may be misrecognised. +In particular: + - FASTA/Q indexes must have .fai/.fqi extensions; without this requirement, + some similar BED files would be misrecognised as indexes. +*/ +HTSLIB_EXPORT +int hts_detect_format2(struct hFILE *fp, const char *fname, htsFormat *fmt); + /*! @abstract Get a human-readable description of the file format @param fmt Format structure holding type, version, compression, etc. From 2f2318e7e5a62dc8d2ef5a147b4d521e4dbb8981 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 9 Nov 2021 15:39:26 +0000 Subject: [PATCH 218/488] Update cirrus-ci macos from Catalina to Monterey. --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 2e45de544..ea97dbc64 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -146,7 +146,7 @@ centos_task: macosx_task: name: macosx + clang osx_instance: - image: catalina-base + image: monterey-base environment: CC: clang From f867feab092936b4a7df294e1fe4fd59fc87fffe Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 9 Nov 2021 16:27:28 +0000 Subject: [PATCH 219/488] Update to latest htscodecs - Fix off-by-one error on initial size check for arith_dynamic - Work around an icc-2021 compiler bug (and speedup varint too) --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index ed325d7e4..b6615cdf5 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit ed325d7e406926b2be7697bdb185cf1692879c2a +Subproject commit b6615cdf542a83badc805d2195bc514220f19ce6 From 049574852175e0850234d154c0ccc0822a940c67 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 10 Nov 2021 17:12:38 +0000 Subject: [PATCH 220/488] Make bcf_remove_allele_set() set work on unpacked records (#1358) The function accesses alleles, info and genotypes, so everything needs to be unpacked. Resolves #1353 --- vcfutils.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcfutils.c b/vcfutils.c index 4153c5c60..b2a057a75 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -256,10 +256,10 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int *map = (int*) calloc(line->n_allele, sizeof(int)); uint8_t *dat = NULL; + bcf_unpack(line, BCF_UN_ALL); + // create map of indexes from old to new ALT numbering and modify ALT kstring_t str = {0,0,0}; - if (!line->d.allele) - bcf_unpack(line, BCF_UN_STR); kputs(line->d.allele[0], &str); int nrm = 0, i,j; // i: ori alleles, j: new alleles From 1d79f449cb3b02eda8fc151556395b7b50ccd730 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 Nov 2021 14:46:52 +0000 Subject: [PATCH 221/488] Fix indexing bug with "placed" unmapped reads. Unmapped-but-placed (having REF/POS) reads are not included in the index. Hence if an placed unmapped is the first record in a bin, then it may not be returned. Note most aligners write out mapped followed by unmapped which does not trigger this problem. The SAM spec states that all unmapped placed reads should be considered as having an alignment length of 1. While it doesn't seem to explicitly state these must therefore be in the index, it does imply it. It appears that picard also indexes placed reads in this manner. Originally reported by John Marshall. Fixes #1142 --- hts.c | 14 ++++++-------- test/index2.sam | 11 +++++++++++ test/test.pl | 25 +++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 8 deletions(-) create mode 100644 test/index2.sam diff --git a/hts.c b/hts.c index 9d5af1289..4bbe41eaa 100644 --- a/hts.c +++ b/hts.c @@ -2408,14 +2408,12 @@ int hts_idx_push(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end, uint64_t if ( tid>=0 ) { if (idx->bidx[tid] == 0) idx->bidx[tid] = kh_init(bin); - if (is_mapped) { - // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin - if (beg < 0) beg = 0; - if (end <= 0) end = 1; - // idx->z.last_off points to the start of the current record - if (insert_to_l(&idx->lidx[tid], beg, end, - idx->z.last_off, idx->min_shift) < 0) return -1; - } + // shoehorn [-1,0) (VCF POS=0) into the leftmost bottom-level bin + if (beg < 0) beg = 0; + if (end <= 0) end = 1; + // idx->z.last_off points to the start of the current record + if (insert_to_l(&idx->lidx[tid], beg, end, + idx->z.last_off, idx->min_shift) < 0) return -1; } else idx->n_no_coor++; bin = hts_reg2bin(beg, end, idx->min_shift, idx->n_lvls); diff --git a/test/index2.sam b/test/index2.sam new file mode 100644 index 000000000..97d39e65b --- /dev/null +++ b/test/index2.sam @@ -0,0 +1,11 @@ +@HD VN:1.4 SO:coordinate +@SQ SN:1 LN:249250621 M5:1b22b98cdeb4a9304cb5d48026a85128 +@SQ SN:2 LN:243199373 M5:a0d9851da00400dec1098a9255ac712e +um1 69 1 1000000 0 * * 0 0 AAAAAAAAAA * +um1 137 1 1000000 44 10M * 0 0 AAAAAAAAAA * +um2 69 1 2000000 0 * * 0 0 AAAAAAAAAA * +um2 137 1 2000000 44 10M * 0 0 AAAAAAAAAA * +mu1 137 2 1000000 44 10M * 0 0 AAAAAAAAAA * +mu1 69 2 1000000 0 * * 0 0 AAAAAAAAAA * +mu2 137 2 2000000 44 10M * 0 0 AAAAAAAAAA * +mu2 69 2 2000000 0 * * 0 0 AAAAAAAAAA * diff --git a/test/test.pl b/test/test.pl index 5ba6022b4..ff5601b0d 100755 --- a/test/test.pl +++ b/test/test.pl @@ -840,6 +840,31 @@ sub test_index $wtmp =~ s/\//\\\\/g; } test_cmd($opts,out=>'tabix.out',cmd=>"$$opts{bin}/tabix $wtmp/index.vcf.gz##idx##$wtmp/index.vcf.gz.tbi 1:10000060-10000060"); + + cmd("$$opts{path}/test_view -b -p $$opts{tmp}/index2.bam -x $$opts{tmp}/index2.bam.bai $$opts{path}/index2.sam"); + for (my $tid = 1; $tid <= 2; $tid++) { + for (my $pos = 1; $pos <= 2; $pos++) { + # All queries should return exactly two sequences. + # The input data consists of mapped/unmapped and unmapped/mapped + # in both orders. + # Done verbatim as test_cmd cannot return $out for us to check. + my $test = "$$opts{path}/test_view $$opts{tmp}/index2.bam $tid:${pos}000000-${pos}000000"; + print "test_index:\n\t$test\n"; + my ($ret, $out) = _cmd($test); + if ($ret ne 0) { + failed($opts, $test); + } else { + my $rnum = ($out =~ s/^[^@].*\n//gm); + if ($rnum ne 2) { + failed($opts, $test); + } else { + passed($opts, $test); + } + } + } + } + unlink("$$opts{tmp}/index2.bam"); + unlink("$$opts{tmp}/index2.bam.bai"); } sub test_bcf2vcf From f1ac6bbdc2bf44c0464ceed2f9aa1675bb1dcbfe Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 17 Nov 2021 14:30:55 +0000 Subject: [PATCH 222/488] Add pathA##idx##pathB support for CRAM indices. Fixes samtools/samtools#1541 --- cram/cram_index.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cram/cram_index.c b/cram/cram_index.c index b567d3e05..45d420df2 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -173,6 +173,11 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { idx_stack[idx_stack_ptr] = idx; + // Support pathX.cram##idx##pathY.crai + const char *fn_delim = strstr(fn, HTS_IDX_DELIM); + if (fn_delim && !fn_idx) + fn_idx = fn_delim + strlen(HTS_IDX_DELIM); + if (!fn_idx) { if (hts_idx_check_local(fn, HTS_FMT_CRAI, &tfn_idx) == 0 && hisremote(fn)) tfn_idx = hts_idx_getfn(fn, ".crai"); From e769401d18c19ce6150a1d40bdccd2af814e6b1c Mon Sep 17 00:00:00 2001 From: David Seifert Date: Sat, 11 Dec 2021 12:47:51 +0100 Subject: [PATCH 223/488] Make public API `-Wdouble-promotion` clean --- htslib/sam.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index d37877e48..79ac26e4b 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1492,7 +1492,8 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, ++s; } else if (type == 'f') { if (end - s >= 4) { - ksprintf(ks, "f:%g", le_to_float(s)); + // cast to avoid triggering -Wdouble-promotion + ksprintf(ks, "f:%g", (double)le_to_float(s)); s += 4; } else goto bad_aux; @@ -1594,7 +1595,8 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, if (ks_expand(ks, n*8) < 0) goto mem_err; for (i = 0; i < n; ++i) { ks->s[ks->l++] = ','; - r |= kputd(le_to_float(s), ks) < 0; + // cast to avoid triggering -Wdouble-promotion + r |= kputd((double)le_to_float(s), ks) < 0; s += 4; } break; From dc8a63199c8908505a4677ad04468396f2185d17 Mon Sep 17 00:00:00 2001 From: pd3 Date: Thu, 9 Dec 2021 10:51:32 +0000 Subject: [PATCH 224/488] Make repeated seeks possible Repeated seeks with implicitly created region list wouldn't initialize all internal structures to the original clean state, as demonstrated by the issue #1362, resolved by this commit --- synced_bcf_reader.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index dae70099d..ce2d7c780 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -75,6 +75,7 @@ static int _regions_add(bcf_sr_regions_t *reg, const char *chr, hts_pos_t start, static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); static void _regions_sort_and_merge(bcf_sr_regions_t *reg); +static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler); char *bcf_sr_strerror(int errnum) { @@ -838,6 +839,7 @@ static void bcf_sr_seek_start(bcf_srs_t *readers) for (i=0; inseqs; i++) reg->regs[i].creg = -1; reg->iseq = 0; + reg->prev_seq = -1; } @@ -851,8 +853,18 @@ int bcf_sr_seek(bcf_srs_t *readers, const char *seq, hts_pos_t pos) bcf_sr_seek_start(readers); return 0; } - bcf_sr_regions_overlap(readers->regions, seq, pos, pos); + int i, nret = 0; + + // Need to position both the readers and the regions. The latter is a bit of a mess + // because we can have in memory or external regions. The safe way is: + // - reset all regions as if they were not read from at all (bcf_sr_seek_start) + // - find the requested iseq (stored in the seq_hash) + // - position regions to the requested position (bcf_sr_regions_overlap) + bcf_sr_seek_start(readers); + if ( khash_str2int_get(readers->regions->seq_hash, seq, &i)>=0 ) readers->regions->iseq = i; + _bcf_sr_regions_overlap(readers->regions, seq, pos, pos, 0); + for (i=0; inreaders; i++) { nret += _reader_seek(&readers->readers[i],seq,pos,MAX_CSI_COOR-1); @@ -1406,14 +1418,20 @@ static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *re } int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end) +{ + return _bcf_sr_regions_overlap(reg,seq,start,end,1); +} + +static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler) { int iseq; if ( khash_str2int_get(reg->seq_hash, seq, &iseq)<0 ) return -1; // no such sequence + if ( missed_reg_handler && !reg->missed_reg_handler ) missed_reg_handler = 0; if ( reg->prev_seq==-1 || iseq!=reg->prev_seq || reg->prev_start > start ) // new chromosome or after a seek { // flush regions left on previous chromosome - if ( reg->missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) + if ( missed_reg_handler && reg->prev_seq!=-1 && reg->iseq!=-1 ) bcf_sr_regions_flush(reg); bcf_sr_regions_seek(reg, seq); @@ -1427,7 +1445,7 @@ int bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t sta { if ( bcf_sr_regions_next(reg) < 0 ) return -2; // no more regions left if ( reg->iseq != iseq ) return -1; // does not overlap any regions - if ( reg->missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); + if ( missed_reg_handler && reg->end < start ) reg->missed_reg_handler(reg, reg->missed_reg_data); } if ( reg->start <= end ) return 0; // region overlap return -1; // no overlap From 72886381d348aae422a86d3e9f99a9ade5257d4c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 7 Jan 2022 09:43:34 +0000 Subject: [PATCH 225/488] Fix missing autotools on Appveyor msys2 commit a37deb34 removed autotools from their base-devel package, so we now need to ask for it explicitly. --- .appveyor.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.appveyor.yml b/.appveyor.yml index 45550bade..eb3059750 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -27,7 +27,7 @@ install: - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - set MINGWPREFIX=x86_64-w64-mingw32 - - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\"" + - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\"" build_script: - set HOME=. From 7d3eeec730321544a09c85dfb53e1bfd307a4d49 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 6 Jan 2022 13:31:03 +0000 Subject: [PATCH 226/488] Prevent empty FORMAT fields and output ":.:" instead of "::" --- vcfutils.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/vcfutils.c b/vcfutils.c index b2a057a75..d99e78ffb 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -747,7 +747,7 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb nnew = nR_new; } - #define BRANCH(type_t,is_vector_end) \ + #define BRANCH(type_t,is_vector_end,set_missing) \ { \ for (j=0; jn_sample; j++) \ { \ @@ -757,7 +757,12 @@ int bcf_remove_allele_set(const bcf_hdr_t *header, bcf1_t *line, const struct kb int k_src, k_dst = 0; \ for (k_src=0; k_src Date: Fri, 7 Jan 2022 11:15:50 +0000 Subject: [PATCH 227/488] Fix CRAM region queries when CIGAR isn't being decoded. If using the required_fields option to select columns for decoding, excluding CIGAR, then the sequence end position (cr->apos to cr->aend) is not set correctly. With CIGAR computed, ref_pos is increased by each CIGAR option. Without it it'll end up being equivalent to cr->apos-1. Now it is always minimum cr->apos itself (ie covers apos to apos inclusive). The impact of this is filtering for the entirety of a single chromosome could leave a sequence as pos 1 with apos=1 aend=0, which then rejected the sequence as aend < 1 (for region chr:1-LEN). I think this also fixes samtools/samtools#1574, but cannot be sure without confirmation. --- cram/cram_decode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index e65e11d08..e6b4977b7 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1777,7 +1777,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } cr->ncigar = ncigar - cr->cigar; - cr->aend = ref_pos; + cr->aend = ref_pos > cr->apos ? ref_pos : cr->apos; //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); From ac22d4d4fb2914b1f64559940b2a11cbf8102127 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 10 Jan 2022 11:49:44 +0000 Subject: [PATCH 228/488] Happy New Year --- LICENSE | 4 ++-- bgzip.c | 2 +- htsfile.c | 2 +- tabix.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/LICENSE b/LICENSE index 5075c3f53..cf2e9f82a 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.] The MIT/Expat License -Copyright (C) 2012-2021 Genome Research Ltd. +Copyright (C) 2012-2022 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.] The Modified-BSD License -Copyright (C) 2012-2020 Genome Research Ltd. +Copyright (C) 2012-2022 Genome Research Ltd. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/bgzip.c b/bgzip.c index f9abf99c5..bd0374811 100644 --- a/bgzip.c +++ b/bgzip.c @@ -173,7 +173,7 @@ int main(int argc, char **argv) case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); diff --git a/htsfile.c b/htsfile.c index bbddc1c39..0391a98fa 100644 --- a/htsfile.c +++ b/htsfile.c @@ -258,7 +258,7 @@ int main(int argc, char **argv) case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2021 Genome Research Ltd.\n", +"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; diff --git a/tabix.c b/tabix.c index 969f902f6..a79a7b968 100644 --- a/tabix.c +++ b/tabix.c @@ -581,7 +581,7 @@ int main(int argc, char *argv[]) case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2021 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 2: return usage(stdout, EXIT_SUCCESS); From 4289991d9c37981d6ef1c1436ec6e791d9993437 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 11 Jan 2022 12:11:22 +0000 Subject: [PATCH 229/488] Provide a definition of ssize_t when compiling with MSVC Define `ssize_t` within the public headers that use it, enabling them to be compiled with MSVC, which does not provide this type definition in its or any other headers. Having `ssize_t` as a macro may confuse other headers, so be sure to do this only after all other #includes and to undefine it again at the end of each header. User code or other libraries may also have workarounds for this issue, so checking its not already #defined and removing our `ssize_t` macro afterwards also avoids conflicting with other workarounds. Under MSVC, including //etc also causes to be included, which defines `size_t` and `intptr_t` as unsigned/signed versions of the same type. So we define `ssize_t` as that, if it is not already defined (by some other workaround) and if `_INTPTR_T_DEFINED` has indeed been defined by (to avoid confusing error messages). We also check `_SSIZE_T_DEFINED`, which MSVC will likely define if it's ever fixed to supply this typedef. Also define it (permanently) in , unless suppressed by defining `HTS_NO_SSIZE_T` before including the header. --- htslib/bgzf.h | 12 ++++++++++++ htslib/hfile.h | 12 ++++++++++++ htslib/hts_os.h | 6 ++++++ htslib/knetfile.h | 12 ++++++++++++ htslib/kstring.h | 12 ++++++++++++ htslib/sam.h | 12 ++++++++++++ 6 files changed, 66 insertions(+) diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 8e6b9b17e..24d787bdf 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -35,6 +35,13 @@ #include "hts_defs.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -450,4 +457,9 @@ typedef struct BGZF BGZF; } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/hfile.h b/htslib/hfile.h index 987acb7c8..038591cbc 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -32,6 +32,13 @@ DEALINGS IN THE SOFTWARE. */ #include "hts_defs.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -368,4 +375,9 @@ int hfile_has_plugin(const char *name); } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/hts_os.h b/htslib/hts_os.h index b71cb89e7..c715b0612 100644 --- a/htslib/hts_os.h +++ b/htslib/hts_os.h @@ -77,4 +77,10 @@ extern int is_cygpty(int fd); #define random rand #endif +/* MSVC does not provide ssize_t in its . This ensures the type + is available (unless suppressed by defining HTS_NO_SSIZE_T first). */ +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined HTS_NO_SSIZE_T && !defined ssize_t +#define ssize_t intptr_t +#endif + #endif // HTSLIB_HTS_OS_H diff --git a/htslib/knetfile.h b/htslib/knetfile.h index da9cdc5e8..cfddd6b67 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -44,6 +44,13 @@ #define netclose(fd) closesocket(fd) #endif +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + // FIXME: currently I/O is unbuffered #define KNF_TYPE_LOCAL 1 @@ -102,4 +109,9 @@ extern "C" { } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/kstring.h b/htslib/kstring.h index 150757ca6..09bc9e3d9 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -55,6 +55,13 @@ #endif #endif +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + /* kstring_t is a simple non-opaque type whose fields are likely to be * used directly by user code (but see also ks_str() and ks_len() below). * A kstring_t object is initialised by either of @@ -396,4 +403,9 @@ static inline int *ksplit(kstring_t *s, int delimiter, int *n) return offsets; } +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif diff --git a/htslib/sam.h b/htslib/sam.h index 79ac26e4b..e61b9a779 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -33,6 +33,13 @@ DEALINGS IN THE SOFTWARE. */ #include "hts.h" #include "hts_endian.h" +// Ensure ssize_t exists within this header. All #includes must precede this, +// and ssize_t must be undefined again at the end of this header. +#if defined _MSC_VER && defined _INTPTR_T_DEFINED && !defined _SSIZE_T_DEFINED && !defined ssize_t +#define HTSLIB_SSIZE_T +#define ssize_t intptr_t +#endif + #ifdef __cplusplus extern "C" { #endif @@ -2266,4 +2273,9 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, } #endif +#ifdef HTSLIB_SSIZE_T +#undef HTSLIB_SSIZE_T +#undef ssize_t +#endif + #endif From 4424468ef856fcd0bda07ff58a8f9f6f155b4a73 Mon Sep 17 00:00:00 2001 From: Aidan Bickford Date: Mon, 24 Jan 2022 10:12:28 -0700 Subject: [PATCH 230/488] Update docs: Iterate through bam1_t records (PR #1381) I think this needs to iterate through the records, as it is just passing the index right now. Ensures 'recs' was allocated when cleaning up. Co-authored-by: Rob Davies --- htslib/sam.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index e61b9a779..ea585a45a 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -944,10 +944,12 @@ void bam_destroy1(bam1_t *b); // ... use data ... cleanup: - for (size_t i = 0; i < nrecs; i++) - bam_destroy1(i); + if (recs) { + for (size_t i = 0; i < nrecs; i++) + bam_destroy1(&recs[i]); + free(recs); + } free(buffer); - free(recs); \endcode */ From 4469ab18377e7fc074c0e38e8aba0f4e886daaaf Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 25 Jan 2022 12:42:38 +0000 Subject: [PATCH 231/488] Permit 2 letter extension in find_file_extension. sam_open_mode checks for ".fa" and ".fq" extension types, but these were never returned before as it had a hard rule of >= 3 characters. --- hts_internal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hts_internal.h b/hts_internal.h index b4aa4c0d9..61956da21 100644 --- a/hts_internal.h +++ b/hts_internal.h @@ -139,7 +139,8 @@ static inline int find_file_extension(const char *fn, char ext_out[static HTS_MA { for (ext--; ext > fn && *ext != '.' && *ext != '/'; --ext) {} } - if (*ext != '.' || delim - ext > HTS_MAX_EXT_LEN || delim - ext < 4) return -1; + if (*ext != '.' || delim - ext > HTS_MAX_EXT_LEN || delim - ext < 3) + return -1; memcpy(ext_out, ext + 1, delim - ext - 1); ext_out[delim - ext - 1] = '\0'; return 0; From b28a043b2ae11b02b5de98e57a74d74acca49f1c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 31 Jan 2022 15:05:23 +0000 Subject: [PATCH 232/488] Replace CentOS test build with Rocky Linux CentOS has moved to a rolling distribution. Rocky Linux is more like the old CentOS, being a community build of the current RedHat. See https://wiki.rockylinux.org/ --- .cirrus.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index ea97dbc64..83e36af9c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -117,11 +117,11 @@ ubuntu_task: << : *TEST -# CentOS -centos_task: - name: centos-gcc +# Rocky Linux +rocky_task: + name: rockylinux-gcc container: - image: centos:latest + image: rockylinux:latest cpu: 2 memory: 1G From e2d1f256b5bd0c1e23676087abf0f84078fc7044 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 20 Jan 2022 11:32:25 +0000 Subject: [PATCH 233/488] Detect libdeflate >= 1.9 and adjust CRAM RN encoding. Previously for read names compression levels between 4 and 7 zlib considerably beat libdeflate due to libdeflate's poor selection of minimum match length. This was raised as issue ebiggers/libdeflate#85. It's now been resolved in libdeflate 1.9 (along with some general improvements elsewhere), and in all cases libdeflate is a better choice. Also fixed the mapping of levels 1..9 (standard zlib) to 1..12 (libdeflate). The maths in the comment was incorrect as it's an integer calculation not a floating point one. Figures from converting 1 million NovaSeq records from BAM to CRAM 3.0: Libdeflate 1.9+PR-7 0m43.816s 204732408 48381374 (RN=libdeflate) Libdeflate 1.8-7 0m45.379s 206626451 50580708 (RN=zlib) Libdeflate 1.8-7 1m1.431s 210172035 *54126292 (RN=libdeflate, forced) Zlib only -7 0m48.531s 207189920 50580708 (RN=zlib) (Default level) Libdeflate 1.9+PR-5 0m30.323s 207793059 51023626 (RN=libdeflate) Libdeflate 1.8-5 0m33.265s 208714328 51612215 (RN=zlib, as devel) Libdeflate 1.8-5 0m29.753s 213024792 *55922679 (RN=libdeflate, forced) Zlib only -5 0m40.353s 208499406 51612215 (RN=zlib) We can clearly see the problem(*) in using libdeflate for read-names in 1.8, how it's fixed in 1.9, and how it is now smaller/faster than zlib again. At level 9 it was using libdeflate for RN already, but we see improvements to both RN and elsewhere which are simply down to other changes in the library: Time Size RN Libdeflate 1.9+PR-9 2m21.757s 202890458 47327597 (RN=libdeflate) Libdeflate 1.8-9 2m6.304s 204292448 48541687 (RN=libdeflate) Zlib only -9 1m20.966s 206482425 49988310 (RN=zlib) Finally, the impact of switching level 9 from the old mapping (11) to new (12; "9+"), along with a more complete table for curiosities sake: Time Size RN Libdeflate 1.9+PR-9+ 2m54.664s 202315823 46783148 Libdeflate 1.9+PR-9 2m21.757s 202890458 47327597 Libdeflate 1.9+PR-8 1m39.040s 202934405 47247996 Libdeflate 1.9+PR-7 0m43.816s 204732408 48381374 Libdeflate 1.9+PR-6 0m31.521s 207437149 50768595 Libdeflate 1.9+PR-5 0m30.323s 207793059 51023626 (default level) Libdeflate 1.9+PR-4 0m29.478s 210425588 52946850 Libdeflate 1.9+PR-1 0m27.460s 215975209 57142706 (no change) (Note: "1.9" here is actually master, which is a few commits on from the tag, but the main gist of it is the same.) --- cram/cram_io.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index bc4f03646..3045b59dd 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1108,8 +1108,8 @@ char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { static char *libdeflate_deflate(char *data, size_t size, size_t *cdata_size, int level, int strat) { level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default - level *= 1.2; // NB levels go up to 12 here; 5 onwards is +1 - if (level >= 8) level += level/8; // 8->10, 9->12 + level *= 1.23; // NB levels go up to 12 here; 5 onwards is +1 + level += level>=8; // 5,6,7->6,7,8 8->10 9->12 if (level > 12) level = 12; if (strat == Z_RLE) // not supported by libdeflate @@ -1213,6 +1213,7 @@ char *zlib_mem_inflate(char *cdata, size_t csize, size_t *size) { } #endif +#if !defined(HAVE_LIBDEFLATE) || LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8) static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, int level, int strat) { z_stream s; @@ -1269,6 +1270,7 @@ static char *zlib_mem_deflate(char *data, size_t size, size_t *cdata_size, } return (char *)cdata; } +#endif #ifdef HAVE_LIBLZMA /* ------------------------------------------------------------------------ */ @@ -1754,9 +1756,11 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, // // Eg RN at level 5; libdeflate=55.9MB zlib=51.6MB #ifdef HAVE_LIBDEFLATE +# if (LIBDEFLATE_VERSION_MAJOR < 1 || (LIBDEFLATE_VERSION_MAJOR == 1 && LIBDEFLATE_VERSION_MINOR <= 8)) if (content_id == DS_RN && level >= 4 && level <= 7) return zlib_mem_deflate(in, in_size, out_size, level, strat); else +# endif return libdeflate_deflate(in, in_size, out_size, level, strat); #else return zlib_mem_deflate(in, in_size, out_size, level, strat); From 5dc29e9cfbb9ee787663c7760526eb8706b505b6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 1 Feb 2022 17:25:42 +0000 Subject: [PATCH 234/488] Fix CRAM reading of "alignment span" for unmapped data. When reading, the code that filters data by the requested range now has an exception for ref_seq -1 (unmapped) so that alignment span is not checked. This fixes the check for end-start-1 failing. Data output (correctly) by htsjdk triggered a bug where range filtering was exiting early, making e.g. 'samtools view in.file "*"' return nothing. We didn't spot this before because htslib (wrongly) writes span=1. See next commit. Fixes #1387 --- cram/cram_decode.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index e6b4977b7..835cbdb1c 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -3228,7 +3228,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // position beyond end of range; bail out - if (c_next->ref_seq_start > fd->range.end) { + if (fd->range.refid != -1 && + c_next->ref_seq_start > fd->range.end) { cram_free_container(c_next); fd->ctr_mt = NULL; fd->ooc = 1; @@ -3236,7 +3237,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // before start of range; skip to next container - if (c_next->ref_seq_start + c_next->ref_seq_span-1 < + if (fd->range.refid != -1 && + c_next->ref_seq_start + c_next->ref_seq_span-1 < fd->range.start) { c_next->curr_slice_mt = c_next->max_slice; cram_seek(fd, c_next->length, SEEK_CUR); @@ -3301,7 +3303,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // position beyond end of range; bail out - if (s_next->hdr->ref_seq_start > fd->range.end) { + if (fd->range.refid != -1 && + s_next->hdr->ref_seq_start > fd->range.end) { fd->ooc = 1; cram_free_slice(s_next); c_next->slice = s_next = NULL; @@ -3309,7 +3312,8 @@ static cram_slice *cram_next_slice(cram_fd *fd, cram_container **cp) { } // before start of range; skip to next slice - if (s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 < + if (fd->range.refid != -1 && + s_next->hdr->ref_seq_start + s_next->hdr->ref_seq_span-1 < fd->range.start) { cram_free_slice(s_next); c_next->slice = s_next = NULL; From 0f3074633646b7c550ad196f3868364304a735dd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 1 Feb 2022 17:30:23 +0000 Subject: [PATCH 235/488] Correct writing CRAM alignment span field for unmapped data. The specification states unmapped data must have alignment start and alignment span both zero. Previously span was 1 (from end-start+1). We now adhere to the specification for 3.1 onwards, but it is left as-is (incorrect) for 3.0. See previous commit for implications on reading. Outputting spec compliant CRAM files would make older builds of htslib/samtools fail to return unmapped data from a CRAM index query. This is not helpful, so in this case we feel the specification is best amended to permit other values in the alignment span field (albeit to keep the existing values as recommendations). --- cram/cram_encode.c | 30 ++++++++++++++++++++++++++---- cram/cram_encode.h | 2 +- cram/cram_io.c | 4 ++-- 3 files changed, 29 insertions(+), 7 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 468822247..03738f65d 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1397,6 +1397,9 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, return -1; } +// CRAM version >= 3.1 +#define CRAM_ge31(v) ((v) >= 0x301) + /* * Encodes all slices in a container into blocks. * Returns 0 on success @@ -1526,6 +1529,12 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { s->hdr->ref_seq_id = -2; s->hdr->ref_seq_start = 0; s->hdr->ref_seq_span = 0; + } else if (c->ref_id == -1 && CRAM_ge31(fd->version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + s->hdr->ref_seq_id = -1; + s->hdr->ref_seq_start = 0; + s->hdr->ref_seq_span = 0; } else { s->hdr->ref_seq_id = c->ref_id; s->hdr->ref_seq_start = first_base; @@ -1923,8 +1932,15 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } c->ref_seq_id = c->slices[0]->hdr->ref_seq_id; - c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; - c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; + if (c->ref_seq_id == -1 && CRAM_ge31(fd->version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + c->ref_seq_start = 0; + c->ref_seq_span = 0; + } else { + c->ref_seq_start = c->slices[0]->hdr->ref_seq_start; + c->ref_seq_span = c->slices[0]->hdr->ref_seq_span; + } for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; @@ -2592,12 +2608,18 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, * * See cram_next_container() and cram_close(). */ -void cram_update_curr_slice(cram_container *c) { +void cram_update_curr_slice(cram_container *c, int version) { cram_slice *s = c->slice; if (c->multi_seq) { s->hdr->ref_seq_id = -2; s->hdr->ref_seq_start = 0; s->hdr->ref_seq_span = 0; + } else if (c->curr_ref == -1 && CRAM_ge31(version)) { + // Spec states span=0, but it broke our range queries. + // See commit message for this and prior. + s->hdr->ref_seq_id = -1; + s->hdr->ref_seq_start = 0; + s->hdr->ref_seq_span = 0; } else { s->hdr->ref_seq_id = c->curr_ref; s->hdr->ref_seq_start = c->first_base; @@ -2632,7 +2654,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { c->curr_ref = bam_ref(b); if (c->slice) - cram_update_curr_slice(c); + cram_update_curr_slice(c, fd->version); /* Flush container */ if (c->curr_slice == c->max_slice || diff --git a/cram/cram_encode.h b/cram/cram_encode.h index c779b46a7..7cccae9af 100644 --- a/cram/cram_encode.h +++ b/cram/cram_encode.h @@ -106,7 +106,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c); * * See cram_next_container() and cram_close(). */ -void cram_update_curr_slice(cram_container *c); +void cram_update_curr_slice(cram_container *c, int version); #ifdef __cplusplus } diff --git a/cram/cram_io.c b/cram/cram_io.c index 3045b59dd..77a33db83 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5342,7 +5342,7 @@ int cram_flush(cram_fd *fd) { if (fd->mode == 'w' && fd->ctr) { if(fd->ctr->slice) - cram_update_curr_slice(fd->ctr); + cram_update_curr_slice(fd->ctr, fd->version); if (-1 == cram_flush_container_mt(fd, fd->ctr)) return -1; @@ -5449,7 +5449,7 @@ int cram_close(cram_fd *fd) { if (fd->mode == 'w' && fd->ctr) { if(fd->ctr->slice) - cram_update_curr_slice(fd->ctr); + cram_update_curr_slice(fd->ctr, fd->version); if (-1 == cram_flush_container_mt(fd, fd->ctr)) return -1; From 3f7e13e0688e50fca61e6f043e6655a165d596a2 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 13 Jan 2022 17:37:59 +0000 Subject: [PATCH 236/488] Improve windows build. Specifically we create the extra files needed for MSVC linkage, and document the MSYS2/MINGW setup process. Also added a win-dist target which attempts to produce a directory structure suitable for binary distribution. This isn't executed by default, but is a good aide-memoire and to simplify testing compatibility with things like MSVC. Ideally we'd have a similar mechanism for all platforms to permit easy creation of binary distributions (see #533). --- .appveyor.yml | 2 +- INSTALL | 17 +++++++++++++++++ Makefile | 39 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 55 insertions(+), 3 deletions(-) diff --git a/.appveyor.yml b/.appveyor.yml index eb3059750..8fe288094 100644 --- a/.appveyor.yml +++ b/.appveyor.yml @@ -27,7 +27,7 @@ install: - set MSYSTEM=MINGW64 - set PATH=C:/msys64/usr/bin;C:/msys64/mingw64/bin;%PATH% - set MINGWPREFIX=x86_64-w64-mingw32 - - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl\"" + - "sh -lc \"pacman -S --noconfirm --needed base-devel mingw-w64-x86_64-toolchain mingw-w64-x86_64-autotools mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-tools-git\"" build_script: - set HOME=. diff --git a/INSTALL b/INSTALL index d0eb638c5..68ea086f7 100644 --- a/INSTALL +++ b/INSTALL @@ -269,3 +269,20 @@ OpenSUSE -------- sudo zypper install autoconf automake make gcc perl zlib-devel libbz2-devel xz-devel libcurl-devel libopenssl-devel + +Windows MSYS2/MINGW64 +--------------------- + +Follow MSYS2 installation instructions at +https://www.msys2.org/wiki/MSYS2-installation/ + +Then relaunch to MSYS2 shell using the "MSYS2 MinGW x64" executable. +Once in that environment (check $MSYSTEM equals "MINGW64") install the +compilers using pacman -S and the following package list: + +base-devel mingw-w64-x86_64-toolchain +mingw-w64-x86_64-libdeflate mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 +mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools +mingw-w64-x86_64-tools-git + +(The last is only needed for building libraries compatible with MSVC.) diff --git a/Makefile b/Makefile index 37ae76565..2995adbeb 100644 --- a/Makefile +++ b/Makefile @@ -282,10 +282,10 @@ SHLIB_FLAVOUR = cygdll lib-shared: cyghts-$(LIBHTS_SOVERSION).dll else ifeq "$(findstring MSYS,$(PLATFORM))" "MSYS" SHLIB_FLAVOUR = dll -lib-shared: hts-$(LIBHTS_SOVERSION).dll +lib-shared: hts-$(LIBHTS_SOVERSION).dll hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib else ifeq "$(findstring MINGW,$(PLATFORM))" "MINGW" SHLIB_FLAVOUR = dll -lib-shared: hts-$(LIBHTS_SOVERSION).dll +lib-shared: hts-$(LIBHTS_SOVERSION).dll hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib else SHLIB_FLAVOUR = so lib-shared: libhts.so @@ -330,6 +330,41 @@ cyghts-$(LIBHTS_SOVERSION).dll libhts.dll.a: $(LIBHTS_OBJS) hts-$(LIBHTS_SOVERSION).dll hts.dll.a: $(LIBHTS_OBJS) $(CC) -shared -Wl,--out-implib=hts.dll.a -Wl,--enable-auto-import -Wl,--exclude-all-symbols $(LDFLAGS) -o $@ -Wl,--whole-archive $(LIBHTS_OBJS) -Wl,--no-whole-archive $(LIBS) -lpthread +hts-$(LIBHTS_SOVERSION).def: hts-$(LIBHTS_SOVERSION).dll + gendef hts-$(LIBHTS_SOVERSION).dll + +hts-$(LIBHTS_SOVERSION).lib: hts-$(LIBHTS_SOVERSION).def + dlltool -m i386:x86-64 -d hts-$(LIBHTS_SOVERSION).def -l hts-$(LIBHTS_SOVERSION).lib + +# Bundling libraries, binaries, dll dependencies, and licenses into a +# single directory. NB: This is not needed for end-users, but a test bed +# for maintainers building binary distributions. +# +# NOTE: only tested on the supported MSYS2/MINGW64 environment. +dist-windows: DESTDIR= +dist-windows: prefix=dist-windows +dist-windows: install + cp hts-$(LIBHTS_SOVERSION).def hts-$(LIBHTS_SOVERSION).lib dist-windows/lib + cp `ldd hts-$(LIBHTS_SOVERSION).dll| awk '/mingw64/ {print $$3}'` dist-windows/bin + mkdir -p dist-windows/share/licenses/htslib + -cp -r /mingw64/share/licenses/mingw-w64-libraries \ + /mingw64/share/licenses/brotli \ + /mingw64/share/licenses/bzip2 \ + /mingw64/share/licenses/gcc-libs \ + /mingw64/share/licenses/libdeflate \ + /mingw64/share/licenses/libpsl \ + /mingw64/share/licenses/libtre \ + /mingw64/share/licenses/libwinpthread \ + /mingw64/share/licenses/openssl \ + /mingw64/share/licenses/xz \ + /mingw64/share/licenses/zlib \ + /mingw64/share/licenses/zstd \ + dist-windows/share/licenses/ + -cp -r /usr/share/licenses/curl \ + dist-windows/share/licenses/ + cp LICENSE dist-windows/share/licenses/htslib/ + + # Target to allow htslib.mk to build all the object files before it # links the shared and static libraries. hts-object-files: $(LIBHTS_OBJS) From 1c8829a9a189004c6229bab923753aa5569cff0c Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 10 Feb 2022 10:46:39 +0000 Subject: [PATCH 237/488] Update copyright dates for February 2022 release. --- Makefile | 2 +- cram/cram_decode.c | 2 +- cram/cram_encode.c | 2 +- cram/cram_io.c | 2 +- vcfutils.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 2995adbeb..f88cc4f4e 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2021 Genome Research Ltd. +# Copyright (C) 2013-2022 Genome Research Ltd. # # Author: John Marshall # diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 835cbdb1c..b352fc633 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2020, 2022 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 03738f65d..d35643a92 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020 Genome Research Ltd. +Copyright (c) 2012-2020, 2022 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_io.c b/cram/cram_io.c index 77a33db83..c9dcb5014 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2021 Genome Research Ltd. +Copyright (c) 2012-2022 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/vcfutils.c b/vcfutils.c index d99e78ffb..890c50a16 100644 --- a/vcfutils.c +++ b/vcfutils.c @@ -1,6 +1,6 @@ /* vcfutils.c -- allele-related utility functions. - Copyright (C) 2012-2018, 2020-2021 Genome Research Ltd. + Copyright (C) 2012-2018, 2020-2022 Genome Research Ltd. Author: Petr Danecek From 89574a8010a1eba550c4b7c50ca2aea258639f4a Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 15 Feb 2022 10:09:39 +0000 Subject: [PATCH 238/488] Update to htscodecs v1.2.1 - Speed up rle decode by a minor amount. - Improve compute_shift estimates. - Speed up r4x16pr -o0 encode on gcc11. - Fix small bug in hist1_4. - Small optimisations to hist8 and hist1_4 functions. - Optimise the rans stripe and unstripe code. - Bug fix to bounds checking in fqzcomp read_array. - Work around issues with Mingw64 having a tiny stack. --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index b6615cdf5..c6a459a44 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit b6615cdf542a83badc805d2195bc514220f19ce6 +Subproject commit c6a459a4488624d5e4b2d4d642febbd55a78a9b1 From 6c1265c02e583f434dcaa81312beecd45eaefd2b Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 14 Feb 2022 10:20:18 +0000 Subject: [PATCH 239/488] Fix hts_parse_decimal and make synced BCF targets parsing more robust The function bcf_sr_regions_init() autodetects the format of input regions, trying to decide between CHR,POS vs CHR,BEG,END formats. However, the function hts_parse_decimal() was recognising standalone `G` as `0` which resulted in `chr1 1234 G` being interpretted as `chr1 1234 0`. This commit makes the digit parsing more strict, requiring at least one digit to appear before the exponent or unit part. See also samtools/bcftools#1598 --- hts.c | 13 ++++++++++--- synced_bcf_reader.c | 9 ++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/hts.c b/hts.c index 4bbe41eaa..839ec7113 100644 --- a/hts.c +++ b/hts.c @@ -3482,7 +3482,7 @@ static inline long long push_digit(long long i, char c) long long hts_parse_decimal(const char *str, char **strend, int flags) { long long n = 0; - int decimals = 0, e = 0, lost = 0; + int decimals = 0, e = 0, lost = 0, has_digit = 0; char sign = '+', esign = '+'; const char *s; @@ -3491,13 +3491,20 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) if (*s == '+' || *s == '-') sign = *s++; while (*s) - if (isdigit_c(*s)) n = push_digit(n, *s++); + if (isdigit_c(*s)) n = push_digit(n, *s++), has_digit = 1; else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++; else break; if (*s == '.') { s++; - while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++); + while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++), has_digit = 1; + } + + // there must have been a digit or else cannot be a valid number + if ( !has_digit ) + { + if ( strend ) *strend = (char*)str; + return 0; } if (*s == 'E' || *s == 'e') { diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index ce2d7c780..c980e3b45 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1107,7 +1107,7 @@ static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char ** if ( k==l ) { *from = *to = hts_parse_decimal(ss, &tmp, 0); - if ( tmp==ss ) return -1; + if ( tmp==ss || (*tmp && *tmp!='\t') ) return -1; } else { @@ -1115,7 +1115,7 @@ static int _regions_parse_line(char *line, int ichr, int ifrom, int ito, char ** *from = hts_parse_decimal(ss, &tmp, 0); else *to = hts_parse_decimal(ss, &tmp, 0); - if ( ss==tmp ) return -1; + if ( ss==tmp || (*tmp && *tmp!='\t') ) return -1; for (i=k; ifile); reg->file = NULL; free(reg); return NULL; } + ito = ifrom; } + else if ( ito<0 ) + ito = abs(ito); if ( !ret ) continue; if ( is_bed ) from++; *chr_end = 0; From c903d331056c11afa18c58a466896d4c1de10848 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 26 Jan 2022 14:24:01 +0000 Subject: [PATCH 240/488] Check for incorrect contig names and INFO/FORMAT tags This commit will only result in warnings printed on stderr. This should be improved to propagate the error all the way up to the caller and let it decide whether to throw an error or proceed despite the error. See https://github.com/samtools/htslib/issues/1065 https://github.com/samtools/samtools/issues/1573 --- vcf.c | 146 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 136 insertions(+), 10 deletions(-) diff --git a/vcf.c b/vcf.c index 0d59be6e6..18c97662a 100644 --- a/vcf.c +++ b/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li @@ -376,6 +376,126 @@ int bcf_hrec_find_key(bcf_hrec_t *hrec, const char *key) return -1; } +static void bcf_hrec_set_type(bcf_hrec_t *hrec) +{ + if ( !strcmp(hrec->key, "contig") ) hrec->type = BCF_HL_CTG; + else if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; + else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; + else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; + else if ( hrec->nkeys>0 ) hrec->type = BCF_HL_STR; + else hrec->type = BCF_HL_GEN; +} + + +/** + The arrays were generated with + + valid_ctg: + perl -le '@v = (split(//,q[!#$%&*+./:;=?@^_|~-]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 + + valid_tag: + perl -le '@v = (split(//,q[_.]),"a"..."z","A"..."Z","0"..."9"); @a = (0) x 256; foreach $c (@v) { $a[ord($c)] = 1; } print join(", ",@a)' | fold -w 48 +*/ +static const uint8_t valid_ctg[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +static const uint8_t valid_tag[256] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/** + bcf_hrec_check() - check the validity of structured header lines + + Returns 0 on success or negative value on error. + + Currently the return status is not checked by the caller + and only a warning is printed on stderr. This should be improved + to propagate the error all the way up to the caller and let it + decide what to do: throw an error or proceed anyway. + */ +static int bcf_hrec_check(bcf_hrec_t *hrec) +{ + int i; + bcf_hrec_set_type(hrec); + + if ( hrec->type==BCF_HL_CTG ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( val[0]=='*' || val[0]=='=' || !valid_ctg[(uint8_t)val[0]] ) goto err_invalid_ctg; + while ( *(++val) ) + if ( !valid_ctg[(uint8_t)*val] ) goto err_invalid_ctg; + return 0; + } + if ( hrec->type==BCF_HL_INFO ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( !strcmp(val,"1000G") ) return 0; + if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag; + while ( *(++val) ) + if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag; + return 0; + } + if ( hrec->type==BCF_HL_FMT ) + { + i = bcf_hrec_find_key(hrec,"ID"); + if ( i<0 ) goto err_missing_id; + char *val = hrec->vals[i]; + if ( val[0]=='.' || (val[0]>='0' && val[0]<='9') || !valid_tag[(uint8_t)val[0]] ) goto err_invalid_tag; + while ( *(++val) ) + if ( !valid_tag[(uint8_t)*val] ) goto err_invalid_tag; + return 0; + } + return 0; + + err_missing_id: + hts_log_warning("Missing ID attribute in one or more header lines"); + return -1; + + err_invalid_ctg: + hts_log_warning("Invalid contig name: \"%s\"", hrec->vals[i]); + return -1; + + err_invalid_tag: + hts_log_warning("Invalid tag name: \"%s\"", hrec->vals[i]); + return -1; +} + static inline int is_escaped(const char *min, const char *str) { int n = 0; @@ -402,6 +522,7 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) if (!hrec->key) goto fail; memcpy(hrec->key,p,n); hrec->key[n] = 0; + hrec->type = -1; p = ++q; if ( *p!='<' ) // generic field, e.g. ##samtoolsVersion=0.1.18-r579 @@ -483,8 +604,14 @@ bcf_hrec_t *bcf_hdr_parse_line(const bcf_hdr_t *h, const char *line, int *len) if (bcf_hrec_set_val(hrec, hrec->nkeys-1, p, r-p, quoted) < 0) goto fail; if ( quoted && *q==ending ) q++; - if ( *q=='>' ) { nopen--; q++; } + if ( *q=='>' ) + { + if (nopen) nopen--; // this can happen with nested angle brackets <> + q++; + } } + if ( nopen ) + hts_log_warning("Incomplete header line, trying to proceed anyway:\n\t[%s]\n\t[%d]",line,q[0]); // Skip to end of line int nonspace = 0; @@ -555,10 +682,11 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) khint_t k; char *str = NULL; - if ( !strcmp(hrec->key, "contig") ) + bcf_hrec_set_type(hrec); + + if ( hrec->type==BCF_HL_CTG ) { hts_pos_t len = 0; - hrec->type = BCF_HL_CTG; // Get the contig ID ($str) and length ($j) i = bcf_hrec_find_key(hrec,"length"); @@ -623,11 +751,8 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 1; } - if ( !strcmp(hrec->key, "INFO") ) hrec->type = BCF_HL_INFO; - else if ( !strcmp(hrec->key, "FILTER") ) hrec->type = BCF_HL_FLT; - else if ( !strcmp(hrec->key, "FORMAT") ) hrec->type = BCF_HL_FMT; - else if ( hrec->nkeys>0 ) { hrec->type = BCF_HL_STR; return 1; } - else return 0; + if ( hrec->type==BCF_HL_STR ) return 1; + if ( hrec->type!=BCF_HL_INFO && hrec->type!=BCF_HL_FLT && hrec->type!=BCF_HL_FMT ) return 0; // INFO/FILTER/FORMAT char *id = NULL; @@ -738,7 +863,8 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) int res; if ( !hrec ) return 0; - hrec->type = BCF_HL_GEN; + bcf_hrec_check(hrec); // todo: check return status and propagate errors up + res = bcf_hdr_register_hrec(hdr,hrec); if (res < 0) return -1; if ( !res ) From fb82bae5607c7ccc49de2662f52a6f2d54bb9a52 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 14 Feb 2022 21:04:12 +0000 Subject: [PATCH 241/488] Check number syntax is valid in hts_parse_decimal() Follow strtol() et al's conventions, setting strend to the original str (or emitting a warning) if the input string does not form a valid numeric constant. In particular, require it to have a non-zero number of digits. Merge E and k/M/G handling, so that at most one can be used. In particular, "1e+3k" is non-standard and ambiguous, and should be rejected. Document the numeric syntax parsed and the two modes of operation (as determined by whether strend is non-NULL). Add test cases, in particular verifying that a sole "G" is rejected. --- hts.c | 26 +++++++++++--------------- htslib/hts.h | 24 ++++++++++++++++++++---- test/sam.c | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 63 insertions(+), 19 deletions(-) diff --git a/hts.c b/hts.c index 839ec7113..ddaa60bfb 100644 --- a/hts.c +++ b/hts.c @@ -3482,39 +3482,32 @@ static inline long long push_digit(long long i, char c) long long hts_parse_decimal(const char *str, char **strend, int flags) { long long n = 0; - int decimals = 0, e = 0, lost = 0, has_digit = 0; + int digits = 0, decimals = 0, e = 0, lost = 0; char sign = '+', esign = '+'; - const char *s; + const char *s, *str_orig = str; while (isspace_c(*str)) str++; s = str; if (*s == '+' || *s == '-') sign = *s++; while (*s) - if (isdigit_c(*s)) n = push_digit(n, *s++), has_digit = 1; + if (isdigit_c(*s)) digits++, n = push_digit(n, *s++); else if (*s == ',' && (flags & HTS_PARSE_THOUSANDS_SEP)) s++; else break; if (*s == '.') { s++; - while (isdigit_c(*s)) decimals++, n = push_digit(n, *s++), has_digit = 1; - } - - // there must have been a digit or else cannot be a valid number - if ( !has_digit ) - { - if ( strend ) *strend = (char*)str; - return 0; + while (isdigit_c(*s)) decimals++, digits++, n = push_digit(n, *s++); } - if (*s == 'E' || *s == 'e') { + switch (*s) { + case 'e': case 'E': s++; if (*s == '+' || *s == '-') esign = *s++; while (isdigit_c(*s)) e = push_digit(e, *s++); if (esign == '-') e = -e; - } + break; - switch (*s) { case 'k': case 'K': e += 3; s++; break; case 'm': case 'M': e += 6; s++; break; case 'g': case 'G': e += 9; s++; break; @@ -3529,7 +3522,10 @@ long long hts_parse_decimal(const char *str, char **strend, int flags) } if (strend) { - *strend = (char *)s; + // Set to the original input str pointer if not valid number syntax + *strend = (digits > 0)? (char *)s : (char *)str_orig; + } else if (digits == 0) { + hts_log_warning("Invalid numeric value %.8s[truncated]", str); } else if (*s) { if ((flags & HTS_PARSE_THOUSANDS_SEP) || (!(flags & HTS_PARSE_THOUSANDS_SEP) && *s != ',')) hts_log_warning("Ignoring unknown characters after %.*s[%s]", (int)(s - str), str, s); diff --git a/htslib/hts.h b/htslib/hts.h index 48c263cab..43dfb5f97 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1137,10 +1137,26 @@ int hts_idx_nseq(const hts_idx_t *idx); @param strend If non-NULL, set on return to point to the first character in @a str after those forming the parsed number @param flags Or'ed-together combination of HTS_PARSE_* flags - @return Converted value of the parsed number. - - When @a strend is NULL, a warning will be printed (if hts_verbose is HTS_LOG_WARNING - or more) if there are any trailing characters after the number. + @return Integer value of the parsed number, or 0 if no valid number + + The input string is parsed as: optional whitespace; an optional '+' or + '-' sign; decimal digits possibly including ',' characters (if @a flags + includes HTS_PARSE_THOUSANDS_SEP) and a '.' decimal point; and an optional + case-insensitive suffix, which may be either 'k', 'M', 'G', or scientific + notation consisting of 'e'/'E' followed by an optional '+' or '-' sign and + decimal digits. To be considered a valid numeric value, the main part (not + including any suffix or scientific notation) must contain at least one + digit (either before or after the decimal point). + + When @a strend is NULL, @a str is expected to contain only (optional + whitespace followed by) the numeric value. A warning will be printed + (if hts_verbose is HTS_LOG_WARNING or more) if no valid parsable number + is found or if there are any unused characters after the number. + + When @a strend is non-NULL, @a str starts with (optional whitespace + followed by) the numeric value. On return, @a strend is set to point + to the first unused character after the numeric value, or to @a str + if no valid parsable number is found. */ HTSLIB_EXPORT long long hts_parse_decimal(const char *str, char **strend, int flags); diff --git a/test/sam.c b/test/sam.c index b6f6c0e04..cc5bfe77a 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1655,6 +1655,37 @@ static int read_data_block(const char *in_name, samFile *fp_in, return ret; } +static void test_parse_decimal1(long long exp, const char *str, size_t exp_consumed, int flags, const char *warning) +{ + if (warning) fprintf(stderr, "(Expect %s message for \"%s\")\n", warning, str); + + long long val = hts_parse_decimal(str, NULL, flags); + if (val != exp) fail("hts_parse_decimal(\"%s\", NULL, %d) returned %lld, expected %lld", str, flags, val, exp); + + char *end; + val = hts_parse_decimal(str, &end, flags); + if (val != exp) fail("hts_parse_decimal(\"%s\", ..., %d) returned %lld, expected %lld", str, flags, val, exp); + size_t consumed = end - str; + if (consumed != exp_consumed) fail("hts_parse_decimal(\"%s\", ..., %d) consumed %zu chars, expected %zu", str, flags, consumed, exp_consumed); +} + +static void test_parse_decimal(void) +{ + test_parse_decimal1(37, "+37", 3, 0, NULL); + test_parse_decimal1(-1001, " \t -1,001x", 9, HTS_PARSE_THOUSANDS_SEP, "trailing 'x'"); + test_parse_decimal1(LLONG_MAX, "+9223372036854775807", 20, 0, NULL); + test_parse_decimal1(LLONG_MIN, "-9,223,372,036,854,775,808", 26, HTS_PARSE_THOUSANDS_SEP, NULL); + test_parse_decimal1(1500, "1.5e3", 5, 0, NULL); + test_parse_decimal1(1500, "1.5e+3k", 6, 0, "trailing 'k'"); + test_parse_decimal1(1500000000, "1.5G", 4, 0, NULL); + test_parse_decimal1(12345, "12.345k", 7, 0, NULL); + test_parse_decimal1(12345, "12.3456k", 8, 0, "dropped fraction"); + test_parse_decimal1(0, "A", 0, 0, "invalid numeric"); + test_parse_decimal1(0, "G", 0, 0, "invalid numeric"); + test_parse_decimal1(0, " +/-", 0, 0, "invalid numeric"); + test_parse_decimal1(0, " \t -.e+9999", 0, 0, "invalid numeric"); +} + static void test_mempolicy(void) { size_t bufsz = MAX_RECS * REC_LENGTH, nrecs = 0, i; @@ -2194,6 +2225,7 @@ int main(int argc, char **argv) check_cigar_tab(); check_big_ref(0); check_big_ref(1); + test_parse_decimal(); test_mempolicy(); set_qname(); for (i = 1; i < argc; i++) faidx1(argv[i]); From f1efcc35f806bff5380919776fb02e9309770076 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 16 Feb 2022 17:06:34 +0000 Subject: [PATCH 242/488] NEWS updates for 1.15. (PR #1393) --- NEWS | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/NEWS b/NEWS index 7b489a753..bc5616af3 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,86 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Features and Updates +-------------------- + +* Bgzip now has a --keep option to not remove the input file after + compressing. (PR#1331) + +* Improved file format detection so some BED files are no longer + detected as FASTQ or FASTA. (PR#1350, thanks to John Marshall) + +* Added xz (lzma), zstd and D4 formats to the file type detection + functions. We don't actively support reading these data types, but + function calls and htsfile can detect them. (PR#1340, thanks to + John Marshall) + +* CRAM now also uses libdeflate for read-names if the libdeflate + version is new enough (1.9 onwards). Previously we used zlib for + this due to poor performance of libdeflate. This gives a slight + speed up and reduction in file size. (PR#1383) + +* The VCF and BCF readers will now issue a warning if contig, INFO + or FORMAT IDs do not match the formats described in the VCFv4.3 + specification. Note that while the invalid names will mostly still + be accepted, future updates will convert the warnings to errors + causing files including invalid names to be rejected. (PR#1389) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* HTSlib now uses libhtscodecs release 1.2.1. + +* Improved support for compiling and linking against HTSlib with + Microsoft Visual Studio. (PR#1380, #1377, #1375. Thanks to + Aidan Bickford and John Marshall) + +* Various internal CI improvements. + +Bug fixes +--------- + +* Fixed CRAM index queries for HTSJDK output (PR#1388, reported by + Chris Norman). Note this also fixes writing CRAM writing, to match + the specification (and HTSJDK), from version 3.1 onwards. + +* Fixed CRAM index queries when required-fields settings are selected + to ignore CIGARs (PR#1372, reported by Giulio Genovese). + +* Unmapped but placed (having chr/pos) are now included in the BAM + indices. (PR#1352, thanks to John Marshall) + +* CRAM now honours the filename##idx##index nomenclature for + specifying non-standard index locations. (PR#1360, reported by + Michael Cariaso) + +* Minor CRAM v1.0 read-group fix (PR#1349, thanks to John Marshall) + +* Permit .fa and .fq file type detection as synonyms for FASTA and + FASTQ. (PR#1386). + +* Empty VCF format fields are now output ":.:" as instead of "::". + (PR#1370) + +* Repeated bcf_sr_seek calls now work. (PR#1363, reported by + Giulio Genovese) + +* Bcf_remove_allele_set now works on unpacked BCF records. (PR#1358, + reported by Brent Pedersen). + +* The hts_parse_decimal() function used to read numbers in region lists + is now better at rejecting non-numeric values. In particular it + now rejects a lone 'G' instead of interpreting it as '0G', i.e. zero. + (PR#1396, PR#1400, reported by SSSimon Yang; thanks to John Marshall). + +* Improve support for GPU issues listed by -Wdouble-promotion. + (PR#1365, reported by David Seisert) + +* Fix example code in header file documentation. (PR#1381, Thanks to + Aidan Bickford) + Noteworthy changes in release 1.14 (22nd October 2021) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 3e6f13c2f4a74f9b2561fa1dd4f8c22b4c4dbbd6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 17 Feb 2022 12:51:14 +0000 Subject: [PATCH 243/488] Improve install notes for Windows. --- INSTALL | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/INSTALL b/INSTALL index 68ea086f7..9e73a36f8 100644 --- a/INSTALL +++ b/INSTALL @@ -53,11 +53,9 @@ library is used. Systems that do not have CCHmac will get this from libcrypto. libcrypto is part of OpenSSL or one of its derivatives (LibreSSL or BoringSSL). -On Microsoft Windows we recommend use of Mingw64/Msys2. Note that -currently for the test harness to work you will need to override the -test temporary directory with e.g.: make check TEST_OPTS="-t C:/msys64/tmp/_" -Whilst the code may work on Windows with other environments, these have -not been verified. +On Microsoft Windows we recommend use of Mingw64/Msys2. Whilst the +code may work on Windows with other environments, these have not been +verified. Use of the configure script is a requirement too. Update htscodecs submodule ========================== @@ -273,6 +271,9 @@ sudo zypper install autoconf automake make gcc perl zlib-devel libbz2-devel xz-d Windows MSYS2/MINGW64 --------------------- +The configure script must be used as without it the compilation will +likely fail. + Follow MSYS2 installation instructions at https://www.msys2.org/wiki/MSYS2-installation/ From dde5d45c01eb4b50c2291d5ea47b7268e8dd5898 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Fri, 18 Feb 2022 11:50:11 +0000 Subject: [PATCH 244/488] CentOS and Alpine Linux install changes. Changed sudo to doas for modern Alpine Linux installations and added a Perl package that is missing in CentOS 9. --- INSTALL | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/INSTALL b/INSTALL index 9e73a36f8..2f6deacb2 100644 --- a/INSTALL +++ b/INSTALL @@ -257,11 +257,15 @@ RedHat / CentOS sudo yum install autoconf automake make gcc perl-Data-Dumper zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel +Note: On some versions perl FindBin will need to be installed to make the tests work. + +sudo yum install perl-FindBin + Alpine Linux ------------ -sudo apk update # Ensure the package list is up to date -sudo apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev +doas apk update # Ensure the package list is up to date +doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev OpenSUSE -------- From 7981bc9356899ea5dbf076d432e9645ee55df1d6 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 22 Feb 2022 11:21:16 +0000 Subject: [PATCH 245/488] Fix missing parameter in hts_log() calls The context parameter was omitted, causing the format string to be used for it instead, and the next one to be used as the format. This resulted in not very useful error messages and possible issues with incorrect interpretation of varargs parameters. --- realn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/realn.c b/realn.c index f05db6ac4..5354dee78 100644 --- a/realn.c +++ b/realn.c @@ -91,12 +91,12 @@ int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres) static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity, const char *type, const bam1_t *b) { if (*tg != 'Z') { - hts_log(severity, "Incorrect %s tag type (%c) for read %s", + hts_log(severity, __func__, "Incorrect %s tag type (%c) for read %s", type, *tg, bam_get_qname(b)); return -1; } if (b->core.l_qseq != strlen((const char *) tg + 1)) { - hts_log(severity, "Read %s %s tag is wrong length", + hts_log(severity, __func__, "Read %s %s tag is wrong length", bam_get_qname(b), type); return -1; } From 0d83a7b2d7aa961b5589edbd2a244ddaa6520282 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 23 Feb 2022 09:54:22 +0000 Subject: [PATCH 246/488] Use a constant printf format string in hts_feature_string() [minor] Refactor this code to avoid a -Wformat-nonliteral warning. --- hts.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/hts.c b/hts.c index ddaa60bfb..30c0c6890 100644 --- a/hts.c +++ b/hts.c @@ -168,7 +168,7 @@ const char *hts_test_feature(unsigned int id) { // to find the configuration parameters. const char *hts_feature_string(void) { static char config[1200]; - const char *fmt= + const char *flags= #ifdef PACKAGE_URL "build=configure " @@ -176,12 +176,6 @@ const char *hts_feature_string(void) { "build=Makefile " #endif -#ifdef ENABLE_PLUGINS - "plugins=yes, plugin-path=%.1000s " -#else - "plugins=no " -#endif - #ifdef HAVE_LIBCURL "libcurl=yes " #else @@ -218,13 +212,21 @@ const char *hts_feature_string(void) { "bzip2=no " #endif - "htscodecs=%.40s"; +// "plugins=" must stay at the end as it is followed by "plugin-path=" +#ifdef ENABLE_PLUGINS + "plugins=yes"; +#else + "plugins=no"; +#endif #ifdef ENABLE_PLUGINS - snprintf(config, sizeof(config), fmt, - hts_plugin_path(), htscodecs_version()); + snprintf(config, sizeof(config), + "%s plugin-path=%.1000s htscodecs=%.40s", + flags, hts_plugin_path(), htscodecs_version()); #else - snprintf(config, sizeof(config), fmt, htscodecs_version()); + snprintf(config, sizeof(config), + "%s htscodecs=%.40s", + flags, htscodecs_version()); #endif return config; } From d5a00db4049a4aff80d332f812863f7e501a0125 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 20 Jan 2022 16:28:18 +0000 Subject: [PATCH 247/488] Add more calls to bgzf_flush_try. This makes bgzipped SAM, VCF, FASTA and FASTQ start blocks on a new record (except for the case of a single record being too large to fit in a single block). It is a companion PR to #1369 --- sam.c | 75 ++++++++++++++++++++++++++++++++++++++++-- test/index.vcf.gz.csi | Bin 159 -> 168 bytes test/index.vcf.gz.tbi | Bin 213 -> 206 bytes vcf.c | 13 +++++--- 4 files changed, 82 insertions(+), 6 deletions(-) diff --git a/sam.c b/sam.c index 393a3b22e..55851134b 100644 --- a/sam.c +++ b/sam.c @@ -2910,6 +2910,10 @@ ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { // Number of BAM records (writing) #define NB 1000 +// FIXME: this is too large for ONT data. +// We should have NB as a maximum for allocation purposes, but bail out +// early if it grows beyond NM so we have constant memory usage. + struct SAM_state; // Output job - a block of BAM records @@ -3444,6 +3448,8 @@ static void *sam_dispatcher_write(void *vp) { i++; if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, i-j) < 0) + goto err; if (bgzf_write(fp->fp.bgzf, &gl->data[j], i-j) != i-j) goto err; } else { @@ -3483,8 +3489,69 @@ static void *sam_dispatcher_write(void *vp) { pthread_mutex_unlock(&fd->lines_m); } else { if (fp->is_bgzf) { - if (bgzf_write(fp->fp.bgzf, gl->data, gl->data_size) != gl->data_size) - goto err; + // We keep track of how much in the current block we have + // remaining => R. We look for the last newline in input + // [i] to [i+R], backwards => position N. + // + // If we find a newline, we write out bytes i to N. + // We know we cannot fit the next record in this bgzf block, + // so we flush what we have and copy input N to i+R into + // the start of a new block, and recompute a new R for that. + // + // If we don't find a newline (i==N) then we cannot extend + // the current block at all, so flush whatever is in it now + // if it ends on a newline. + // We still copy i(==N) to i+R to the next block and + // continue as before with a new R. + // + // The only exception on the flush is when we run out of + // data in the input. In that case we skip it as we don't + // yet know if the next record will fit. + // + // Both conditions share the same code here: + // - Look for newline (pos N) + // - Write i to N (which maybe 0) + // - Flush if block ends on newline and not end of input + // - write N to i+R + + int i = 0; + BGZF *fb = fp->fp.bgzf; + while (i < gl->data_size) { + // remaining space in block + int R = BGZF_BLOCK_SIZE - fb->block_offset; + int eod = 0; + if (R > gl->data_size-i) + R = gl->data_size-i, eod = 1; + + // Find last newline in input data + int N = i + R; + while (--N > i) { + if (gl->data[N] == '\n') + break; + } + + if (N != i) { + // Found a newline + N++; + if (bgzf_write(fb, &gl->data[i], N-i) != N-i) + goto err; + } + + // Flush bgzf block + int b_off = fb->block_offset; + if (!eod && b_off && + ((char *)fb->uncompressed_block)[b_off-1] == '\n') + if (bgzf_flush_try(fb, BGZF_BLOCK_SIZE) < 0) + goto err; + + // Copy from N onwards into next block + if (i+R > N) + if (bgzf_write(fb, &gl->data[N], i+R - N) + != i+R - N) + goto err; + + i = i+R; + } } else { if (hwrite(fp->fp.hfile, gl->data, gl->data_size) != gl->data_size) goto err; @@ -4348,6 +4415,8 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) if (sam_format1(h, b, &fp->line) < 0) return -1; kputc('\n', &fp->line); if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; if ( bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l ) return -1; } else { if ( hwrite(fp->fp.hfile, fp->line.s, fp->line.l) != fp->line.l ) return -1; @@ -4387,6 +4456,8 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) if (fastq_format1(fp->state, b, &fp->line) < 0) return -1; if (fp->is_bgzf) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; if (bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l) != fp->line.l) return -1; } else { diff --git a/test/index.vcf.gz.csi b/test/index.vcf.gz.csi index 644832d838ef520e66657129e63c51fda22d1cfe..250339624cc84613d53e0ad22fa389dfecf655ac 100644 GIT binary patch delta 145 zcmV;C0B--E0jL3gABzYC000000RIL6LPG)oivVrTu?@mN5Jb^;2H^-IDu4zsEfAR$ ziDD=Nle++lpa?>mLE;XbeY#0TyMMH^r%N9;SYeL_A-S}J7M-K+V3F|ozVug&w_Dit z(O>3zK5`~h@(pf7p2LKVWAw7KfxspOJqcj@&^_ literal 159 zcmb2|=3rp}f&Xj_PR>jWO$^0+-%_3=CnO{=C#5Q+F(tWaoMjMYW-L=MOEE|?FiA*g z^Q!huN^D>h_}^;Ka#oak;dkQ&O;>{jvLrUHV4oRiDtR#7Wm5Kio`o0Rx=i2So2Gt4 tda~fB9*-jW8yVi-+{k;#K*aUp>wqPTl!Om3sVHA)mDjWI~d;H+{k;#K%n*E^8f|Kz*gRfh7=a7AZ079Z3&E3*A^^U zn9SiNmca99j{m_p2E~8sk@Jh}cG|^md+j^JW7RL=NrtDNY_ac+GF|;}m38##)!Sa* zythaDd}`wPskQA{YZR5B;NRrJ^<`^v^Pm0RbouI+z0r~NTK>|<E?2qd8offf?)%5CH(~Csr!} diff --git a/vcf.c b/vcf.c index 18c97662a..6e3f05388 100644 --- a/vcf.c +++ b/vcf.c @@ -2226,10 +2226,12 @@ int vcf_hdr_write(htsFile *fp, const bcf_hdr_t *h) } while (htxt.l && htxt.s[htxt.l-1] == '\0') --htxt.l; // kill trailing zeros int ret; - if ( fp->format.compression!=no_compression ) + if ( fp->format.compression!=no_compression ) { ret = bgzf_write(fp->fp.bgzf, htxt.s, htxt.l); - else + if (bgzf_flush(fp->fp.bgzf) != 0) return -1; + } else { ret = hwrite(fp->fp.hfile, htxt.s, htxt.l); + } free(htxt.s); return ret<0 ? -1 : 0; } @@ -3401,10 +3403,13 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) fp->line.l = 0; if (vcf_format1(h, v, &fp->line) != 0) return -1; - if ( fp->format.compression!=no_compression ) + if ( fp->format.compression!=no_compression ) { + if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) + return -1; ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); - else + } else { ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); + } if (fp->idx) { int tid; From 5ccdc356d34aacc857937270fa938ce376120b8f Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 24 Jan 2022 15:51:22 +0000 Subject: [PATCH 248/488] Fix excessive memory used by multi-threaded SAM output on long reads. The previous code here buffered up 1000 BAM records before dispatching a job to convert to SAM. We now buffer up either 1000 BAM records or (estimated) ~240KB of encoded SAM, whichever is hit first. Note these figures were previously written in macros NB and NM, which have been replaced with more meaningful names. I tested it with data having an average length of 148bp (Illumina) 15949bp (ONT) and 13441 (PB CCS), using the GIAB HG002 chr1 public data sets. Maximum lengths were 148, 1044875 and 19288. /usr/bin/time real time and maxresident figures, before: Illumina ONT PB -@8 43,128/2m43 6,040,568/1m39 1,593,932/1m15 -@16 70,880/1m42 8,710,132/1m08 3,026,604/0m52 -@32 70,272/1m29 11,940,036/0m55 5,722,952/0m47 -@64 190,584/1m21 17,007,840/0m56 10,835,512/0m48 After: Illumina ONT PB -@16 50,208/1m36 696,276/1m09 63,496/0m57 -@32 86,044/1m21 1,054,524/0m53 109,696/0m44 -@64 149,024/1m24 1,616,720/0m55 195,676/0m48 The effect on memory (KB) is vast, although it's still a bit higher on ONT. This is probably related to the maximum lengths being used and the reuse of BAM structs (never shrinking them) rather than the average, so a long tail on the distribution causes memory growth. We could address that in later updates, but this is still a huge improvement. Obviously as we get to very long records, we'll be dispatching very commonly (maybe every alignment), but I don't yet know how inefficient the threading becomes then. Memory usage will grow as we cannot store half an alignment, but it won't explode so fast. We may wish to try larger values of SAM_NBYTES, but note I tried 100KB to 1MB and this came out fairly optimal on CPU so there was little reason to trade more memory for CPU. --- sam.c | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/sam.c b/sam.c index 55851134b..b0ca2e974 100644 --- a/sam.c +++ b/sam.c @@ -2906,13 +2906,10 @@ ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { * SAM threading */ // Size of SAM text block (reading) -#define NM 240000 -// Number of BAM records (writing) -#define NB 1000 +#define SAM_NBYTES 240000 -// FIXME: this is too large for ONT data. -// We should have NB as a maximum for allocation purposes, but bail out -// early if it grows beyond NM so we have constant memory usage. +// Number of BAM records (writing, up to NB_mem in size) +#define SAM_NBAM 1000 struct SAM_state; @@ -2922,7 +2919,8 @@ typedef struct sp_bams { int serial; bam1_t *bams; - int nbams, abams; // used and alloc + int nbams, abams; // used and alloc for bams[] array + size_t bam_mem; // very approximate total size struct SAM_state *fd; } sp_bams; @@ -3173,6 +3171,7 @@ static void *sam_parse_worker(void *arg) { goto err; } gb->nbams = 0; + gb->bam_mem = 0; } gb->serial = gl->serial; gb->next = NULL; @@ -3225,6 +3224,7 @@ static void *sam_parse_worker(void *arg) { cleanup_sp_lines(gl); goto err; } + cp = nl; i++; } @@ -3294,7 +3294,7 @@ static void *sam_dispatcher_read(void *vp) { l = calloc(1, sizeof(*l)); if (!l) goto err; - l->alloc = NM; + l->alloc = SAM_NBYTES; l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1 if (!l->data) { free(l); @@ -3305,11 +3305,11 @@ static void *sam_dispatcher_read(void *vp) { } l->next = NULL; - if (l->alloc < line_frag+NM/2) { - char *rp = realloc(l->data, line_frag+NM/2 +8); + if (l->alloc < line_frag+SAM_NBYTES/2) { + char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8); if (!rp) goto err; - l->alloc = line_frag+NM/2; + l->alloc = line_frag+SAM_NBYTES/2; l->data = rp; } memcpy(l->data, line.s, line_frag); @@ -4369,16 +4369,18 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) fd->bams = gb->next; gb->next = NULL; gb->nbams = 0; + gb->bam_mem = 0; pthread_mutex_unlock(&fd->lines_m); } else { pthread_mutex_unlock(&fd->lines_m); if (!(gb = calloc(1, sizeof(*gb)))) return -1; - if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) { + if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) { free(gb); return -1; } gb->nbams = 0; - gb->abams = NB; + gb->abams = SAM_NBAM; + gb->bam_mem = 0; gb->fd = fd; fd->curr_idx = 0; fd->curr_bam = gb; @@ -4387,11 +4389,11 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) if (!bam_copy1(&gb->bams[gb->nbams++], b)) return -2; + gb->bam_mem += b->l_data + sizeof(*b); // Dispatch if full - if (gb->nbams == NB) { + if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) { gb->serial = fd->serial++; - //fprintf(stderr, "Dispatch another %d bams\n", NB); pthread_mutex_lock(&fd->command_m); if (fd->errcode != 0) { pthread_mutex_unlock(&fd->command_m); From b92aac55b18e8dff8847707352b8b474519d97fb Mon Sep 17 00:00:00 2001 From: Alexandr Chernov Date: Fri, 25 Feb 2022 15:50:15 +0100 Subject: [PATCH 249/488] Adjusted types sam.h to avoid compiler warnings --- htslib/sam.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index ea585a45a..45dd51f0a 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1524,7 +1524,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, ++s; } else if (type == 'B') { uint8_t sub_type = *(s++); - int sub_type_size; + unsigned sub_type_size; // or externalise sam.c's aux_type2size function? switch (sub_type) { @@ -1547,7 +1547,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, goto bad_aux; n = le_to_u32(s); s += 4; // now points to the start of the array - if ((end - s) / sub_type_size < n) + if ((size_t)(end - s) / sub_type_size < n) goto bad_aux; r |= kputsn_("B:", 2, ks) < 0; r |= kputc(sub_type, ks) < 0; // write the type From 8caa1db4cfd8dc626a909f27a3051cb2c7b44a1a Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 23 Feb 2022 14:13:29 +0000 Subject: [PATCH 250/488] Add stricter format string checking to a gcc CI build Following a tidy up in 7981bc93 and 0d83a7b2, all format strings in HTSlib are now literals. Prevent non-literal strings from coming back by using -Wformat=2 in one of the CI tests, which enables -Wformat-nonliteral. This has to be a gcc test as it excludes functions that take a va_list from the format-nonliteral warning while clang doesn't, and we need to be able to pass nonliteral format strings to these functions. While this might be slightly inconvenient for developers, any annoyances are far outweighed by being able to automatically detect a class of nasty and otherwise difficult to spot bugs. --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 83e36af9c..1ba352063 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -72,7 +72,7 @@ gcc_task: USE_CONFIG: no - environment: USE_CONFIG: yes - CFLAGS: -std=c99 -pedantic + CFLAGS: -std=c99 -pedantic -Wformat=2 USE_LIBDEFLATE: yes << : *LIBDEFLATE From 328c3f5771695d32fcd7a9fc45899140879174d0 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 22 Feb 2022 11:21:16 +0000 Subject: [PATCH 251/488] Fix missing parameter in hts_log() calls The context parameter was omitted, causing the format string to be used for it instead, and the next one to be used as the format. This resulted in not very useful error messages and possible issues with incorrect interpretation of varargs parameters. --- realn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/realn.c b/realn.c index f05db6ac4..5354dee78 100644 --- a/realn.c +++ b/realn.c @@ -91,12 +91,12 @@ int sam_cap_mapq(bam1_t *b, const char *ref, hts_pos_t ref_len, int thres) static int realn_check_tag(const uint8_t *tg, enum htsLogLevel severity, const char *type, const bam1_t *b) { if (*tg != 'Z') { - hts_log(severity, "Incorrect %s tag type (%c) for read %s", + hts_log(severity, __func__, "Incorrect %s tag type (%c) for read %s", type, *tg, bam_get_qname(b)); return -1; } if (b->core.l_qseq != strlen((const char *) tg + 1)) { - hts_log(severity, "Read %s %s tag is wrong length", + hts_log(severity, __func__, "Read %s %s tag is wrong length", bam_get_qname(b), type); return -1; } From a2db7ed2d508991e4adf289622e4a58234a0aeb0 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 1 Apr 2022 09:14:51 +0100 Subject: [PATCH 252/488] Improve coordinate parsing, particularly with tabix. (PR #1411) Double check the file coordinates are 1-based unless tabix -0 is specified. We treat a 0 coordinate as a warning now. The documentation is also more explicit that -0 also implied a half-open coordinate system. Also improve region parsing. The code supports regions such as "x"/"x:" (all of "x"), "x:10-20" "x:-20" (up to pos 20) and "x:20-" (position 20 onwards). However as genome coordinates are 1-based and internally we have 0-based, we subtract one during parsing. The "x:-20" is done by negative value detection, but this was also triggered with the "x:0" region (treated as "up to 1"). Also, illegal regions such as "x:-10-20" were treated as "x:-20". This is now a hard error. Fixes #1409 --- hts.c | 9 ++++++--- tabix.1 | 4 ++-- tbx.c | 6 +++++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/hts.c b/hts.c index 30c0c6890..d06c10891 100644 --- a/hts.c +++ b/hts.c @@ -3713,14 +3713,17 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, char *hyphen; *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; if (*beg < 0) { + if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') { + // User specified zero, but we're 1-based. + hts_log_error("Coordinates must be > 0"); + return NULL; + } if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') { // interpret chr:-100 as chr:1-100 *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1); *beg = 0; return s_end; - } else if (*hyphen == '-') { - *beg = 0; - } else { + } else if (*beg < -1) { hts_log_error("Unexpected string \"%s\" after region", hyphen); return NULL; } diff --git a/tabix.1 b/tabix.1 index e4bf4b7cb..27428bd52 100644 --- a/tabix.1 +++ b/tabix.1 @@ -81,8 +81,8 @@ greater than that, you will need to use a CSI index. .SH INDEXING OPTIONS .TP 10 .B -0, --zero-based -Specify that the position in the data file is 0-based (e.g. UCSC files) -rather than 1-based. +Specify that the position in the data file is 0-based half-open +(e.g. UCSC files) rather than 1-based. .TP .BI "-b, --begin " INT Column of start chromosomal position. [4] diff --git a/tbx.c b/tbx.c index 0d486ad9c..f0310a257 100644 --- a/tbx.c +++ b/tbx.c @@ -107,7 +107,11 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) if ( s==line+b ) return -1; // expected int if (!(conf->preset&TBX_UCSC)) --intv->beg; else ++intv->end; - if (intv->beg < 0) intv->beg = 0; + if (intv->beg < 0) { + hts_log_warning("Coordinate <= 0 detected. " + "Did you forget to use the -0 option?"); + intv->beg = 0; + } if (intv->end < 1) intv->end = 1; } else { if ((conf->preset&0xffff) == TBX_GENERIC) { From a1cd8b8335f554d5be71fa0a75d304bbe3a8a682 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 29 Mar 2022 16:04:56 +0100 Subject: [PATCH 253/488] Detect and fix invalid Type=Flag && Number!=0 header definitions. Invalid definitions are fixed internally and warning such as [W::bcf_hdr_register_hrec] The definition of Flag "INFO/SNP" is invalid, forcing Number=0 are printed so that downstream analyses can work (e.g. `bcftools merge`). However, output VCF headers are not fixed. This could go one step further and also modify the headers. See also https://github.com/samtools/bcftools/issues/1685 --- vcf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vcf.c b/vcf.c index 6e3f05388..f868cc738 100644 --- a/vcf.c +++ b/vcf.c @@ -809,6 +809,12 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) *hrec->key == 'I' ? "An" : "A", hrec->key); var = BCF_VL_VAR; } + if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) ) + { + hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id); + var = BCF_VL_FIXED; + num = 0; + } } uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 | (var & 0xf) << 8 | From 9bcb2d2e3c0a84b1ba6c0e7cc89a4e82c055f9fa Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 1 Apr 2022 14:22:54 +0100 Subject: [PATCH 254/488] Update to htscodecs v1.2.2 - The name tokeniser now validates the stored length in the data stream matches the decoded length. - Make data types consistent in rans_compress_bound4x16. - Fix an endless loop in arith_dynamic and rans4x16pr involving X_STRIPE with 0 stripes. - Prevent memcpy(dest, NULL, 0) calls. - Reject attempts to duplicate non-existent name tokeniser streams. - Fix possible use of uninitialised memory in rans_uncompress_O1_4x16. - Improve error detection from fqzcomp's read_array function. - Reject fqzcomp parameters with inconsistent "sel" parameters. - Fix uninitialised access to element 4095 in r4x8 O0 arrays. --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index c6a459a44..1395d7306 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit c6a459a4488624d5e4b2d4d642febbd55a78a9b1 +Subproject commit 1395d730651fdfa39cd916be3b3ef4dd9b1ab895 From edd3dab51e0e36e74eb322e0d1ee67ca0a118aaa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 24 Jan 2022 15:51:22 +0000 Subject: [PATCH 255/488] Fix excessive memory used by multi-threaded SAM output on long reads. The previous code here buffered up 1000 BAM records before dispatching a job to convert to SAM. We now buffer up either 1000 BAM records or (estimated) ~240KB of encoded SAM, whichever is hit first. Note these figures were previously written in macros NB and NM, which have been replaced with more meaningful names. I tested it with data having an average length of 148bp (Illumina) 15949bp (ONT) and 13441 (PB CCS), using the GIAB HG002 chr1 public data sets. Maximum lengths were 148, 1044875 and 19288. /usr/bin/time real time and maxresident figures, before: Illumina ONT PB -@8 43,128/2m43 6,040,568/1m39 1,593,932/1m15 -@16 70,880/1m42 8,710,132/1m08 3,026,604/0m52 -@32 70,272/1m29 11,940,036/0m55 5,722,952/0m47 -@64 190,584/1m21 17,007,840/0m56 10,835,512/0m48 After: Illumina ONT PB -@16 50,208/1m36 696,276/1m09 63,496/0m57 -@32 86,044/1m21 1,054,524/0m53 109,696/0m44 -@64 149,024/1m24 1,616,720/0m55 195,676/0m48 The effect on memory (KB) is vast, although it's still a bit higher on ONT. This is probably related to the maximum lengths being used and the reuse of BAM structs (never shrinking them) rather than the average, so a long tail on the distribution causes memory growth. We could address that in later updates, but this is still a huge improvement. Obviously as we get to very long records, we'll be dispatching very commonly (maybe every alignment), but I don't yet know how inefficient the threading becomes then. Memory usage will grow as we cannot store half an alignment, but it won't explode so fast. We may wish to try larger values of SAM_NBYTES, but note I tried 100KB to 1MB and this came out fairly optimal on CPU so there was little reason to trade more memory for CPU. --- sam.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/sam.c b/sam.c index 393a3b22e..0ce3a42cb 100644 --- a/sam.c +++ b/sam.c @@ -2906,9 +2906,10 @@ ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { * SAM threading */ // Size of SAM text block (reading) -#define NM 240000 -// Number of BAM records (writing) -#define NB 1000 +#define SAM_NBYTES 240000 + +// Number of BAM records (writing, up to NB_mem in size) +#define SAM_NBAM 1000 struct SAM_state; @@ -2918,7 +2919,8 @@ typedef struct sp_bams { int serial; bam1_t *bams; - int nbams, abams; // used and alloc + int nbams, abams; // used and alloc for bams[] array + size_t bam_mem; // very approximate total size struct SAM_state *fd; } sp_bams; @@ -3169,6 +3171,7 @@ static void *sam_parse_worker(void *arg) { goto err; } gb->nbams = 0; + gb->bam_mem = 0; } gb->serial = gl->serial; gb->next = NULL; @@ -3221,6 +3224,7 @@ static void *sam_parse_worker(void *arg) { cleanup_sp_lines(gl); goto err; } + cp = nl; i++; } @@ -3290,7 +3294,7 @@ static void *sam_dispatcher_read(void *vp) { l = calloc(1, sizeof(*l)); if (!l) goto err; - l->alloc = NM; + l->alloc = SAM_NBYTES; l->data = malloc(l->alloc+8); // +8 for optimisation in sam_parse1 if (!l->data) { free(l); @@ -3301,11 +3305,11 @@ static void *sam_dispatcher_read(void *vp) { } l->next = NULL; - if (l->alloc < line_frag+NM/2) { - char *rp = realloc(l->data, line_frag+NM/2 +8); + if (l->alloc < line_frag+SAM_NBYTES/2) { + char *rp = realloc(l->data, line_frag+SAM_NBYTES/2 +8); if (!rp) goto err; - l->alloc = line_frag+NM/2; + l->alloc = line_frag+SAM_NBYTES/2; l->data = rp; } memcpy(l->data, line.s, line_frag); @@ -4302,16 +4306,18 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) fd->bams = gb->next; gb->next = NULL; gb->nbams = 0; + gb->bam_mem = 0; pthread_mutex_unlock(&fd->lines_m); } else { pthread_mutex_unlock(&fd->lines_m); if (!(gb = calloc(1, sizeof(*gb)))) return -1; - if (!(gb->bams = calloc(NB, sizeof(*gb->bams)))) { + if (!(gb->bams = calloc(SAM_NBAM, sizeof(*gb->bams)))) { free(gb); return -1; } gb->nbams = 0; - gb->abams = NB; + gb->abams = SAM_NBAM; + gb->bam_mem = 0; gb->fd = fd; fd->curr_idx = 0; fd->curr_bam = gb; @@ -4320,11 +4326,11 @@ int sam_write1(htsFile *fp, const sam_hdr_t *h, const bam1_t *b) if (!bam_copy1(&gb->bams[gb->nbams++], b)) return -2; + gb->bam_mem += b->l_data + sizeof(*b); // Dispatch if full - if (gb->nbams == NB) { + if (gb->nbams == SAM_NBAM || gb->bam_mem > SAM_NBYTES*0.8) { gb->serial = fd->serial++; - //fprintf(stderr, "Dispatch another %d bams\n", NB); pthread_mutex_lock(&fd->command_m); if (fd->errcode != 0) { pthread_mutex_unlock(&fd->command_m); From f893208602b9a12f4a2fa21cae23fcda840dbedb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 1 Apr 2022 09:14:51 +0100 Subject: [PATCH 256/488] Improve coordinate parsing, particularly with tabix. (PR #1411) Double check the file coordinates are 1-based unless tabix -0 is specified. We treat a 0 coordinate as a warning now. The documentation is also more explicit that -0 also implied a half-open coordinate system. Also improve region parsing. The code supports regions such as "x"/"x:" (all of "x"), "x:10-20" "x:-20" (up to pos 20) and "x:20-" (position 20 onwards). However as genome coordinates are 1-based and internally we have 0-based, we subtract one during parsing. The "x:-20" is done by negative value detection, but this was also triggered with the "x:0" region (treated as "up to 1"). Also, illegal regions such as "x:-10-20" were treated as "x:-20". This is now a hard error. Fixes #1409 --- hts.c | 9 ++++++--- tabix.1 | 4 ++-- tbx.c | 6 +++++- 3 files changed, 13 insertions(+), 6 deletions(-) diff --git a/hts.c b/hts.c index ddaa60bfb..431d337a8 100644 --- a/hts.c +++ b/hts.c @@ -3711,14 +3711,17 @@ const char *hts_parse_region(const char *s, int *tid, hts_pos_t *beg, char *hyphen; *beg = hts_parse_decimal(colon+1, &hyphen, flags) - 1; if (*beg < 0) { + if (*beg != -1 && *hyphen == '-' && colon[1] != '\0') { + // User specified zero, but we're 1-based. + hts_log_error("Coordinates must be > 0"); + return NULL; + } if (isdigit_c(*hyphen) || *hyphen == '\0' || *hyphen == ',') { // interpret chr:-100 as chr:1-100 *end = *beg==-1 ? HTS_POS_MAX : -(*beg+1); *beg = 0; return s_end; - } else if (*hyphen == '-') { - *beg = 0; - } else { + } else if (*beg < -1) { hts_log_error("Unexpected string \"%s\" after region", hyphen); return NULL; } diff --git a/tabix.1 b/tabix.1 index e4bf4b7cb..27428bd52 100644 --- a/tabix.1 +++ b/tabix.1 @@ -81,8 +81,8 @@ greater than that, you will need to use a CSI index. .SH INDEXING OPTIONS .TP 10 .B -0, --zero-based -Specify that the position in the data file is 0-based (e.g. UCSC files) -rather than 1-based. +Specify that the position in the data file is 0-based half-open +(e.g. UCSC files) rather than 1-based. .TP .BI "-b, --begin " INT Column of start chromosomal position. [4] diff --git a/tbx.c b/tbx.c index 0d486ad9c..f0310a257 100644 --- a/tbx.c +++ b/tbx.c @@ -107,7 +107,11 @@ int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) if ( s==line+b ) return -1; // expected int if (!(conf->preset&TBX_UCSC)) --intv->beg; else ++intv->end; - if (intv->beg < 0) intv->beg = 0; + if (intv->beg < 0) { + hts_log_warning("Coordinate <= 0 detected. " + "Did you forget to use the -0 option?"); + intv->beg = 0; + } if (intv->end < 1) intv->end = 1; } else { if ((conf->preset&0xffff) == TBX_GENERIC) { From d66cb320a18e48d70d86c51db3b05da723842402 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 29 Mar 2022 16:04:56 +0100 Subject: [PATCH 257/488] Detect and fix invalid Type=Flag && Number!=0 header definitions. Invalid definitions are fixed internally and warning such as [W::bcf_hdr_register_hrec] The definition of Flag "INFO/SNP" is invalid, forcing Number=0 are printed so that downstream analyses can work (e.g. `bcftools merge`). However, output VCF headers are not fixed. This could go one step further and also modify the headers. See also https://github.com/samtools/bcftools/issues/1685 --- vcf.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/vcf.c b/vcf.c index 18c97662a..d83c30e3f 100644 --- a/vcf.c +++ b/vcf.c @@ -809,6 +809,12 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) *hrec->key == 'I' ? "An" : "A", hrec->key); var = BCF_VL_VAR; } + if ( type==BCF_HT_FLAG && (var!=BCF_VL_FIXED || num!=0) ) + { + hts_log_warning("The definition of Flag \"%s/%s\" is invalid, forcing Number=0", hrec->key,id); + var = BCF_VL_FIXED; + num = 0; + } } uint32_t info = ((((uint32_t)num) & 0xfffff)<<12 | (var & 0xf) << 8 | From 2e12175874c263b4f583a85ed9e0efc8c87fa5e8 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 1 Apr 2022 14:22:54 +0100 Subject: [PATCH 258/488] Update to htscodecs v1.2.2 - The name tokeniser now validates the stored length in the data stream matches the decoded length. - Make data types consistent in rans_compress_bound4x16. - Fix an endless loop in arith_dynamic and rans4x16pr involving X_STRIPE with 0 stripes. - Prevent memcpy(dest, NULL, 0) calls. - Reject attempts to duplicate non-existent name tokeniser streams. - Fix possible use of uninitialised memory in rans_uncompress_O1_4x16. - Improve error detection from fqzcomp's read_array function. - Reject fqzcomp parameters with inconsistent "sel" parameters. - Fix uninitialised access to element 4095 in r4x8 O0 arrays. --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index c6a459a44..1395d7306 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit c6a459a4488624d5e4b2d4d642febbd55a78a9b1 +Subproject commit 1395d730651fdfa39cd916be3b3ef4dd9b1ab895 From e005af56ba2a0c9131061032d164867e3093241c Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 22 Mar 2022 16:46:54 +0000 Subject: [PATCH 259/488] NEWS updates --- NEWS | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/NEWS b/NEWS index 51f65559a..61b6065f9 100644 --- a/NEWS +++ b/NEWS @@ -1,3 +1,27 @@ +Noteworthy changes in release a.b +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +* Security fix: Fixed broken error reporting in the sam_cap_mapq() + function, due to a missing hts_log() parameter. Prior to this fix + it was possible to abuse the log message format string by passing + a specially crafted alignment record to this function. (PR#1406) + +* HTSlib now uses libhtscodecs release 1.2.2. This fixes a number + of bugs where invalid compressed data could trigger usage of + uninitialised values. (PR#1416) + +* Fixed excessive memory used by multi-threaded SAM output on + long reads. (Part of PR#1384) + +* Fixed a bug where tabix would misinterpret region specifiers + starting at position 0. It will also now warn if the file + being indexed is supposed to be 1-based but has positions + less than or equal to 0. (PR#1411) + +* The VCF header parser will now issue a warning if it finds an + INFO header with Type=Flag but Number not equal to 0. It will + also ignore the incorrect Number so the flag can be used. (PR#1415) + Noteworthy changes in release 1.15 (21st February 2022) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 0785d8565b76f848d338b590ac01b03eae1d2ccf Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 5 Apr 2022 11:10:44 +0100 Subject: [PATCH 260/488] Prevent `@p masks earlier declaration` warning [trivial] --- test/test.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test.pl b/test/test.pl index ff5601b0d..9912a1d3e 100755 --- a/test/test.pl +++ b/test/test.pl @@ -614,7 +614,7 @@ sub test_view ## Experimental CRAM 4.0 support. # SAM -> CRAM40 -> SAM - my @p = $sam eq "ce#large_seq.sam" || $sam eq "xx#large_aux.sam" + @p = $sam eq "ce#large_seq.sam" || $sam eq "xx#large_aux.sam" ? (qw/fast normal small archive/) : (qw/archive/); foreach my $profile (@p) { From fb226846194c063744ea3efbe5cd4a490f9edf89 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 5 Apr 2022 11:42:54 +0100 Subject: [PATCH 261/488] Use constant srand() seed for repeatability Add -s, --random-seed option and a constant default, and seed srand() with it. Currently test_bcf_sr_sort() is the only user of rand(), which it uses to seed test_bcf_sr_sort.pl's random number generator. --- test/test.pl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test.pl b/test/test.pl index 9912a1d3e..7a396e22a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -32,6 +32,7 @@ use IO::Handle; my $opts = parse_params(); +srand($$opts{seed}); test_bgzip($opts, 0); test_bgzip($opts, 4); @@ -79,6 +80,7 @@ sub error "Usage: test.pl [OPTIONS]\n", "Options:\n", " -r, --redo-outputs Recreate expected output files.\n", + " -s, --random-seed Initialise rand() with a different seed.\n", " -t, --temp-dir When given, temporary files will not be removed.\n", " -f, --fail-fast Fail-fast mode: exit as soon as a test fails.\n", " -h, -?, --help This help message.\n", @@ -104,12 +106,13 @@ sub safe_tempdir sub parse_params { - my $opts = { keep_files=>0, nok=>0, nfailed=>0 }; + my $opts = { keep_files=>0, nok=>0, nfailed=>0, seed=>42 }; my $help; Getopt::Long::Configure('bundling'); my $ret = GetOptions ( 't|temp-dir:s' => \$$opts{keep_files}, 'r|redo-outputs' => \$$opts{redo_outputs}, + 's|random-seed=i' => \$$opts{seed}, 'f|fail-fast' => \$$opts{fail_fast}, 'h|?|help' => \$help ); @@ -974,7 +977,7 @@ sub test_bcf_sr_sort my ($opts, %args) = @_; for (my $i=0; $i<10; $i++) { - my $seed = int(rand(time)); + my $seed = int(rand(100000000)); my $test = 'test-bcf-sr'; my $cmd = "$$opts{path}/test-bcf-sr.pl -t $$opts{tmp} -s $seed"; print "$test:\n"; From 2eddc214f26504e6d0308b654ebcf705afb6b6af Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 8 Apr 2022 15:59:52 +0100 Subject: [PATCH 262/488] Permit BAM headers between 2GB and 4GB in size once more. This isn't permitted by the BAM specification, but was accepted by earlier htslib release. 62f9909 added code to check the maximum length. This now has a warning at 2GB and the hard-failure at 4GB. Fixes #1420. Fixes samtools/samtools#1613 --- sam.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/sam.c b/sam.c index b0ca2e974..95e8b1d42 100644 --- a/sam.c +++ b/sam.c @@ -339,17 +339,23 @@ int bam_hdr_write(BGZF *fp, const sam_hdr_t *h) if (h->hrecs) { if (sam_hrecs_rebuild_text(h->hrecs, &hdr_ks) != 0) return -1; - if (hdr_ks.l > INT32_MAX) { + if (hdr_ks.l > UINT32_MAX) { hts_log_error("Header too long for BAM format"); free(hdr_ks.s); return -1; + } else if (hdr_ks.l > INT32_MAX) { + hts_log_warning("Header too long for BAM specification (>2GB)"); + hts_log_warning("Output file may not be portable"); } text = hdr_ks.s; l_text = hdr_ks.l; } else { - if (h->l_text > INT32_MAX) { + if (h->l_text > UINT32_MAX) { hts_log_error("Header too long for BAM format"); return -1; + } else if (h->l_text > INT32_MAX) { + hts_log_warning("Header too long for BAM specification (>2GB)"); + hts_log_warning("Output file may not be portable"); } text = h->text; l_text = h->l_text; From e51f72f0a025ffa03806a5fd6559e03b4c6438c9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 12 Apr 2022 12:16:01 +0100 Subject: [PATCH 263/488] Permit MM tags containing "." and "?" suffixes. These define explicit vs implicit coordinates. They are now part of the MM specification, but we don't do anything with this data yet. This PR simply permits them to be parsed without choking, and ignores the additional markup. A subsequent PR will improve on this. Fixes #1418 --- sam.c | 20 ++++++- test/base_mods/MM-explicit.out | 100 +++++++++++++++++++++++++++++++++ test/base_mods/MM-explicit.sam | 13 +++++ test/base_mods/base-mods.tst | 7 ++- 4 files changed, 134 insertions(+), 6 deletions(-) create mode 100644 test/base_mods/MM-explicit.out create mode 100644 test/base_mods/MM-explicit.sam diff --git a/sam.c b/sam.c index 95e8b1d42..04f3435f7 100644 --- a/sam.c +++ b/sam.c @@ -6162,7 +6162,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { int mod_num = 0; while (*cp) { for (; *cp; cp++) { - // cp should be [ACGTNU][+-][^,]*(,\d+)*; + // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; unsigned char btype = *cp++; if (btype != 'A' && btype != 'C' && @@ -6182,18 +6182,32 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { char *ms = cp, *me; // mod code start and end char *cp_end = NULL; int chebi = 0; - if (isdigit(*cp)) { + if (isdigit_c(*cp)) { chebi = strtol(cp, &cp_end, 10); cp = cp_end; ms = cp-1; } else { - while (*cp && *cp != ',' && *cp != ';') + while (*cp && isalpha_c(*cp)) cp++; if (*cp == '\0') return -1; } me = cp; + // Optional explicit vs implicit marker. + // Right now we ignore this field. A proper API for + // querying it will follow later. + if (*cp == '.') { + // implicit = 1; + cp++; + } else if (*cp == '?') { + // implicit = 0; + cp++; + } else if (*cp != ',' && *cp != ';') { + // parse error + return -1; + } + long delta; int n = 0; // nth symbol in a multi-mod string int stride = me-ms; diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out new file mode 100644 index 000000000..05e2828c4 --- /dev/null +++ b/test/base_mods/MM-explicit.out @@ -0,0 +1,100 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 + +=== + diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam new file mode 100644 index 000000000..e4e37103d --- /dev/null +++ b/test/base_mods/MM-explicit.sam @@ -0,0 +1,13 @@ +@CO Testing explicit vs implicit base modifications. +@CO This covers the case where a lack of a signal could be either +@CO implicitly assumed to be no-mod (default) or assumed to be +@CO unchecked and require an explicit statement to indicate it was +@CO looked at and no base modification was observed. +@CO +@CO 0 1 23 45 6 78 +@CO ATCATCATTCCTACCGCTATAGCCT +@CO . . m. .m . .. m +@CO ? ? .h .. . ?? h +r1 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh,2,0,1; Ml:B:C,200,10,50,170,160,20 +r2 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh?,2,0,0,0,0; Ml:B:C,200,10,50,170,10,5,160,20,10,5 +r3 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0; Ml:B:C,200,160,10,170,5,20,5 diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index 865a539c7..c64128ee2 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -33,9 +33,10 @@ # samtools binary. This can be useful for testing older versions. # Test files from SAM spec -P MM-chebi.out $test_mod MM-chebi.sam -P MM-double.out $test_mod MM-double.sam -P MM-multi.out $test_mod MM-multi.sam +P MM-chebi.out $test_mod MM-chebi.sam +P MM-double.out $test_mod MM-double.sam +P MM-multi.out $test_mod MM-multi.sam +P MM-explicit.out $test_mod MM-explicit.sam # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam From e6065cb6b777333077b1ee6e0e7e50974dd32739 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 27 Apr 2022 10:30:21 +0100 Subject: [PATCH 264/488] Improve error messages for CRAM reference mismatches. If the user specifies the wrong reference, the CRAM slice header MD5sum checks fail. We now report the SQ line M5 string too so it is possible to validate against the whole chr in the ref.fa file. The error message has also been improved to report the reference name instead of #num. Finally, we now hint at the likely cause, which counters the misleading samtools supplied error of "truncated or corrupt" file. See samtools/samtools#1640. --- cram/cram_decode.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index b352fc633..51f1b765c 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2423,10 +2423,17 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, if ((!s->ref && s->hdr->ref_base_id < 0) || memcmp(digest, s->hdr->md5, 16) != 0) { char M[33]; - hts_log_error("MD5 checksum reference mismatch at #%d:%d-%d", - ref_id, s->ref_start, s->ref_end); - hts_log_error("CRAM: %s", md5_print(s->hdr->md5, M)); - hts_log_error("Ref : %s", md5_print(digest, M)); + const char *rname = sam_hdr_tid2name(sh, ref_id); + if (!rname) rname="?"; // cannot happen normally + hts_log_error("MD5 checksum reference mismatch at %s:%d-%d", + rname, s->ref_start, s->ref_end); + hts_log_error("CRAM : %s", md5_print(s->hdr->md5, M)); + hts_log_error("Ref : %s", md5_print(digest, M)); + kstring_t ks = KS_INITIALIZE; + if (sam_hdr_find_tag_id(sh, "SQ", "SN", rname, "M5", &ks) == 0) + hts_log_error("@SQ M5: %s", ks.s); + hts_log_error("Please check the reference given is correct"); + ks_free(&ks); return -1; } } From d7cc10de075735d07eb8da0538cbdc0f331f7bd1 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 27 Apr 2022 11:39:01 +0100 Subject: [PATCH 265/488] Make test-logging less pedantic. It's trying to spot error messages starting with lowercase letters, but in doing so forbids things like "@SQ" as it's not capital. --- test/test-logging.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test-logging.pl b/test/test-logging.pl index 1040b0e47..2f22560b5 100755 --- a/test/test-logging.pl +++ b/test/test-logging.pl @@ -33,7 +33,7 @@ sub check_log_message my ($message, $filename, $line_num) = @_; $log_message_count++; - unless ($message =~ /^\"([A-Z]|%s)/) + unless ($message =~ /^\"([A-Z!-@]|%s)/) { print "$filename line $line_num:\n"; print "Log message should begin with a capital letter: $message.\n"; From e868dea9ebd9b5d4d47bbeb1b77cd20267abf00e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 Nov 2021 14:45:14 +0000 Subject: [PATCH 266/488] Improve support for MM .? modifiers. The previous commit permitted these to exist, but didn't make the data available to the caller. This extends the API with additional queries to distinguish the specifics about the modification types present. --- htslib/sam.h | 38 ++++++++++++ sam.c | 53 ++++++++++++++-- test/base_mods/MM-chebi.out | 1 + test/base_mods/MM-double.out | 1 + test/base_mods/MM-explicit-x.out | 103 +++++++++++++++++++++++++++++++ test/base_mods/MM-explicit.out | 3 + test/base_mods/MM-explicit.sam | 22 +++++-- test/base_mods/MM-multi.out | 2 + test/base_mods/base-mods.tst | 9 +-- test/test_mod.c | 97 ++++++++++++++++++++++++++--- 10 files changed, 309 insertions(+), 20 deletions(-) create mode 100644 test/base_mods/MM-explicit-x.out diff --git a/htslib/sam.h b/htslib/sam.h index 45dd51f0a..a6e64fbb2 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2271,6 +2271,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, hts_base_mod *mods, int n_mods); +/// Returns data about a specific modification type for the alignment record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param code Modification code. If positive this is a character code, + * if negative it is a -ChEBI code. + * + * @param strand Boolean for top (0) or bottom (1) strand + * @param implicit Boolean for whether unlisted positions should be + * implicitly assumed to be unmodified, or require an + * explicit score and should be considered as unknown. + * Returned. + * @param canonical Canonical base type associated with this modification + * Returned. + * + * @return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +HTSLIB_EXPORT +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical); + +/// Returns the list of base modification codes provided for this +/// alignment record as an array of character codes (+ve) or ChEBI numbers +/// (negative). +/* + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param ntype Filled out with the number of array elements returned + * + * @return the type array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +HTSLIB_EXPORT +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype); + #ifdef __cplusplus } #endif diff --git a/sam.c b/sam.c index 04f3435f7..dd1b7d9fd 100644 --- a/sam.c +++ b/sam.c @@ -6092,6 +6092,7 @@ struct hts_base_mod_state { char *MMend[MAX_BASE_MOD]; // end of pos-delta string uint8_t *ML[MAX_BASE_MOD]; // next qual int MLstride[MAX_BASE_MOD]; // bytes between quals for this type + int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? int seq_pos; // current position along sequence int nmods; // used array size (0 to MAX_BASE_MOD-1). }; @@ -6160,6 +6161,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { char *cp = (char *)mm+1; int mod_num = 0; + int implicit = 1; while (*cp) { for (; *cp; cp++) { // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; @@ -6192,16 +6194,15 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { if (*cp == '\0') return -1; } + me = cp; - // Optional explicit vs implicit marker. - // Right now we ignore this field. A proper API for - // querying it will follow later. + // Optional explicit vs implicit marker if (*cp == '.') { - // implicit = 1; + // default is implicit = 1; cp++; } else if (*cp == '?') { - // implicit = 0; + implicit = 0; cp++; } else if (*cp != ',' && *cp != ';') { // parse error @@ -6257,6 +6258,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { state->strand [mod_num] = (strand == '-'); state->canonical[mod_num] = btype; state->MLstride [mod_num] = stride; + state->implicit [mod_num] = implicit; state->MMcount [mod_num] = delta; if (b->core.flag & BAM_FREVERSE) { @@ -6473,3 +6475,44 @@ int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, return r; } + +/* + * Returns the list of base modification codes provided for this + * alignment record as an array of character codes (+ve) or ChEBI numbers + * (negative). + * + * Returns the array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { + *ntype = state->nmods; + return state->type; +} + +/* + * Returns data about a specific modification type for the alignment record. + * Code is either positive (eg 'm') or negative for ChEBI numbers. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical) { + // Find code entry + int i; + for (i = 0; i < state->nmods; i++) { + if (state->type[i] == code) + break; + } + if (i == state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out index cefdc545c..a6e7654cf 100644 --- a/test/base_mods/MM-chebi.out +++ b/test/base_mods/MM-chebi.out @@ -35,6 +35,7 @@ 34 C C+m204 C+(76792)33 35 A --- +Present: m #-76792 n 6 C C+m102 15 N N+n212 17 C C+m128 diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out index 82d086a2f..e21ae314e 100644 --- a/test/base_mods/MM-double.out +++ b/test/base_mods/MM-double.out @@ -35,6 +35,7 @@ 34 A 35 T --- +Present: m m o 1 G G-m115 7 C C+m128 12 G G-m141 diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out new file mode 100644 index 000000000..4abedc719 --- /dev/null +++ b/test/base_mods/MM-explicit-x.out @@ -0,0 +1,103 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h.10 +10 C C+m.50 C+h.170 +11 T +12 A +13 C +14 C C+m.160 C+h.20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m?200 C+h?10 +10 C C+m?50 C+h?170 +11 T +12 A +13 C C+m?10 C+h?5 +14 C C+m?160 C+h?20 +15 G +16 C C+m?10 C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 + +=== + +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h?10 +10 C C+h?170 +11 T +12 A +13 C C+h?5 +14 C C+m.160 C+h?20 +15 G +16 C C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m h +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 + +=== + diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out index 05e2828c4..f28b25f83 100644 --- a/test/base_mods/MM-explicit.out +++ b/test/base_mods/MM-explicit.out @@ -24,6 +24,7 @@ 23 C 24 T --- +Present: m h 9 C C+m200 C+h10 10 C C+m50 C+h170 14 C C+m160 C+h20 @@ -56,6 +57,7 @@ 23 C 24 T --- +Present: m h 9 C C+m200 C+h10 10 C C+m50 C+h170 13 C C+m10 C+h5 @@ -90,6 +92,7 @@ 23 C 24 T --- +Present: m h 9 C C+m200 C+h10 10 C C+h170 13 C C+h5 diff --git a/test/base_mods/MM-explicit.sam b/test/base_mods/MM-explicit.sam index e4e37103d..e85afa293 100644 --- a/test/base_mods/MM-explicit.sam +++ b/test/base_mods/MM-explicit.sam @@ -4,10 +4,24 @@ @CO unchecked and require an explicit statement to indicate it was @CO looked at and no base modification was observed. @CO -@CO 0 1 23 45 6 78 -@CO ATCATCATTCCTACCGCTATAGCCT -@CO . . m. .m . .. m -@CO ? ? .h .. . ?? h +@CO ATCATCATTCCTACCGCTATAGCCT r1; implicit +@CO - - .. -. - -- +@CO Mm M +@CO - - .. -. - -- +@CO hH h +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r2; explicit to a small region +@CO - - ?? ?? ? -- +@CO Mm mM m +@CO - - ?? ?? ? -- +@CO hH hh h +@CO +@CO ATCATCATTCCTACCGCTATAGCCT r3; mixture +@CO - - . -. - -- +@CO M M +@CO - - ?? ?? ? -- +@CO hH hh h -- +@CO r1 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh,2,0,1; Ml:B:C,200,10,50,170,160,20 r2 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+mh?,2,0,0,0,0; Ml:B:C,200,10,50,170,10,5,160,20,10,5 r3 0 * 0 0 * * 0 0 ATCATCATTCCTACCGCTATAGCCT * Mm:Z:C+m.,2,2;C+h?,2,0,0,0,0; Ml:B:C,200,160,10,170,5,20,5 diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out index 23c98d97b..e411a81ee 100644 --- a/test/base_mods/MM-multi.out +++ b/test/base_mods/MM-multi.out @@ -35,6 +35,7 @@ 34 C C+m230 C+h6 35 A --- +Present: m h n 6 C C+m128 15 N N+n215 17 C C+m153 @@ -83,6 +84,7 @@ 34 C C+m204 C+h31 35 A --- +Present: m h n 6 C C+m77 C+h159 15 N N+n240 17 C C+m103 C+h133 diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index c64128ee2..3809c0e6e 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -33,10 +33,11 @@ # samtools binary. This can be useful for testing older versions. # Test files from SAM spec -P MM-chebi.out $test_mod MM-chebi.sam -P MM-double.out $test_mod MM-double.sam -P MM-multi.out $test_mod MM-multi.sam -P MM-explicit.out $test_mod MM-explicit.sam +P MM-chebi.out $test_mod MM-chebi.sam +P MM-double.out $test_mod MM-double.sam +P MM-multi.out $test_mod MM-multi.sam +P MM-explicit.out $test_mod MM-explicit.sam +P MM-explicit-x.out $test_mod -x MM-explicit.sam # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam diff --git a/test/test_mod.c b/test/test_mod.c index aade3733c..f6f5b0718 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -22,6 +22,52 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/* +This tests multiple APIs. The simplest is to parse the MM/ML tags with +bam_parse_basemod and then call bam_mods_at_next_pos once for each base in +the bam sequence to check for modifications. + +Ie: + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); // b=bam1_t pointer + hts_base_mod mods[5]; + for (i = 0; i < b->core.l_qseq; i++) { + n = bam_mods_at_next_pos(b, m, mods, 5); + for (j = 0; j < n && j < 5; j++) { + // Report 'n'th mod at seq pos 'i'. + // mods[j].modified_base holds the base mod itself, with + // mods[j].canonical_base, mods[j].strand and mods[j].qual + // also present in hts_base_mod struct. + // ... + } + } + hts_base_mod_state_free(m); + +The extended mode has the same loop above, but calls bam_mods_query_type +to return additional meta-data including the strand, canonical base and +whether the base modification is recorded implicitly or explicitly: + + int ret = bam_mods_query_type(m, mods[j].modified_base, + &m_strand, &m_implicit, + &m_canonical); + +Looping over every base in the sequence is not particularly efficient +however unless this fits your natural processing order. The alternative +is to call bam_next_base_mod to iterate only over modified locations: + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod(b, m); // b=bam1_t pointer + hts_base_mod mods[5]; + while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { + for (j = 0; j < n && j < 5; j++) { + // Report 'n'th mod at sequence position 'pos' + } + } + hts_base_mod_state_free(m); + +*/ + #include #include @@ -41,6 +87,14 @@ static char *code(int id) { int main(int argc, char **argv) { char out[1024] = {0}; + int extended = 0; + + if (argc > 1 && strcmp(argv[1], "-x") == 0) { + extended = 1; + argv++; + argc--; + } + if (argc < 2) return 1; @@ -69,12 +123,31 @@ int main(int argc, char **argv) { n = bam_mods_at_next_pos(b, m, mods, 5); lp += sprintf(lp, "%d\t%c\t", i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); - for (j = 0; j < n && j < 5; j++) - lp += sprintf(lp, "%c%c%s%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - mods[j].qual); + for (j = 0; j < n && j < 5; j++) { + if (extended) { + int m_strand, m_implicit; + char m_canonical; + int ret = bam_mods_query_type(m, mods[j].modified_base, + &m_strand, &m_implicit, + &m_canonical); + if (ret < 0 || + m_canonical != mods[j].canonical_base || + m_strand != mods[j].strand) + goto err; + lp += sprintf(lp, "%c%c%s%c%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + "?."[m_implicit], + mods[j].qual); + } else { + lp += sprintf(lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); + } + } *lp++ = '\n'; *lp++ = 0; @@ -88,17 +161,27 @@ int main(int argc, char **argv) { bam_parse_basemod(b, m); + // List possible mod choices. + int *all_mods; + int all_mods_n = 0; + all_mods = bam_mods_recorded(m, &all_mods_n); + printf("Present:"); + for (i = 0; i < all_mods_n; i++) + printf(all_mods[i] > 0 ? " %c" : " #%d", all_mods[i]); + putchar('\n'); + int pos; while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { char line[8192]={0}, *lp = line; lp += sprintf(lp, "%d\t%c\t", pos, seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); - for (j = 0; j < n && j < 5; j++) + for (j = 0; j < n && j < 5; j++) { lp += sprintf(lp, "%c%c%s%d ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), mods[j].qual); + } *lp++ = '\n'; *lp++ = 0; From 3c44c0b9eabcb1f6e111ec1b94e287155085e523 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 5 May 2022 16:19:12 +0100 Subject: [PATCH 267/488] Switch cirrus ubuntu image to ubuntu:latest Done to fix a problem where clang fails to install due to an inconsistency in the apt sources used by the ubuntu kinetic (a.k.a devel) Docker image. This means an update to clang via the kinetic-proposed source makes it uninstallable on that image until the proposed change makes it to the kinetic one. Switching to ubuntu:latest means we won't be quite as leading-edge but it's less likely to break unexpectedly. --- .cirrus.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 1ba352063..2740ce05e 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -84,8 +84,8 @@ gcc_task: ubuntu_task: name: ubuntu-clang container: - #image: ubuntu:latest # use << : *LIBDEFLATE - image: ubuntu:devel + image: ubuntu:latest + # image: ubuntu:devel cpu: 2 memory: 1G From 46c56fcc2300dcac123edc670689e3bd2f241df6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 4 May 2022 17:13:49 +0100 Subject: [PATCH 268/488] Fix buffer overrun in bam_plp_insertion_mod. This attempted to grow memory by the maximum amount of space a base modification would take up, but due to a misunderstanding of kstring it kept adding this to the original size rather than actually growing the allocated size. (Probably) fixes samtools/samtools#1652 --- sam.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sam.c b/sam.c index dd1b7d9fd..865b55fa6 100644 --- a/sam.c +++ b/sam.c @@ -5306,6 +5306,7 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, hts_base_mod mod[256]; if (m && (nm = bam_mods_at_qpos(p->b, p->qpos + j - p->is_del, m, mod, 256)) > 0) { + int o_indel = indel; if (ks_resize(ins, ins->l + nm*16+3) < 0) return -1; ins->s[indel++] = '['; @@ -5329,6 +5330,7 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, qual); } ins->s[indel++] = ']'; + ins->l += indel - o_indel; // grow by amount we used } } break; From 7e2df7ea27138058bae94baaf28cf2ca12f1ec0d Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 19 May 2022 16:11:56 +0100 Subject: [PATCH 269/488] Ensure str_class is set before using it in bcf_hdr_get_hrec gcc-12.1 produced a warning that NULL could be passed to strcmp() via str_class. I'm not sure if that can actually happen, but just in case add a check. --- vcf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vcf.c b/vcf.c index f868cc738..e1d386c7a 100644 --- a/vcf.c +++ b/vcf.c @@ -930,6 +930,8 @@ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, co } else if ( type==BCF_HL_STR ) { + if (!str_class) + return NULL; for (i=0; inhrec; i++) { if ( hdr->hrec[i]->type!=type ) continue; From 5a302ffbb4f76b30a3fe555433581d10da47cf30 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 25 Apr 2022 11:21:21 +0100 Subject: [PATCH 270/488] Update htscodecs including associated Makefile changes Update htscodecs to bring in the rANS 32x16 codecs Add new htscodecs source files and dependencies into htslib makefiles. Some htscodecs functions have changed name slightly so a couple of cram source files are updated to reflect that. These changes are enough to build the non-SIMD versions of the new codecs, but don't enable the accelerated versions yet. --- Makefile | 13 +++++++++---- cram/cram_codecs.c | 16 ++++++++-------- cram/cram_io.c | 4 ++-- htscodecs | 2 +- htscodecs_bundled.mk | 11 ++++++++++- 5 files changed, 30 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 0871580ce..e52d9128b 100644 --- a/Makefile +++ b/Makefile @@ -435,14 +435,19 @@ cram/string_alloc.o cram/string_alloc.pico: cram/string_alloc.c config.h cram/st thread_pool.o thread_pool.pico: thread_pool.c config.h $(thread_pool_internal_h) $(htslib_hts_log_h) htscodecs/htscodecs/arith_dynamic.o htscodecs/htscodecs/arith_dynamic.pico: htscodecs/htscodecs/arith_dynamic.c config.h $(htscodecs_arith_dynamic_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) -htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_c_simple_model_h) +htscodecs/htscodecs/fqzcomp_qual.o htscodecs/htscodecs/fqzcomp_qual.pico: htscodecs/htscodecs/fqzcomp_qual.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_c_simple_model_h) htscodecs/htscodecs/htscodecs.o htscodecs/htscodecs/htscodecs.pico: htscodecs/htscodecs/htscodecs.c $(htscodecs_htscodecs_h) $(htscodecs_version_h) htscodecs/htscodecs/pack.o htscodecs/htscodecs/pack.pico: htscodecs/htscodecs/pack.c config.h $(htscodecs_pack_h) -htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr.o htscodecs/htscodecs/rANS_static32x16pr.pico: htscodecs/htscodecs/rANS_static32x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x16pr_avx2.pico: htscodecs/htscodecs/rANS_static32x16pr_avx2.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) $(htscodecs_permute_h) +htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: htscodecs/htscodecs/rANS_static32x16pr_avx512.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_neon.o htscodecs/htscodecs/rANS_static32x16pr_neon.pico: htscodecs/htscodecs/rANS_static32x16pr_neon.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: htscodecs/htscodecs/rANS_static32x16pr_sse4.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/rANS_static4x16pr.o htscodecs/htscodecs/rANS_static4x16pr.pico: htscodecs/htscodecs/rANS_static4x16pr.c config.h $(htscodecs_rANS_word_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_rANS_static16_int_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(htscodecs_utils_h) $(htscodecs_rANS_static32x16pr_h) htscodecs/htscodecs/rANS_static.o htscodecs/htscodecs/rANS_static.pico: htscodecs/htscodecs/rANS_static.c config.h $(htscodecs_rANS_byte_h) $(htscodecs_utils_h) $(htscodecs_rANS_static_h) htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle.c config.h $(htscodecs_varint_h) $(htscodecs_rle_h) -htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) - +htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs/htscodecs/utils.o htscodecs/htscodecs/utils.pico: htscodecs/htscodecs/utils.c config.h $(htscodecs_utils_h) bgzip: bgzip.o libhts.a $(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 9f112863e..33e1b5bf8 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -2030,10 +2030,10 @@ static int cram_xrle_decode_expand_char(cram_slice *slice, cram_codec *c) { int nb = var_get_u64(len_dat, len_dat+len_sz, &out_sz); if (!(b->data = malloc(out_sz))) return -1; - rle_decode(lit_dat, lit_sz, - len_dat+nb, len_sz-nb, - rle_syms, rle_nsyms, - b->data, &out_sz); + hts_rle_decode(lit_dat, lit_sz, + len_dat+nb, len_sz-nb, + rle_syms, rle_nsyms, + b->data, &out_sz); b->uncomp_size = out_sz; return 0; @@ -2200,10 +2200,10 @@ int cram_xrle_encode_flush(cram_codec *c) { int nb = var_put_u64(out_len, NULL, c->u.e_xrle.to_flush_size); - out_lit = rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size, - out_len+nb, &out_len_size, - rle_syms, &rle_nsyms, - NULL, &out_lit_size); + out_lit = hts_rle_encode((uint8_t *)c->u.e_xrle.to_flush, c->u.e_xrle.to_flush_size, + out_len+nb, &out_len_size, + rle_syms, &rle_nsyms, + NULL, &out_lit_size); out_len_size += nb; diff --git a/cram/cram_io.c b/cram/cram_io.c index c9dcb5014..60a568b7b 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1722,7 +1722,7 @@ int cram_uncompress_block(cram_block *b) { case TOK3: { uint32_t out_len; - uint8_t *cp = decode_names(b->data, b->comp_size, &out_len); + uint8_t *cp = tok3_decode_names(b->data, b->comp_size, &out_len); if (!cp) return -1; b->orig_method = TOK3; @@ -1875,7 +1875,7 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, int lev = level; if (method == TOK3 && lev > 3) lev = 3; - uint8_t *cp = encode_names(in, in_size, lev, strat, &out_len, NULL); + uint8_t *cp = tok3_encode_names(in, in_size, lev, strat, &out_len, NULL); *out_size = out_len; return (char *)cp; } diff --git a/htscodecs b/htscodecs index 1395d7306..65bb347f6 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 1395d730651fdfa39cd916be3b3ef4dd9b1ab895 +Subproject commit 65bb347f6b0ea7f4a00cb768b3d8004f24ae03c3 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 7242e210b..4a862f3d1 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -28,9 +28,15 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ $(HTSPREFIX)htscodecs/htscodecs/rle.c \ - $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c + $(HTSPREFIX)htscodecs/htscodecs/tokenise_name3.c \ + $(HTSPREFIX)htscodecs/htscodecs/utils.c + HTSCODECS_OBJS = $(HTSCODECS_SOURCES:.c=.o) @@ -49,8 +55,11 @@ htscodecs_varint_h = htscodecs/htscodecs/varint.h htscodecs_htscodecs_endian_h = htscodecs/htscodecs/htscodecs_endian.h htscodecs_c_range_coder_h = htscodecs/htscodecs/c_range_coder.h htscodecs_c_simple_model_h = htscodecs/htscodecs/c_simple_model.h $(htscodecs_c_range_coder_h) +htscodecs_permute_h = htscodecs/htscodecs/permute.h htscodecs_pooled_alloc_h = htscodecs/htscodecs/pooled_alloc.h htscodecs_rANS_byte_h = htscodecs/htscodecs/rANS_byte.h +htscodecs_rANS_static16_int_h = htscodecs/htscodecs/rANS_static16_int.h $(htscodecs_varint_h) $(htscodecs_utils_h) +htscodecs_rANS_static32x16pr_h = htscodecs/htscodecs/rANS_static32x16pr.h htscodecs_rANS_word_h = htscodecs/htscodecs/rANS_word.h $(htscodecs_htscodecs_endian_h) htscodecs_utils_h = htscodecs/htscodecs/utils.h htscodecs_version_h = htscodecs/htscodecs/version.h From b0cc5bdd86c6aebc9684f22b28fcf6b079192bb9 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 25 Apr 2022 12:08:28 +0100 Subject: [PATCH 271/488] Add configure changes to enable htscodecs SIMD code Adds the checks necessary to detect x86_64 SIMD support and turn it on the htscodecs if it's available. As ssse3, popcnt and sse4.1 are used together, they're tested for as a group. --- Makefile | 14 ++++++++-- config.mk.in | 5 ++++ configure.ac | 51 +++++++++++++++++++++++++++++++++++ m4/ax_check_compile_flag.m4 | 53 +++++++++++++++++++++++++++++++++++++ 4 files changed, 121 insertions(+), 2 deletions(-) create mode 100644 m4/ax_check_compile_flag.m4 diff --git a/Makefile b/Makefile index e52d9128b..68734026d 100644 --- a/Makefile +++ b/Makefile @@ -37,6 +37,7 @@ CPPFLAGS = #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 CFLAGS = -g -Wall -O2 -fvisibility=hidden EXTRA_CFLAGS_PIC = -fpic +TARGET_CFLAGS = LDFLAGS = -fvisibility=hidden LIBS = $(htslib_default_libs) @@ -161,10 +162,10 @@ config_vars.h: .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: - $(CC) $(CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< .c.pico: - $(CC) $(CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< LIBHTS_OBJS = \ @@ -215,6 +216,10 @@ NONCONFIGURE_OBJS = hfile_libcurl.o PLUGIN_EXT = PLUGIN_OBJS = +HTS_CFLAGS_AVX2 = +HTS_CFLAGS_AVX512 = +HTS_CFLAGS_SSE4 = + cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h) cram_io_h = cram/cram_io.h $(cram_misc_h) cram_misc_h = cram/misc.h @@ -449,6 +454,11 @@ htscodecs/htscodecs/rle.o htscodecs/htscodecs/rle.pico: htscodecs/htscodecs/rle. htscodecs/htscodecs/tokenise_name3.o htscodecs/htscodecs/tokenise_name3.pico: htscodecs/htscodecs/tokenise_name3.c config.h $(htscodecs_pooled_alloc_h) $(htscodecs_arith_dynamic_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_tokenise_name3_h) $(htscodecs_varint_h) $(htscodecs_utils_h) htscodecs/htscodecs/utils.o htscodecs/htscodecs/utils.pico: htscodecs/htscodecs/utils.c config.h $(htscodecs_utils_h) +# Extra CFLAGS for specific files +htscodecs/htscodecs/rANS_static32x16pr_avx2.o htscodecs/htscodecs/rANS_static32x16pr_avx2.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX2) +htscodecs/htscodecs/rANS_static32x16pr_avx512.o htscodecs/htscodecs/rANS_static32x16pr_avx512.pico: TARGET_CFLAGS = $(HTS_CFLAGS_AVX512) +htscodecs/htscodecs/rANS_static32x16pr_sse4.o htscodecs/htscodecs/rANS_static32x16pr_sse4.pico: TARGET_CFLAGS = $(HTS_CFLAGS_SSE4) + bgzip: bgzip.o libhts.a $(CC) $(LDFLAGS) -o $@ bgzip.o libhts.a $(LIBS) -lpthread diff --git a/config.mk.in b/config.mk.in index f8decf0a2..35392bf0d 100644 --- a/config.mk.in +++ b/config.mk.in @@ -112,3 +112,8 @@ LDFLAGS += $(noplugin_LDFLAGS) LIBS += $(noplugin_LIBS) endif + +# Extra CFLAGS for specific files +HTS_CFLAGS_AVX2 = @hts_cflags_avx2@ +HTS_CFLAGS_AVX512 = @hts_cflags_avx512@ +HTS_CFLAGS_SSE4 = @hts_cflags_sse4@ diff --git a/configure.ac b/configure.ac index 1216ecc21..9c8cca480 100644 --- a/configure.ac +++ b/configure.ac @@ -30,6 +30,7 @@ AC_CONFIG_SRCDIR(hts.c) AC_CONFIG_HEADERS(config.h) m4_include([m4/hts_prog_cc_warnings.m4]) +m4_include([m4/ax_check_compile_flag.m4]) m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) @@ -69,6 +70,56 @@ dnl Flags to treat warnings as errors. These need to be applied to CFLAGS dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS) HTS_PROG_CC_WERROR(hts_late_cflags) +dnl Check for various compiler flags to enable SIMD features +dnl Options for rANS32x16 sse4.1 version +AX_CHECK_COMPILE_FLAG([-mssse3 -mpopcnt -msse4.1], [ + hts_cflags_sse4="-mssse3 -mpopcnt -msse4.1" + AC_SUBST([hts_cflags_sse4]) + AC_DEFINE([HAVE_SSSE3],1, + [Defined to 1 if the compiler can issue SSSE3 instructions.]) + AC_DEFINE([HAVE_POPCNT],1, + [Defined to 1 if the compiler can issue popcnt instructions.]) + AC_DEFINE([HAVE_SSE4_1],1, + [Defined to 1 if the compiler can issue SSE4.1 instructions.]) + ], [], [], [AC_LANG_PROGRAM([[ + #include "x86intrin.h" + ]],[[ + unsigned int i = _mm_popcnt_u32(1); + __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_max_epu32(a, b); + b = _mm_shuffle_epi8(a, c); + return *((char *) &b); + ]])]) + +dnl Options for rANS32x16 avx2 version +AX_CHECK_COMPILE_FLAG([-mavx2], [ + hts_cflags_avx2="-mavx2" + AC_SUBST([hts_cflags_avx2]) + AC_DEFINE([HAVE_AVX2],1, + [Defined to 1 if the compiler can issue AVX2 instructions.]) + ], [], [], [AC_LANG_PROGRAM([[ + #include "x86intrin.h" + ]],[[ + __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + __m256i b = _mm256_add_epi32(a, a); + return *((char *) &b); + ]])]) + +dnl Options for rANS32x16 avx512 version +AX_CHECK_COMPILE_FLAG([-mavx512f], [ + hts_cflags_avx512="-mavx512f" + AC_SUBST([hts_cflags_avx512]) + AC_DEFINE([HAVE_AVX512],1, + [Defined to 1 if the compiler can issue AVX512 instructions.]) + ], [], [], [AC_LANG_PROGRAM([[ + #include "x86intrin.h" + ]],[[ + __m512i a = _mm512_set1_epi32(1); + __m512i b = _mm512_add_epi32(a, a); + return *((char *) &b); + ]])]) + + dnl Avoid chicken-and-egg problem where pkg-config supplies the dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check dnl for pkg-config... diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4 new file mode 100644 index 000000000..bd753b34d --- /dev/null +++ b/m4/ax_check_compile_flag.m4 @@ -0,0 +1,53 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) +# +# DESCRIPTION +# +# Check whether the given FLAG works with the current language's compiler +# or gives an error. (Warnings, however, are ignored) +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# INPUT gives an alternative input source to AC_COMPILE_IFELSE. +# +# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this +# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim +# Copyright (c) 2011 Maarten Bosmans +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 6 + +AC_DEFUN([AX_CHECK_COMPILE_FLAG], +[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF +AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl +AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ + ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS + _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" + AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], + [AS_VAR_SET(CACHEVAR,[yes])], + [AS_VAR_SET(CACHEVAR,[no])]) + _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) +AS_VAR_IF(CACHEVAR,yes, + [m4_default([$2], :)], + [m4_default([$3], :)]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl AX_CHECK_COMPILE_FLAGS From 587b4d05600d6708295c3bce6db519d887368636 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 20 May 2022 15:46:25 +0100 Subject: [PATCH 272/488] Make htscodecs SIMD code build without configure For the benefit of htslib embeds that don't want to create and run a configure script. Adds a small script that does a similar job by probing a few compiler options and then outputs makefile lines to set variables if they succeed. These lines are added to the default 'htscodecs.mk' file that gets built if configure hasn't already made one. Adding them here means the probing will be remembered until the next "make distclean". The script fragment that builds the default 'config.h' checks to see if the variables have been set, and if so adds the appropriate 'HAVE_' lines for the feature. --- Makefile | 21 +++++++++-- hts_probe_cc.sh | 98 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 4 deletions(-) create mode 100755 hts_probe_cc.sh diff --git a/Makefile b/Makefile index 68734026d..f2d6c9cff 100644 --- a/Makefile +++ b/Makefile @@ -115,10 +115,16 @@ ALL_CPPFLAGS = -I. $(CPPFLAGS) htscodecs.mk: echo '# Default htscodecs.mk generated by Makefile' > $@ echo 'include $$(HTSPREFIX)htscodecs_bundled.mk' >> $@ + $(srcdir)/hts_probe_cc.sh '$(CC)' '$(CFLAGS) $(CPPFLAGS)' '$(LDFLAGS)' >> $@ srcdir = . srcprefix = HTSPREFIX = + +HTS_CFLAGS_AVX2 = +HTS_CFLAGS_AVX512 = +HTS_CFLAGS_SSE4 = + include htslib_vars.mk include htscodecs.mk @@ -216,10 +222,6 @@ NONCONFIGURE_OBJS = hfile_libcurl.o PLUGIN_EXT = PLUGIN_OBJS = -HTS_CFLAGS_AVX2 = -HTS_CFLAGS_AVX512 = -HTS_CFLAGS_SSE4 = - cram_h = cram/cram.h $(cram_samtools_h) $(header_h) $(cram_structs_h) $(cram_io_h) cram/cram_encode.h cram/cram_decode.h cram/cram_stats.h cram/cram_codecs.h cram/cram_index.h $(htslib_cram_h) cram_io_h = cram/cram_io.h $(cram_misc_h) cram_misc_h = cram/misc.h @@ -258,6 +260,17 @@ config.h: echo '#endif' >> $@ echo '#define HAVE_DRAND48 1' >> $@ echo '#define HAVE_LIBCURL 1' >> $@ + if [ "x$(HTS_CFLAGS_SSE4)" != "x" ] ; then \ + echo '#define HAVE_POPCNT 1' >> $@ ; \ + echo '#define HAVE_SSE4_1 1' >> $@ ; \ + echo '#define HAVE_SSSE3 1' >> $@ ; \ + fi + if [ "x$(HTS_CFLAGS_AVX2)" != "x" ] ; then \ + echo '#define HAVE_AVX2 1' >> $@ ; \ + fi + if [ "x$(HTS_CFLAGS_AVX512)" != "x" ] ; then \ + echo '#define HAVE_AVX512 1' >> $@ ; \ + fi # And similarly for htslib.pc.tmp ("pkg-config template"). No dependency # on htslib.pc.in listed, as if that file is newer the usual way to regenerate diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh new file mode 100755 index 000000000..0f6ddede8 --- /dev/null +++ b/hts_probe_cc.sh @@ -0,0 +1,98 @@ +#!/bin/sh + +# Check compiler options for non-configure builds and create Makefile fragment +# +# Copyright (C) 2022 Genome Research Ltd. +# +# Author: Rob Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Arguments are: +# 1. C compiler command +# 2. Initial CFLAGS +# 3. LDFLAGS + +CC=$1 +CFLAGS=$2 +LDFLAGS=$3 + +# Try running the compiler. Uses the same contest.* names as +# configure for temporary files. +run_compiler () +{ + "$CC" $CFLAGS $1 $LDFLAGS -o conftest conftest.c 2> conftest.err + retval=$? + rm -f conftest.err conftest + return $retval +} + +echo "# Compiler probe results, generated by $0" + +# Check for sse4.1 etc. support + +rm -f conftest conftest.err conftest.c +cat - <<'EOF' > conftest.c +#include "x86intrin.h" +int main(int argc, char **argv) { + unsigned int i = _mm_popcnt_u32(1); + __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_max_epu32(a, b); + b = _mm_shuffle_epi8(a, c); + return *((char *) &b); +} +EOF +FLAGS="-mpopcnt -msse4.1 -mssse3" +if run_compiler "$FLAGS" ; then + echo "HTS_CFLAGS_SSE4 = $FLAGS" +fi + +# Check for avx2 + +rm -f conftest.c +cat - <<'EOF' > conftest.c +#include "x86intrin.h" +int main(int argc, char **argv) { + __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + __m256i b = _mm256_add_epi32(a, a); + return *((char *) &b); +} +EOF +FLAGS="-mavx2" +if run_compiler "$FLAGS" ; then + echo "HTS_CFLAGS_AVX2 = $FLAGS" +fi + +# Check for avx512 + +rm -f conftest.c +cat - <<'EOF' > conftest.c +#include "x86intrin.h" +int main(int argc, char **argv) { + __m512i a = _mm512_set1_epi32(1); + __m512i b = _mm512_add_epi32(a, a); + return *((char *) &b); +} +EOF +FLAGS="-mavx512f" +if run_compiler "$FLAGS" ; then + echo "HTS_CFLAGS_AVX512 = $FLAGS" +fi + +rm -f conftest.c From aada31476c534322bcdb7616ad748341dd5a5ad9 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 20 May 2022 18:53:16 +0100 Subject: [PATCH 273/488] Disable unaligned access in htscodecs if it is in HTSlib Mainly to ensure that the fuzzer build doesn't start complaining about unaligned access. --- Makefile | 3 +++ configure.ac | 8 ++++++++ 2 files changed, 11 insertions(+) diff --git a/Makefile b/Makefile index f2d6c9cff..c8e394830 100644 --- a/Makefile +++ b/Makefile @@ -264,6 +264,9 @@ config.h: echo '#define HAVE_POPCNT 1' >> $@ ; \ echo '#define HAVE_SSE4_1 1' >> $@ ; \ echo '#define HAVE_SSSE3 1' >> $@ ; \ + echo '#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0' >> $@ ; \ + echo '#define UBSAN 1' >> $@ ; \ + echo '#endif' >> $@ ; \ fi if [ "x$(HTS_CFLAGS_AVX2)" != "x" ] ; then \ echo '#define HAVE_AVX2 1' >> $@ ; \ diff --git a/configure.ac b/configure.ac index 9c8cca480..c1578d6e4 100644 --- a/configure.ac +++ b/configure.ac @@ -81,6 +81,14 @@ AX_CHECK_COMPILE_FLAG([-mssse3 -mpopcnt -msse4.1], [ [Defined to 1 if the compiler can issue popcnt instructions.]) AC_DEFINE([HAVE_SSE4_1],1, [Defined to 1 if the compiler can issue SSE4.1 instructions.]) +dnl Propagate HTSlib's unaligned access preference to htscodecs + AH_VERBATIM([UBSAN],[ +/* Prevent unaligned access in htscodecs SSE4 rANS codec */ +#if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0 +#undef UBSAN +#endif + ]) + AC_DEFINE([UBSAN],1,[]) ], [], [], [AC_LANG_PROGRAM([[ #include "x86intrin.h" ]],[[ From 958e6fa708d1914bc46d9f8e9411987402468153 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 23 May 2022 12:08:43 +0100 Subject: [PATCH 274/488] Pull in extra htscodecs pointer aliasing fixes --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 65bb347f6..9cd552e17 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 65bb347f6b0ea7f4a00cb768b3d8004f24ae03c3 +Subproject commit 9cd552e173055730eb7701ebdbd13f6c579088e4 From e16c4224174a7581cf6f6b39705d62bbde007261 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 23 May 2022 17:37:25 +0100 Subject: [PATCH 275/488] Fix NEWS entry for function affected by realn_check_tag() fix PR #1406 fixed which arguments were used as printf() format strings in realn_check_tag(), which is a subroutine of sam_prob_realn(). Correct the name of the function affected, and add a note of which HTSlib releases were affected. --- NEWS | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/NEWS b/NEWS index 5f30d8878..75d9ce79e 100644 --- a/NEWS +++ b/NEWS @@ -4,10 +4,11 @@ Noteworthy changes in release a.b Noteworthy changes in release 1.15.1 (7th April 2022) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* Security fix: Fixed broken error reporting in the sam_cap_mapq() +* Security fix: Fixed broken error reporting in the sam_prob_realn() function, due to a missing hts_log() parameter. Prior to this fix - it was possible to abuse the log message format string by passing - a specially crafted alignment record to this function. (PR#1406) + (i.e., in HTSlib versions 1.8 to 1.15) it was possible to abuse + the log message format string by passing a specially crafted + alignment record to this function. (PR#1406) * HTSlib now uses libhtscodecs release 1.2.2. This fixes a number of bugs where invalid compressed data could trigger usage of From 008eabd3b38600a187b8d2fe94be9f8f9260545e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 13 May 2022 16:25:09 +0100 Subject: [PATCH 276/488] Fix clang builds under mingw. Under mingw clang requires dllexport to be applied to both function declarations and function definitions. (Gcc is happy for the definition only to have the dllexport attribute.) Note this exports several "internal" functions, including some apparently unused anywhere (hts_json_*). Perhaps they were once used in htslib-plugins, but no more. I haven't taken the decision to remove these though, but it's worth considering for whenever we next do an ABI breaking change. Either that or stop pretending that their internal only when clearly they are not, and move their API to the external htslib/*.h files instead. These appear to be somewhat in Limbo right now. Fixes #1433 --- hfile_internal.h | 6 ++++++ htslib/hfile.h | 6 ++++++ htslib/hts.h | 3 +++ htslib/hts_log.h | 1 + htslib/sam.h | 1 + htslib/tbx.h | 1 + htslib/vcf.h | 3 +++ textutils_internal.h | 8 ++++++++ 8 files changed, 29 insertions(+) diff --git a/hfile_internal.h b/hfile_internal.h index 70cc99c57..2e365ae7d 100644 --- a/hfile_internal.h +++ b/hfile_internal.h @@ -90,11 +90,13 @@ struct hFILE_backend { /* May be called by hopen_*() functions to decode a fopen()-style mode into open(2)-style flags. */ +HTSLIB_EXPORT int hfile_oflags(const char *mode); /* Must be called by hopen_*() functions to allocate the hFILE struct and set up its base. Capacity is a suggested buffer size (e.g., via fstat(2)) or 0 for a default-sized buffer. */ +HTSLIB_EXPORT hFILE *hfile_init(size_t struct_size, const char *mode, size_t capacity); /* Alternative to hfile_init() for in-memory backends for which the base @@ -107,6 +109,7 @@ hFILE *hfile_init_fixed(size_t struct_size, const char *mode, /* May be called by hopen_*() functions to undo the effects of hfile_init() in the event opening the stream subsequently fails. (This is safe to use even if fp is NULL. This takes care to preserve errno.) */ +HTSLIB_EXPORT void hfile_destroy(hFILE *fp); @@ -138,10 +141,13 @@ struct hFILE_scheme_handler { }; /* May be used as an isremote() function in simple cases. */ +HTSLIB_EXPORT extern int hfile_always_local (const char *fname); +HTSLIB_EXPORT extern int hfile_always_remote(const char *fname); /* Should be called by plugins for each URL scheme they wish to handle. */ +HTSLIB_EXPORT void hfile_add_scheme_handler(const char *scheme, const struct hFILE_scheme_handler *handler); diff --git a/htslib/hfile.h b/htslib/hfile.h index 038591cbc..92b789acd 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -158,6 +158,7 @@ static inline off_t htell(hFILE *fp) */ static inline int hgetc(hFILE *fp) { + HTSLIB_EXPORT extern int hgetc2(hFILE *); return (fp->end > fp->begin)? (unsigned char) *(fp->begin++) : hgetc2(fp); } @@ -229,6 +230,7 @@ or I/O errors. static inline ssize_t HTS_RESULT_USED hread(hFILE *fp, void *buffer, size_t nbytes) { + HTSLIB_EXPORT extern ssize_t hread2(hFILE *, void *, size_t, size_t); size_t n = fp->end - fp->begin; @@ -243,6 +245,7 @@ hread(hFILE *fp, void *buffer, size_t nbytes) */ static inline int hputc(int c, hFILE *fp) { + HTSLIB_EXPORT extern int hputc2(int, hFILE *); if (fp->begin < fp->limit) *(fp->begin++) = c; else c = hputc2(c, fp); @@ -254,6 +257,7 @@ static inline int hputc(int c, hFILE *fp) */ static inline int hputs(const char *text, hFILE *fp) { + HTSLIB_EXPORT extern int hputs2(const char *, size_t, size_t, hFILE *); size_t nbytes = strlen(text), n = fp->limit - fp->begin; @@ -271,7 +275,9 @@ In the absence of I/O errors, the full _nbytes_ will be written. static inline ssize_t HTS_RESULT_USED hwrite(hFILE *fp, const void *buffer, size_t nbytes) { + HTSLIB_EXPORT extern ssize_t hwrite2(hFILE *, const void *, size_t, size_t); + HTSLIB_EXPORT extern int hfile_set_blksize(hFILE *fp, size_t bufsiz); if (!fp->mobile) { diff --git a/htslib/hts.h b/htslib/hts.h index 8f11266fa..801506b95 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -456,16 +456,19 @@ The input character may be either an IUPAC ambiguity code, '=' for 0, or '0'/'1'/'2'/'3' for a result of 1/2/4/8. The result is encoded as 1/2/4/8 for A/C/G/T or combinations of these bits for ambiguous bases. */ +HTSLIB_EXPORT extern const unsigned char seq_nt16_table[256]; /*! @abstract Table for converting a 4-bit encoded nucleotide to an IUPAC ambiguity code letter (or '=' when given 0). */ +HTSLIB_EXPORT extern const char seq_nt16_str[]; /*! @abstract Table for converting a 4-bit encoded nucleotide to about 2 bits. Returns 0/1/2/3 for 1/2/4/8 (i.e., A/C/G/T), or 4 otherwise (0 or ambiguous). */ +HTSLIB_EXPORT extern const int seq_nt16_int[]; /*! diff --git a/htslib/hts_log.h b/htslib/hts_log.h index b2336a4df..f6a50b333 100644 --- a/htslib/hts_log.h +++ b/htslib/hts_log.h @@ -58,6 +58,7 @@ enum htsLogLevel hts_get_log_level(void); * One of the HTS_LOG_* values. The default is HTS_LOG_WARNING. * \note Avoid direct use of this variable. Use hts_set_log_level and hts_get_log_level instead. */ +HTSLIB_EXPORT extern int hts_verbose; /*! Logs an event. diff --git a/htslib/sam.h b/htslib/sam.h index a6e64fbb2..f0a191a28 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -118,6 +118,7 @@ typedef sam_hdr_t bam_hdr_t; Result is operator code or -1. Be sure to cast the index if it is a plain char: int op = bam_cigar_table[(unsigned char) ch]; */ +HTSLIB_EXPORT extern const int8_t bam_cigar_table[256]; #define bam_cigar_op(c) ((c)&BAM_CIGAR_MASK) diff --git a/htslib/tbx.h b/htslib/tbx.h index 9b9e111b9..3d2037cbb 100644 --- a/htslib/tbx.h +++ b/htslib/tbx.h @@ -52,6 +52,7 @@ typedef struct tbx_t { void *dict; } tbx_t; +HTSLIB_EXPORT extern const tbx_conf_t tbx_conf_gff, tbx_conf_bed, tbx_conf_psltbl, tbx_conf_sam, tbx_conf_vcf; #define tbx_itr_destroy(iter) hts_itr_destroy(iter) diff --git a/htslib/vcf.h b/htslib/vcf.h index 7a001aca6..8f7d79fe3 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -123,6 +123,7 @@ typedef struct bcf_hdr_t { int32_t m[3]; // m: allocated size of the dictionary block in use (see n above) } bcf_hdr_t; +HTSLIB_EXPORT extern uint8_t bcf_type_shift[]; /************** @@ -1341,7 +1342,9 @@ which works for both BCF and VCF. #define BCF_MIN_BT_INT16 (-32760) /* INT16_MIN + 8 */ #define BCF_MIN_BT_INT32 (-2147483640) /* INT32_MIN + 8 */ +HTSLIB_EXPORT extern uint32_t bcf_float_vector_end; +HTSLIB_EXPORT extern uint32_t bcf_float_missing; static inline void bcf_float_set(float *ptr, uint32_t value) { diff --git a/textutils_internal.h b/textutils_internal.h index 4b120bdbc..1ad096494 100644 --- a/textutils_internal.h +++ b/textutils_internal.h @@ -65,9 +65,11 @@ typedef struct hts_json_token hts_json_token; /// Allocate an empty JSON token structure, for use with hts_json_* functions /** @return An empty token on success; NULL on failure */ +HTSLIB_EXPORT hts_json_token *hts_json_alloc_token(void); /// Free a JSON token +HTSLIB_EXPORT void hts_json_free_token(hts_json_token *token); /// Accessor function to get JSON token type @@ -85,6 +87,7 @@ as follows: - `!` other errors (e.g. out of memory) - `\0` terminator at end of input */ +HTSLIB_EXPORT char hts_json_token_type(hts_json_token *token); /// Accessor function to get JSON token in string form @@ -98,6 +101,7 @@ will point at the kstring_t buffer passed as the third parameter to hts_json_fnext(). In that case, the value will only be valid until the next call to hts_json_fnext(). */ +HTSLIB_EXPORT char *hts_json_token_str(hts_json_token *token); /// Read one JSON token from a string @@ -111,6 +115,7 @@ is modified by having token-terminating characters overwritten as NULs. The `state` argument records the current position within `str` after each `hts_json_snext()` call, and should be set to 0 before the first call. */ +HTSLIB_EXPORT char hts_json_snext(char *str, size_t *state, hts_json_token *token); /// Read and discard a complete JSON value from a string @@ -123,6 +128,7 @@ char hts_json_snext(char *str, size_t *state, hts_json_token *token); Skips a complete JSON value, which may be a single token or an entire object or array. */ +HTSLIB_EXPORT char hts_json_sskip_value(char *str, size_t *state, char type); struct hFILE; @@ -137,6 +143,7 @@ The `kstr` buffer is used to store the string value of the token read, so `token->str` is only valid until the next time `hts_json_fnext()` is called with the same `kstr` argument. */ +HTSLIB_EXPORT char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr); /// Read and discard a complete JSON value from a file @@ -148,6 +155,7 @@ char hts_json_fnext(struct hFILE *fp, hts_json_token *token, kstring_t *kstr); Skips a complete JSON value, which may be a single token or an entire object or array. */ +HTSLIB_EXPORT char hts_json_fskip_value(struct hFILE *fp, char type); // The functions operate on ints such as are returned by fgetc(), From 88ccb034fca1f8e140f554cccc53d355f0a06da2 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 30 May 2022 18:34:13 +0100 Subject: [PATCH 277/488] Fix curl type warning on Windows with gcc 12.1 curl_easy_setopt(handle, CURLOPT_POSTFIELDSIZE, val) takes a long, but was being passed a size_t. This works on Linux where they're the same size for both 64 and 32-bit platforms, but not on 64-bit Windows which has 32-bit longs and 64-bit size_t. Casting the value to long should be fine. The POST data is for the completion message which should not get that big due to the limit of 10000 parts when uploading data. --- hfile_s3_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hfile_s3_write.c b/hfile_s3_write.c index eec56696b..d54945839 100644 --- a/hfile_s3_write.c +++ b/hfile_s3_write.c @@ -321,7 +321,7 @@ static int complete_upload(hFILE_s3_write *fp, kstring_t *resp) { curl_easy_reset(fp->curl); curl_easy_setopt(fp->curl, CURLOPT_POST, 1L); curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDS, fp->completion_message.s); - curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, fp->completion_message.l); + curl_easy_setopt(fp->curl, CURLOPT_POSTFIELDSIZE, (long) fp->completion_message.l); curl_easy_setopt(fp->curl, CURLOPT_WRITEFUNCTION, response_callback); curl_easy_setopt(fp->curl, CURLOPT_WRITEDATA, (void *)resp); curl_easy_setopt(fp->curl, CURLOPT_URL, url.s); From 7fef9d3acaf4ea51dcd6a9fd74742e17c90ec44a Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 27 May 2022 12:19:38 +0100 Subject: [PATCH 278/488] Add an "sclen" expression keyword. This is the length of soft-clips, both left and right end. It may be combined with qlen (qlen-sclen) to obtain the number of bases in the query sequence that have been aligned to the genome. Ie it provides a way to compare local-alignment vs global-alignment length. Fixes #1436 --- sam.c | 27 +++++++++++++++++++++++++++ test/sam_filter/filter.tst | 3 +++ test/sam_filter/func5.out | 5 +++++ test/sam_filter/func6.out | 2 ++ test/sam_filter/func7.out | 3 +++ 5 files changed, 40 insertions(+) create mode 100644 test/sam_filter/func5.out create mode 100644 test/sam_filter/func6.out create mode 100644 test/sam_filter/func7.out diff --git a/sam.c b/sam.c index 865b55fa6..64c08a43a 100644 --- a/sam.c +++ b/sam.c @@ -1354,6 +1354,33 @@ static int bam_sym_lookup(void *data, char *str, char **end, res->s.l = b->core.l_qseq; res->is_str = 1; return 0; + } else if (memcmp(str, "sclen", 5) == 0) { + int sclen = 0; + uint32_t *cigar = bam_get_cigar(b); + int ncigar = b->core.n_cigar; + int left = 0; + + // left + if (ncigar > 0 + && bam_cigar_op(cigar[0]) == BAM_CSOFT_CLIP) + left = 0, sclen += bam_cigar_oplen(cigar[0]); + else if (ncigar > 1 + && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP + && bam_cigar_op(cigar[1]) == BAM_CSOFT_CLIP) + left = 1, sclen += bam_cigar_oplen(cigar[1]); + + // right + if (ncigar-1 > left + && bam_cigar_op(cigar[ncigar-1]) == BAM_CSOFT_CLIP) + sclen += bam_cigar_oplen(cigar[ncigar-1]); + else if (ncigar-2 > left + && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP + && bam_cigar_op(cigar[ncigar-2]) == BAM_CSOFT_CLIP) + sclen += bam_cigar_oplen(cigar[ncigar-2]); + + *end = str+5; + res->d = sclen; + return 0; } break; diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index effb77a26..129516b24 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -53,3 +53,6 @@ P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | egrep -cv '^@' P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | egrep -cv '^@' P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | egrep -cv '^@' P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | egrep -cv '^@' +P func5.out $tv -i 'filter=sclen>=20' ../realn02.sam | egrep -v '^@' +P func6.out $tv -i 'filter=rlen<50' ../realn02.sam | egrep -v '^@' +P func7.out $tv -i 'filter=qlen>100' ../realn02.sam | egrep -v '^@' diff --git a/test/sam_filter/func5.out b/test/sam_filter/func5.out new file mode 100644 index 000000000..6c2e2bc64 --- /dev/null +++ b/test/sam_filter/func5.out @@ -0,0 +1,5 @@ +ERR013140.3521432 99 17 1 29 22S86M = 226 313 AGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTGCTCTGGGGTCTCTGGGGTCTCA @AEDGBHIIIIIFJGIKHGHIJJJEJKHJKJKGKLLIFHKLLCJJIDEFFHKHEHHJIIIDJEEEJEIKGJIHCGKHFKFE9BBDIAJAHF4?DE@I:DD48(86D=> MD:Z:86 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR156632.12704932 163 17 1 29 36S64M = 195 293 TGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTG BFAFGFEIGFEFHHEIDKJGHHHJIIE=@KKGGKJGIBLLMFKMDIIHJKKHFELLLKFIHMHIHHIHLKJFCHFJIJAID=JHKFGHJIHKKCH:@HD? MD:Z:64 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR156632.9601178 99 17 1 29 62S38M = 279 377 CTATGACAGGGAGGTCATGTGCAGGCTGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGA DEEEIIHHKIJILKHLHIKEKHHMKLKKJGKKKKLKLFIHEKIKL=KLJLKIILHKMH9LJJJJLHLHJJKJJKMLKJD>MJKLEHIGHIH=FFCHF>BE MD:Z:38 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.13475139 99 17 2401 60 88M20S = 2680 386 AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGGAGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGCGTTTGAGGCTGCTGT @CEBEEIHHHICFJIFKGHIKJHII>DBC:CE>A8C>C>7DBA=BEDDB4=9;:@=;@D@@=B@E.3?972<>6@8=>?1$0:95%5%*1=8;0%4<228% X0:i:1 X1:i:0 XC:i:88 MD:Z:88 RG:Z:rg AM:i:37 NM:i:0 SM:i:37 MQ:i:60 XT:A:U +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg diff --git a/test/sam_filter/func6.out b/test/sam_filter/func6.out new file mode 100644 index 000000000..de091ed96 --- /dev/null +++ b/test/sam_filter/func6.out @@ -0,0 +1,2 @@ +ERR156632.9601178 99 17 1 29 62S38M = 279 377 CTATGACAGGGAGGTCATGTGCAGGCTGGAGAAGGGGACAAGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGA DEEEIIHHKIJILKHLHIKEKHHMKLKKJGKKKKLKLFIHEKIKL=KLJLKIILHKMH9LJJJJLHLHJJKJJKMLKJD>MJKLEHIGHIH=FFCHF>BE MD:Z:38 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg diff --git a/test/sam_filter/func7.out b/test/sam_filter/func7.out new file mode 100644 index 000000000..1fe2500bf --- /dev/null +++ b/test/sam_filter/func7.out @@ -0,0 +1,3 @@ +ERR013140.3521432 99 17 1 29 22S86M = 226 313 AGAGGTCCCCAACTTCTTTGCAAAGCTTCTCACCCTGTTCCTGCATAGATAATTGCATGACAATTGCCTTGTCCCTGCTGAATGTGCTCTGGGGTCTCTGGGGTCTCA @AEDGBHIIIIIFJGIKHGHIJJJEJKHJKJKGKLLIFHKLLCJJIDEFFHKHEHHJIIIDJEEEJEIKGJIHCGKHFKFE9BBDIAJAHF4?DE@I:DD48(86D=> MD:Z:86 RG:Z:rg AM:i:29 NM:i:0 SM:i:29 MQ:i:29 XT:A:M +ERR013140.13475139 99 17 2401 60 88M20S = 2680 386 AAATACAAAAAACAACTAGCCAGGCGTGGTGGTGCACACCTGTAGTCCCAGCTACTCAGGAGGCTGAGGGGGAAGGACTGCTTGAGCCCAGGCGTTTGAGGCTGCTGT @CEBEEIHHHICFJIFKGHIKJHII>DBC:CE>A8C>C>7DBA=BEDDB4=9;:@=;@D@@=B@E.3?972<>6@8=>?1$0:95%5%*1=8;0%4<228% X0:i:1 X1:i:0 XC:i:88 MD:Z:88 RG:Z:rg AM:i:37 NM:i:0 SM:i:37 MQ:i:60 XT:A:U +ERR013140.23480670 133 17 3771 0 35M73S = 3771 0 TTCTCATCAATCCCTCATCTCTTATAACCATTTCGGTCCTTTCGGCCCTACAGCCACCTTGTTTATACTTGGTAAGACCCACACCACTCGCCAACTTACTCTACTCCC 8+7?5>09:),/%81,$,7<+?)+1+*+),3%5+)#%(4B%$&'%'/*@,)*%%&,%(/0%-&$$*$-,$3*.%/$:%$+.$*%&+.,.%%,%(%7(-.-',1*6%&$ XC:i:35 RG:Z:rg From 33ff2bcc49ddc47b7eb9ced7bf06446b1984746e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 4 May 2022 16:45:54 +0100 Subject: [PATCH 279/488] Add an embed_ref=2 mode which starts with ref=N and edits on the fly. We compute reference from seq + cigar + MD tag. This gets used immediately as it's part of the reference-based encoding in process_one_read. If we have inconsistent records where the inferred reference differs, then we would end up with an invalid embedded reference and decode errors. Instead we just accept the first one as we cannot go back and correct earlier mistakes. It's a little tricky when the sequence may contain "N" characters, as that's also the initial value of unassigned reference, so we have to be careful there. This also opens the way for handling CRAM files without any MD tags. In that scenario we just assume there are no SNPs and create a fake MD tag (just numbers merging match/mismatch and ^N* for deletions). So we can now do embedded reference on files with no reference and no MD. It's not quite consensus (which would be better), but that would require a two-pass encoding strategy in place of process_one_read. So it's a bit poor on high-error technologies. Note, when embedded auto-generated reference where MD tags may be absent, the auto-decoded MD may be incorrect as the reference we have embedded is now the sequence as-is rather than the real sequence. --- cram/cram_encode.c | 254 +++++++++++++++++++++++++++++++++++++++++--- cram/cram_io.c | 8 +- cram/cram_structs.h | 2 + 3 files changed, 250 insertions(+), 14 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index d35643a92..828ce089a 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include +#include #include "cram.h" #include "os.h" @@ -1438,19 +1439,29 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { goto_err; bam_seq_t *b = c->bams[0]; - char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); - if (!ref && bam_ref(b) >= 0) { - hts_log_error("Failed to load reference #%d", bam_ref(b)); - return -1; - } - if ((c->ref_id = bam_ref(b)) >= 0) { - c->ref_seq_id = c->ref_id; - c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; - c->ref_start = 1; - c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; + if (fd->embed_ref <= 1) { + char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); + if (!ref && bam_ref(b) >= 0) { + hts_log_error("Failed to load reference #%d", bam_ref(b)); + return -1; + } + if ((c->ref_id = bam_ref(b)) >= 0) { + c->ref_seq_id = c->ref_id; + c->ref = fd->refs->ref_id[c->ref_seq_id]->seq; + c->ref_start = 1; + c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; + } } else { - c->ref_seq_id = c->ref_id; // FIXME remove one var! + // Auto-embed ref. + // This starts as 'N' and is amended on-the-fly as we go + // based on MD:Z tags. + if ((c->ref_id = bam_ref(b)) >= 0) { + c->ref_free = 1; + if (c->ref) abort(); + c->ref = NULL; + } } + c->ref_seq_id = c->ref_id; } else { c->ref_id = bam_ref(c->bams[0]); cram_ref_incr(fd->refs, c->ref_id); @@ -2722,6 +2733,216 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { return c; } +// Returns the next cigar op code: one of the BAM_C* codes, +// or -1 if no more are present. +static inline +int next_cigar_op(uint32_t *cigar, int *ncigar, int *skip, int *spos, + uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) { + for(;;) { + while (*cig_len == 0) { + if (*cig_ind < *ncigar) { + *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK; + *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT; + (*cig_ind)++; + } else { + return -1; + } + } + + if (skip[*cig_op]) { + *spos += (bam_cigar_type(*cig_op)&1) * *cig_len; + *cig_len = 0; + continue; + } + + (*cig_len)--; + break; + } + + return *cig_op; +} + +// Set a base in the computed reference. +// As we fill this out record by record as we go, and we encode the +// sequence against the reference we've computed so far, once we set a +// reference is must never change. So if the reference inferred by one +// SEQ+MD differs to the reference inferred by another SEQ+MD, the latter +// is warned about and the reference remains unchanged to ensure +// round-trips. +// +// In order to spot N->N->G type edits, where "N" and "G" are two inferred +// ref from two sequences, we use N->n->G and patch up the lowercase n later. +// Similarly where the data is unvalidated (faked up MD tag) also get +// assigned lowercase letters. This prevents false warnings when mixing data +// with and without MD tags. +static inline void assign_ref(char *ref, char *set, int pos, + unsigned char base, int validate) { + base = base & ~0x20; // fast toupper for ASCII +#if 1 + if (!set[pos] || ref[pos] == base) { + ref[pos] = base; + set[pos] = 1; + } +#else + // Optional reporting. It's 7% additional CPU cost in process_one_read, + // and maybe not appropriate anyway given there's nothing we can do to + // correct this either than ignore it. It'd need update to explain the + // position too. + if (!set[pos] || ref[pos] == base) { + ref[pos] = base; + set[pos] = validate ? 1 : 2; // actual MD:Z or guesswork/fake + } else if (validate && set[pos] == 1) { + hts_log_warning("Incompatible MD:Z tags between records"); + } +#endif +} + +static int cram_extend_ref(cram_container *c, bam1_t *b) { + hts_pos_t end = bam_endpos(b); + + if (!c->ref) + c->ref_start = b->core.pos+1; + //c->ref_start = 1; // FIXME, needs to be b->core.pos, but fails + + if (end >= c->ref_end) { + hts_pos_t old_end = c->ref ? c->ref_end : c->ref_start; + c->ref_end = end + 1000 + (end - c->ref_start)*1.5; + + char *r = realloc(c->ref, c->ref_end+1 - c->ref_start); + if (!r) return -1; + c->ref = r; + + r = realloc(c->ref_set, c->ref_end+1 - c->ref_start); + if (!r) return -1; + c->ref_set = r; + + memset(c->ref + old_end - c->ref_start, 'N', c->ref_end - old_end); + memset(c->ref_set + old_end - c->ref_start, 0, c->ref_end - old_end); + } + + return 0; +} + +// Converts a bam object with SEQ, POS/CIGAR and MD:Z to a reference. +// Updates ref[] array. +// +// Returns >0 on success, +// 0 on no-MD found, +// -1 on failure (eg inconsistent data) +static int cram_build_ref(bam1_t *b, const uint8_t *MD, + char *ref, char *ref_set, + hts_pos_t ref_start, hts_pos_t ref_len) { + uint8_t *seq = bam_get_seq(b); + uint32_t *cigar = bam_get_cigar(b); + int ncigar = b->core.n_cigar; + uint32_t cig_op = 0, cig_len = 0, cig_ind = 0; + kstring_t fake_MD = KS_INITIALIZE; + int validate = 1; + + if (!MD || *MD != 'Z') { + // Fake it! + int i, err = 0; + int run_len = 0; + for (i = 0; i < ncigar; i++) { + switch(cigar[i] & BAM_CIGAR_MASK) { + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: + run_len += cigar[i] >> BAM_CIGAR_SHIFT; + break; + + case BAM_CDEL: + err |= ksprintf(&fake_MD, "%d", run_len) < 0; + run_len = 0; + err |= kputc('^', &fake_MD); + for (int j = 0; j < cigar[i] >> BAM_CIGAR_SHIFT; j++) + err |= kputc('N', &fake_MD); + break; + } + } + if (run_len) + err |= ksprintf(&fake_MD, "%d", run_len) < 0; + MD = (uint8_t *)fake_MD.s; + if (err < 0) + return -1; + + validate = 0; + } else { + MD++; + } + + // Walk through MD + seq to generate ref + int iseq = 0, next_op; + hts_pos_t iref = b->core.pos+1 - ref_start; + int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1}; + while (iseq < b->core.l_qseq && MD && *MD) { + if (isdigit(*MD)) { + // match + int len = strtol((char *)MD, (char **)&MD, 10); + while (iseq < b->core.l_qseq && len) { + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && + next_op != BAM_CEQUAL) { + hts_log_warning("MD:Z and CIGAR are incompatible for " + "record %s", bam_get_qname(b)); + return -1; + } + + if (iref < ref_len) + assign_ref(ref, ref_set, iref, + seq_nt16_str[bam_seqi(seq, iseq)], validate); + iseq++; + iref++; + len--; + } + } else if (*MD == '^') { + // deletion + MD++; + while (*MD && isalpha(*MD)) { + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CDEL) { + hts_log_warning("MD:Z and CIGAR are incompatible"); + return -1; + } + + if (iref < ref_len) + assign_ref(ref, ref_set, iref, toupper(*MD), validate); + + MD++; + iref++; + } + } else { + // substitution + if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) { + hts_log_warning("MD:Z and CIGAR are incompatible"); + return -1; + } + if (iref < ref_len) + assign_ref(ref, ref_set, iref, toupper(*MD), validate); + + MD++; + iref++; + iseq++; + } + } + + ks_free(&fake_MD); + return 1; +} + /* * Converts a single bam record into a cram record. * Possibly used within a thread. @@ -2746,16 +2967,23 @@ static int process_one_read(cram_fd *fd, cram_container *c, // FIXME: multi-ref containers - ref = c->ref; cr->flags = bam_flag(b); cr->len = bam_seq_len(b); - if (!bam_aux_get(b, "MD")) + uint8_t *md; + if (!(md = bam_aux_get(b, "MD"))) MD = NULL; else MD->l = 0; + if (/*md &&*/ fd->embed_ref == 2) { + // Auto-generate and embed ref + cram_extend_ref(c, b); + cram_build_ref(b, md, c->ref, c->ref_set, c->ref_start, c->ref_end); + } + //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); + ref = c->ref ? c->ref - (c->ref_start-1) : NULL; cr->ref_id = bam_ref(b); if (cram_stats_add(c->stats[DS_RI], cr->ref_id) < 0) goto block_err; diff --git a/cram/cram_io.c b/cram/cram_io.c index 60a568b7b..e0d203469 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -3639,6 +3639,7 @@ cram_container *cram_new_container(int nrec, int nslice) { if (!(c->tags_used = kh_init(m_tagmap))) goto err; c->refs_used = 0; + c->ref_free = 0; return c; @@ -3711,6 +3712,11 @@ void cram_free_container(cram_container *c) { kh_destroy(m_tagmap, c->tags_used); } + if (c->ref_free) { + free(c->ref); + free(c->ref_set); + } + free(c); } @@ -4820,7 +4826,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { } /* Fix M5 strings */ - if (fd->refs && !fd->no_ref) { + if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) { int i; for (i = 0; i < hdr->hrecs->nref; i++) { sam_hrec_type_t *ty; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index ce27bc1a4..e03a34e11 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -473,6 +473,8 @@ struct cram_container { uint64_t s_num_bases; // number of bases in this slice uint32_t n_mapped; // Number of mapped reads + int ref_free; // whether 'ref' is owned by us and must be freed. + char *ref_set; // same size as ref. Only set for auto embed_ref }; /* From 764edbd24adf20b938a7f70160802d4cc5846224 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 30 May 2022 14:27:26 +0100 Subject: [PATCH 280/488] Add a cF CRAM specific tag. This is used to indicate when MD and NM were not present and should not be regenerated during decode. Bit 1 is set when MD shouldn't be produced and bit 2 when NM shouldn't be. In both cases this tag is only created when embed_ref=2 and MD and/or NM is absent from the input data. In this scenario we cannot reproduce the reference from SEQ+MD and cannot therefore be certain that the value reproduced is correct. E.g. if the reference is produced by consensus alone, then MD is a diff of this read vs consensus and not this read vs the original reference used in by the aligner. The cF tag is automatically stripped out during decode, but only with this version of htslib and above. Older versions will just emit a private-space aux tag (which hopefully is harmless except for the unlikely event of it clashing with another private name-space tool). --- cram/cram_decode.c | 18 ++++++++++++++++++ cram/cram_encode.c | 31 +++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 51f1b765c..1f8d60f12 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2037,12 +2037,30 @@ static int cram_decode_aux(cram_fd *fd, m = map_find(c->comp_hdr->tag_encoding_map, tag_data, id); if (!m) return -1; + BLOCK_APPEND(s->aux_blk, (char *)tag_data, 3); if (!m->codec) return -1; r |= m->codec->decode(s, m->codec, blk, (char *)s->aux_blk, &out_sz); if (r) break; cr->aux_size += out_sz + 3; + + // cF CRAM flags. + if (TN[-3]=='c' && TN[-2]=='F' && TN[-1]=='C' && out_sz == 1) { + // Remove cF tag + uint8_t cF = BLOCK_END(s->aux_blk)[-1]; + BLOCK_SIZE(s->aux_blk) -= out_sz+3; + cr->aux_size -= out_sz+3; + + // bit 1 => don't auto-decode MD. + // Pretend MD is present verbatim, so we don't auto-generate + if ((cF & 1) && has_MD && *has_MD == 0) + *has_MD = 1; + + // bit 1 => don't auto-decode NM + if ((cF & 2) && has_NM && *has_NM == 0) + *has_NM = 1; + } } } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 828ce089a..72f21c575 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -2229,7 +2229,7 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, cram_slice *s, cram_record *cr, int verbatim_NM, int verbatim_MD, - int NM, kstring_t *MD, + int NM, kstring_t *MD, int cf_tag, int *err) { char *aux, *orig, *rg = NULL; int aux_size = bam_get_l_aux(b); @@ -2242,6 +2242,24 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, orig = aux = (char *)bam_aux(b); + + // cF:i => Extra CRAM bit flags. + // 1: Don't auto-decode MD (may be invalid) + // 2: Don't auto-decode NM (may be invalid) + if (cf_tag && CRAM_MAJOR_VERS(fd->version) < 4) { + // Temporary copy of aux so we can ammend it. + aux = malloc(aux_size+4); + if (!aux) + return NULL; + + memcpy(aux, orig, aux_size); + aux[aux_size++] = 'c'; + aux[aux_size++] = 'F'; + aux[aux_size++] = 'C'; + aux[aux_size++] = cf_tag; + orig = aux; + } + // Copy aux keys to td_b and aux values to slice aux blocks while (aux - orig < aux_size && aux[0] != 0) { int r; @@ -2604,11 +2622,16 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (cram_stats_add(c->stats[DS_TL], cr->TL) < 0) goto block_err; + if (orig != (char *)bam_aux(b)) + free(orig); + if (err) *err = 0; return rg; err: block_err: + if (orig != (char *)bam_aux(b)) + free(orig); return NULL; } @@ -2975,10 +2998,13 @@ static int process_one_read(cram_fd *fd, cram_container *c, else MD->l = 0; + int cf_tag = 0; if (/*md &&*/ fd->embed_ref == 2) { // Auto-generate and embed ref cram_extend_ref(c, b); cram_build_ref(b, md, c->ref, c->ref_set, c->ref_start, c->ref_end); + cf_tag = MD ? 0 : 1; // No MD + cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM } //fprintf(stderr, "%s => %d\n", rg ? rg : "\"\"", cr->rg); @@ -3297,7 +3323,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); int err = 0; - rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, &err); + rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, + cf_tag, &err); if (err) goto block_err; From e97816474e5facdb0ef23df8be7e446eea474f24 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 1 Jun 2022 14:43:13 +0100 Subject: [PATCH 281/488] Remove unneeded abort check in cram encoding. Also fixed a potential illegal memory access caused by the return value of cram_encode_aux, and added more belt and braces memory free requests for the new embed_ref=2 option. --- cram/cram_encode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 72f21c575..368f6e0b4 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1457,7 +1457,6 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // based on MD:Z tags. if ((c->ref_id = bam_ref(b)) >= 0) { c->ref_free = 1; - if (c->ref) abort(); c->ref = NULL; } } @@ -2626,7 +2625,9 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, free(orig); if (err) *err = 0; - return rg; + + // rg from within bam_aux, not rg from our aux copy. + return rg ? (char *)bam_aux(b) + (rg - orig) : NULL; err: block_err: @@ -2841,6 +2842,7 @@ static int cram_extend_ref(cram_container *c, bam1_t *b) { memset(c->ref + old_end - c->ref_start, 'N', c->ref_end - old_end); memset(c->ref_set + old_end - c->ref_start, 0, c->ref_end - old_end); + c->ref_free = 1; } return 0; From 2978708540a17a66fd83133ed4aca34faa2ff6bd Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 7 Jun 2022 18:22:00 +0100 Subject: [PATCH 282/488] Remove limit on returned size from fai_retrieve() This was probably a left-over from the transition to 64-bit positions in HTSlib. Having the limit in fai_retrieve() caused very long references to be truncated even though programs like `samtools faidx` should be able to support them (see issue samtools/samtools#1660 - samtools faidx fails to retrieve large scaffolds). The limit is useful for legacy faidx interfaces that return the size in an `int *`, so tests for sizes over INT_MAX have been applied to them. --- faidx.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/faidx.c b/faidx.c index 4b25d3918..f3be5e57c 100644 --- a/faidx.c +++ b/faidx.c @@ -731,7 +731,7 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, } s[l] = '\0'; - *len = l < INT_MAX ? l : INT_MAX; + *len = l; return s; } @@ -784,7 +784,7 @@ char *fai_fetch(const faidx_t *fai, const char *str, int *len) { hts_pos_t len64; char *ret = fai_fetch64(fai, str, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -803,7 +803,7 @@ char *fai_fetchqual64(const faidx_t *fai, const char *str, hts_pos_t *len) { char *fai_fetchqual(const faidx_t *fai, const char *str, int *len) { hts_pos_t len64; char *ret = fai_fetchqual64(fai, str, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -876,7 +876,7 @@ char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p { hts_pos_t len64; char *ret = faidx_fetch_seq64(fai, c_name, p_beg_i, p_end_i, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } @@ -897,7 +897,7 @@ char *faidx_fetch_qual(const faidx_t *fai, const char *c_name, int p_beg_i, int { hts_pos_t len64; char *ret = faidx_fetch_qual64(fai, c_name, p_beg_i, p_end_i, &len64); - *len = len64; // trunc + *len = len64 < INT_MAX ? len64 : INT_MAX; // trunc return ret; } From f46597e7ec497e20903c423298bef61e83d8f6b4 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 29 Apr 2022 12:03:33 +0100 Subject: [PATCH 283/488] Expose a bit more of the CRAM API. This is to enable samtools cram2ref. --- cram/cram_external.c | 13 +++++++++++++ htslib/cram.h | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+) diff --git a/cram/cram_external.c b/cram/cram_external.c index 314826932..098accde9 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -188,6 +188,19 @@ int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr) { return hdr->num_blocks; } +int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h) { + return h->ref_base_id; +} + +void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, + int *refid, hts_pos_t *start, hts_pos_t *span) { + if (refid) + *refid = h->ref_seq_id; + if (start) + *start = h->ref_seq_start; + if (span) + *span = h->ref_seq_span; +} /* *----------------------------------------------------------------------------- diff --git a/htslib/cram.h b/htslib/cram.h index dab666345..afeeb3711 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -247,6 +247,46 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, HTSLIB_EXPORT int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice); +/* + *----------------------------------------------------------------------------- + * cram slice interrogation + */ + +/* + * Returns the number of cram blocks within this slice. + */ +HTSLIB_EXPORT +int32_t cram_slice_hdr_get_num_blocks(cram_block_slice_hdr *hdr); + +/* + * Returns the block content_id for the block containing an embedded reference + * sequence. If none is present, -1 is returned. + */ +HTSLIB_EXPORT +int cram_slice_hdr_get_embed_ref_id(cram_block_slice_hdr *h); + +/* + * Returns slice reference ID, start and span (length) coordinates. + * Return parameters may be NULL in which case they are ignored. + */ +HTSLIB_EXPORT +void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, + int *refid, hts_pos_t *start, hts_pos_t *span); + +/* + * Decodes a slice header from a cram block. + * Returns the opaque cram_block_slice_hdr pointer on success, + * NULL on failure. + */ +HTSLIB_EXPORT +cram_block_slice_hdr *cram_decode_slice_header(cram_fd *fd, cram_block *b); + +/* + * Frees a cram_block_slice_hdr structure. + */ +HTSLIB_EXPORT +void cram_free_slice_header(cram_block_slice_hdr *hdr); + /* *----------------------------------------------------------------------------- * cram_io basics From fee3bbbe58a9bc4af716b1af4934c1f76faa8f56 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 9 Jun 2022 13:01:48 +0100 Subject: [PATCH 284/488] CVE-2020-36403 affected all older versions of HTSlib This issue was fixed in 1.11 by PRs #1044 and #1104. It was detected via fuzz testing (https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=24097) but the Reproducer Testcase also has an invalid `#CHROM` line which resulted in an error message in HTSlib versions <= 1.9. This error message masked the segfault caused by the actual issue, namely a VCF record whose in-memory representation requires more than 2GiB. A clean test case produces a segfault all the way back to HTSlib 1.0. --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 75d9ce79e..53a0d3c34 100644 --- a/NEWS +++ b/NEWS @@ -723,7 +723,7 @@ Bug fixes * Fixed potential integer overflows in the VCF parser and ensured that the total length of FORMAT fields cannot go over 2Gbytes. [fuzz] (#1044, - #1104; latter is CVE-2020-36403 affecting HTSlib versions 1.10 to 1.10.2) + #1104; latter is CVE-2020-36403 affecting all HTSlib versions up to 1.10.2) * Download index files atomically in idx_test_and_fetch(). This prevents corruption when running parallel jobs on S3 files. Thanks to John Marshall. From 1109c8bce1248df001e1b3550a6b1bb58dfaa1c0 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 10 Jun 2022 13:20:51 +0100 Subject: [PATCH 285/488] Cap hts_getline() return value at INT_MAX On success, hts_getline() returns the length of the string read. Its return type is int, so when plain int is 32 bits, trouble ensues for very long lines exceeding 2GiB: the return value wraps to negative and is misinterpreted as error. Rather than changing the return type to e.g. ssize_t, clamp the return value for very long lines. In test/sam.c's test cases, check the return value is indeed the expected length. --- hts.c | 2 +- htslib/hts.h | 2 +- test/sam.c | 6 +++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/hts.c b/hts.c index d06c10891..0e9257e76 100644 --- a/hts.c +++ b/hts.c @@ -1899,7 +1899,7 @@ int hts_getline(htsFile *fp, int delimiter, kstring_t *str) case no_compression: str->l = 0; ret = kgetline2(str, (kgets_func2 *) hgetln, fp->fp.hfile); - if (ret >= 0) ret = str->l; + if (ret >= 0) ret = (str->l <= INT_MAX)? (int) str->l : INT_MAX; else if (herrno(fp->fp.hfile)) ret = -2, errno = herrno(fp->fp.hfile); else ret = -1; break; diff --git a/htslib/hts.h b/htslib/hts.h index 801506b95..d354b2e2c 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -676,7 +676,7 @@ int hts_set_opt(htsFile *fp, enum hts_fmt_option opt, ...); @param fp The file handle @param delimiter Unused, but must be '\n' (or KS_SEP_LINE) @param str The line (not including the terminator) is written here - @return Length of the string read; + @return Length of the string read (capped at INT_MAX); -1 on end-of-file; <= -2 on error */ HTSLIB_EXPORT diff --git a/test/sam.c b/test/sam.c index cc5bfe77a..49d9210c3 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1525,7 +1525,11 @@ static void test_text_file(const char *filename, int nexp) if (in) { kstring_t str = KS_INITIALIZE; int ret, n = 0; - while ((ret = hts_getline(in, '\n', &str)) >= 0) n++; + while ((ret = hts_getline(in, '\n', &str)) >= 0) { + size_t len = strlen(str.s); + n++; + if (ret != len) fail("hts_getline read length %d (expected %zu)", ret, len); + } if (ret != -1) fail("hts_getline got an error from %s", filename); if (n != nexp) fail("hts_getline read %d lines from %s (expected %d)", n, filename, nexp); From 41f9aaa19ece0f151d10e98423c173d3e95c9c66 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Mon, 13 Jun 2022 12:51:33 +0100 Subject: [PATCH 286/488] Detect ARM Neon support and only build appropriate SIMD object files Add test compilations to detect ARM Neon support to configure.ac and hts_probe_cc.sh. If compiler support is present, add rANS_static32x16pr_neon.c to $(HTSCODECS_SOURCES) in htscodecs_bundled.mk. Fixes #1450. In htscodecs_bundled.mk, only add rANS_static32x16pr_avx2.c et al to $(HTSCODECS_SOURCES) if the respective AVX2, AVX512, SSE4 support is present. As building these files already uses GNU Make-specific constructs and the $(HTS_CFLAGS_AVX2) variables are either empty or an option string, this is easily achieved via `$(if $(HTS_CFLAGS_AVX2),...)`. There is no compiler flag required for Neon, so invent HTS_HAVE_NEON and use it to control building rANS_static32x16pr_neon.c without adding any bespoke compilation options for it. --- config.mk.in | 1 + configure.ac | 15 +++++++++++++++ hts_probe_cc.sh | 15 +++++++++++++++ htscodecs_bundled.mk | 7 ++++--- 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/config.mk.in b/config.mk.in index 35392bf0d..82af49850 100644 --- a/config.mk.in +++ b/config.mk.in @@ -117,3 +117,4 @@ endif HTS_CFLAGS_AVX2 = @hts_cflags_avx2@ HTS_CFLAGS_AVX512 = @hts_cflags_avx512@ HTS_CFLAGS_SSE4 = @hts_cflags_sse4@ +HTS_HAVE_NEON = @hts_have_neon@ diff --git a/configure.ac b/configure.ac index c1578d6e4..a53c08fe3 100644 --- a/configure.ac +++ b/configure.ac @@ -127,6 +127,21 @@ AX_CHECK_COMPILE_FLAG([-mavx512f], [ return *((char *) &b); ]])]) +dnl Detect ARM Neon availability +AC_CACHE_CHECK([whether C compiler supports ARM Neon], [hts_cv_have_neon], [ + AC_COMPILE_IFELSE([ + AC_LANG_PROGRAM([[ + #include "arm_neon.h" + ]], [[ + int32x4_t a = vdupq_n_s32(1); + int32x4_t b = vaddq_s32(a, a); + return *((char *) &b); + ]])], [hts_cv_have_neon=yes], [hts_cv_have_neon=no])]) +if test "$hts_cv_have_neon" = yes; then + hts_have_neon=yes + AC_SUBST([hts_have_neon]) +fi + dnl Avoid chicken-and-egg problem where pkg-config supplies the dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh index 0f6ddede8..905279099 100755 --- a/hts_probe_cc.sh +++ b/hts_probe_cc.sh @@ -95,4 +95,19 @@ if run_compiler "$FLAGS" ; then echo "HTS_CFLAGS_AVX512 = $FLAGS" fi +# Check for neon + +rm -f conftest.c +cat - <<'EOF' > conftest.c +#include "arm_neon.h" +int main(int argc, char **argv) { + int32x4_t a = vdupq_n_s32(1); + int32x4_t b = vaddq_s32(a, a); + return *((char *) &b); +} +EOF +if run_compiler "" ; then + echo "HTS_HAVE_NEON = yes" +fi + rm -f conftest.c diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 4a862f3d1..64192f085 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -28,9 +28,10 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ - $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c \ - $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c \ - $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c \ + $(if $(HTS_CFLAGS_AVX2),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c) \ + $(if $(HTS_CFLAGS_AVX512),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c) \ + $(if $(HTS_CFLAGS_SSE4),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c) \ + $(if $(HTS_HAVE_NEON),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c) \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ $(HTSPREFIX)htscodecs/htscodecs/rle.c \ From 58d9f604a0b9de5be021c9e9a79d1dc5d5892503 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 13 Jun 2022 13:54:49 +0100 Subject: [PATCH 287/488] Add an ARM CI test Tries to be as strict as possible. Unfortunately Address Sanitizer appears to be very slow on this platform at the moment, so has been left out for now. It would be good to add it later should its performance improve. --- .cirrus.yml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/.cirrus.yml b/.cirrus.yml index 2740ce05e..79aa2f99b 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -140,6 +140,33 @@ rocky_task: << : *COMPILE << : *TEST +# Arm Linux +arm_ubuntu_task: + name: ubuntu-arm + arm_container: + image: ubuntu:latest + cpu: 2 + memory: 1G + + environment: + LC_ALL: C + CIRRUS_CLONE_DEPTH: 1 + DO_UNTRACKED_FILE_CHECK: yes + USE_CONFIG: yes + CFLAGS: -g -Wall -O3 -std=c99 -pedantic + + # NB: we could consider building a docker image with these + # preinstalled and specifying that instead, to speed up testing. + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates clang libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + libdeflate-dev + + << : *COMPILE + << : *TEST + #-------------------------------------------------- # Task: macOS builds From 226c1a813bc5d0582f7e0b0bdb4b3ea9e3ee4ce4 Mon Sep 17 00:00:00 2001 From: Martin Pollard Date: Tue, 21 Jun 2022 18:06:57 +0100 Subject: [PATCH 288/488] Fix breakend detection and test bcf_set_variant_type (PR #1456) --- .gitignore | 1 + Makefile | 7 +- test/test-bcf_set_variant_type.c | 135 +++++++++++++++++++++++++++++++ test/test.pl | 15 ++++ vcf.c | 9 ++- 5 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 test/test-bcf_set_variant_type.c diff --git a/.gitignore b/.gitignore index 1573a5bf7..527bc227f 100644 --- a/.gitignore +++ b/.gitignore @@ -56,6 +56,7 @@ shlib-exports-*.txt /test/tabix/FAIL* /test/test-bcf-sr /test/test-bcf-translate +/test/test-bcf_set_variant_type /test/test_bgzf /test/test_expr /test/test_index diff --git a/Makefile b/Makefile index c8e394830..540fa58f3 100644 --- a/Makefile +++ b/Makefile @@ -94,7 +94,8 @@ BUILT_TEST_PROGRAMS = \ test/fuzz/hts_open_fuzzer.o \ test/test-bcf-translate \ test/test-parse-reg \ - test/test_introspection + test/test_introspection \ + test/test-bcf_set_variant_type BUILT_THRASH_PROGRAMS = \ test/thrash_threads1 \ @@ -649,6 +650,9 @@ test/test-bcf-translate: test/test-bcf-translate.o libhts.a test/test_introspection: test/test_introspection.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_introspection.o libhts.a $(LIBS) -lpthread +test/test-bcf_set_variant_type: test/test-bcf_set_variant_type.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test-bcf_set_variant_type.o libhts.a $(LIBS) -lpthread + # Extra tests for bundled htscodecs test_htscodecs_rans4x8: htscodecs/tests/rans4x8 cd htscodecs/tests && srcdir=. && export srcdir && ./rans4x8.test @@ -723,6 +727,7 @@ test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h) +test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c test/thrash_threads1: test/thrash_threads1.o libhts.a diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c new file mode 100644 index 000000000..fef212dbb --- /dev/null +++ b/test/test-bcf_set_variant_type.c @@ -0,0 +1,135 @@ +/* test/test-bcf_set_variant_type.c -- bcf_set_variant_type test harness. + + Copyright (C) 2022 Genome Research Ltd. + + Author: Martin Pollard + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include + +#include "../htslib/hts.h" +#include "../vcf.c" + +void error(const char *format, ...) +{ + va_list ap; + va_start(ap, format); + vfprintf(stderr, format, ap); + va_end(ap); + if (strrchr(format, '\n') == NULL) fputc('\n', stderr); + exit(-1); +} + +static void test_bcf_set_variant_type() +{ + // Test SNVs + bcf_variant_t var1; + bcf_set_variant_type("A", "T", &var1); + if ( var1.type != VCF_SNP) + { + error("A -> T was not detected as a breakend"); + } + + // Test INDEL + bcf_variant_t var2a; + bcf_set_variant_type("A", "AA", &var2a); + if ( var2a.type != VCF_INDEL) + { + error("A -> AA was not detected as an INDEL"); + } + bcf_variant_t var2b; + bcf_set_variant_type("AA", "A", &var2b); + if ( var2b.type != VCF_INDEL) + { + error("AA -> A was not detected as a INDEL"); + } + + // Test breakends + bcf_variant_t var3a; + bcf_set_variant_type("N", "N]16:33625444]", &var3a); + if ( var3a.type != VCF_BND) + { + error("N]16:33625444] was not detected as a breakend"); + } + + bcf_variant_t var3b; + bcf_set_variant_type("N", "N[16:33625444[", &var3b); + if (var3b.type != VCF_BND) + { + error("N[16:33625444[ was not detected as a breakend"); + } + + bcf_variant_t var3c; + bcf_set_variant_type("N", "]16:33625444]N", &var3c); + if ( var3c.type != VCF_BND) + { + error("]16:33625444]N was not detected as a breakend"); + } + + bcf_variant_t var3d; + bcf_set_variant_type("N", "[16:33625444[N", &var3d); + if ( var3d.type != VCF_BND) + { + error("[16:33625444[N was not detected as a breakend"); + } + // Test special reference alleles + bcf_variant_t var4a; + bcf_set_variant_type("A", "", &var4a); + if ( var4a.type != VCF_REF) + { + error(" was not detected as a special reference allele"); + } + bcf_variant_t var4b; + bcf_set_variant_type("A", "<*>", &var4b); + if ( var4b.type != VCF_REF) + { + error("<*> was not detected as a special reference allele"); + } + // Test MNP + bcf_variant_t var5; + bcf_set_variant_type("AA", "TT", &var5); + if ( var5.type != VCF_MNP) + { + error("AA->TT was not detected as a MNP"); + } + // Test Overlapping allele + bcf_variant_t var6; + bcf_set_variant_type("A", "*", &var6); + if ( var6.type != VCF_OVERLAP) + { + error("A->* was not detected as an overlap"); + } + // Test . + bcf_variant_t var7; + bcf_set_variant_type("A", ".", &var7); + if ( var7.type != VCF_REF) + { + error("A->. was not detected as a special reference allele"); + } +} + +int main(int argc, char **argv) +{ + test_bcf_set_variant_type(); + return 0; +} + diff --git a/test/test.pl b/test/test.pl index 7a396e22a..514f2508a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -60,6 +60,7 @@ test_logging($opts); test_plugin_loading($opts); test_realn($opts); +test_bcf_set_variant_type($opts); print "\nNumber of tests:\n"; printf " total .. %d\n", $$opts{nok}+$$opts{nfailed}; @@ -1055,3 +1056,17 @@ sub test_realn { # Revert quality values (using data in ZQ tags) test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn02.fa -i $$opts{path}/realn02_exp-a.sam -o -", out => "realn02_exp.sam"); } + +sub test_bcf_set_variant_type +{ + my ($opts) = @_; + my $test = 'test-bcf_set_variant_type'; + my $cmd = "$$opts{path}/test-bcf_set_variant_type"; + print "$test:\n"; + print "\t$cmd\n"; + my ($ret,$out) = _cmd($cmd); + if ( $ret ) { + print $out; + failed($opts,$test); + } else { passed($opts,$test); } +} diff --git a/vcf.c b/vcf.c index e1d386c7a..ab2477861 100644 --- a/vcf.c +++ b/vcf.c @@ -4179,12 +4179,19 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t return; } + // Catch "joined before" breakend case + if ( alt[0]==']' || alt[0] == '[' ) + { + var->type = VCF_BND; return; + } + + // Iterate through alt characters that match the reference const char *r = ref, *a = alt; while (*r && *a && toupper_c(*r)==toupper_c(*a) ) { r++; a++; } // unfortunately, matching REF,ALT case is not guaranteed if ( *a && !*r ) { - if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } + if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend while ( *a ) a++; var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; } From dd5ee808d58011e58a158c8863aa1e074498a061 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 1 Jul 2022 13:05:47 +0100 Subject: [PATCH 289/488] Don't use `register` in public header file The `register` storage class specifier has been removed in C++17. HTSlib itself is a C project so is unaffected, but this header may be used from third-party C++ projects. --- htslib/ksort.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/ksort.h b/htslib/ksort.h index 755010951..ad19fc47a 100644 --- a/htslib/ksort.h +++ b/htslib/ksort.h @@ -88,7 +88,7 @@ typedef struct { int depth; } ks_isort_stack_t; -#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } +#define KSORT_SWAP(type_t, a, b) { type_t t=(a); (a)=(b); (b)=t; } #define KSORT_INIT(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, , type_t, __sort_lt) #define KSORT_INIT_STATIC(name, type_t, __sort_lt) KSORT_INIT_(_ ## name, static klib_unused, type_t, __sort_lt) From ca34d9e3e7e6e5dd4416fe3cca74cb4088d8888b Mon Sep 17 00:00:00 2001 From: Colin Diesh Date: Mon, 4 Jul 2022 10:07:23 -0600 Subject: [PATCH 290/488] Set tab delimiter in manpage for tabix GFF3 sort (PR #1457) This can help if there are spaces in the GFF3 file e.g. in column 2 or 3 --- tabix.1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tabix.1 b/tabix.1 index 2c012442c..6950593e5 100644 --- a/tabix.1 +++ b/tabix.1 @@ -169,7 +169,7 @@ The default is 3, which turns on error and warning messages; Values higher than 3 produce additional informational and debugging messages. .PP .SH EXAMPLE -(grep ^"#" in.gff; grep -v ^"#" in.gff | sort -k1,1 -k4,4n) | bgzip > sorted.gff.gz; +(grep "^#" in.gff; grep -v "^#" in.gff | sort -t"`printf '\(rst'`" -k1,1 -k4,4n) | bgzip > sorted.gff.gz; tabix -p gff sorted.gff.gz; From 185521aebda549487cee0124ca20907de2b0fcf5 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 6 Jul 2022 09:52:09 +0100 Subject: [PATCH 291/488] Improve error message when failing to load an index. If errno is non-zero we now use strerror to report the system error message when reporting on failure to load an index. See samtools/samtools#1637 for an example where we believe this would aid the user to diagnose the problem. --- hts.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hts.c b/hts.c index 0e9257e76..c2c4acb89 100644 --- a/hts.c +++ b/hts.c @@ -4588,7 +4588,9 @@ hts_idx_t *hts_idx_load3(const char *fn, const char *fnidx, int fmt, int flags) hts_idx_t *idx = idx_read(fnidx); if (!idx && !(flags & HTS_IDX_SILENT_FAIL)) - hts_log_error("Could not load local index file '%s'", fnidx); + hts_log_error("Could not load local index file '%s'%s%s", fnidx, + errno ? " : " : "", errno ? strerror(errno) : ""); + free(local_fnidx); From b0148048382a62113e08e9954dca7c008df3db63 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 12 Apr 2022 17:37:59 +0100 Subject: [PATCH 292/488] Make hfile_s3 refresh AWS credentials on expiry This is to make HTSlib work better with AWS IAM credentials, which have a limited lifespan, and so may need to be refreshed. To allow this, hfile_s3 is made to look for an unofficial 'expiry_time' entry in the AWS_SHARED_CREDENTIALS_FILE. If present, the file will be re-read if the current time is within one minute of the given expiry (new credentails are available five minutes before expiry, according to https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/iam-roles-for-amazon-ec2.html). Currently no effort is made to understand the JSON format emitted by the AWS security-credentials endpoint. It's up to the end user to reformat this into the style of the normal '.aws/credentials' file. An example of how this can be done for one source of credentials on AWS is added to the manual page. Fixes bug where parse_ini would append to rather than replace existing values. Moves x-amz-security-token to the set of headers updated via callback, as it can now change when the credentials are updated. Includes an implementation of the timegm() function, which is not portable (e.g. mingw doesn't have it) but needed to convert the expiry time to a time_t. This is put in a separate header so that it can be more easily reused elsewhere if we want. Includes tests to check that details like leap years and normalisation work properly. --- .gitignore | 1 + Makefile | 9 +- hfile_s3.c | 214 ++++++++++++++++++++++++++++++----------- hts_time_funcs.h | 167 ++++++++++++++++++++++++++++++++ htslib-s3-plugin.7 | 80 ++++++++++++++- test/test_time_funcs.c | 122 +++++++++++++++++++++++ 6 files changed, 534 insertions(+), 59 deletions(-) create mode 100644 hts_time_funcs.h create mode 100644 test/test_time_funcs.c diff --git a/.gitignore b/.gitignore index 527bc227f..6b58e8439 100644 --- a/.gitignore +++ b/.gitignore @@ -68,6 +68,7 @@ shlib-exports-*.txt /test/test_realn /test/test-regidx /test/test_str2int +/test/test_time_funcs /test/test-vcf-api /test/test-vcf-sweep /test/test_view diff --git a/Makefile b/Makefile index 540fa58f3..621e8a11a 100644 --- a/Makefile +++ b/Makefile @@ -86,6 +86,7 @@ BUILT_TEST_PROGRAMS = \ test/test_realn \ test/test-regidx \ test/test_str2int \ + test/test_time_funcs \ test/test_view \ test/test_index \ test/test-vcf-api \ @@ -234,6 +235,7 @@ bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h) header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h) hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h) hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h) +hts_time_funcs_h = hts_time_funcs.h config.h sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) @@ -421,7 +423,7 @@ hfile.o hfile.pico: hfile.c config.h $(htslib_hfile_h) $(hfile_internal_h) $(hts hfile_gcs.o hfile_gcs.pico: hfile_gcs.c config.h $(htslib_hts_h) $(htslib_kstring_h) $(hfile_internal_h) hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) -hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) +hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(hts_time_funcs_h) hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c @@ -563,6 +565,7 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODEC test/test_kfunc test/test_kstring test/test_str2int + test/test_time_funcs test/fieldarith test/fieldarith.sam test/hfile HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR) @@ -629,6 +632,9 @@ test/test-parse-reg: test/test-parse-reg.o libhts.a test/test_str2int: test/test_str2int.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_str2int.o libhts.a $(LIBS) -lpthread +test/test_time_funcs: test/test_time_funcs.o + $(CC) $(LDFLAGS) -o $@ test/test_time_funcs.o + test/test_view: test/test_view.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_view.o libhts.a $(LIBS) -lpthread @@ -720,6 +726,7 @@ test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_s test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h) test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) +test/test_time_funcs.o: test/test_time_funcs.c $(htslib_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) diff --git a/hfile_s3.c b/hfile_s3.c index e8e505e2a..c9bed1fe1 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -40,6 +40,7 @@ DEALINGS IN THE SOFTWARE. */ #endif #include "htslib/hts.h" // for hts_version() and hts_verbose #include "htslib/kstring.h" +#include "hts_time_funcs.h" typedef struct s3_auth_data { kstring_t id; @@ -49,6 +50,8 @@ typedef struct s3_auth_data { kstring_t canonical_query_string; kstring_t user_query_string; kstring_t host; + kstring_t profile; + time_t creds_expiry_time; char *bucket; kstring_t auth_hdr; time_t auth_time; @@ -57,11 +60,12 @@ typedef struct s3_auth_data { char date_short[9]; kstring_t date_html; char mode; - char *headers[4]; + char *headers[5]; int refcount; } s3_auth_data; -#define AUTH_LIFETIME 60 +#define AUTH_LIFETIME 60 // Regenerate auth headers if older than this +#define CREDENTIAL_LIFETIME 60 // Seconds before expiry to reread credentials #if defined HAVE_COMMONCRYPTO @@ -235,7 +239,10 @@ static void parse_ini(const char *fname, const char *section, ...) va_start(args, section); while ((akey = va_arg(args, const char *)) != NULL) { kstring_t *avar = va_arg(args, kstring_t *); - if (strcmp(key, akey) == 0) { kputs(value, avar); break; } + if (strcmp(key, akey) == 0) { + avar->l = 0; + kputs(value, avar); + break; } } va_end(args); } @@ -270,17 +277,37 @@ static void parse_simple(const char *fname, kstring_t *id, kstring_t *secret) static int copy_auth_headers(s3_auth_data *ad, char ***hdrs) { char **hdr = &ad->headers[0]; + int idx = 0; *hdrs = hdr; - *hdr = strdup(ad->date); - if (!*hdr) return -1; - hdr++; + + hdr[idx] = strdup(ad->date); + if (!hdr[idx]) return -1; + idx++; + + if (ad->token.l) { + kstring_t token_hdr = KS_INITIALIZE; + kputs("X-Amz-Security-Token: ", &token_hdr); + kputs(ad->token.s, &token_hdr); + if (token_hdr.s) { + hdr[idx++] = token_hdr.s; + } else { + goto fail; + } + } + if (ad->auth_hdr.l) { - *hdr = strdup(ad->auth_hdr.s); - if (!*hdr) { free(ad->headers[0]); return -1; } - hdr++; + hdr[idx] = strdup(ad->auth_hdr.s); + if (!hdr[idx]) goto fail; + idx++; } - *hdr = NULL; + + hdr[idx] = NULL; return 0; + + fail: + for (--idx; idx >= 0; --idx) + free(hdr[idx]); + return -1; } static void free_auth_data(s3_auth_data *ad) { @@ -288,6 +315,7 @@ static void free_auth_data(s3_auth_data *ad) { --ad->refcount; return; } + free(ad->profile.s); free(ad->id.s); free(ad->token.s); free(ad->secret.s); @@ -301,6 +329,67 @@ static void free_auth_data(s3_auth_data *ad) { free(ad); } +static time_t parse_rfc3339_date(kstring_t *datetime) +{ + int offset = 0; + time_t when; + int num; + char should_be_t = '\0', timezone[10] = { '\0' }; + unsigned int year, mon, day, hour, min, sec; + + if (!datetime->s) + return 0; + + // It should be possible to do this with strptime(), but it seems + // to not get on with our feature definitions. + num = sscanf(datetime->s, "%4u-%2u-%2u%c%2u:%2u:%2u%9s", + &year, &mon, &day, &should_be_t, &hour, &min, &sec, timezone); + if (num < 8) + return 0; + if (should_be_t != 'T' && should_be_t != 't' && should_be_t != ' ') + return 0; + struct tm parsed = { sec, min, hour, day, mon - 1, year - 1900, 0, 0, 0 }; + + switch (timezone[0]) { + case 'Z': + case 'z': + case '\0': + break; + case '+': + case '-': { + unsigned hr_off, min_off; + if (sscanf(timezone + 1, "%2u:%2u", &hr_off, &min_off)) { + if (hr_off < 24 && min_off <= 60) { + offset = ((hr_off * 60 + min_off) + * (timezone[0] == '+' ? -60 : 60)); + } + } + break; + } + default: + return 0; + } + + when = hts_time_gm(&parsed); + return when >= 0 ? when + offset : 0; +} + +static void refresh_auth_data(s3_auth_data *ad) { + // Basically a copy of the AWS_SHARED_CREDENTIALS_FILE part of + // setup_auth_data(), but this only reads the authorisation parts. + const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); + kstring_t expiry_time = KS_INITIALIZE; + parse_ini(v? v : "~/.aws/credentials", ad->profile.s, + "aws_access_key_id", &ad->id, + "aws_secret_access_key", &ad->secret, + "aws_session_token", &ad->token, + "expiry_time", &expiry_time); + if (expiry_time.l) { + ad->creds_expiry_time = parse_rfc3339_date(&expiry_time); + } + ks_free(&expiry_time); +} + static int auth_header_callback(void *ctx, char ***hdrs) { s3_auth_data *ad = (s3_auth_data *) ctx; @@ -320,7 +409,10 @@ static int auth_header_callback(void *ctx, char ***hdrs) { return 0; } - if (now - ad->auth_time < AUTH_LIFETIME) { + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } else if (now - ad->auth_time < AUTH_LIFETIME) { // Last auth string should still be valid *hdrs = NULL; return 0; @@ -499,7 +591,6 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, s3_auth_data *ad = calloc(1, sizeof(*ad)); const char *bucket, *path; char *escaped = NULL; - kstring_t profile = { 0, 0, NULL }; size_t url_path_pos; ptrdiff_t bucket_len; int is_https = 1, dns_compliant; @@ -532,7 +623,7 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if (*path == '@') { const char *colon = strpbrk(bucket, ":@"); if (*colon != ':') { - urldecode_kput(bucket, colon - bucket, &profile); + urldecode_kput(bucket, colon - bucket, &ad->profile); } else { const char *colon2 = strpbrk(&colon[1], ":@"); @@ -554,9 +645,9 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if ((v = getenv("AWS_DEFAULT_REGION")) != NULL) kputs(v, &ad->region); if ((v = getenv("HTS_S3_HOST")) != NULL) kputs(v, &ad->host); - if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &profile); - else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &profile); - else kputs("default", &profile); + if ((v = getenv("AWS_DEFAULT_PROFILE")) != NULL) kputs(v, &ad->profile); + else if ((v = getenv("AWS_PROFILE")) != NULL) kputs(v, &ad->profile); + else kputs("default", &ad->profile); if ((v = getenv("HTS_S3_ADDRESS_STYLE")) != NULL) { if (strcasecmp(v, "virtual") == 0) { @@ -569,13 +660,15 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, if (ad->id.l == 0) { kstring_t url_style = KS_INITIALIZE; + kstring_t expiry_time = KS_INITIALIZE; const char *v = getenv("AWS_SHARED_CREDENTIALS_FILE"); - parse_ini(v? v : "~/.aws/credentials", profile.s, + parse_ini(v? v : "~/.aws/credentials", ad->profile.s, "aws_access_key_id", &ad->id, "aws_secret_access_key", &ad->secret, "aws_session_token", &ad->token, "region", &ad->region, "addressing_style", &url_style, + "expiry_time", &expiry_time, NULL); if (url_style.l) { @@ -587,14 +680,23 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, address_style = s3_auto; } } + if (expiry_time.l) { + // Not a real part of the AWS configuration file, but it allows + // support for short-term credentials like those for the IAM + // service. The botocore library uses the key "expiry_time" + // internally for this purpose. + // See https://github.com/boto/botocore/blob/develop/botocore/credentials.py + ad->creds_expiry_time = parse_rfc3339_date(&expiry_time); + } ks_free(&url_style); + ks_free(&expiry_time); } if (ad->id.l == 0) { kstring_t url_style = KS_INITIALIZE; const char *v = getenv("HTS_S3_S3CFG"); - parse_ini(v? v : "~/.s3cfg", profile.s, "access_key", &ad->id, + parse_ini(v? v : "~/.s3cfg", ad->profile.s, "access_key", &ad->id, "secret_key", &ad->secret, "access_token", &ad->token, "host_base", &ad->host, "bucket_location", &ad->region, @@ -699,13 +801,11 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, *query_start = 0; } - free(profile.s); free(escaped); return ad; error: - free(profile.s); free(escaped); free_auth_data(ad); return NULL; @@ -713,23 +813,13 @@ static s3_auth_data * setup_auth_data(const char *s3url, const char *mode, static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) { - char *header_list[4], **header = header_list; - kstring_t url = { 0, 0, NULL }; - kstring_t token_hdr = { 0, 0, NULL }; s3_auth_data *ad = setup_auth_data(s3url, mode, 2, &url); if (!ad) return NULL; - if (ad->token.l > 0) { - kputs("X-Amz-Security-Token: ", &token_hdr); - kputs(ad->token.s, &token_hdr); - *header++ = token_hdr.s; - } - - *header = NULL; - hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + hFILE *fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -738,12 +828,10 @@ static hFILE * s3_rewrite(const char *s3url, const char *mode, va_list *argsp) if (!fp) goto fail; free(url.s); - free(token_hdr.s); return fp; fail: free(url.s); - free(token_hdr.s); free_auth_data(ad); return NULL; } @@ -895,9 +983,8 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten } -static int update_time(s3_auth_data *ad) { +static int update_time(s3_auth_data *ad, time_t now) { int ret = -1; - time_t now = time(NULL); #ifdef HAVE_GMTIME_R struct tm tm_buffer; struct tm *tm = gmtime_r(&now, &tm_buffer); @@ -988,6 +1075,7 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co kstring_t *token, int uqs) { s3_auth_data *ad = (s3_auth_data *)auth; char content_hash[HASH_LENGTH_SHA256]; + time_t now; if (request == NULL) { // signal to free auth data @@ -995,9 +1083,15 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co return 0; } - if (update_time(ad)) { + now = time(NULL); + + if (update_time(ad, now)) { return -1; } + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } if (content) { hash_string(content->s, content->l, content_hash); @@ -1045,19 +1139,29 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co static int v4_auth_header_callback(void *ctx, char ***hdrs) { s3_auth_data *ad = (s3_auth_data *) ctx; char content_hash[HASH_LENGTH_SHA256]; - kstring_t content = {0, 0, NULL}; - kstring_t authorisation = {0, 0, NULL}; + kstring_t content = KS_INITIALIZE; + kstring_t authorisation = KS_INITIALIZE; + kstring_t token_hdr = KS_INITIALIZE; char *date_html = NULL; + time_t now; + int idx; if (!hdrs) { // Closing connection free_auth_data(ad); return 0; } - if (update_time(ad)) { + now = time(NULL); + + if (update_time(ad, now)) { return -1; } + if (ad->creds_expiry_time > 0 + && ad->creds_expiry_time - now < CREDENTIAL_LIFETIME) { + refresh_auth_data(ad); + } + if (!ad->id.l || !ad->secret.l) { return copy_auth_headers(ad, hdrs); } @@ -1083,18 +1187,27 @@ static int v4_auth_header_callback(void *ctx, char ***hdrs) { ksprintf(&content, "x-amz-content-sha256: %s", content_hash); date_html = strdup(ad->date_html.s); + if (ad->token.l > 0) { + kputs("X-Amz-Security-Token: ", &token_hdr); + kputs(ad->token.s, &token_hdr); + } + if (content.l == 0 || date_html == NULL) { ksfree(&authorisation); ksfree(&content); + ksfree(&token_hdr); free(date_html); return -1; } *hdrs = &ad->headers[0]; - ad->headers[0] = ks_release(&authorisation); - ad->headers[1] = date_html; - ad->headers[2] = ks_release(&content); - ad->headers[3] = NULL; + idx = 0; + ad->headers[idx++] = ks_release(&authorisation); + ad->headers[idx++] = date_html; + ad->headers[idx++] = ks_release(&content); + if (token_hdr.s) + ad->headers[idx++] = ks_release(&token_hdr); + ad->headers[idx++] = NULL; return 0; } @@ -1167,9 +1280,7 @@ static int http_status_errno(int status) static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { kstring_t url = { 0, 0, NULL }; - kstring_t token_hdr = { 0, 0, NULL }; - char *header_list[4], **header = header_list; s3_auth_data *ad = setup_auth_data(s3url, mode, 4, &url); hFILE *fp = NULL; @@ -1180,14 +1291,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { if (ad->mode == 'r') { long http_response = 0; - if (ad->token.l > 0) { - kputs("x-amz-security-token: ", &token_hdr); - kputs(ad->token.s, &token_hdr); - *header++ = token_hdr.s; - } - - *header = NULL; - fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", v4_auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -1204,7 +1308,7 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { goto error; } hclose_abruptly(fp); - fp = hopen(url.s, mode, "va_list", argsp, "httphdr:v", header_list, + fp = hopen(url.s, mode, "va_list", argsp, "httphdr_callback", v4_auth_header_callback, "httphdr_callback_data", ad, "redirect_callback", redirect_endpoint_callback, @@ -1237,7 +1341,6 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { } free(url.s); - free(token_hdr.s); return fp; @@ -1245,7 +1348,6 @@ static hFILE *s3_open_v4(const char *s3url, const char *mode, va_list *argsp) { if (fp) hclose_abruptly(fp); free(url.s); - free(token_hdr.s); free_auth_data(ad); return NULL; diff --git a/hts_time_funcs.h b/hts_time_funcs.h new file mode 100644 index 000000000..bc3de14f8 --- /dev/null +++ b/hts_time_funcs.h @@ -0,0 +1,167 @@ +/* hts_time_funcs.h -- Implementations of non-standard time functions + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +/* + This mainly exists because timegm() is not a standard function, and so + Cannot be used in portable code. Unfortunately the standard one (mktime) + always takes the local timezone into accout so doing a UTC conversion + with it involves changing the TZ environment variable, which is rather + messy and not likely to go well with threaded code. + + The code here is a much simplified version of the BSD timegm() implementation. + It currently rejects dates before 1970, avoiding problems with -ve time_t. + It also works strictly in UTC, so doesn't have to worry about tm_isdst + which makes the calculation much easier. + + Some of this is derived from BSD sources, for example + https://github.com/NetBSD/src/blob/trunk/lib/libc/time/localtime.c + which state: + + ** This file is in the public domain, so clarified as of + ** 1996-06-05 by Arthur David Olson. + + Non-derived code is copyright as above. +*/ + +#include + +static inline int hts_time_normalise(int *tens, int *units, int base) { + if (*units < 0 || *units >= base) { + int delta = *units >= 0 ? *units / base : (-1 - (-1 - *units) / base); + int64_t tmp = (int64_t) (*tens) + delta; + if (tmp < INT_MIN || tmp > INT_MAX) return 1; + *tens = tmp; + *units -= delta * base; + } + return 0; +} + +static inline int hts_year_is_leap(int64_t year) { + return ((year % 4 == 0) && (year % 100 != 0)) || (year % 400 == 0); +} + +// Number of leap years to start of year +// Only works for year >= 1. +static inline int64_t hts_leaps_to_year_start(int64_t year) { + --year; + return year / 4 - year / 100 + year / 400; +} + +static inline int hts_time_normalise_tm(struct tm *t) +{ + const int days_per_mon[2][12] = { + { 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 }, + { 31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 } + }; + const int year_days[2] = { 365, 366 }; + int overflow = 0; + int64_t year; + + if (t->tm_sec > 62) { + overflow |= hts_time_normalise(&t->tm_min, &t->tm_sec, 60); + } + overflow |= hts_time_normalise(&t->tm_hour, &t->tm_min, 60); + overflow |= hts_time_normalise(&t->tm_mday, &t->tm_hour, 24); + overflow |= hts_time_normalise(&t->tm_year, &t->tm_mon, 12); + if (overflow) + return 1; + + year = (int64_t) t->tm_year + 1900LL; + while (t->tm_mday <= 0) { + --year; + t->tm_mday += year_days[hts_year_is_leap(year + (1 < t->tm_mon))]; + } + while (t->tm_mday > 366) { + t->tm_mday -= year_days[hts_year_is_leap(year + (1 < t->tm_mon))]; + ++year; + } + for (;;) { + int mdays = days_per_mon[hts_year_is_leap(year)][t->tm_mon]; + if (t->tm_mday <= mdays) + break; + t->tm_mday -= mdays; + t->tm_mon++; + if (t->tm_mon >= 12) { + year++; + t->tm_mon = 0; + } + } + year -= 1900; + if (year != t->tm_year) { + if (year < INT_MIN || year > INT_MAX) + return 1; + t->tm_year = year; + } + return 0; +} + +/** + * Convert broken-down time to an equivalent time_t value + * @param target Target broken-down time structure + * @return Equivalent time_t value on success; -1 on failure + * + * This function first normalises the time in @p target so that the + * structure members are in the valid range. It then calculates the + * number of seconds (ignoring leap seconds) between midnight Jan 1st 1970 + * and the target date. + * + * If @p target is outside the range that can be represented in a time_t, + * or tm_year is less than 70 (which would return a negative value) then + * it returns -1 and sets errno to EOVERFLOW. + */ + +static inline time_t hts_time_gm(struct tm *target) +{ + int month_start[2][12] = { + { 0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334 }, + { 0, 31, 60, 91, 121, 152, 182, 213, 244, 274, 305, 335 } + }; + int years_from_epoch, leaps, days; + int64_t secs; + + if (hts_time_normalise_tm(target) != 0) + goto overflow; + + if (target->tm_year < 70) + goto overflow; + + years_from_epoch = target->tm_year - 70; + leaps = (hts_leaps_to_year_start(target->tm_year + 1900) + - hts_leaps_to_year_start(1970)); + days = ((365 * (years_from_epoch - leaps) + 366 * leaps) + + month_start[hts_year_is_leap(target->tm_year + 1900)][target->tm_mon] + + target->tm_mday - 1); + secs = ((int64_t) days * 86400LL + + target->tm_hour * 3600 + + target->tm_min * 60 + + target->tm_sec); + if (sizeof(time_t) < 8 && secs > INT_MAX) + goto overflow; + + return (time_t) secs; + + overflow: + errno = EOVERFLOW; + return (time_t) -1; +} diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index 279661053..359c0fc35 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -24,6 +24,21 @@ s3 plugin \- htslib AWS S3 plugin .\" FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER .\" DEALINGS IN THE SOFTWARE. .\" +. +.\" For code blocks and examples (cf groff's Ultrix-specific man macros) +.de EX + +. in +\\$1 +. nf +. ft CR +.. +.de EE +. ft +. fi +. in + +.. + .SH DESCRIPTION The S3 plugin allows htslib file functions to communicate with servers that use the AWS S3 protocol. Files are identified by their bucket and object key in a @@ -114,14 +129,73 @@ files will be used. The default file locations are either \fI~/.aws/credentials\fR or \fI~/.s3cfg\fR (in that order). Entries used in aws style credentials file are aws_access_key_id, -aws_secret_access_key, aws_session_token, region and addressing_style. Only the -first two are usually needed. +aws_secret_access_key, aws_session_token, region, addressing_style and +expiry_time (unofficial, see SHORT-LIVED CREDENTIALS below). +Only the first two are usually needed. Entries used in s3cmd style config files are access_key, secret_key, access_token, host_base, bucket_location and host_bucket. Again only the first two are usually needed. The host_bucket option is only used to set a path-style URL, see below. +.SH SHORT-LIVED CREDENTIALS + +Some cloud identity and access management (IAM) systems can make short-lived +credentials that allow access to resources. +These credentials will expire after a time and need to be renewed to +give continued access. +To enable this, the S3 plugin allows an \fIexpiry_time\fR entry to be set in the +\fI.aws/credentials\fR file. +The value for this entry should be the time when the token expires, +following the format in RFC3339 section 5.6, which takes the form: + + 2012-04-29T05:20:48Z + +That is, year - month - day, the letter "T", hour : minute : second. +The time can be followed by the letter "Z", indicating the UTC timezone, +or an offset from UTC which is a "+" or "-" sign followed by two digits for +the hours offset, ":", and two digits for the minutes. + +The S3 plugin will attempt to re-read the credentials file up to 1 minute +before the given expiry time, which means the file needs to be updated with +new credentials before then. +As the exact way of doing this can vary between services and IAM providers, +the S3 plugin expects this to be done by an external user-supplied process. +This may be achieved by running a program that replaces the file as new +credentials become available. +The following script shows how it might be done for AWS instance credentials: +.EX 2 +#!/bin/sh +instance='http://169.254.169.254' +tok_url="$instance/latest/api/token" +ttl_hdr='X-aws-ec2-metadata-token-ttl-seconds: 10' +creds_url="$instance/latest/meta-data/iam/security-credentials" +key1='aws_access_key_id = \(rs(.AccessKeyId)\(rsn' +key2='aws_secret_access_key = \(rs(.SecretAccessKey)\(rsn' +key3='aws_session_token = \(rs(.Token)\(rsn' +key4='expiry_time = \(rs(.Expiration)\(rsn' +while true; do + token=`curl -X PUT -H "$ttl_hdr" "$tok_url"` + tok_hdr="X-aws-ec2-metadata-token: $token" + role=`curl -H "$tok_hdr" "$creds_url/"` + expires='now' + ( curl -H "$tok_hdr" "$creds_url/$role" \(rs + | jq -r "\(rs"${key1}${key2}${key3}${key4}\(rs"" > credentials.new ) \(rs + && mv -f credentials.new credentials \(rs + && expires=`grep expiry_time credentials | cut -d ' ' -f 3-` + if test $? -ne 0 ; then break ; fi + expiry=`date -d "$expires - 3 minutes" '+%s'` + now=`date '+%s'` + test "$expiry" -gt "$now" && sleep $((($expiry - $now) / 2)) + sleep 30 +done +.EE + +Note that the \fIexpiry_time\fR key is currently only supported for the +\fI.aws/credentials\fR file (or the file referred to in the +.B AWS_SHARED_CREDENTIALS_FILE +environment variable). + .SH NOTES In most cases this plugin transforms the given URL into a virtual host-style format e.g. \fIhttps://bucket.host/path/to/file\fR. A path-style format is used @@ -136,4 +210,6 @@ host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string. .BR htsfile (1) .BR samtools (1) .PP +RFC 3339: +.PP htslib website: diff --git a/test/test_time_funcs.c b/test/test_time_funcs.c new file mode 100644 index 000000000..e8c2600cc --- /dev/null +++ b/test/test_time_funcs.c @@ -0,0 +1,122 @@ +/* test_time_compat.c -- Test time functions + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include +#include +#include +#include +#include + +#include "../hts_time_funcs.h" + +int test_normalised(time_t start, time_t end, time_t incr) { + time_t i, j; + struct tm *utc; + + for (i = start; i < end; i += incr) { + utc = gmtime(&i); + j = hts_time_gm(utc); + if (i != j) { + fprintf(stderr, + "hts_time_gm() failed, got %"PRId64" expected %"PRId64"\n", + (int64_t) j, (int64_t) i); + return 1; + } + } + return 0; +} + +int test_specific(int year, int mon, int mday, int hour, int min, int sec, + time_t expected) { + struct tm utc = { sec, min, hour, mday, mon - 1, year - 1900, 0, 0, 0 }; + time_t res = hts_time_gm(&utc); + if (res != expected) { + fprintf(stderr, + "hts_time_gm() failed for %4d/%02d/%02d %02d:%02d:%02d :" + " got %"PRId64" expected %"PRId64"\n", + year, mon, mday, hour, min, sec, + (int64_t) res, (int64_t) expected); + return 1; + } + return 0; +} + +int main(int argc, char **argv) { + int res = 0; + + if (test_normalised(0, INT_MAX - 1000, 1000) != 0) + return EXIT_FAILURE; + if (sizeof(time_t) >= 8) { + if (test_normalised(INT_MAX - 1000, (time_t) INT_MAX * 2, 1000) != 0) + return EXIT_FAILURE; + } + + // 2022-06-14 12:32:10 + res |= test_specific(2022, 6, 14, 12, 32, 10, 1655209930); + // 2022-06-14 12:32:10 + res |= test_specific(1993, 9, 10514, 12, 32, 10, 1655209930); + // 2022-02-28 12:00:00 + res |= test_specific(2020, 2, 28, 12, 0, 0, 1582891200); + // 2022-02-29 12:00:00 + res |= test_specific(2020, 2, 29, 12, 0, 0, 1582977600); + // 2022-03-01 12:00:00 + res |= test_specific(2020, 2, 30, 12, 0, 0, 1583064000); + // 2022-02-29 12:00:00 + res |= test_specific(2020, 3, 0, 12, 0, 0, 1582977600); + // 2020-02-01 12:00:00 + res |= test_specific(2019, 14, 1, 12, 0, 0, 1580558400); + // 2020-03-01 12:00:00 + res |= test_specific(2019, 15, 1, 12, 0, 0, 1583064000); + // 2021-03-01 12:00:00 + res |= test_specific(2019, 27, 1, 12, 0, 0, 1614600000); + // 2024-02-01 12:00:00 + res |= test_specific(2019, 62, 1, 12, 0, 0, 1706788800); + // 2024-03-01 12:00:00 + res |= test_specific(2019, 63, 1, 12, 0, 0, 1709294400); + // 2020-12-31 23:59:59 + res |= test_specific(2021, 0, 31, 23, 59, 59, 1609459199); + // 2020-03-01 12:00:00 + res |= test_specific(2021, -9, 1, 12, 0, 0, 1583064000); + // 2020-02-01 12:00:00 + res |= test_specific(2021, -10, 1, 12, 0, 0, 1580558400); + // 2019-02-01 12:00:00 + res |= test_specific(2021, -22, 1, 12, 0, 0, 1549022400); + // 1970-01-01 00:00:00 + res |= test_specific(1970, 1, 1, 0, 0, 0, 0); + // 2038-01-19 03:14:07 + res |= test_specific(1970, 1, 1, 0, 0, INT_MAX, INT_MAX); + // 2038-01-19 03:14:07 + res |= test_specific(2038, 1, 19, 3, 14, 7, INT_MAX); + if (sizeof(time_t) < 8) { + // 2038-01-19 03:14:08 + res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) -1); + } else { + // 2038-01-19 03:14:08 + res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) INT_MAX + 1); + } + + return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} From 506f47913cbc8ffbc5cdfb28218b00b84e23de80 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 6 Jul 2022 16:26:09 +0100 Subject: [PATCH 293/488] Further improve MM tag consistency checking. If we have an MM tag with base-type specific coordinates beyond the end of the sequence as there are too few bases of that type, then we now detect this within bam_parse_basemod. This was already checked within bam_next_basemod for forward reads, but not spotted in reverse complemented ones. Fixes #1466 --- sam.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/sam.c b/sam.c index 64c08a43a..a84ac2eaa 100644 --- a/sam.c +++ b/sam.c @@ -6289,6 +6289,11 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { state->MLstride [mod_num] = stride; state->implicit [mod_num] = implicit; + if (delta < 0) { + hts_log_error("MM tag refers to bases beyond sequence " + "length"); + return -1; + } state->MMcount [mod_num] = delta; if (b->core.flag & BAM_FREVERSE) { state->MM [mod_num] = cp+1; From ff49203760f02f5cf4ede7d045d509970f2f8063 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 29 Jun 2022 10:34:23 +0100 Subject: [PATCH 294/488] Fix and_expr expression bug Prevent old is_true values from being carried over, which could cause incorrect results from '&&' expressions. --- hts_expr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hts_expr.c b/hts_expr.c index 599d7a54a..23f7c402b 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -622,8 +622,8 @@ static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { if (eq_expr(filt, data, fn, str, end, res)) return -1; - hts_expr_val_t val = HTS_EXPR_VAL_INIT; for (;;) { + hts_expr_val_t val = HTS_EXPR_VAL_INIT; str = ws(*end); if (str[0] == '&' && str[1] == '&') { if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; @@ -640,8 +640,8 @@ static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } else { break; } + hts_expr_val_free(&val); } - hts_expr_val_free(&val); return 0; } From 4f9a8ba866bfdcd10cf77d1be2a307f85d28d608 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 29 Jun 2022 11:09:30 +0100 Subject: [PATCH 295/488] Add truth checks to test_expr, and test empty-but-true strings --- test/test_expr.c | 232 +++++++++++++++++++++++++---------------------- 1 file changed, 124 insertions(+), 108 deletions(-) diff --git a/test/test_expr.c b/test/test_expr.c index 606a9b3b5..237b4e1ce 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -51,6 +51,12 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { *end = str+5; res->is_str = 1; kputs("plugh", ks_clear(&res->s)); + } else if (strncmp(str, "empty-but-true", 14) == 0) { + // empty string + *end = str+14; + res->is_true = 1; + res->is_str = 1; + kputs("", ks_clear(&res->s)); } else if (strncmp(str, "empty", 5) == 0) { // empty string *end = str+5; @@ -70,6 +76,7 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { } typedef struct { + int truth_val; double dval; char *sval; char *str; @@ -78,108 +85,112 @@ typedef struct { int test(void) { // These are all valid expressions that should work test_ev tests[] = { - { 1, NULL, "1"}, - { 1, NULL, "+1"}, - { -1, NULL, "-1"}, - { 0, NULL, "!7"}, - { 1, NULL, "!0"}, - { 1, NULL, "!(!7)"}, - { 1, NULL, "!!7"}, - - { 5, NULL, "2+3"}, - { -1, NULL, "2+-3"}, - { 6, NULL, "1+2+3"}, - { 1, NULL, "-2+3"}, - - { 6, NULL, "2*3"}, - { 6, NULL, "1*2*3"}, - { 0, NULL, "2*0"}, - - { 7, NULL, "(7)"}, - { 7, NULL, "((7))"}, - { 21, NULL, "(1+2)*(3+4)"}, - { 14, NULL, "(4*5)-(-2*-3)"}, - - { 1, NULL, "(1+2)*3==9"}, - { 1, NULL, "(1+2)*3!=8"}, - { 0, NULL, "(1+2)*3!=9"}, - { 0, NULL, "(1+2)*3==8"}, - - { 0, NULL, "1>2"}, - { 1, NULL, "1<2"}, - { 0, NULL, "3<3"}, - { 0, NULL, "3>3"}, - { 1, NULL, "9<=9"}, - { 1, NULL, "9>=9"}, - { 1, NULL, "2*4==8"}, - { 1, NULL, "16==0x10"}, - { 1, NULL, "15<0x10"}, - { 1, NULL, "17>0x10"}, - { 0, NULL, "2*4!=8"}, - { 1, NULL, "4+2<3+4"}, - { 0, NULL, "4*2<3+4"}, - { 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 - - { 1, NULL, "(1<2) == (3>2)"}, - { 1, NULL, "1<2 == 3>2"}, - - { 1, NULL, "2 && 1"}, - { 0, NULL, "2 && 0"}, - { 0, NULL, "0 && 2"}, - { 1, NULL, "2 || 1"}, - { 1, NULL, "2 || 0"}, - { 1, NULL, "0 || 2"}, - { 1, NULL, "1 || 2 && 3"}, - { 1, NULL, "2 && 3 || 1"}, - { 1, NULL, "0 && 3 || 2"}, - { 0, NULL, "0 && 3 || 0"}, - - { 1, NULL, "3 & 1"}, - { 2, NULL, "3 & 2"}, - { 3, NULL, "1 | 2"}, - { 3, NULL, "1 | 3"}, - { 7, NULL, "1 | 6"}, - { 2, NULL, "1 ^ 3"}, - - { 1, NULL, "(1^0)&(4^3)"}, - { 2, NULL, "1 ^(0&4)^ 3"}, - { 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ - - { 6, NULL, "(1|0)^(4|3)"}, - { 7, NULL, "1 |(0^4)| 3"}, - { 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | - - { 1, NULL, "4 & 2 || 1"}, - { 1, NULL, "(4 & 2) || 1"}, - { 0, NULL, "4 & (2 || 1)"}, - { 1, NULL, "1 || 4 & 2"}, - { 1, NULL, "1 || (4 & 2)"}, - { 0, NULL, "(1 || 4) & 2"}, - - { 1, NULL, " (2*3)&7 > 4"}, - { 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv - { 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv - { 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, - - { 1, "plugh", "magic"}, - { 1, "", "empty"}, - { 1, NULL, "magic == \"plugh\""}, - { 1, NULL, "magic != \"xyzzy\""}, - - { 1, NULL, "\"abc\" < \"def\""}, - { 1, NULL, "\"abc\" <= \"abc\""}, - { 0, NULL, "\"abc\" < \"ab\""}, - { 0, NULL, "\"abc\" <= \"ab\""}, - - { 0, NULL, "\"abc\" > \"def\""}, - { 1, NULL, "\"abc\" >= \"abc\""}, - { 1, NULL, "\"abc\" > \"ab\""}, - { 1, NULL, "\"abc\" >= \"ab\""}, - - { 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, - { 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, - { 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, - { 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, + { 1, 1, NULL, "1"}, + { 1, 1, NULL, "+1"}, + { 1, -1, NULL, "-1"}, + { 0, 0, NULL, "!7"}, + { 1, 1, NULL, "!0"}, + { 1, 1, NULL, "!(!7)"}, + { 1, 1, NULL, "!!7"}, + + { 1, 5, NULL, "2+3"}, + { 1, -1, NULL, "2+-3"}, + { 1, 6, NULL, "1+2+3"}, + { 1, 1, NULL, "-2+3"}, + + { 1, 6, NULL, "2*3"}, + { 1, 6, NULL, "1*2*3"}, + { 0, 0, NULL, "2*0"}, + + { 1, 7, NULL, "(7)"}, + { 1, 7, NULL, "((7))"}, + { 1, 21, NULL, "(1+2)*(3+4)"}, + { 1, 14, NULL, "(4*5)-(-2*-3)"}, + + { 1, 1, NULL, "(1+2)*3==9"}, + { 1, 1, NULL, "(1+2)*3!=8"}, + { 0, 0, NULL, "(1+2)*3!=9"}, + { 0, 0, NULL, "(1+2)*3==8"}, + + { 0, 0, NULL, "1>2"}, + { 1, 1, NULL, "1<2"}, + { 0, 0, NULL, "3<3"}, + { 0, 0, NULL, "3>3"}, + { 1, 1, NULL, "9<=9"}, + { 1, 1, NULL, "9>=9"}, + { 1, 1, NULL, "2*4==8"}, + { 1, 1, NULL, "16==0x10"}, + { 1, 1, NULL, "15<0x10"}, + { 1, 1, NULL, "17>0x10"}, + { 0, 0, NULL, "2*4!=8"}, + { 1, 1, NULL, "4+2<3+4"}, + { 0, 0, NULL, "4*2<3+4"}, + { 1, 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 + + { 1, 1, NULL, "(1<2) == (3>2)"}, + { 1, 1, NULL, "1<2 == 3>2"}, + + { 1, 1, NULL, "2 && 1"}, + { 0, 0, NULL, "2 && 0"}, + { 0, 0, NULL, "0 && 2"}, + { 1, 1, NULL, "2 || 1"}, + { 1, 1, NULL, "2 || 0"}, + { 1, 1, NULL, "0 || 2"}, + { 1, 1, NULL, "1 || 2 && 3"}, + { 1, 1, NULL, "2 && 3 || 1"}, + { 1, 1, NULL, "0 && 3 || 2"}, + { 0, 0, NULL, "0 && 3 || 0"}, + + { 1, 1, NULL, "3 & 1"}, + { 1, 2, NULL, "3 & 2"}, + { 1, 3, NULL, "1 | 2"}, + { 1, 3, NULL, "1 | 3"}, + { 1, 7, NULL, "1 | 6"}, + { 1, 2, NULL, "1 ^ 3"}, + + { 1, 1, NULL, "(1^0)&(4^3)"}, + { 1, 2, NULL, "1 ^(0&4)^ 3"}, + { 1, 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ + + { 1, 6, NULL, "(1|0)^(4|3)"}, + { 1, 7, NULL, "1 |(0^4)| 3"}, + { 1, 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | + + { 1, 1, NULL, "4 & 2 || 1"}, + { 1, 1, NULL, "(4 & 2) || 1"}, + { 0, 0, NULL, "4 & (2 || 1)"}, + { 1, 1, NULL, "1 || 4 & 2"}, + { 1, 1, NULL, "1 || (4 & 2)"}, + { 0, 0, NULL, "(1 || 4) & 2"}, + + { 1, 1, NULL, " (2*3)&7 > 4"}, + { 0, 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv + { 1, 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv + { 1, 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, + + { 1, 1, "plugh", "magic"}, + { 1, 1, "", "empty"}, + { 1, 1, NULL, "magic == \"plugh\""}, + { 1, 1, NULL, "magic != \"xyzzy\""}, + + { 1, 1, NULL, "\"abc\" < \"def\""}, + { 1, 1, NULL, "\"abc\" <= \"abc\""}, + { 0, 0, NULL, "\"abc\" < \"ab\""}, + { 0, 0, NULL, "\"abc\" <= \"ab\""}, + + { 0, 0, NULL, "\"abc\" > \"def\""}, + { 1, 1, NULL, "\"abc\" >= \"abc\""}, + { 1, 1, NULL, "\"abc\" > \"ab\""}, + { 1, 1, NULL, "\"abc\" >= \"ab\""}, + + { 1, 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, + { 0, 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, + { 1, 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, + { 1, 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, + + { 1, 1, "", "empty-but-true" }, + { 1, 1, NULL, "1 && empty-but-true && 1" }, + { 0, 0, NULL, "1 && empty-but-true && 0" }, }; int i; @@ -195,13 +206,18 @@ int test(void) { } if (r.is_str && (strcmp(r.s.s, tests[i].sval) != 0 - || r.d != tests[i].dval)) { - fprintf(stderr, "Failed test: %s == %s, got %s, %f\n", - tests[i].str, tests[i].sval, r.s.s, r.d); + || r.d != tests[i].dval + || r.is_true != tests[i].truth_val)) { + fprintf(stderr, + "Failed test: \"%s\" == \"%s\", got %s, \"%s\", %f\n", + tests[i].str, tests[i].sval, + r.is_true ? "true" : "false", r.s.s, r.d); return 1; - } else if (!r.is_str && r.d != tests[i].dval) { - fprintf(stderr, "Failed test: %s == %f, got %f\n", - tests[i].str, tests[i].dval, r.d); + } else if (!r.is_str && (r.d != tests[i].dval + || r.is_true != tests[i].truth_val)) { + fprintf(stderr, "Failed test: %s == %f, got %s, %f\n", + tests[i].str, tests[i].dval, + r.is_true ? "true" : "false", r.d); return 1; } From 494eca899e5f20c71f753a7db31dad2ccf2bb07f Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 29 Jun 2022 12:26:04 +0100 Subject: [PATCH 296/488] Fix unary-not on null strings in expressions Toggling hts_expr_val::is_true on strings could get it out of phase with hts_expr_val::d on null strings (which are false), which caused double-unary-not to give the wrong value. Instead, make unary not always return false if is_true is true, so empty-but-true works; and for strings return true for null ones, and false for non-null. Numbers are handled as before. --- hts_expr.c | 8 +++++--- test/test_expr.c | 29 ++++++++++++++++++++++------- 2 files changed, 27 insertions(+), 10 deletions(-) diff --git a/hts_expr.c b/hts_expr.c index 23f7c402b..cdbcec8a3 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -296,10 +296,12 @@ static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, res->is_true = res->d != 0; } else if (*str == '!') { err = unary_expr(filt, data, fn, str+1, end, res); - if (res->is_str) { + if (res->is_true) { + res->d = res->is_true = 0; res->is_str = 0; - res->d = 0; - res->is_true = !res->is_true; + } else if (res->is_str) { + res->is_str = 0; + res->d = res->is_true = (res->s.s == NULL); } else { res->d = !(int64_t)res->d; res->is_true = res->d != 0; diff --git a/test/test_expr.c b/test/test_expr.c index 237b4e1ce..673fdf807 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -82,6 +82,13 @@ typedef struct { char *str; } test_ev; +static inline int strcmpnull(const char *a, const char *b) { + if (!a && !b) return 0; + if (!a && b) return -1; + if (a && !b) return 1; + return strcmp(a, b); +} + int test(void) { // These are all valid expressions that should work test_ev tests[] = { @@ -188,12 +195,18 @@ int test(void) { { 1, 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, { 1, 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, - { 1, 1, "", "empty-but-true" }, + { 1, 1, "", "empty-but-true" }, + { 0, 0, NULL, "!empty-but-true" }, + { 1, 1, NULL, "!!empty-but-true" }, { 1, 1, NULL, "1 && empty-but-true && 1" }, { 0, 0, NULL, "1 && empty-but-true && 0" }, + + { 0, 0, NULL, "null" }, + { 1, 1, NULL, "!null" }, + { 0, 0, NULL, "!!null", }, }; - int i; + int i, res = 0; hts_expr_val_t r; for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { hts_filter_t *filt = hts_filter_init(tests[i].str); @@ -202,30 +215,32 @@ int test(void) { if (hts_filter_eval(filt, NULL, lookup, &r)) { fprintf(stderr, "Failed to parse filter string %s\n", tests[i].str); - return 1; + res = 1; + hts_filter_free(filt); + continue; } - if (r.is_str && (strcmp(r.s.s, tests[i].sval) != 0 + if (r.is_str && (strcmpnull(r.s.s, tests[i].sval) != 0 || r.d != tests[i].dval || r.is_true != tests[i].truth_val)) { fprintf(stderr, "Failed test: \"%s\" == \"%s\", got %s, \"%s\", %f\n", tests[i].str, tests[i].sval, r.is_true ? "true" : "false", r.s.s, r.d); - return 1; + res = 1; } else if (!r.is_str && (r.d != tests[i].dval || r.is_true != tests[i].truth_val)) { fprintf(stderr, "Failed test: %s == %f, got %s, %f\n", tests[i].str, tests[i].dval, r.is_true ? "true" : "false", r.d); - return 1; + res = 1; } hts_expr_val_free(&r); hts_filter_free(filt); } - return 0; + return res; } int main(int argc, char **argv) { From 5e426826d0b9ad5bd3d47cff5e18ae7ab2981de1 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 29 Jun 2022 16:41:18 +0100 Subject: [PATCH 297/488] Set hts_filter_t::is_true in mul_expr() and add_expr() Ensures that "5 - 5 && 1" and "+5 - 5 && 1" give the same answer. The latter sets is_true in the unary +, so it has to be reset after the subtraction. --- hts_expr.c | 4 ++++ test/test_expr.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/hts_expr.c b/hts_expr.c index cdbcec8a3..b38a006bc 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -352,8 +352,10 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, else break; + res->is_true = res->d != 0; str = *end; } + hts_expr_val_free(&val); return 0; @@ -390,8 +392,10 @@ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, else break; + res->is_true = res->d != 0; str = *end; } + hts_expr_val_free(&val); return 0; diff --git a/test/test_expr.c b/test/test_expr.c index 673fdf807..0d99c1121 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -147,6 +147,8 @@ int test(void) { { 1, 1, NULL, "2 && 3 || 1"}, { 1, 1, NULL, "0 && 3 || 2"}, { 0, 0, NULL, "0 && 3 || 0"}, + { 0, 0, NULL, " 5 - 5 && 1"}, + { 0, 0, NULL, "+5 - 5 && 1"}, { 1, 1, NULL, "3 & 1"}, { 1, 2, NULL, "3 & 2"}, From 6c733244f5359064f0126e6fbfa9002aafc006fc Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 29 Jun 2022 17:23:10 +0100 Subject: [PATCH 298/488] Ensure is_true is propagated for null-but-true in expressions So "null-but-true" and "null-but-true && 1" return the same value. --- hts_expr.c | 13 ++++++++----- test/test_expr.c | 12 ++++++++++++ 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/hts_expr.c b/hts_expr.c index b38a006bc..8d1245350 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -700,12 +700,15 @@ int hts_filter_eval(hts_filter_t *filt, } // Strings evaluate to true. An empty string is also true, but an - // absent (null) string is false. An empty string has kstring length - // of zero, but a pointer as it's nul-terminated. - if (res->is_str) - res->is_true = res->d = res->s.s != NULL; - else + // absent (null) string is false, unless overriden by is_true. An + // empty string has kstring length of zero, but a pointer as it's + // nul-terminated. + if (res->is_str) { + res->is_true |= res->s.s != NULL; + res->d = res->is_true; + } else { res->is_true |= res->d != 0; + } return 0; } diff --git a/test/test_expr.c b/test/test_expr.c index 0d99c1121..8944ca927 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -62,6 +62,11 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { *end = str+5; res->is_str = 1; kputs("", ks_clear(&res->s)); + } else if (strncmp(str, "null-but-true", 13) == 0) { + *end = str+13; + res->is_true = 1; + res->is_str = 1; + ks_clear(&res->s); } else if (strncmp(str, "null", 4) == 0) { // null string (eg aux:Z tag is absent) *end = str+4; @@ -206,6 +211,13 @@ int test(void) { { 0, 0, NULL, "null" }, { 1, 1, NULL, "!null" }, { 0, 0, NULL, "!!null", }, + + { 1, 1, NULL, "null-but-true" }, + { 0, 0, NULL, "!null-but-true" }, + { 1, 1, NULL, "!!null-but-true" }, + + { 0, 0, NULL, "null || 0" }, + { 1, 1, NULL, "null-but-true && 1" }, }; int i, res = 0; From 1fba06c2c086290e21bd50ef6f8244dcde80c764 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 29 Jun 2022 17:35:46 +0100 Subject: [PATCH 299/488] Deprecate hts_filter_eval() in favour of hts_filter_eval2(). Due to hts_filter_eval() calling memset() on its res parameter, it's not possible to pass in an allocted kstring_t in res->s without leaking memory. Historically it was also possible to get away with passing in an uninitialised structure, so not many assumptions can be made about the contents of res on entry. In particular, it is not guaranteed that free(res->s.s) would work. To ensure the function is being used safely, check that the string part of *res is NULL on entry and fail if not. Also added a documentation note about calling hts_expr_val_free() after hts_filter_eval(). Add hts_filter_eval2() and deprecate hts_filter_eval(). The new function clears its `res` parameter properly, allowing it to be called repeatedly a bit more easily than the original. --- Makefile | 2 +- hts_expr.c | 35 ++++++++++++++++++++++++++++++----- htslib/hts_expr.h | 22 +++++++++++++++++++++- sam.c | 4 ++-- test/test_expr.c | 8 ++++---- 5 files changed, 58 insertions(+), 13 deletions(-) diff --git a/Makefile b/Makefile index 621e8a11a..88d1f56b6 100644 --- a/Makefile +++ b/Makefile @@ -425,7 +425,7 @@ hfile_libcurl.o hfile_libcurl.pico: hfile_libcurl.c config.h $(hfile_internal_h) hfile_s3_write.o hfile_s3_write.pico: hfile_s3_write.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(htslib_khash_h) hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h) $(htslib_kstring_h) $(hts_time_funcs_h) hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) -hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(textutils_internal_h) +hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) diff --git a/hts_expr.c b/hts_expr.c index 8d1245350..74fe85fce 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -39,6 +39,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "htslib/hts_expr.h" +#include "htslib/hts_log.h" #include "textutils_internal.h" // Could also cache hts_expr_val_t stack here for kstring reuse? @@ -683,13 +684,11 @@ void hts_filter_free(hts_filter_t *filt) { free(filt); } -int hts_filter_eval(hts_filter_t *filt, - void *data, hts_expr_sym_func *fn, - hts_expr_val_t *res) { +static int hts_filter_eval_(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { char *end = NULL; - memset(res, 0, sizeof(*res)); - filt->curr_regex = 0; if (expression(filt, data, fn, filt->str, &end, res)) return -1; @@ -712,3 +711,29 @@ int hts_filter_eval(hts_filter_t *filt, return 0; } + +int hts_filter_eval(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { + if (res->s.l != 0 || res->s.m != 0 || res->s.s != NULL) { + // As *res is cleared below, it's not safe to call this function + // with res->s.s set, as memory would be leaked. It's also not + // possible to know is res was initialised correctly, so in + // either case we fail. + hts_log_error("Results structure must be cleared before calling this function"); + return -1; + } + + memset(res, 0, sizeof(*res)); + + return hts_filter_eval_(filt, data, fn, res); +} + +int hts_filter_eval2(hts_filter_t *filt, + void *data, hts_expr_sym_func *fn, + hts_expr_val_t *res) { + ks_free(&res->s); + memset(res, 0, sizeof(*res)); + + return hts_filter_eval_(filt, data, fn, res); +} diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h index d66a8edd8..7e6a9ed2b 100644 --- a/htslib/hts_expr.h +++ b/htslib/hts_expr.h @@ -87,11 +87,31 @@ typedef int (hts_expr_sym_func)(void *data, char *str, char **end, * the is_str member. It can also be explicitly defined to be true even * for a null value. This may be used to check for the existence of * something, irrespective of whether that something evaluates to zero. + * + * @p res must be initialized using HTS_EXPR_VAL_INIT before passing it + * to this function for the first time. + */ +HTSLIB_EXPORT +int hts_filter_eval2(hts_filter_t *filt, + void *data, hts_expr_sym_func *sym_func, + hts_expr_val_t *res); + +/// Evaluate a filter expression (derecated API) +/** + * @copydetails hts_filter_eval2() + * + * If calling this function more than once with the same @p res + * parameter, hts_expr_val_free(res) must be used between invocations + * to clear any allocated memory prior to reuse. + * + * @deprecated This function has been replaced by hts_filter_eval2(), + * which clears @p res properly itself. */ HTSLIB_EXPORT int hts_filter_eval(hts_filter_t *filt, void *data, hts_expr_sym_func *sym_func, - hts_expr_val_t *res); + hts_expr_val_t *res) + HTS_DEPRECATED("Please use hts_filter_eval2 instead"); #endif /* HTS_EXPR_H */ diff --git a/sam.c b/sam.c index a84ac2eaa..fccff262b 100644 --- a/sam.c +++ b/sam.c @@ -1454,8 +1454,8 @@ static int bam_sym_lookup(void *data, char *str, char **end, int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, hts_filter_t *filt) { hb_pair hb = {h, b}; - hts_expr_val_t res; - if (hts_filter_eval(filt, &hb, bam_sym_lookup, &res)) { + hts_expr_val_t res = HTS_EXPR_VAL_INIT; + if (hts_filter_eval2(filt, &hb, bam_sym_lookup, &res)) { hts_log_error("Couldn't process filter expression"); hts_expr_val_free(&res); return -1; diff --git a/test/test_expr.c b/test/test_expr.c index 8944ca927..15e25cf4b 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -221,12 +221,12 @@ int test(void) { }; int i, res = 0; - hts_expr_val_t r; + hts_expr_val_t r = HTS_EXPR_VAL_INIT; for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) { hts_filter_t *filt = hts_filter_init(tests[i].str); if (!filt) return 1; - if (hts_filter_eval(filt, NULL, lookup, &r)) { + if (hts_filter_eval2(filt, NULL, lookup, &r)) { fprintf(stderr, "Failed to parse filter string %s\n", tests[i].str); res = 1; @@ -259,9 +259,9 @@ int test(void) { int main(int argc, char **argv) { if (argc > 1) { - hts_expr_val_t v; + hts_expr_val_t v = HTS_EXPR_VAL_INIT; hts_filter_t *filt = hts_filter_init(argv[1]); - if (hts_filter_eval(filt, NULL, lookup, &v)) + if (hts_filter_eval2(filt, NULL, lookup, &v)) return 1; if (v.is_str) From 19c72628618eb1aaeb965ef06d81255fe0af1c17 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 18 Jul 2022 11:34:38 +0100 Subject: [PATCH 300/488] Switch to rockylinux:9 Change rockylinux docker image to rockylinux:9 following deprecation of rockylinux:latest. Add perl-FindBin to installation list, as it's now in its own package. --- .cirrus.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 79aa2f99b..3a7b910a5 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -121,7 +121,7 @@ ubuntu_task: rocky_task: name: rockylinux-gcc container: - image: rockylinux:latest + image: rockylinux:9 cpu: 2 memory: 1G @@ -133,9 +133,9 @@ rocky_task: # NB: we could consider building a docker image with these # preinstalled and specifying that instead, to speed up testing. install_script: | - yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \ - bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \ - diffutils git + yum install -y autoconf automake make gcc perl-Data-Dumper perl-FindBin \ + zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel \ + ncurses-devel diffutils git << : *COMPILE << : *TEST From 9562aebaa39265b7c96e6a6f5cf6f78727c41221 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 19 Jul 2022 18:16:45 +0100 Subject: [PATCH 301/488] Improve embed_ref=2 and auto-enable it when missing external refs (PR #1449) * Improve embed_ref=2 when no MD:Z tags are present. When these tags are present, the reference is inferred so it's much the same as before. However when not present, we previously took a first-come first-serve approach to consensus generation as the just-in-time reference generation was in the process_one_read loop. As we'd already updated the reference and delta-encoded against it, if a later read comes along with conflicts then we couldn't correct anything. 33ff2bc incorrectly made the assumption that we were being fed a bunch of bam records and didn't have all the data available up-front. However this isn't true. For the sake of threading efficiency, the encoder buffers up all the BAMs for a container before dispatching the cram_encode_container call, so we can do a consensus generation stage prior to the process_one_read loop. This improves consensus accuracy and also simplifies the MD:Z using code too as it doesn't have to worry about inconsistencies any more. On 10MB worth of ONT data we had the following total file and SEQ sizes: Total SEQ only no_ref 205899946 74416907 embed_ref=0 135303919 27583376 use external ref embed_ref=1 139709759 32008185 embed external ref, from file embed_ref=2+MD 139710386 32007405 embed external ref, inferred embed_ref=2-MD 149371457 41670856 OLD: first-come first-serve cons embed_ref=2-MD 139598781 31898253 NEW: proper consensus The old embed_ref=2 needed 30% more space to delta-encode seqs, although it was still a major win over using no_ref mode. The new consensus is is a significant improvement and it's now a better match for the sequence (as we'd expect) than the external reference. Obviously this can't be done when we have MD tags as otherwise we'd need to store them verbatim which then takes more space up again. On Illumina NovaSeq we also see a significant reduction to the embedding overheads: Total SEQ only no_ref 256252015 87171589 48.4s embed_ref=0 207793068 38761828 34.7s embed_ref=1 219064424 50021186 35.3s embed_ref=2-MD 231012254 61929861 46.2s OLD embed_ref=2-MD 218014361 48935989 38.9s NEW The code has also been refactored in a few places to speed it up. Specifically with the newer consensus building strategy, we no longer produce fake MD tags and just call the earlier algorithm, and instead simply add directly to consensus histograms. TODO: figure out how to enable this automatically, without penalty when we do have the external reference available. * Add code to enable embed_ref=2 when external reference isn't found, and when converting BAMs with no UR or M5 tags. This choice is debatable, as it means people may get larger files than they expect, but it also means the CRAM can be built even if suboptimal. A warning is given notifying the user of this action. If a user explicitly species embed_ref=0 then this disables this automatic code and will turn the warning back into a hard failure. Also added more checks for handling things like unsorted data or multi_seq_per_slice mode, neither of which are compatible with embedding references (both the original =1 mode or =2). * Work around embed_ref=2 failures when MD is inconsistent. It's too harsh to simply bail out with invalid data. Instead we just nullify the portion of reference computed with this record and drop back to the consensus method. Also replaced hts_log_warning messages with hts_log_info, as typically we either get no messages or a huge proportion, as MD being wrong quite often means some large scale reprocessing happened. Tested using soft-clip adjusted covid19 samples where almost every MD tag was incorrect. * Make fd->embed_ref a local variable to many functions. This field can be changed on-the-fly by cram_encode_container when it discovers it cannot find a reference. It switches to embed_ref=2 in this case. Due to the multi-threaded nature, it is important that all running code elsewhere is using their embed_ref parameter at the time the container was created. This opens up an avenue for downgrading embed_ref again if we require (eg a reference is absent, but subsequent ones can be found), but for now we don't support this. --- Makefile | 2 +- cram/cram_encode.c | 572 +++++++++++++++++++++++++------------------ cram/cram_encode.h | 3 +- cram/cram_external.c | 2 +- cram/cram_io.c | 26 +- cram/cram_structs.h | 1 - 6 files changed, 361 insertions(+), 245 deletions(-) diff --git a/Makefile b/Makefile index 88d1f56b6..dfb666c91 100644 --- a/Makefile +++ b/Makefile @@ -447,7 +447,7 @@ textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstr cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) -cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) +cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 368f6e0b4..d3dd7a134 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -49,6 +49,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "../sam_internal.h" // for nibble2base #include "../htslib/hts.h" #include "../htslib/hts_endian.h" +#include "../textutils_internal.h" KHASH_MAP_INIT_STR(m_s2u64, uint64_t) @@ -59,7 +60,8 @@ KHASH_MAP_INIT_STR(m_s2u64, uint64_t) static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum, kstring_t *MD); + bam_seq_t *b, int rnum, kstring_t *MD, + int embed_ref); /* * Returns index of val into key. @@ -79,7 +81,8 @@ static int sub_idx(char *key, char val) { * NULL on failure */ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h) { + cram_block_compression_hdr *h, + int embed_ref) { cram_block *cb = cram_new_block(COMPRESSION_HEADER, 0); cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); int i, mc, r = 0; @@ -159,7 +162,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, kh_val(h->preservation_map, k).i = h->qs_seq_orient; } - if (fd->no_ref || fd->embed_ref) { + if (fd->no_ref || embed_ref>0) { // Reference Required == No k = kh_put(map, h->preservation_map, "RR", &r); if (-1 == r) return NULL; @@ -1076,14 +1079,12 @@ static int cram_allocate_block(cram_codec *codec, cram_slice *s, int ds_id) { * -1 on failure */ static int cram_encode_slice(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h, cram_slice *s) { + cram_block_compression_hdr *h, cram_slice *s, + int embed_ref) { int rec, r = 0; int64_t last_pos; - int embed_ref; enum cram_DS_ID id; - embed_ref = fd->embed_ref && s->hdr->ref_seq_id != -1 ? 1 : 0; - /* * Slice external blocks: * ID 0 => base calls (insertions, soft-clip) @@ -1096,7 +1097,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, */ /* Create cram slice header */ - s->hdr->ref_base_id = embed_ref && s->hdr->ref_seq_span > 0 + s->hdr->ref_base_id = embed_ref>0 && s->hdr->ref_seq_span > 0 ? DS_ref : (CRAM_MAJOR_VERS(fd->version) >= 4 ? 0 : -1); s->hdr->record_counter = c->num_records + c->record_counter; @@ -1124,7 +1125,7 @@ static int cram_encode_slice(cram_fd *fd, cram_container *c, } // Embedded reference - if (embed_ref) { + if (embed_ref>0) { if (!(s->block[DS_ref] = cram_new_block(EXTERNAL, DS_ref))) return -1; s->ref_id = DS_ref; // needed? @@ -1401,6 +1402,292 @@ static int add_read_names(cram_fd *fd, cram_container *c, cram_slice *s, // CRAM version >= 3.1 #define CRAM_ge31(v) ((v) >= 0x301) +// Returns the next cigar op code: one of the BAM_C* codes, +// or -1 if no more are present. +static inline +int next_cigar_op(uint32_t *cigar, uint32_t ncigar, int *skip, int *spos, + uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) { + for(;;) { + while (*cig_len == 0) { + if (*cig_ind < ncigar) { + *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK; + *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT; + (*cig_ind)++; + } else { + return -1; + } + } + + if (skip[*cig_op]) { + *spos += (bam_cigar_type(*cig_op)&1) * *cig_len; + *cig_len = 0; + continue; + } + + (*cig_len)--; + break; + } + + return *cig_op; +} + +// Ensure ref and hist are large enough. +static inline int extend_ref(char **ref, uint32_t (**hist)[5], hts_pos_t pos, + hts_pos_t ref_start, hts_pos_t *ref_end) { + if (pos < ref_start) + return -1; + if (pos < *ref_end) + return 0; + + // realloc + hts_pos_t old_end = *ref_end ? *ref_end : ref_start; + hts_pos_t new_end = *ref_end = ref_start + 1000 + (pos-ref_start)*1.5; + + char *tmp = realloc(*ref, *ref_end-ref_start); + if (!tmp) + return -1; + *ref = tmp; + + uint32_t (*tmp5)[5] = realloc(**hist, + (*ref_end - ref_start)*sizeof(**hist)); + if (!tmp5) + return -1; + *hist = tmp5; + *ref_end = new_end; + + // initialise + old_end -= ref_start; + new_end -= ref_start; + memset(&(*ref)[old_end], 0, new_end-old_end); + memset(&(*hist)[old_end], 0, (new_end-old_end)*sizeof(**hist)); + + return 0; +} + +// Walk through MD + seq to generate ref +static int cram_add_to_ref_MD(bam1_t *b, char **ref, uint32_t (**hist)[5], + hts_pos_t ref_start, hts_pos_t *ref_end, + const uint8_t *MD) { + uint8_t *seq = bam_get_seq(b); + uint32_t *cigar = bam_get_cigar(b); + uint32_t ncigar = b->core.n_cigar; + uint32_t cig_op = 0, cig_len = 0, cig_ind = 0; + + int iseq = 0, next_op; + hts_pos_t iref = b->core.pos - ref_start; + + // Skip INS, REF_SKIP, *CLIP, PAD. and BACK. + static int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1}; + while (iseq < b->core.l_qseq && *MD) { + if (isdigit(*MD)) { + // match + int overflow = 0; + int len = hts_str2uint((char *)MD, (char **)&MD, 31, &overflow); + if (overflow || + extend_ref(ref, hist, iref+ref_start + len, + ref_start, ref_end) < 0) + return -1; + while (iseq < b->core.l_qseq && len) { + // rewrite to have internal loops? + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && + next_op != BAM_CEQUAL) { + hts_log_info("MD:Z and CIGAR are incompatible for " + "record %s", bam_get_qname(b)); + return -1; + } + + // Short-cut loop over same cigar op for efficiency + cig_len++; + do { + cig_len--; + (*ref)[iref++] = seq_nt16_str[bam_seqi(seq, iseq)]; + iseq++; + len--; + } while (cig_len && iseq < b->core.l_qseq && len); + } + if (len > 0) + return -1; // MD is longer than seq + } else if (*MD == '^') { + // deletion + MD++; + while (isalpha(*MD)) { + if (extend_ref(ref, hist, iref+ref_start, ref_start, + ref_end) < 0) + return -1; + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CDEL) { + hts_log_info("MD:Z and CIGAR are incompatible"); + return -1; + } + + (*ref)[iref++] = *MD++ & ~0x20; + } + } else { + // substitution + if (extend_ref(ref, hist, iref+ref_start, ref_start, ref_end) < 0) + return -1; + if ((next_op = next_cigar_op(cigar, ncigar, cig_skip, + &iseq, &cig_ind, &cig_op, + &cig_len)) < 0) + return -1; + + if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) { + hts_log_info("MD:Z and CIGAR are incompatible"); + return -1; + } + + (*ref)[iref++] = *MD++ & ~0x20; + iseq++; + } + } + + return 1; +} + +// Append a sequence to a ref/consensus structure. +// We maintain both an absolute refefence (ACGTN where MD:Z is +// present) and a 5-way frequency array for when no MD:Z is known. +// We then subsequently convert the 5-way frequencies to a consensus +// ref in a second pass. +// +// Returns >=0 on success, +// -1 on failure (eg inconsistent data) +static int cram_add_to_ref(bam1_t *b, char **ref, uint32_t (**hist)[5], + hts_pos_t ref_start, hts_pos_t *ref_end) { + const uint8_t *MD = bam_aux_get(b, "MD"); + int ret = 0; + if (MD && *MD == 'Z') { + // We can use MD to directly compute the reference + int ret = cram_add_to_ref_MD(b, ref, hist, ref_start, ref_end, MD+1); + + if (ret > 0) + return ret; + } + + // Otherwise we just use SEQ+CIGAR and build a consensus which we later + // turn into a fake reference + uint32_t *cigar = bam_get_cigar(b); + uint32_t ncigar = b->core.n_cigar; + uint32_t i, j; + hts_pos_t iseq = 0, iref = b->core.pos - ref_start; + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < ncigar; i++) { + switch (bam_cigar_op(cigar[i])) { + case BAM_CSOFT_CLIP: + case BAM_CINS: + iseq += bam_cigar_oplen(cigar[i]); + break; + + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: { + int len = bam_cigar_oplen(cigar[i]); + // Maps an nt16 (A=1 C=2 G=4 T=8 bits) to 0123 plus N=4 + static uint8_t L16[16] = {4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4}; + + if (extend_ref(ref, hist, iref+ref_start + len, + ref_start, ref_end) < 0) + return -1; + if (iseq + len <= b->core.l_qseq) { + // Nullify failed MD:Z if appropriate + if (ret < 0) + memset(&(*ref)[iref], 0, len); + + for (j = 0; j < len; j++, iref++, iseq++) + (*hist)[iref][L16[bam_seqi(seq, iseq)]]++; + } else { + // Probably a 2ndary read with seq "*" + iseq += len; + iref += len; + } + break; + } + + case BAM_CDEL: + case BAM_CREF_SKIP: + iref += bam_cigar_oplen(cigar[i]); + } + } + + return 1; +} + +// Automatically generates the reference and stashed it in c->ref, also +// setting c->ref_start and c->ref_end. +// +// If we have MD:Z tags then we use them to directly infer the reference, +// along with SEQ + CIGAR. Otherwise we use SEQ/CIGAR only to build up +// a consensus and then assume the reference as the majority rule. +// +// In this latter scenario we need to be wary of auto-generating MD and NM +// during decode, but that's handled elsewhere via an additional aux tag. +// +// Returns 0 on success, +// -1 on failure +static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { + // TODO: if we can find an external reference then use it, even if the + // user told us to do embed_ref=2. + char *ref = NULL; + uint32_t (*hist)[5] = NULL; + hts_pos_t ref_start = c->bams[r1]->core.pos, ref_end = 0; + + // initial allocation + if (extend_ref(&ref, &hist, + c->bams[r1 + s->hdr->num_records-1]->core.pos + + c->bams[r1 + s->hdr->num_records-1]->core.l_qseq, + ref_start, &ref_end) < 0) + return -1; + + // Add each bam file to the reference/consensus arrays + int r2; + hts_pos_t last_pos = -1; + for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) { + if (c->bams[r1]->core.pos < last_pos) { + hts_log_error("Cannot build reference with unsorted data"); + goto err; + } + last_pos = c->bams[r1]->core.pos; + if (cram_add_to_ref(c->bams[r1], &ref, &hist, ref_start, &ref_end) < 0) + goto err; + } + + // Compute the consensus + hts_pos_t i; + for (i = 0; i < ref_end-ref_start; i++) { + if (!ref[i]) { + int max_v = 0, max_j = 4, j; + for (j = 0; j < 4; j++) + // don't call N (j==4) unless no coverage + if (max_v < hist[i][j]) + max_v = hist[i][j], max_j = j; + ref[i] = "ACGTN"[max_j]; + } + } + free(hist); + + // Put the reference in place so it appears to be an external + // ref file. + c->ref = ref; + c->ref_start = ref_start+1; + c->ref_end = ref_end+1; + + return 0; + + err: + free(ref); + free(hist); + return -1; +} + /* * Encodes all slices in a container into blocks. * Returns 0 on success @@ -1411,7 +1698,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { cram_block_compression_hdr *h = c->comp_hdr; cram_block *c_hdr; int multi_ref = 0; - int r1, r2, sn, nref; + int r1, r2, sn, nref, embed_ref; spare_bams *spares; if (CRAM_MAJOR_VERS(fd->version) == 1) @@ -1423,6 +1710,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Cache references up-front if we have unsorted access patterns */ pthread_mutex_lock(&fd->ref_lock); nref = fd->refs->nref; + embed_ref = fd->embed_ref; pthread_mutex_unlock(&fd->ref_lock); if (!fd->no_ref && c->refs_used) { @@ -1439,11 +1727,23 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { goto_err; bam_seq_t *b = c->bams[0]; - if (fd->embed_ref <= 1) { + if (embed_ref <= 1) { char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); if (!ref && bam_ref(b) >= 0) { - hts_log_error("Failed to load reference #%d", bam_ref(b)); - return -1; + if (c->multi_seq || embed_ref == 0 || !c->pos_sorted) { + hts_log_error("Failed to load reference #%d", bam_ref(b)); + return -1; + } + hts_log_warning("Failed to load reference #%d", bam_ref(b)); + hts_log_warning("Enabling embed_ref=2 mode to auto-generate" + " reference"); + if (embed_ref <= 0) + hts_log_warning("NOTE: the CRAM file will be bigger than" + " using an external reference"); + pthread_mutex_lock(&fd->ref_lock); + embed_ref = fd->embed_ref = 2; + pthread_mutex_unlock(&fd->ref_lock); + goto auto_ref; } if ((c->ref_id = bam_ref(b)) >= 0) { c->ref_seq_id = c->ref_id; @@ -1452,6 +1752,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { c->ref_end = fd->refs->ref_id[c->ref_seq_id]->length; } } else { + auto_ref: // Auto-embed ref. // This starts as 'N' and is amended on-the-fly as we go // based on MD:Z tags. @@ -1487,6 +1788,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // is done within process_one_read(). kstring_t MD = {0}; + // Embed consensus / MD-generated ref + if (embed_ref == 2) { + if (cram_generate_reference(c, s, r1) < 0) { + hts_log_error("Failed to build reference"); + return -1; + } + } + // Iterate through records creating the cram blocks for some // fields and just gathering stats for others. for (r2 = 0; r1 < c->curr_c_rec && r2 < s->hdr->num_records; r1++, r2++) { @@ -1514,7 +1823,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } } - if (process_one_read(fd, c, s, cr, b, r2, &MD) != 0) { + if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref) != 0) { free(MD.s); return -1; } @@ -1525,6 +1834,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (last_base < cr->aend) last_base = cr->aend; } + free(MD.s); // Process_one_read doesn't add read names as it can change @@ -1901,7 +2211,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { for (i = 0; i < c->curr_slice; i++) { hts_log_info("Encode slice %d", i); - if (cram_encode_slice(fd, c, h, c->slices[i]) != 0) + int local_embed_ref = + embed_ref>0 && c->slices[i]->hdr->ref_seq_id != -1 ? 1 : 0; + if (cram_encode_slice(fd, c, h, c->slices[i], local_embed_ref) != 0) return -1; } @@ -1916,7 +2228,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { h->AP_delta = c->pos_sorted; memcpy(h->substitution_matrix, CRAM_SUBST_MATRIX, 20); - if (!(c_hdr = cram_encode_compression_header(fd, c, h))) + if (!(c_hdr = cram_encode_compression_header(fd, c, h, embed_ref))) return -1; } @@ -2757,216 +3069,6 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { return c; } -// Returns the next cigar op code: one of the BAM_C* codes, -// or -1 if no more are present. -static inline -int next_cigar_op(uint32_t *cigar, int *ncigar, int *skip, int *spos, - uint32_t *cig_ind, uint32_t *cig_op, uint32_t *cig_len) { - for(;;) { - while (*cig_len == 0) { - if (*cig_ind < *ncigar) { - *cig_op = cigar[*cig_ind] & BAM_CIGAR_MASK; - *cig_len = cigar[*cig_ind] >> BAM_CIGAR_SHIFT; - (*cig_ind)++; - } else { - return -1; - } - } - - if (skip[*cig_op]) { - *spos += (bam_cigar_type(*cig_op)&1) * *cig_len; - *cig_len = 0; - continue; - } - - (*cig_len)--; - break; - } - - return *cig_op; -} - -// Set a base in the computed reference. -// As we fill this out record by record as we go, and we encode the -// sequence against the reference we've computed so far, once we set a -// reference is must never change. So if the reference inferred by one -// SEQ+MD differs to the reference inferred by another SEQ+MD, the latter -// is warned about and the reference remains unchanged to ensure -// round-trips. -// -// In order to spot N->N->G type edits, where "N" and "G" are two inferred -// ref from two sequences, we use N->n->G and patch up the lowercase n later. -// Similarly where the data is unvalidated (faked up MD tag) also get -// assigned lowercase letters. This prevents false warnings when mixing data -// with and without MD tags. -static inline void assign_ref(char *ref, char *set, int pos, - unsigned char base, int validate) { - base = base & ~0x20; // fast toupper for ASCII -#if 1 - if (!set[pos] || ref[pos] == base) { - ref[pos] = base; - set[pos] = 1; - } -#else - // Optional reporting. It's 7% additional CPU cost in process_one_read, - // and maybe not appropriate anyway given there's nothing we can do to - // correct this either than ignore it. It'd need update to explain the - // position too. - if (!set[pos] || ref[pos] == base) { - ref[pos] = base; - set[pos] = validate ? 1 : 2; // actual MD:Z or guesswork/fake - } else if (validate && set[pos] == 1) { - hts_log_warning("Incompatible MD:Z tags between records"); - } -#endif -} - -static int cram_extend_ref(cram_container *c, bam1_t *b) { - hts_pos_t end = bam_endpos(b); - - if (!c->ref) - c->ref_start = b->core.pos+1; - //c->ref_start = 1; // FIXME, needs to be b->core.pos, but fails - - if (end >= c->ref_end) { - hts_pos_t old_end = c->ref ? c->ref_end : c->ref_start; - c->ref_end = end + 1000 + (end - c->ref_start)*1.5; - - char *r = realloc(c->ref, c->ref_end+1 - c->ref_start); - if (!r) return -1; - c->ref = r; - - r = realloc(c->ref_set, c->ref_end+1 - c->ref_start); - if (!r) return -1; - c->ref_set = r; - - memset(c->ref + old_end - c->ref_start, 'N', c->ref_end - old_end); - memset(c->ref_set + old_end - c->ref_start, 0, c->ref_end - old_end); - c->ref_free = 1; - } - - return 0; -} - -// Converts a bam object with SEQ, POS/CIGAR and MD:Z to a reference. -// Updates ref[] array. -// -// Returns >0 on success, -// 0 on no-MD found, -// -1 on failure (eg inconsistent data) -static int cram_build_ref(bam1_t *b, const uint8_t *MD, - char *ref, char *ref_set, - hts_pos_t ref_start, hts_pos_t ref_len) { - uint8_t *seq = bam_get_seq(b); - uint32_t *cigar = bam_get_cigar(b); - int ncigar = b->core.n_cigar; - uint32_t cig_op = 0, cig_len = 0, cig_ind = 0; - kstring_t fake_MD = KS_INITIALIZE; - int validate = 1; - - if (!MD || *MD != 'Z') { - // Fake it! - int i, err = 0; - int run_len = 0; - for (i = 0; i < ncigar; i++) { - switch(cigar[i] & BAM_CIGAR_MASK) { - case BAM_CMATCH: - case BAM_CEQUAL: - case BAM_CDIFF: - run_len += cigar[i] >> BAM_CIGAR_SHIFT; - break; - - case BAM_CDEL: - err |= ksprintf(&fake_MD, "%d", run_len) < 0; - run_len = 0; - err |= kputc('^', &fake_MD); - for (int j = 0; j < cigar[i] >> BAM_CIGAR_SHIFT; j++) - err |= kputc('N', &fake_MD); - break; - } - } - if (run_len) - err |= ksprintf(&fake_MD, "%d", run_len) < 0; - MD = (uint8_t *)fake_MD.s; - if (err < 0) - return -1; - - validate = 0; - } else { - MD++; - } - - // Walk through MD + seq to generate ref - int iseq = 0, next_op; - hts_pos_t iref = b->core.pos+1 - ref_start; - int cig_skip[16] = {0,1,0,1,1,1,1,0,0,1,1,1,1,1,1,1}; - while (iseq < b->core.l_qseq && MD && *MD) { - if (isdigit(*MD)) { - // match - int len = strtol((char *)MD, (char **)&MD, 10); - while (iseq < b->core.l_qseq && len) { - if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, - &iseq, &cig_ind, &cig_op, - &cig_len)) < 0) - return -1; - - if (next_op != BAM_CMATCH && - next_op != BAM_CEQUAL) { - hts_log_warning("MD:Z and CIGAR are incompatible for " - "record %s", bam_get_qname(b)); - return -1; - } - - if (iref < ref_len) - assign_ref(ref, ref_set, iref, - seq_nt16_str[bam_seqi(seq, iseq)], validate); - iseq++; - iref++; - len--; - } - } else if (*MD == '^') { - // deletion - MD++; - while (*MD && isalpha(*MD)) { - if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, - &iseq, &cig_ind, &cig_op, - &cig_len)) < 0) - return -1; - - if (next_op != BAM_CDEL) { - hts_log_warning("MD:Z and CIGAR are incompatible"); - return -1; - } - - if (iref < ref_len) - assign_ref(ref, ref_set, iref, toupper(*MD), validate); - - MD++; - iref++; - } - } else { - // substitution - if ((next_op = next_cigar_op(cigar, &ncigar, cig_skip, - &iseq, &cig_ind, &cig_op, - &cig_len)) < 0) - return -1; - - if (next_op != BAM_CMATCH && next_op != BAM_CDIFF) { - hts_log_warning("MD:Z and CIGAR are incompatible"); - return -1; - } - if (iref < ref_len) - assign_ref(ref, ref_set, iref, toupper(*MD), validate); - - MD++; - iref++; - iseq++; - } - } - - ks_free(&fake_MD); - return 1; -} /* * Converts a single bam record into a cram record. @@ -2977,7 +3079,8 @@ static int cram_build_ref(bam1_t *b, const uint8_t *MD, */ static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, - bam_seq_t *b, int rnum, kstring_t *MD) { + bam_seq_t *b, int rnum, kstring_t *MD, + int embed_ref) { int i, fake_qual = -1, NM = 0; char *cp, *rg; char *ref, *seq, *qual; @@ -3001,10 +3104,8 @@ static int process_one_read(cram_fd *fd, cram_container *c, MD->l = 0; int cf_tag = 0; - if (/*md &&*/ fd->embed_ref == 2) { - // Auto-generate and embed ref - cram_extend_ref(c, b); - cram_build_ref(b, md, c->ref, c->ref_set, c->ref_start, c->ref_end); + + if (embed_ref == 2) { cf_tag = MD ? 0 : 1; // No MD cf_tag |= bam_aux_get(b, "NM") ? 0 : 2; // No NM } @@ -3638,9 +3739,12 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { * The multi_seq var here refers to our intention for the next slice. * This slice has already been encoded so we output as-is. */ + pthread_mutex_lock(&fd->ref_lock); + int embed_ref = fd->embed_ref; + pthread_mutex_unlock(&fd->ref_lock); if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && fd->last_slice && fd->last_slice < c->max_rec/4+10 && - !fd->embed_ref) { + embed_ref<=0) { if (!c->multi_seq) hts_log_info("Multi-ref enabled for next container"); multi_seq = 1; @@ -3698,8 +3802,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->slice_rec = c->curr_rec; // Have we seen this reference before? - if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && !fd->embed_ref && - !fd->unsorted && multi_seq) { + if (bam_ref(b) >= 0 && curr_ref >= 0 && bam_ref(b) != curr_ref && + embed_ref<=0 && !fd->unsorted && multi_seq) { if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); diff --git a/cram/cram_encode.h b/cram/cram_encode.h index 7cccae9af..03b8054e8 100644 --- a/cram/cram_encode.h +++ b/cram/cram_encode.h @@ -74,7 +74,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b); * NULL on failure */ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, - cram_block_compression_hdr *h); + cram_block_compression_hdr *h, + int embed_ref); /*! INTERNAL: * Encodes a slice compression header. diff --git a/cram/cram_external.c b/cram/cram_external.c index 098accde9..329f1ec63 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -331,7 +331,7 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, return -1; if (cram_block_compression_hdr_decoder2encoder(in, ch) != 0) return -1; - n_blk = cram_encode_compression_header(in, c, ch); + n_blk = cram_encode_compression_header(in, c, ch, in->embed_ref); cram_free_compression_header(ch); /* diff --git a/cram/cram_io.c b/cram/cram_io.c index e0d203469..968b7b119 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -3561,7 +3561,7 @@ int cram_load_reference(cram_fd *fd, char *fn) { if (fn) { fd->refs = refs_load_fai(fd->refs, fn, - !(fd->embed_ref && fd->mode == 'r')); + !(fd->embed_ref>0 && fd->mode == 'r')); fn = fd->refs ? fd->refs->fn : NULL; if (!fn) ret = -1; @@ -3712,10 +3712,8 @@ void cram_free_container(cram_container *c) { kh_destroy(m_tagmap, c->tags_used); } - if (c->ref_free) { + if (c->ref_free) free(c->ref); - free(c->ref_set); - } free(c); } @@ -4850,7 +4848,21 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { if (!(md5 = hts_md5_init())) return -1; ref = cram_get_ref(fd, i, 1, rlen); - if (NULL == ref) return -1; + if (NULL == ref) { + if (fd->embed_ref == -1) { + // auto embed-ref + hts_log_warning("No M5 tags present and could not " + "find reference"); + hts_log_warning("Enabling embed_ref=2 option"); + hts_log_warning("NOTE: the CRAM file will be bigger " + "than using an external reference"); + pthread_mutex_lock(&fd->ref_lock); + fd->embed_ref = 2; + pthread_mutex_lock(&fd->ref_lock); + break; + } + return -1; + } rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ hts_md5_update(md5, ref, rlen); hts_md5_final(buf, md5); @@ -5251,7 +5263,7 @@ cram_fd *cram_dopen(hFILE *fp, const char *filename, const char *mode) { fd->seqs_per_slice = SEQS_PER_SLICE; fd->bases_per_slice = BASES_PER_SLICE; fd->slices_per_container = SLICE_PER_CNT; - fd->embed_ref = 0; + fd->embed_ref = -1; // automatic selection fd->no_ref = 0; fd->ap_delta = 0; fd->ignore_md5 = 0; @@ -5398,7 +5410,7 @@ int cram_write_eof_block(cram_fd *fd) { // block CRC cram_block_compression_hdr ch; memset(&ch, 0, sizeof(ch)); - c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch); + c.comp_hdr_block = cram_encode_compression_header(fd, &c, &ch, 0); c.length = c.comp_hdr_block->byte // Landmark[0] + 5 // block struct diff --git a/cram/cram_structs.h b/cram/cram_structs.h index e03a34e11..16739c2c6 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -474,7 +474,6 @@ struct cram_container { uint32_t n_mapped; // Number of mapped reads int ref_free; // whether 'ref' is owned by us and must be freed. - char *ref_set; // same size as ref. Only set for auto embed_ref }; /* From c72eee6465147278b9dee97044474bca69f89dcb Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 19 Jul 2022 12:03:35 +0100 Subject: [PATCH 302/488] time_funcs include and dependency adjustments Fix dependency typo in the Makefile Remove config.h from hts_time_funcs.h as it's likely to be too late to include it by the time that file it read. Add config.h include to test/test_time_funcs.c instead. Add extra includes to hts_time_funcs.h so it stands on its own and doesn't rely on the right headers having been included before it is. --- Makefile | 4 ++-- hts_time_funcs.h | 5 ++++- test/test_time_funcs.c | 1 + 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index dfb666c91..42ad91814 100644 --- a/Makefile +++ b/Makefile @@ -235,7 +235,7 @@ bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h) header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h) hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h) hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h) -hts_time_funcs_h = hts_time_funcs.h config.h +hts_time_funcs_h = hts_time_funcs.h sam_internal_h = sam_internal.h $(htslib_sam_h) textutils_internal_h = textutils_internal.h $(htslib_kstring_h) thread_pool_internal_h = thread_pool_internal.h $(htslib_thread_pool_h) @@ -726,7 +726,7 @@ test/test-parse-reg.o: test/test-parse-reg.c config.h $(htslib_hts_h) $(htslib_s test/test_realn.o: test/test_realn.c config.h $(htslib_hts_h) $(htslib_sam_h) $(htslib_faidx_h) test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(textutils_internal_h) test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) -test/test_time_funcs.o: test/test_time_funcs.c $(htslib_time_funcs_h) +test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) diff --git a/hts_time_funcs.h b/hts_time_funcs.h index bc3de14f8..2a0508412 100644 --- a/hts_time_funcs.h +++ b/hts_time_funcs.h @@ -44,7 +44,10 @@ DEALINGS IN THE SOFTWARE. */ Non-derived code is copyright as above. */ -#include +#include +#include +#include +#include static inline int hts_time_normalise(int *tens, int *units, int base) { if (*units < 0 || *units >= base) { diff --git a/test/test_time_funcs.c b/test/test_time_funcs.c index e8c2600cc..9ca292f7a 100644 --- a/test/test_time_funcs.c +++ b/test/test_time_funcs.c @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +#include #include #include #include From b7addd31513fd301bc3f44fff9bcdf87239523db Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 20 Jul 2022 13:08:36 +0100 Subject: [PATCH 303/488] Support 0 coordinate in BCF The 0 coordinate is valid in VCF specification, but the round-trip VCF -> BCF -> VCF turns MT:0 into MT:4294967296. Add a check to detect this overflow. See #1475 and https://github.com/samtools/bcftools/issues/1753 --- vcf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/vcf.c b/vcf.c index ab2477861..012db0934 100644 --- a/vcf.c +++ b/vcf.c @@ -1413,6 +1413,7 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2; v->rid = le_to_i32(x + 8); v->pos = le_to_u32(x + 12); + if ( v->pos==UINT32_MAX ) v->pos = -1; // this is for telomere coordinate, e.g. MT:0 v->rlen = le_to_i32(x + 16); v->qual = le_to_float(x + 20); v->n_info = le_to_u16(x + 24); From c5508d59eefb97eac2af6bb00e97b7e64f81f8da Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 21 Jul 2022 15:27:13 +0100 Subject: [PATCH 304/488] Minimal support for CRAM files with missing @RG headers. The SAMtags spec states that RG:Z: lines should point match an RG ID if RG headers are present, but doesn't explicitly *require* them to be present. The SAM spec itself recommends that RG headers are present. Sadly this means CRAM may need to cope with this semantically inconsistent edge case. Given CRAM stores RG as an integer data series as an index into the corresponding header, in much the same way that BAM stores chromosomes as numeric "tid" values, this makes things challenging. However CRAM can also store text tags, so it's possible to round-trip with missing headers by claiming RG is -1 (unspecified) and then adding a verbatim RG:Z string tag. This is perhaps a bit of a CRAM spec loop hole so it's questionable if this is the correct solution. This works and is decodable by both htslib and htsjdk, but it'll break things like cram_transcode_rg as used by samtools cat. I think this is a pretty unlikely combination of events. Note picard's SamFormatConverter also drops these RG fields. This code also whinges, *once for each and every problematic alignment record*, when RG is absent in the SAM header. It's considerably more work to track which ones we've warned about before and to track all that meta-data across threads in a robust manner, plus this really could be considered to be a poor SAM file. Were it not for the SAM spec explicitly permitting such things (even if recommending against it) I'd reject it outright. Instead brow-beating the SAM creators into fixing the headers could be considered to be a positive outcome. Fixes #1479 --- cram/cram_encode.c | 47 ++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index d3dd7a134..1ba1988f4 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -2534,15 +2534,17 @@ static int cram_add_insertion(cram_container *c, cram_slice *s, cram_record *r, * Encodes auxiliary data. Largely duplicated from above, but done so to * keep it simple and avoid a myriad of version ifs. * - * Returns the read-group parsed out of the BAM aux fields on success + * Returns the RG header line pointed to by the BAM aux fields on success, * NULL on failure or no rg present, also sets "*err" to non-zero */ -static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, - cram_slice *s, cram_record *cr, - int verbatim_NM, int verbatim_MD, - int NM, kstring_t *MD, int cf_tag, - int *err) { - char *aux, *orig, *rg = NULL; +static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, + cram_container *c, + cram_slice *s, cram_record *cr, + int verbatim_NM, int verbatim_MD, + int NM, kstring_t *MD, int cf_tag, + int *err) { + char *aux, *orig; + sam_hrec_rg_t *brg = NULL; int aux_size = bam_get_l_aux(b); cram_block *td_b = c->comp_hdr->TD_blk; int TD_blk_size = BLOCK_SIZE(td_b), new; @@ -2577,11 +2579,17 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, // RG:Z if (aux[0] == 'R' && aux[1] == 'G' && aux[2] == 'Z') { - rg = &aux[3]; - while (*aux++); - if (CRAM_MAJOR_VERS(fd->version) >= 4) - BLOCK_APPEND(td_b, "RG*", 3); - continue; + char *rg = &aux[3]; + brg = sam_hrecs_find_rg(fd->header->hrecs, rg); + if (brg) { + while (*aux++); + if (CRAM_MAJOR_VERS(fd->version) >= 4) + BLOCK_APPEND(td_b, "RG*", 3); + continue; + } else { + // RG:Z tag will be stored verbatim + hts_log_warning("Missing @RG header for RG \"%s\"", rg); + } } // MD:Z @@ -2938,8 +2946,7 @@ static char *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_container *c, if (err) *err = 0; - // rg from within bam_aux, not rg from our aux copy. - return rg ? (char *)bam_aux(b) + (rg - orig) : NULL; + return brg; err: block_err: @@ -3082,7 +3089,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, bam_seq_t *b, int rnum, kstring_t *MD, int embed_ref) { int i, fake_qual = -1, NM = 0; - char *cp, *rg; + char *cp; char *ref, *seq, *qual; // Any places with N in seq and/or reference can lead to ambiguous @@ -3426,15 +3433,15 @@ static int process_one_read(cram_fd *fd, cram_container *c, cr->ntags = 0; //cram_stats_add(c->stats[DS_TC], cr->ntags); int err = 0; - rg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, - cf_tag, &err); + sam_hrec_rg_t *brg = + cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, + cf_tag, &err); if (err) goto block_err; /* Read group, identified earlier */ - if (rg) { - sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, rg); - cr->rg = brg ? brg->id : -1; + if (brg) { + cr->rg = brg->id; } else if (CRAM_MAJOR_VERS(fd->version) == 1) { sam_hrec_rg_t *brg = sam_hrecs_find_rg(fd->header->hrecs, "UNKNOWN"); if (!brg) goto block_err; From 542dfb81afd6ebef5a5ea5651555f930d43d572e Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 27 Jul 2022 16:12:29 +0100 Subject: [PATCH 305/488] Fix (un)locking bug introduced in commit 9562aeba --- cram/cram_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 968b7b119..bc5fea915 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -4858,7 +4858,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { "than using an external reference"); pthread_mutex_lock(&fd->ref_lock); fd->embed_ref = 2; - pthread_mutex_lock(&fd->ref_lock); + pthread_mutex_unlock(&fd->ref_lock); break; } return -1; From 72dfa6d79fe3a42b81ea35e572031dbd3176b1bd Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 27 Jul 2022 16:54:41 +0100 Subject: [PATCH 306/488] Fix leak of MD5 context when entering embed_ref=2 mode Delay initialising the context until it's actually needed. --- cram/cram_io.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index bc5fea915..5d01e1318 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -4845,8 +4845,6 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { return -1; } rlen = fd->refs->ref_id[i]->length; - if (!(md5 = hts_md5_init())) - return -1; ref = cram_get_ref(fd, i, 1, rlen); if (NULL == ref) { if (fd->embed_ref == -1) { @@ -4864,6 +4862,8 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { return -1; } rlen = fd->refs->ref_id[i]->length; /* In case it just loaded */ + if (!(md5 = hts_md5_init())) + return -1; hts_md5_update(md5, ref, rlen); hts_md5_final(buf, md5); hts_md5_destroy(md5); From 66a7fad49a8d11d6f66a6cf805d7820dece97a53 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 27 Jul 2022 16:28:32 +0100 Subject: [PATCH 307/488] Add an embed_ref=2 auto-mode test --- test/test.pl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/test/test.pl b/test/test.pl index 514f2508a..455aef18e 100755 --- a/test/test.pl +++ b/test/test.pl @@ -637,6 +637,14 @@ sub test_view testv $opts, "./compare_sam.pl -Baux $md $sam $jsam"; } + # embed_ref=2 mode + my $ersam = "ce#1000.sam"; + my $ercram = "ce#1000_er.tmp.cram"; + my $ersam2 = "${ercram}.sam"; + testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; + testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; + testv $opts, "./compare_sam.pl $ersam $ersam2"; + if ($test_view_failures == 0) { passed($opts, "$sam conversions"); From e92076cd1491545f822d1455c1aa7c4286635541 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 29 Jul 2022 10:26:16 +0100 Subject: [PATCH 308/488] Report HTS_CFLAGS_AVX2 et al in `make print-config` This information (probed for by configure) may also be of use to third parties. --- Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Makefile b/Makefile index 42ad91814..bd020bd21 100644 --- a/Makefile +++ b/Makefile @@ -326,6 +326,10 @@ libhts.a: $(LIBHTS_OBJS) -$(RANLIB) $@ print-config: + @echo HTS_CFLAGS_AVX2 = $(HTS_CFLAGS_AVX2) + @echo HTS_CFLAGS_AVX512 = $(HTS_CFLAGS_AVX512) + @echo HTS_CFLAGS_SSE4 = $(HTS_CFLAGS_SSE4) + @echo HTS_HAVE_NEON = $(HTS_HAVE_NEON) @echo LDFLAGS = $(LDFLAGS) @echo LIBHTS_OBJS = $(LIBHTS_OBJS) @echo LIBS = $(LIBS) From 1858eed9c959a0cbb51b71bd8468dd78ca0f82e8 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Wed, 27 Jul 2022 11:31:09 +0100 Subject: [PATCH 309/488] Summer 2022 copyright corrections. --- configure.ac | 2 +- cram/cram_external.c | 2 +- hfile_s3.c | 2 +- hts.c | 2 +- hts_expr.c | 2 +- htscodecs_bundled.mk | 2 +- htslib-s3-plugin.7 | 2 +- htslib/bgzf.h | 2 +- htslib/cram.h | 2 +- htslib/hfile.h | 2 +- htslib/hts.h | 2 +- htslib/hts_expr.h | 2 +- htslib/knetfile.h | 2 +- htslib/kstring.h | 2 +- htslib/sam.h | 2 +- sam.c | 2 +- test/sam.c | 2 +- test/test.pl | 2 +- test/test_expr.c | 2 +- test/test_mod.c | 2 +- version.sh | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) diff --git a/configure.ac b/configure.ac index a53c08fe3..b848dc633 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2021 Genome Research Ltd. +# Copyright (C) 2015-2022 Genome Research Ltd. # # Author: John Marshall # diff --git a/cram/cram_external.c b/cram/cram_external.c index 329f1ec63..e88ff838b 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2020 Genome Research Ltd. +Copyright (c) 2015, 2018-2020, 2022 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/hfile_s3.c b/hfile_s3.c index c9bed1fe1..ce83875c9 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -1,6 +1,6 @@ /* hfile_s3.c -- Amazon S3 backend for low-level file streams. - Copyright (C) 2015-2017, 2019-2021 Genome Research Ltd. + Copyright (C) 2015-2017, 2019-2022 Genome Research Ltd. Author: John Marshall diff --git a/hts.c b/hts.c index c2c4acb89..03809a11b 100644 --- a/hts.c +++ b/hts.c @@ -1,6 +1,6 @@ /* hts.c -- format-neutral I/O, indexing, and iterator API functions. - Copyright (C) 2008, 2009, 2012-2021 Genome Research Ltd. + Copyright (C) 2008, 2009, 2012-2022 Genome Research Ltd. Copyright (C) 2012, 2013 Broad Institute. Author: Heng Li diff --git a/hts_expr.c b/hts_expr.c index 74fe85fce..21d768d67 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -1,6 +1,6 @@ /* hts_expr.c -- filter expression parsing and processing. - Copyright (C) 2020-2021 Genome Research Ltd. + Copyright (C) 2020-2022 Genome Research Ltd. Author: James Bonfield diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 64192f085..91a9c39e9 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -1,6 +1,6 @@ # Makefile fragment to add settings needed when bundling htscodecs functions # -# Copyright (C) 2021 Genome Research Ltd. +# Copyright (C) 2021-2022 Genome Research Ltd. # # Author: Rob Davies # diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index 359c0fc35..d70626832 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -2,7 +2,7 @@ .SH NAME s3 plugin \- htslib AWS S3 plugin .\" -.\" Copyright (C) 2021 Genome Research Ltd. +.\" Copyright (C) 2021-2022 Genome Research Ltd. .\" .\" Author: Andrew Whitwham .\" diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 24d787bdf..c4ba85679 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -3,7 +3,7 @@ /* Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013, 2014, 2017, 2018-2019 Genome Research Ltd + Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/htslib/cram.h b/htslib/cram.h index afeeb3711..8dc6fe1b3 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016, 2018-2020 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2020, 2022 Genome Research Ltd. Author: James Bonfield diff --git a/htslib/hfile.h b/htslib/hfile.h index 92b789acd..6e3a2a22a 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -1,7 +1,7 @@ /// @file htslib/hfile.h /// Buffered low-level input/output streams. /* - Copyright (C) 2013-2021 Genome Research Ltd. + Copyright (C) 2013-2022 Genome Research Ltd. Author: John Marshall diff --git a/htslib/hts.h b/htslib/hts.h index d354b2e2c..5fb968fd0 100644 --- a/htslib/hts.h +++ b/htslib/hts.h @@ -1,7 +1,7 @@ /// @file htslib/hts.h /// Format-neutral I/O, indexing, and iterator API functions. /* - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2022 Genome Research Ltd. Copyright (C) 2010, 2012 Broad Institute. Portions copyright (C) 2003-2006, 2008-2010 by Heng Li diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h index 7e6a9ed2b..2abdddc24 100644 --- a/htslib/hts_expr.h +++ b/htslib/hts_expr.h @@ -1,6 +1,6 @@ /* expr.c -- filter expression parsing and processing. - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020, 2022 Genome Research Ltd. Author: James Bonfield diff --git a/htslib/knetfile.h b/htslib/knetfile.h index cfddd6b67..0f2adec83 100644 --- a/htslib/knetfile.h +++ b/htslib/knetfile.h @@ -1,6 +1,6 @@ /* The MIT License - Copyright (c) 2008, 2012, 2014, 2021 Genome Research Ltd (GRL). + Copyright (c) 2008, 2012, 2014, 2021-2022 Genome Research Ltd (GRL). 2010 by Attractive Chaos Permission is hereby granted, free of charge, to any person obtaining diff --git a/htslib/kstring.h b/htslib/kstring.h index 09bc9e3d9..53a19806d 100644 --- a/htslib/kstring.h +++ b/htslib/kstring.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2011 by Attractive Chaos - Copyright (C) 2013-2014, 2016, 2018-2020 Genome Research Ltd. + Copyright (C) 2013-2014, 2016, 2018-2020, 2022 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/htslib/sam.h b/htslib/sam.h index f0a191a28..5f8c0a554 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1,7 +1,7 @@ /// @file htslib/sam.h /// High-level SAM/BAM/CRAM sequence file operations. /* - Copyright (C) 2008, 2009, 2013-2021 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2022 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li diff --git a/sam.c b/sam.c index fccff262b..3d479e1ed 100644 --- a/sam.c +++ b/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2021 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2022 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li diff --git a/test/sam.c b/test/sam.c index 49d9210c3..036349f2b 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1,6 +1,6 @@ /* test/sam.c -- SAM/BAM/CRAM API test cases. - Copyright (C) 2014-2020 Genome Research Ltd. + Copyright (C) 2014-2020, 2022 Genome Research Ltd. Author: John Marshall diff --git a/test/test.pl b/test/test.pl index 455aef18e..d6c01786a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2021 Genome Research Ltd. +# Copyright (C) 2012-2022 Genome Research Ltd. # # Author: Petr Danecek # diff --git a/test/test_expr.c b/test/test_expr.c index 15e25cf4b..641e89041 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -1,6 +1,6 @@ /* test-expr.c -- Testing: filter expression parsing and processing. - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020, 2022 Genome Research Ltd. Author: James Bonfield diff --git a/test/test_mod.c b/test/test_mod.c index f6f5b0718..3facf5dba 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -1,6 +1,6 @@ /* test/test_mod.c -- testing of base modification functions - Copyright (C) 2020 Genome Research Ltd. + Copyright (C) 2020-2021 Genome Research Ltd. Author: James Bonfield diff --git a/version.sh b/version.sh index 6e6eff016..a13c6df88 100755 --- a/version.sh +++ b/version.sh @@ -3,7 +3,7 @@ # # Author : James Bonfield # -# Copyright (C) 2017-2018 Genome Research Ltd. +# Copyright (C) 2017-2018, 2021 Genome Research Ltd. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal From 3e289217be0e26a9722c51fe129dc98281214659 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 1 Aug 2022 17:02:02 +0100 Subject: [PATCH 310/488] Add a mapping of Zlib to Libdeflate compression levels for BGZF. Libdeflate goes up to compression level 12, with the last 3 levels using a much slower optimal parsing technique. We reserve bgzip levels 8 and 9 for two of these slow modes, and spread the remainder out across levels libdeflate 1-9. We map 1-9 to 1,2,3, 5,6,7,8, 10,12. This was designed so that the files are generally smaller than their zlib counterparts while still being faster (except for zlib levels 8 and 9 as noted above). This is based on benchmarks (see below) for various data sets. Hence users will find bgzf -l8 and -l9 considerably slower than before. Ideally we'd support bgzip -l10 to -l12, but this complicates several tools and the htslib format string which assumes it's level+'0' in various places (not just the library, but also the command line tools). This was the simpler and safer option. Realistically no one uses level 9 unless they want maximum compression, and now they're getting it once again. Fixes #1477 CPU time (threaded, but total user CPU via time -f "%U") and file size in bytes. 1. 1GB of Illumina NovaSeq BAM (NovaSeq.10m.bam) Libdeflate Zlib 0 0.99 1000474917 0.68 1000474917 1 9.80 183521324 20.29 213827245 >1 2 14.17 179046201 21.87 205485380 >1 3 15.20 175877610 26.67 195469541 >1 4 16.28 172991407 29.80 176019215 ~3 5 19.36 169087724 38.52 169202888 ~4 6 23.27 165900144 56.30 164719424 >7 7 32.50 163766923 72.45 163327258 ~7 8 57.16 161643808 148.61 160866537 ~9 9 74.91 160953697 295.37 159689582 >10 10 303.28 157126803 11 477.66 155323612 12 659.36 153756096 As an experiment I added zstd here too, with various block sizes or none at all, at level 9, 12 and 19: -9 (unblocked) 146938203 -B1048576 -b9 149801386 -B65536 -b9 160527533 (in ~29s) -B5536 -b12 157251800 (best 64k blocked zstd, in ~95s) -B5536 -b19 144923698 (best 64k blocked zstd, in ~617s) ----------------------------------------------------------------------------- 1GB of ~4000 sample VCF ALL.chr20.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz libdeflate zlib 0 0.66 1000474917 0.70 1000474917 1 4.29 23702390 6.50 31624075 >1 2 6.15 22689795 6.96 30023229 >1 3 6.60 22086968 8.37 28216644 >1 4 6.89 21741126 13.88 21861705 ~4 5 7.26 21312709 15.91 20967100 >6 6 8.29 20575904 21.01 19966632 ~7 7 11.67 19817402 27.06 19529414 >8 8 20.15 19082333 59.78 18349376 >10 9 26.82 18536813 104.85 17957219 >10 10 104.32 17748987 11 168.52 17297625 12 273.45 16916786 ----------------------------------------------------------------------------- 1GB of 1 sample VCF; many verbose INFO fields HG002_GRCh38_1_22_v4.2_benchmark.vcf.gz libdeflate zlib 0 0.66 1000474917 0.73 1000474917 1 5.17 89779592 9.36 88208451 >2 2 7.87 78071190 9.09 76125060 >3 3 9.16 70155649 8.91 71063897 >3 4 8.00 61555631 17.29 67974286 >4 5 9.26 59090268 18.28 59819372 >5 6 11.24 56259793 20.25 55524441 >7 7 13.33 53421543 21.65 54944394 >7 8 19.12 51953892 26.07 53725388 >7 9 21.17 51870998 28.17 53714621 >7 10 140.56 52822252 11 240.26 50724685 12 452.39 50135214 There is some oddity here with libdeflate level 10 being poorer than level 8 while still being much slower! This is probably some quirk of excessive data redundancy. This data really shows the benefit of zstd instead. Such highly redundant data hugely speeds up with zstd -9 taking approx 8s to encode (when using 64KB blocks) at a size of 49792035, so libdeflate lvl 5 speeds at better than libdeflate size. With block sizes of 1MB that drops from ~50MB to ~39MB too. (Zstd doesn't help nearly as much on the other data sets, so this is likely an excessive redundancy thing.) ----------------------------------------------------------------------------- Single sample GIAB chr1 bcftools output (85MB worth); more succint libdeflate zlib 0 0.04 85086248 0.08 85086248 1 0.84 18635065 1.61 21071717 >1 2 1.38 17914684 1.84 20227900 >1 3 1.44 17581996 2.05 19477263 >1 4 1.57 17231445 2.58 18139642 >2 5 1.87 16065221 3.06 16855325 >5 6 2.24 15550661 4.05 16433775 >5 7 2.90 15265882 4.63 16084926 >5 8 4.99 14650624 7.45 15784360 >6 9 5.86 14615736 7.49 15778425 >6 10 21.15 14238504 11 29.47 14188808 12 35.88 14180049 ----------------------------------------------------------------------------- 1GB of R10 ONT fastq libdeflate zlib 0 0.60 1000474917 0.68 1000474917 1 17.48 507151512 39.93 522788722 >1 2 27.29 490892251 42.81 513743184 >1 3 32.00 486330031 52.95 506652197 ~1 4 38.66 483198216 54.37 501296019 >2 5 43.60 479970547 78.50 498174825 >2 6 60.20 478146566 137.41 494118811 >2 7 87.72 476998683 196.25 493161028 >2 8 109.78 476648496 249.70 493162311 >2 9 110.21 476658031 249.02 493162210 >2 10 213.13 459363967 11 250.79 457698243 12 287.86 457132946 --- bgzf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bgzf.c b/bgzf.c index e72ed566d..a969b1567 100644 --- a/bgzf.c +++ b/bgzf.c @@ -574,6 +574,8 @@ int bgzf_compress(void *_dst, size_t *dlen, const void *src, size_t slen, int le } else { level = level > 0 ? level : 6; // libdeflate doesn't honour -1 as default // NB levels go up to 12 here. + int lvl_map[] = {0,1,2,3,5,6,7,8,10,12}; + level = lvl_map[level>9 ?9 :level]; struct libdeflate_compressor *z = libdeflate_alloc_compressor(level); if (!z) return -1; From ad80f8e8ee582377d998d5aea9f65975f9bb3322 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 29 Jul 2022 11:53:01 +0100 Subject: [PATCH 311/488] Catch arrays of BCF_BT_NULL in bcf_record_check() Adds a check for INFO/FORMAT values with type 0 ("A MISSING value without an explicit type provided") and number of elements > 0. Normally type = 0 and number = 0 is used for a completely missing INFO/FORMAT value. In theory you could use number > 0 to make an array of missing values (at least the specification doesn't explicitly disallow it), however trying to do so results in bcf_fmt_array() reporting "Unexpected type 0" and calling exit(1). HTSJDK also appears to reject this encoding, and nothing appears to write it, so it seems reasonable to say that it's not valid. The check for this encoding is added to bcf_record_check() so it's caught well before the data gets near bcf_fmt_array(). It also avoids problems with bcf_type_shift[], which cannot report the correct size of type = 0 encodings (it would need to multiply by 0, which isn't stricty possible with a shift). Credit to OSS-Fuzz Fixes oss-fuzz 49091 --- vcf.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index 012db0934..aa7c558be 100644 --- a/vcf.c +++ b/vcf.c @@ -1599,7 +1599,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { err |= BCF_ERR_TAG_UNDEF; } if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_shared; - if (((1 << type) & is_valid_type) == 0) { + if (((1 << type) & is_valid_type) == 0 + || (type == BCF_BT_NULL && num > 0)) { if (!reports++ || hts_verbose >= HTS_LOG_DEBUG) hts_log_warning("Bad BCF record at %s:%"PRIhts_pos": Invalid %s type %d (%s)", bcf_seqname_safe(hdr,rec), rec->pos+1, "INFO", type, get_type_name(type)); err |= BCF_ERR_TAG_INVALID; @@ -1623,7 +1624,8 @@ static int bcf_record_check(const bcf_hdr_t *hdr, bcf1_t *rec) { err |= BCF_ERR_TAG_UNDEF; } if (bcf_dec_size_safe(ptr, end, &ptr, &num, &type) != 0) goto bad_indiv; - if (((1 << type) & is_valid_type) == 0) { + if (((1 << type) & is_valid_type) == 0 + || (type == BCF_BT_NULL && num > 0)) { bcf_record_check_err(hdr, rec, "type", &reports, type); err |= BCF_ERR_TAG_INVALID; } From c27092616fd0b11ea6955dbc337d0f581ace8d72 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 14 Jul 2022 10:33:38 +0100 Subject: [PATCH 312/488] Sanitize the 3-state logic for hts_expr_t, plus added some functions. 1. We have strings, numerics, and a null/unknown state. The latter was previously only used as a mechanism for reporting the presence of absence of a tag, eg "[X1]" being the value of X1 if found, or null if not. We also had an override of is_true=1 to permit "![X1]", although sadly this wasn't robust (and still isn't) as it has a dual meaning of X1 not existing or X1 existing but being zero. Our unknown state now uses NaN semantics as defined by IEE 754 for comparisons and mathematics, but not for conditionals (see below). This means unknown+2 is still unknown (and false). Previously "0/0+2" was true/-nan, it's now false/nan. 2. NaN semantics means <, >, == and != all return false with null/nan values, even when comparing null to null. Note this changes the language results slightly. INCOMPATIBILITY: Previously "[X1] != 0" meant X1 tag exists and is zero, or X1 tag does not exist. To avoid that second clause the man page recommends "[X1] && [X1] != 0" to add a clause of checking the tag for existance first. This was illogical and almost certainly not the intended outcome. Now != will be false whenever tag [X1] does not exist so the expression is only true when the value is defined. (The man page expression still works, but has a redundant component.) Similarly "![X1]" now means X1 doesn't exist, rather than the previous interpretation of doesn't exist or is zero. 3. Fix arithmetic on non-existant aux tags. Undefined values are now considered to be false. They defined as either null strings or NaN doubles, although we use the latter ourselves. (The former are considered the same as the previous code used this and possibly external methods copying our old style). Note for compatibility with before the empty string is not false. Previously attempting to use an undefined value gave a warning message, so expressions like "[X0] + [X1] > 10" would spam when X0 or X1 were absent. Added a hts_expr_val_exists() function to simplify testing for defined values, hts_expr_val_existsT() for defined or undef-but-true (useful in conditionals), and hts_expr_val_undef() to set a variable to be undefined (used when invalidating things). 4. Added an exists() function. "[X0]" is a synonym for X0-exists, and "![X0]" as doesn't exist, but sadly previously "![X0]" was interpreted as "X0-doesn't-exist or X0==0". Given this change and for general clarity, a less ambiguous explicit exists function has been added. Note "exists" means has a known value or has an explicit is_true==1. So null-but-true is still "exists". Hence tags XX:f:nan are considered to exist. 5. Added a default(a,b) function where "a" is returned if defined, otherwise "b". As previously explained the expression "[X0] + [X1] > 10" is false whenever X0 or X1 don't exist, but using "default([X0],0) + default([X1],0) > 10" we can use the sum of the values present, or a single value if one is absent. 6. Added mathematical functions of sqrt, log, exp, and pow. 7. Null and boolean operations are largely unchanged, but for clarification they work as follow, with 0/1 also being false/true and symmetric operations. null == NaN (false) null-but-true == NaN (true) null && x == 0 null || 0 == 0 null || 1 == 1 !null == 1 !!null == 0 !null-but-true == 0 Although we're using NaN internally in order to get the arithmetic consistent, it's not good to assume we rigidly follows all NaN semantics. Specifically in C NaN is considered to be true (so "NaN && 1" is true), but for us it is false. --- hts_expr.c | 296 +++++++++++++++++++++++++++++++-------- htslib/hts_expr.h | 47 ++++++- test/test_expr.c | 344 +++++++++++++++++++++++++++++----------------- 3 files changed, 496 insertions(+), 191 deletions(-) diff --git a/hts_expr.c b/hts_expr.c index 21d768d67..5e5a132ea 100644 --- a/hts_expr.c +++ b/hts_expr.c @@ -23,7 +23,6 @@ FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ // TODO: -// - add maths functions. pow, sqrt, log, ? // - ?: operator for conditionals? #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h @@ -163,10 +162,52 @@ static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } break; + case 'd': + if (strncmp(str, "default(", 8) == 0) { + if (expression(filt, data, fn, str+8, end, res)) return -1; + if (**end != ',') + return -1; + (*end)++; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + if (expression(filt, data, fn, ws(*end), end, &val)) return -1; + func_ok = 1; + if (!hts_expr_val_existsT(res)) { + kstring_t swap = res->s; + *res = val; + val.s = swap; + hts_expr_val_free(&val); + } + } + break; + + case 'e': + if (strncmp(str, "exists(", 7) == 0) { + if (expression(filt, data, fn, str+7, end, res)) return -1; + func_ok = 1; + res->is_true = res->d = hts_expr_val_existsT(res); + res->is_str = 0; + } else if (strncmp(str, "exp(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + res->d = exp(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); + } + + break; + case 'l': if (strncmp(str, "length(", 7) == 0) { if (expression(filt, data, fn, str+7, end, res)) return -1; func_ok = expr_func_length(res); + } else if (strncmp(str, "log(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + res->d = log(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); } break; @@ -179,6 +220,44 @@ static int func_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, func_ok = expr_func_max(res); } break; + + case 'p': + if (strncmp(str, "pow(", 4) == 0) { + if (expression(filt, data, fn, str+4, end, res)) return -1; + func_ok = 1; + + if (**end != ',') + return -1; + (*end)++; + hts_expr_val_t val = HTS_EXPR_VAL_INIT; + if (expression(filt, data, fn, ws(*end), end, &val)) return -1; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else if (res->is_str || val.is_str) { + hts_expr_val_free(&val); // arith on strings + return -1; + } else { + func_ok = 1; + res->d = pow(res->d, val.d); + hts_expr_val_free(&val); + res->is_str = 0; + } + + if (isnan(res->d)) + hts_expr_val_undef(res); + } + break; + + case 's': + if (strncmp(str, "sqrt(", 5) == 0) { + if (expression(filt, data, fn, str+5, end, res)) return -1; + func_ok = 1; + res->d = sqrt(res->d); + res->is_str = 0; + if (isnan(res->d)) + hts_expr_val_undef(res); + } + break; } if (func_ok < 0) @@ -286,32 +365,46 @@ static int unary_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, char *str, char **end, hts_expr_val_t *res) { int err; str = ws(str); - if (*str == '+') { - err = simple_expr(filt, data, fn, str+1, end, res); - err |= res->is_str; - res->is_true = res->d != 0; - } else if (*str == '-') { + if (*str == '+' || *str == '-') { err = simple_expr(filt, data, fn, str+1, end, res); - err |= res->is_str; - res->d = -res->d; - res->is_true = res->d != 0; + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + err |= res->is_str; + if (*str == '-') + res->d = -res->d; + res->is_true = res->d != 0; + } } else if (*str == '!') { err = unary_expr(filt, data, fn, str+1, end, res); if (res->is_true) { + // Any explicitly true value becomes false res->d = res->is_true = 0; - res->is_str = 0; + } else if (!hts_expr_val_exists(res)) { + // We can also still negate undef values by toggling the + // is_true override value. + res->d = res->is_true = !res->is_true; } else if (res->is_str) { - res->is_str = 0; + // !null = true, !"foo" = false, NOTE: !"" = false also res->d = res->is_true = (res->s.s == NULL); } else { res->d = !(int64_t)res->d; res->is_true = res->d != 0; } + res->is_str = 0; } else if (*str == '~') { err = unary_expr(filt, data, fn, str+1, end, res); - err |= res->is_str; - res->d = ~(int64_t)res->d; - res->is_true = res->d != 0; + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + err |= res->is_str; + if (!hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else { + res->d = ~(int64_t)res->d; + res->is_true = res->d != 0; + } + } } else { err = simple_expr(filt, data, fn, str, end, res); } @@ -338,7 +431,9 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, str = ws(str); if (*str == '*' || *str == '/' || *str == '%') { if (unary_expr(filt, data, fn, str+1, end, &val)) return -1; - if (val.is_str || res->is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + hts_expr_val_undef(res); + } else if (val.is_str || res->is_str) { hts_expr_val_free(&val); return -1; // arith on strings } @@ -348,12 +443,15 @@ static int mul_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, res->d *= val.d; else if (*str == '/') res->d /= val.d; - else if (*str == '%') - res->d = (int64_t)res->d % (int64_t)val.d; - else + else if (*str == '%') { + if (val.d) + res->d = (int64_t)res->d % (int64_t)val.d; + else + hts_expr_val_undef(res); + } else break; - res->is_true = res->d != 0; + res->is_true = hts_expr_val_exists(res) && (res->d != 0); str = *end; } @@ -378,9 +476,12 @@ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, hts_expr_val_t val = HTS_EXPR_VAL_INIT; while (*str) { str = ws(str); + int undef = 0; if (*str == '+' || *str == '-') { if (mul_expr(filt, data, fn, str+1, end, &val)) return -1; - if (val.is_str || res->is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (val.is_str || res->is_str) { hts_expr_val_free(&val); return -1; // arith on strings } @@ -393,7 +494,11 @@ static int add_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, else break; - res->is_true = res->d != 0; + if (undef) + hts_expr_val_undef(res); + else + res->is_true = res->d != 0; + str = *end; } @@ -412,11 +517,14 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, if (add_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; for (;;) { str = ws(*end); if (*str == '&' && str[1] != '&') { if (add_expr(filt, data, fn, str+1, end, &val)) return -1; - if (res->is_str || val.is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } @@ -426,6 +534,8 @@ static int bitand_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } } hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); return 0; } @@ -440,11 +550,14 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, if (bitand_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; for (;;) { str = ws(*end); if (*str == '^') { if (bitand_expr(filt, data, fn, str+1, end, &val)) return -1; - if (res->is_str || val.is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } @@ -454,6 +567,8 @@ static int bitxor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } } hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); return 0; } @@ -468,11 +583,14 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, if (bitxor_expr(filt, data, fn, str, end, res)) return -1; hts_expr_val_t val = HTS_EXPR_VAL_INIT; + int undef = 0; for (;;) { str = ws(*end); if (*str == '|' && str[1] != '|') { if (bitxor_expr(filt, data, fn, str+1, end, &val)) return -1; - if (res->is_str || val.is_str) { + if (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res)) { + undef = 1; + } else if (res->is_str || val.is_str) { hts_expr_val_free(&val); return -1; } @@ -482,6 +600,8 @@ static int bitor_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } } hts_expr_val_free(&val); + if (undef) + hts_expr_val_undef(res); return 0; } @@ -500,33 +620,60 @@ static int cmp_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, str = ws(*end); hts_expr_val_t val = HTS_EXPR_VAL_INIT; - int err = 0; + int err = 0, cmp_done = 0; if (*str == '>' && str[1] == '=') { + cmp_done = 1; err = cmp_expr(filt, data, fn, str+2, end, &val); - res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s - ? strcmp(res->s.s, val.s.s) >= 0 - : !res->is_str && !val.is_str && res->d >= val.d; - res->is_str = 0; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) >= 0 + : !res->is_str && !val.is_str && res->d >= val.d; + res->is_str = 0; + } } else if (*str == '>') { + cmp_done = 1; err = cmp_expr(filt, data, fn, str+1, end, &val); - res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s - ? strcmp(res->s.s, val.s.s) > 0 - : !res->is_str && !val.is_str && res->d > val.d; - res->is_str = 0; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) > 0 + : !res->is_str && !val.is_str && res->d > val.d; + res->is_str = 0; + } } else if (*str == '<' && str[1] == '=') { + cmp_done = 1; err = cmp_expr(filt, data, fn, str+2, end, &val); - res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s - ? strcmp(res->s.s, val.s.s) <= 0 - : !res->is_str && !val.is_str && res->d <= val.d; - res->is_str = 0; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) <= 0 + : !res->is_str && !val.is_str && res->d <= val.d; + res->is_str = 0; + } } else if (*str == '<') { + cmp_done = 1; err = cmp_expr(filt, data, fn, str+1, end, &val); - res->is_true=res->d = res->is_str && res->s.s && val.is_str && val.s.s - ? strcmp(res->s.s, val.s.s) < 0 - : !res->is_str && !val.is_str && res->d < val.d; - res->is_str = 0; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true=res->d + = res->is_str && res->s.s && val.is_str && val.s.s + ? strcmp(res->s.s, val.s.s) < 0 + : !res->is_str && !val.is_str && res->d < val.d; + res->is_str = 0; + } } + + if (cmp_done && (!hts_expr_val_exists(&val) || !hts_expr_val_exists(res))) + hts_expr_val_undef(res); hts_expr_val_free(&val); return err ? -1 : 0; @@ -546,34 +693,45 @@ static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, str = ws(*end); - int err = 0; + int err = 0, eq_done = 0; hts_expr_val_t val = HTS_EXPR_VAL_INIT; // numeric vs numeric comparison is as expected // string vs string comparison is as expected // numeric vs string is false if (str[0] == '=' && str[1] == '=') { + eq_done = 1; if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { res->is_true = res->d = 0; } else { - res->is_true = res->d = res->is_str - ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s)==0 : 0) - : !res->is_str && !val.is_str && res->d == val.d; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s)==0 :0) + : !res->is_str && !val.is_str && res->d == val.d; + } } res->is_str = 0; } else if (str[0] == '!' && str[1] == '=') { + eq_done = 1; if ((err = eq_expr(filt, data, fn, str+2, end, &val))) { res->is_true = res->d = 0; } else { - res->is_true = res->d = res->is_str - ? (res->s.s && val.s.s ? strcmp(res->s.s, val.s.s) != 0 : 1) - : res->is_str != val.is_str || res->d != val.d; + if (!hts_expr_val_exists(res) || !hts_expr_val_exists(&val)) { + hts_expr_val_undef(res); + } else { + res->is_true = res->d = res->is_str + ? (res->s.s && val.s.s ?strcmp(res->s.s, val.s.s) != 0 :1) + : res->is_str != val.is_str || res->d != val.d; + } } res->is_str = 0; } else if ((str[0] == '=' && str[1] == '~') || (str[0] == '!' && str[1] == '~')) { + eq_done = 1; err = eq_expr(filt, data, fn, str+2, end, &val); if (!val.is_str || !res->is_str) { hts_expr_val_free(&val); @@ -614,6 +772,9 @@ static int eq_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, } res->is_str = 0; } + + if (eq_done && ((!hts_expr_val_exists(&val)) || !hts_expr_val_exists(res))) + hts_expr_val_undef(res); hts_expr_val_free(&val); return err ? -1 : 0; @@ -634,16 +795,37 @@ static int and_expr(hts_filter_t *filt, void *data, hts_expr_sym_func *fn, str = ws(*end); if (str[0] == '&' && str[1] == '&') { if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; - res->is_true = res->d = - (res->is_true || (res->is_str && res->s.s) || res->d) && - (val.is_true || (val.is_str && val.s.s) || val.d); - res->is_str = 0; + if (!hts_expr_val_existsT(res) || !hts_expr_val_existsT(&val)) { + hts_expr_val_undef(res); + res->d = 0; + } else { + res->is_true = res->d = + (res->is_true || (res->is_str && res->s.s) || res->d) && + (val.is_true || (val.is_str && val.s.s) || val.d); + res->is_str = 0; + } } else if (str[0] == '|' && str[1] == '|') { if (eq_expr(filt, data, fn, str+2, end, &val)) return -1; - res->is_true = res->d = - res->is_true || (res->is_str && res->s.s) || res->d || - val.is_true || (val.is_str && val.s.s ) || val.d; - res->is_str = 0; + if (!hts_expr_val_existsT(res) && !hts_expr_val_existsT(&val)) { + // neither defined + hts_expr_val_undef(res); + res->d = 0; + } else if (!hts_expr_val_existsT(res) && + !(val.is_true || (val.is_str && val.s.s ) || val.d)) { + // LHS undef and RHS false + hts_expr_val_undef(res); + res->d = 0; + } else if (!hts_expr_val_existsT(&val) && + !(res->is_true || (res->is_str && res->s.s) || res->d)){ + // RHS undef and LHS false + hts_expr_val_undef(res); + res->d = 0; + } else { + res->is_true = res->d = + res->is_true || (res->is_str && res->s.s) || res->d || + val.is_true || (val.is_str && val.s.s ) || val.d; + res->is_str = 0; + } } else { break; } @@ -705,7 +887,7 @@ static int hts_filter_eval_(hts_filter_t *filt, if (res->is_str) { res->is_true |= res->s.s != NULL; res->d = res->is_true; - } else { + } else if (hts_expr_val_exists(res)) { res->is_true |= res->d != 0; } diff --git a/htslib/hts_expr.h b/htslib/hts_expr.h index 2abdddc24..43da89d6a 100644 --- a/htslib/hts_expr.h +++ b/htslib/hts_expr.h @@ -25,18 +25,30 @@ DEALINGS IN THE SOFTWARE. */ #ifndef HTS_EXPR_H #define HTS_EXPR_H +#include #include "kstring.h" #include "hts_defs.h" /// Holds a filter variable. This is also used to return the results. /** - * Note we cope with zero-but-true in order to implement a basic - * "exists(something)" check where "something" may even be zero. + * The expression language has 3-states of string, numeric, and unknown. + * The unknown state is either a NaN numeric or a null string, with both + * internally considered to have the same "unknown" meaning. * - * Eg in the aux tag searching syntax, "[NM]" should return true if - * NM tag exists even if zero. - * Take care when negating this. "[NM] != 0" will be true when - * [NM] is absent, thus consider "[NM] && [NM] != 0". + * These largely match the IEE 754 semantics for NaN comparisons: <, >, ==, + * != all fail, (even NaN == NaN). Similarly arithmetic (+,-,/,*,%) with + * unknown values are still unknown (and false). + * + * The departure from NaN semantics though is that our unknown/null state is + * considered to be false while NaN in C is true. Similarly the false nature + * of our unknown state meants !val becomes true, !!val is once again false, + * val && 1 is false, val || 0 is false, and val || 1 is true along with + * !val || 0 and !val && 1. + * + * Note it is possible for empty strings and zero numbers to also be true. + * An example of this is the aux string '[NM]' which returns true if the + * NM tag is found, regardless of whether it is also zero. However the + * better approach added in 1.16 is 'exists([NM])'. */ typedef struct hts_expr_val_t { char is_str; // Use .s vs .d @@ -45,6 +57,29 @@ typedef struct hts_expr_val_t { double d; // otherwise this } hts_expr_val_t; +/// Returns true if an hts_expr_val_t is defined. +/* An example usage of this is in the SAM expression filter where an + * [X0] aux tag will be the value of X0 (string or numeric) if set, or + * a false nul-string (not the same as an empty one) when not set. + */ +static inline int hts_expr_val_exists(hts_expr_val_t *v) { + return v && !(v->is_str == 1 && v->s.s == NULL) + && !(v->is_str == 0 && isnan(v->d)); +} + +/// Returns true if an hts_expr_val_t is defined or is undef-but-true +static inline int hts_expr_val_existsT(hts_expr_val_t *v) { + return (v && v->is_true) || hts_expr_val_exists(v); +} + +/// Set a value to be undefined (nan). +static inline void hts_expr_val_undef(hts_expr_val_t *v) { + ks_clear(&v->s); + v->is_true = 0; + v->is_str = 0; + v->d = NAN; +} + /// Frees a hts_expr_val_t type. static inline void hts_expr_val_free(hts_expr_val_t *f) { ks_free(&f->s); diff --git a/test/test_expr.c b/test/test_expr.c index 641e89041..ecd1232e4 100644 --- a/test/test_expr.c +++ b/test/test_expr.c @@ -62,16 +62,23 @@ int lookup(void *data, char *str, char **end, hts_expr_val_t *res) { *end = str+5; res->is_str = 1; kputs("", ks_clear(&res->s)); + } else if (strncmp(str, "zero-but-true", 13) == 0) { + *end = str+13; + res->d = 0; + res->is_true = 1; } else if (strncmp(str, "null-but-true", 13) == 0) { *end = str+13; + hts_expr_val_undef(res); res->is_true = 1; - res->is_str = 1; - ks_clear(&res->s); } else if (strncmp(str, "null", 4) == 0) { // null string (eg aux:Z tag is absent) *end = str+4; - res->is_str = 1; - ks_clear(&res->s); + hts_expr_val_undef(res); + } else if (strncmp(str, "nan", 3) == 0) { + // sqrt(-1), 0/0 and similar + // Semantically the same operations as null. + *end = str+3; + hts_expr_val_undef(res); } else { return -1; @@ -94,130 +101,200 @@ static inline int strcmpnull(const char *a, const char *b) { return strcmp(a, b); } +// Compare NAN as equal, for testing we returned the correct values +static inline int cmpfloat(double d1, double d2) { + // If needs be, can use DBL_EPSILON in comparisons here. + return d1 == d2 || (isnan(d1) && isnan(d2)); +} + int test(void) { // These are all valid expressions that should work test_ev tests[] = { - { 1, 1, NULL, "1"}, - { 1, 1, NULL, "+1"}, - { 1, -1, NULL, "-1"}, - { 0, 0, NULL, "!7"}, - { 1, 1, NULL, "!0"}, - { 1, 1, NULL, "!(!7)"}, - { 1, 1, NULL, "!!7"}, - - { 1, 5, NULL, "2+3"}, - { 1, -1, NULL, "2+-3"}, - { 1, 6, NULL, "1+2+3"}, - { 1, 1, NULL, "-2+3"}, - - { 1, 6, NULL, "2*3"}, - { 1, 6, NULL, "1*2*3"}, - { 0, 0, NULL, "2*0"}, - - { 1, 7, NULL, "(7)"}, - { 1, 7, NULL, "((7))"}, - { 1, 21, NULL, "(1+2)*(3+4)"}, - { 1, 14, NULL, "(4*5)-(-2*-3)"}, - - { 1, 1, NULL, "(1+2)*3==9"}, - { 1, 1, NULL, "(1+2)*3!=8"}, - { 0, 0, NULL, "(1+2)*3!=9"}, - { 0, 0, NULL, "(1+2)*3==8"}, - - { 0, 0, NULL, "1>2"}, - { 1, 1, NULL, "1<2"}, - { 0, 0, NULL, "3<3"}, - { 0, 0, NULL, "3>3"}, - { 1, 1, NULL, "9<=9"}, - { 1, 1, NULL, "9>=9"}, - { 1, 1, NULL, "2*4==8"}, - { 1, 1, NULL, "16==0x10"}, - { 1, 1, NULL, "15<0x10"}, - { 1, 1, NULL, "17>0x10"}, - { 0, 0, NULL, "2*4!=8"}, - { 1, 1, NULL, "4+2<3+4"}, - { 0, 0, NULL, "4*2<3+4"}, - { 1, 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 - - { 1, 1, NULL, "(1<2) == (3>2)"}, - { 1, 1, NULL, "1<2 == 3>2"}, - - { 1, 1, NULL, "2 && 1"}, - { 0, 0, NULL, "2 && 0"}, - { 0, 0, NULL, "0 && 2"}, - { 1, 1, NULL, "2 || 1"}, - { 1, 1, NULL, "2 || 0"}, - { 1, 1, NULL, "0 || 2"}, - { 1, 1, NULL, "1 || 2 && 3"}, - { 1, 1, NULL, "2 && 3 || 1"}, - { 1, 1, NULL, "0 && 3 || 2"}, - { 0, 0, NULL, "0 && 3 || 0"}, - { 0, 0, NULL, " 5 - 5 && 1"}, - { 0, 0, NULL, "+5 - 5 && 1"}, - - { 1, 1, NULL, "3 & 1"}, - { 1, 2, NULL, "3 & 2"}, - { 1, 3, NULL, "1 | 2"}, - { 1, 3, NULL, "1 | 3"}, - { 1, 7, NULL, "1 | 6"}, - { 1, 2, NULL, "1 ^ 3"}, - - { 1, 1, NULL, "(1^0)&(4^3)"}, - { 1, 2, NULL, "1 ^(0&4)^ 3"}, - { 1, 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ - - { 1, 6, NULL, "(1|0)^(4|3)"}, - { 1, 7, NULL, "1 |(0^4)| 3"}, - { 1, 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | - - { 1, 1, NULL, "4 & 2 || 1"}, - { 1, 1, NULL, "(4 & 2) || 1"}, - { 0, 0, NULL, "4 & (2 || 1)"}, - { 1, 1, NULL, "1 || 4 & 2"}, - { 1, 1, NULL, "1 || (4 & 2)"}, - { 0, 0, NULL, "(1 || 4) & 2"}, - - { 1, 1, NULL, " (2*3)&7 > 4"}, - { 0, 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv - { 1, 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv - { 1, 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, - - { 1, 1, "plugh", "magic"}, - { 1, 1, "", "empty"}, - { 1, 1, NULL, "magic == \"plugh\""}, - { 1, 1, NULL, "magic != \"xyzzy\""}, - - { 1, 1, NULL, "\"abc\" < \"def\""}, - { 1, 1, NULL, "\"abc\" <= \"abc\""}, - { 0, 0, NULL, "\"abc\" < \"ab\""}, - { 0, 0, NULL, "\"abc\" <= \"ab\""}, - - { 0, 0, NULL, "\"abc\" > \"def\""}, - { 1, 1, NULL, "\"abc\" >= \"abc\""}, - { 1, 1, NULL, "\"abc\" > \"ab\""}, - { 1, 1, NULL, "\"abc\" >= \"ab\""}, - - { 1, 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, - { 0, 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, - { 1, 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, - { 1, 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, - - { 1, 1, "", "empty-but-true" }, - { 0, 0, NULL, "!empty-but-true" }, - { 1, 1, NULL, "!!empty-but-true" }, - { 1, 1, NULL, "1 && empty-but-true && 1" }, - { 0, 0, NULL, "1 && empty-but-true && 0" }, - - { 0, 0, NULL, "null" }, - { 1, 1, NULL, "!null" }, - { 0, 0, NULL, "!!null", }, - - { 1, 1, NULL, "null-but-true" }, - { 0, 0, NULL, "!null-but-true" }, - { 1, 1, NULL, "!!null-but-true" }, - - { 0, 0, NULL, "null || 0" }, - { 1, 1, NULL, "null-but-true && 1" }, + { 1, 1, NULL, "1"}, + { 1, 1, NULL, "+1"}, + { 1, -1, NULL, "-1"}, + { 0, 0, NULL, "!7"}, + { 1, 1, NULL, "!0"}, + { 1, 1, NULL, "!(!7)"}, + { 1, 1, NULL, "!!7"}, + + { 1, 5, NULL, "2+3"}, + { 1, -1, NULL, "2+-3"}, + { 1, 6, NULL, "1+2+3"}, + { 1, 1, NULL, "-2+3"}, + { 0, NAN, NULL, "1+null" }, + { 0, NAN, NULL, "null-1" }, + { 0, NAN, NULL, "-null" }, + + { 1, 6, NULL, "2*3"}, + { 1, 6, NULL, "1*2*3"}, + { 0, 0, NULL, "2*0"}, + + { 1, 7, NULL, "(7)"}, + { 1, 7, NULL, "((7))"}, + { 1, 21, NULL, "(1+2)*(3+4)"}, + { 1, 14, NULL, "(4*5)-(-2*-3)"}, + + { 0, NAN, NULL, "2*null"}, + { 0, NAN, NULL, "null/2"}, + { 0, NAN, NULL, "0/0"}, + + { 1, 1, NULL, "(1+2)*3==9"}, + { 1, 1, NULL, "(1+2)*3!=8"}, + { 0, 0, NULL, "(1+2)*3!=9"}, + { 0, 0, NULL, "(1+2)*3==8"}, + + { 0, 0, NULL, "1>2"}, + { 1, 1, NULL, "1<2"}, + { 0, 0, NULL, "3<3"}, + { 0, 0, NULL, "3>3"}, + { 1, 1, NULL, "9<=9"}, + { 1, 1, NULL, "9>=9"}, + { 1, 1, NULL, "2*4==8"}, + { 1, 1, NULL, "16==0x10"}, + { 1, 1, NULL, "15<0x10"}, + { 1, 1, NULL, "17>0x10"}, + { 0, 0, NULL, "2*4!=8"}, + { 1, 1, NULL, "4+2<3+4"}, + { 0, 0, NULL, "4*2<3+4"}, + { 1, 8, NULL, "4*(2<3)+4"}, // boolean; 4*(1)+4 + + { 1, 1, NULL, "(1<2) == (3>2)"}, + { 1, 1, NULL, "1<2 == 3>2"}, + + { 0, NAN, NULL, "null <= 0" }, + { 0, NAN, NULL, "null >= 0" }, + { 0, NAN, NULL, "null < 0" }, + { 0, NAN, NULL, "null > 0" }, + { 0, NAN, NULL, "null == null" }, + { 0, NAN, NULL, "null != null" }, + { 0, NAN, NULL, "null < 10" }, + { 0, NAN, NULL, "10 > null" }, + + { 1, 1, NULL, "2 && 1"}, + { 0, 0, NULL, "2 && 0"}, + { 0, 0, NULL, "0 && 2"}, + { 1, 1, NULL, "2 || 1"}, + { 1, 1, NULL, "2 || 0"}, + { 1, 1, NULL, "0 || 2"}, + { 1, 1, NULL, "1 || 2 && 3"}, + { 1, 1, NULL, "2 && 3 || 1"}, + { 1, 1, NULL, "0 && 3 || 2"}, + { 0, 0, NULL, "0 && 3 || 0"}, + { 0, 0, NULL, " 5 - 5 && 1"}, + { 0, 0, NULL, "+5 - 5 && 1"}, + { 0, 0, NULL, "null && 1"}, // null && x == null + { 0, 0, NULL, "1 && null"}, + { 1, 1, NULL, "!null && 1"}, + { 1, 1, NULL, "1 && !null"}, + { 1, 1, NULL, "1 && null-but-true"}, + { 0, 0, NULL, "null || 0"}, // null || 0 == null + { 0, 0, NULL, "0 || null"}, + { 1, 1, NULL, "!null || 0"}, + { 1, 1, NULL, "0 || !null"}, + { 1, 1, NULL, "0 || null-but-true"}, + { 1, 1, NULL, "null || 1"}, // null || 1 == 1 + { 1, 1, NULL, "1 || null"}, + + { 1, 1, NULL, "3 & 1"}, + { 1, 2, NULL, "3 & 2"}, + { 1, 3, NULL, "1 | 2"}, + { 1, 3, NULL, "1 | 3"}, + { 1, 7, NULL, "1 | 6"}, + { 1, 2, NULL, "1 ^ 3"}, + { 0, NAN, NULL, "1 | null"}, + { 0, NAN, NULL, "null | 1"}, + { 0, NAN, NULL, "1 & null"}, + { 0, NAN, NULL, "null & 1"}, + { 0, NAN, NULL, "0 ^ null"}, + { 0, NAN, NULL, "null ^ 0"}, + { 0, NAN, NULL, "1 ^ null"}, + { 0, NAN, NULL, "null ^ 1"}, + + { 1, 1, NULL, "(1^0)&(4^3)"}, + { 1, 2, NULL, "1 ^(0&4)^ 3"}, + { 1, 2, NULL, "1 ^ 0&4 ^ 3"}, // precedence, & before ^ + + { 1, 6, NULL, "(1|0)^(4|3)"}, + { 1, 7, NULL, "1 |(0^4)| 3"}, + { 1, 7, NULL, "1 | 0^4 | 3"}, // precedence, ^ before | + + { 1, 1, NULL, "4 & 2 || 1"}, + { 1, 1, NULL, "(4 & 2) || 1"}, + { 0, 0, NULL, "4 & (2 || 1)"}, + { 1, 1, NULL, "1 || 4 & 2"}, + { 1, 1, NULL, "1 || (4 & 2)"}, + { 0, 0, NULL, "(1 || 4) & 2"}, + + { 1, 1, NULL, " (2*3)&7 > 4"}, + { 0, 0, NULL, " (2*3)&(7 > 4)"}, // C precedence equiv + { 1, 1, NULL, "((2*3)&7) > 4"}, // Python precedence equiv + { 1, 1, NULL, "((2*3)&7) > 4 && 2*2 <= 4"}, + + { 1, 1, "plugh", "magic"}, + { 1, 1, "", "empty"}, + { 1, 1, NULL, "magic == \"plugh\""}, + { 1, 1, NULL, "magic != \"xyzzy\""}, + + { 1, 1, NULL, "\"abc\" < \"def\""}, + { 1, 1, NULL, "\"abc\" <= \"abc\""}, + { 0, 0, NULL, "\"abc\" < \"ab\""}, + { 0, 0, NULL, "\"abc\" <= \"ab\""}, + + { 0, 0, NULL, "\"abc\" > \"def\""}, + { 1, 1, NULL, "\"abc\" >= \"abc\""}, + { 1, 1, NULL, "\"abc\" > \"ab\""}, + { 1, 1, NULL, "\"abc\" >= \"ab\""}, + + { 0, NAN, NULL, "null == \"x\"" }, + { 0, NAN, NULL, "null != \"x\"" }, + { 0, NAN, NULL, "null < \"x\"" }, + { 0, NAN, NULL, "null > \"x\"" }, + + { 1, 1, NULL, "\"abbc\" =~ \"^a+b+c+$\""}, + { 0, 0, NULL, "\"aBBc\" =~ \"^a+b+c+$\""}, + { 1, 1, NULL, "\"aBBc\" !~ \"^a+b+c+$\""}, + { 1, 1, NULL, "\"xyzzy plugh abracadabra\" =~ magic"}, + + { 1, 1, "", "empty-but-true" }, + { 0, 0, NULL, "!empty-but-true" }, + { 1, 1, NULL, "!!empty-but-true" }, + { 1, 1, NULL, "1 && empty-but-true && 1" }, + { 0, 0, NULL, "1 && empty-but-true && 0" }, + + { 0, NAN, NULL, "null" }, + { 1, 1, NULL, "!null" }, + { 0, 0, NULL, "!!null", }, + { 0, 0, NULL, "!\"foo\"" }, + { 1, 1, NULL, "!!\"foo\"" }, + + { 1, NAN, NULL, "null-but-true" }, + { 0, 0, NULL, "!null-but-true" }, + { 1, 1, NULL, "!!null-but-true" }, + { 1, 0, NULL, "zero-but-true" }, + { 0, 0, NULL, "!zero-but-true" }, + { 1, 1, NULL, "!!zero-but-true" }, + + { 1, log(2), NULL, "log(2)"}, + { 1, exp(9), NULL, "exp(9)"}, + { 1, 9, NULL, "log(exp(9))"}, + { 1, 8, NULL, "pow(2,3)"}, + { 1, 3, NULL, "sqrt(9)"}, + { 0, NAN, NULL, "sqrt(-9)"}, + + { 1, 2, NULL, "default(2,3)"}, + { 1, 3, NULL, "default(null,3)"}, + { 0, 0, NULL, "default(null,0)"}, + { 1, NAN, NULL, "default(null-but-true,0)"}, + { 1, NAN, NULL, "default(null-but-true,null)"}, + { 1, NAN, NULL, "default(null,null-but-true)"}, + + { 1, 1, NULL, "exists(\"foo\")"}, + { 1, 1, NULL, "exists(12)"}, + { 1, 1, NULL, "exists(\"\")"}, + { 1, 1, NULL, "exists(0)"}, + { 0, 0, NULL, "exists(null)"}, + { 1, 1, NULL, "exists(null-but-true)"}, }; int i, res = 0; @@ -234,15 +311,24 @@ int test(void) { continue; } - if (r.is_str && (strcmpnull(r.s.s, tests[i].sval) != 0 - || r.d != tests[i].dval + if (!hts_expr_val_exists(&r)) { + if (r.is_true != tests[i].truth_val || + !cmpfloat(r.d, tests[i].dval)) { + fprintf(stderr, + "Failed test: \"%s\" == \"%f\", got %s, \"%s\", %f\n", + tests[i].str, tests[i].dval, + r.is_true ? "true" : "false", r.s.s, r.d); + res = 1; + } + } else if (r.is_str && (strcmpnull(r.s.s, tests[i].sval) != 0 + || !cmpfloat(r.d, tests[i].dval) || r.is_true != tests[i].truth_val)) { fprintf(stderr, "Failed test: \"%s\" == \"%s\", got %s, \"%s\", %f\n", tests[i].str, tests[i].sval, r.is_true ? "true" : "false", r.s.s, r.d); res = 1; - } else if (!r.is_str && (r.d != tests[i].dval + } else if (!r.is_str && (!cmpfloat(r.d, tests[i].dval) || r.is_true != tests[i].truth_val)) { fprintf(stderr, "Failed test: %s == %f, got %s, %f\n", tests[i].str, tests[i].dval, @@ -264,6 +350,8 @@ int main(int argc, char **argv) { if (hts_filter_eval2(filt, NULL, lookup, &v)) return 1; + printf("%s\t", v.is_true ? "true":"false"); + if (v.is_str) puts(v.s.s); else From d77192ae04fe6a57a84b9050d1c0c976f207037d Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 5 Aug 2022 09:58:09 +0100 Subject: [PATCH 313/488] Update to htscodecs 1.3.0 * Substantial reduction to name tokeniser memory. * Improve compiler SIMD support tests * Conditionally build SIMD object files based on configure results * Remove GNU make-ism * Merge Various speed improvements to rANS codecs (PR #53) * Fix compression of data blocks close to 2GB in size. (PR #55) * Rename NEWS to NEWS.md * Switch CI to rocklinux:9 instead of :latest * Preparation for v1.3.0 release * Fix big-endian rans_compress_O0_32x16() to match little-endian * Fix clash with overlapping buffers in 32x16 scalar O1 decoder. * Update README file to acknowledge rans32x16 options * Add a -b (benchmark) option to test/entropy * Add some simple entropy encoder benchmarks * Fix bug with RANS_ORDER_STRIPE on large blocks. --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 9cd552e17..3ef17f6fb 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 9cd552e173055730eb7701ebdbd13f6c579088e4 +Subproject commit 3ef17f6fb5b8b6b0ad2d4c1c562165664f0703f8 From 759f69618b4fcd05d92e6cb46ca9d58e511c0225 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 10 Aug 2022 15:14:52 +0100 Subject: [PATCH 314/488] Recognise FASTA/Q even if the file has a very long header file secondline_is_bases() avoids false positives by verifying that the second line contains only base-encoded alphabetic characters, but if the first line is longer than hts_detect_format2()'s peek buffer, we fail to recognise FASTA or FASTQ. Revise (and rename to is_fastaq()) so the FASTA/Q check is: 1. Check that the (buffered part of the) first line is entirely textual. 2. Return true if we have none of the second line in our peek buffer. 3. Check that the (buffered part of the) second line contains only base-encoding alphabetic characters. Fixes samtools/samtools#1689. --- hts.c | 19 ++++++++++++------- test/fastq/fastq.tst | 3 +++ test/fastq/longline.fq | 4 ++++ test/fastq/longline.sam | 1 + 4 files changed, 20 insertions(+), 7 deletions(-) create mode 100644 test/fastq/longline.fq create mode 100644 test/fastq/longline.sam diff --git a/hts.c b/hts.c index 03809a11b..19981d83f 100644 --- a/hts.c +++ b/hts.c @@ -417,12 +417,17 @@ static int is_text_only(const unsigned char *u, const unsigned char *ulim) return 1; } -static int -secondline_is_bases(const unsigned char *u, const unsigned char *ulim) +static int is_fastaq(const unsigned char *u, const unsigned char *ulim) { - // Skip to second line, returning false if there isn't one - u = memchr(u, '\n', ulim - u); - if (u == NULL || ++u == ulim) return 0; + const unsigned char *eol = memchr(u, '\n', ulim - u); + + // Check that the first line is entirely textual + if (! is_text_only(u, eol? eol : ulim)) return 0; + + // If the first line is very long, consider the file to indeed be FASTA/Q + if (eol == NULL) return 1; + + u = eol+1; // Now points to the first character of the second line // Scan over all base-encoding letters (including 'N' but not SEQ's '=') while (u < ulim && (seq_nt16_table[*u] != 15 || toupper(*u) == 'N')) { @@ -678,12 +683,12 @@ int hts_detect_format2(hFILE *hfile, const char *fname, htsFormat *fmt) fmt->format = hts_crypt4gh_format; return 0; } - else if (len >= 1 && s[0] == '>' && secondline_is_bases(s, &s[len])) { + else if (len >= 1 && s[0] == '>' && is_fastaq(s, &s[len])) { fmt->category = sequence_data; fmt->format = fasta_format; return 0; } - else if (len >= 1 && s[0] == '@' && secondline_is_bases(s, &s[len])) { + else if (len >= 1 && s[0] == '@' && is_fastaq(s, &s[len])) { fmt->category = sequence_data; fmt->format = fastq_format; return 0; diff --git a/test/fastq/fastq.tst b/test/fastq/fastq.tst index 966f0ed8a..3b5fd9f4f 100644 --- a/test/fastq/fastq.tst +++ b/test/fastq/fastq.tst @@ -44,6 +44,9 @@ P minimal-q.sam $tview minimal.fa P multiline.sam $tview multiline.fq P multiline-q.sam $tview multiline.fa +# FASTQ with a very long header line +P longline.sam $tview -i fastq_aux longline.fq + # Single file, unpaired data, with / without aux tags P single_noaux.sam $tview single.fq P single_noaux-q.sam $tview single.fa diff --git a/test/fastq/longline.fq b/test/fastq/longline.fq new file mode 100644 index 000000000..09cabd1a3 --- /dev/null +++ b/test/fastq/longline.fq @@ -0,0 +1,4 @@ +@readname XX:Z:baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab +ATGC ++ +qqqq diff --git a/test/fastq/longline.sam b/test/fastq/longline.sam new file mode 100644 index 000000000..4dc5e8215 --- /dev/null +++ b/test/fastq/longline.sam @@ -0,0 +1 @@ +readname 4 * 0 0 * * 0 0 ATGC qqqq XX:Z:baaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaab From a6ffbf50d55262595a094a46ea41dad3ab59d26d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 10 Aug 2022 12:17:09 +0100 Subject: [PATCH 315/488] Fix SEGV when handling errors from sam_read1_sam without headers --- sam.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sam.c b/sam.c index 3d479e1ed..c95d1c693 100644 --- a/sam.c +++ b/sam.c @@ -4101,7 +4101,7 @@ static inline int sam_read1_sam(htsFile *fp, sam_hdr_t *h, bam1_t *b) { fp->line.l = 0; if (ret < 0) { hts_log_warning("Parse error at line %lld", (long long)fp->lineno); - if (h->ignore_sam_err) goto err_recover; + if (h && h->ignore_sam_err) goto err_recover; } } From 6a043b9e10d6fee9ef5c0a82e3716155b91e1fef Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 5 Jul 2022 17:09:04 +0100 Subject: [PATCH 316/488] Extend VCF API to distinguish between INS and DEL variant types The change is largerly backward API and ABI compatible unless the var_type flag is queried for equality in the user program. API alternatives for querying these flags is provided. --- htslib/vcf.h | 32 ++++++++++++++++++++++++-------- test/test-bcf_set_variant_type.c | 6 +++--- vcf.c | 32 ++++++++++++++++++++++++++++---- 3 files changed, 55 insertions(+), 15 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 8f7d79fe3..b8162d7a2 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -138,13 +138,16 @@ extern uint8_t bcf_type_shift[]; #define BCF_BT_FLOAT 5 #define BCF_BT_CHAR 7 -#define VCF_REF 0 -#define VCF_SNP 1 -#define VCF_MNP 2 -#define VCF_INDEL 4 -#define VCF_OTHER 8 -#define VCF_BND 16 // breakend -#define VCF_OVERLAP 32 // overlapping deletion, ALT=* +#define VCF_REF 0 +#define VCF_SNP (1<<0) +#define VCF_MNP (1<<1) +#define VCF_INDEL (1<<2) +#define VCF_OTHER (1<<3) +#define VCF_BND (1<<4) // breakend +#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_INS (1<<6) // implies VCF_INDEL +#define VCF_DEL (1<<7) // implies VCF_INDEL +#define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) typedef struct bcf_variant_t { int type, n; // variant type and the number of bases affected, negative for deletions @@ -751,7 +754,11 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line); /** - * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc + * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc. (DEPRECATED) + * bcf_has_variant_type[s]() - the preferred way to query the presence of variant types + * @bitmask: combination of VCF_* variant type above, VCF_INDEL implies VCF_INS|VCF_DEL + * @mode: `exact` for an exact match, `overlap` for at least one matching variant, + * `subset` for the listed variants only */ HTSLIB_EXPORT int bcf_get_variant_types(bcf1_t *rec); @@ -759,6 +766,15 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT int bcf_get_variant_type(bcf1_t *rec, int ith_allele); + enum bcf_variant_match { exact, overlap, subset }; + + HTSLIB_EXPORT + int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode); + + HTSLIB_EXPORT + int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode); + + HTSLIB_EXPORT int bcf_is_snp(bcf1_t *v); diff --git a/test/test-bcf_set_variant_type.c b/test/test-bcf_set_variant_type.c index fef212dbb..e5092084e 100644 --- a/test/test-bcf_set_variant_type.c +++ b/test/test-bcf_set_variant_type.c @@ -46,19 +46,19 @@ static void test_bcf_set_variant_type() bcf_set_variant_type("A", "T", &var1); if ( var1.type != VCF_SNP) { - error("A -> T was not detected as a breakend"); + error("A -> T was not detected as a SNP"); } // Test INDEL bcf_variant_t var2a; bcf_set_variant_type("A", "AA", &var2a); - if ( var2a.type != VCF_INDEL) + if ( var2a.type != (VCF_INDEL|VCF_INS) ) { error("A -> AA was not detected as an INDEL"); } bcf_variant_t var2b; bcf_set_variant_type("AA", "A", &var2b); - if ( var2b.type != VCF_INDEL) + if ( var2b.type != (VCF_INDEL|VCF_DEL) ) { error("AA -> A was not detected as a INDEL"); } diff --git a/vcf.c b/vcf.c index aa7c558be..2edf68ddd 100644 --- a/vcf.c +++ b/vcf.c @@ -4196,12 +4196,12 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t { if ( *a==']' || *a=='[' ) { var->type = VCF_BND; return; } // "joined after" breakend while ( *a ) a++; - var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; + var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_INS; return; } else if ( *r && !*a ) { while ( *r ) r++; - var->n = (a-alt)-(r-ref); var->type = VCF_INDEL; return; + var->n = (a-alt)-(r-ref); var->type = VCF_INDEL | VCF_DEL; return; } else if ( !*r && !*a ) { @@ -4216,13 +4216,13 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t { if ( re==r ) { var->n = 1; var->type = VCF_SNP; return; } var->n = -(re-r); - if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; } + if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_DEL; return; } var->type = VCF_OTHER; return; } else if ( re==r ) { var->n = ae-a; - if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL; return; } + if ( toupper_c(*re)==toupper_c(*ae) ) { var->type = VCF_INDEL | VCF_INS; return; } var->type = VCF_OTHER; return; } @@ -4264,6 +4264,30 @@ int bcf_get_variant_type(bcf1_t *rec, int ith_allele) if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); return rec->d.var[ith_allele].type; } +inline static int _has_variant_type(int type, int bitmask, enum bcf_variant_match mode) +{ + if ( mode==overlap ) return type & bitmask; + + // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may + // ask for say `VCF_INS` or `VCF_INDEL` only + if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL; + else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL); + + if ( mode==subset ) + { + if ( ~bitmask & type ) return 0; + else return bitmask & type; + } + return type==bitmask ? type : 0; +} +int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode) +{ + return _has_variant_type(bcf_get_variant_type(rec, ith_allele), bitmask, mode); +} +int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode) +{ + return _has_variant_type(bcf_get_variant_types(rec), bitmask, mode); +} int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) { From 552219e856749acc31900e3b7a0db74e15f98d28 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 6 Jul 2022 13:03:01 +0100 Subject: [PATCH 317/488] Make values returned by bcf_get_variant_type[s] backward compatible When the old bcf_get_variant_type[s] functions are used, the values stored and returned are identical to the old interface. --- vcf.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/vcf.c b/vcf.c index 2edf68ddd..5a1aa2758 100644 --- a/vcf.c +++ b/vcf.c @@ -4257,14 +4257,14 @@ static int bcf_set_variant_types(bcf1_t *b) int bcf_get_variant_types(bcf1_t *rec) { if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return rec->d.var_type; + return rec->d.var_type & ~(VCF_INS|VCF_DEL); } int bcf_get_variant_type(bcf1_t *rec, int ith_allele) { if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return rec->d.var[ith_allele].type; + return rec->d.var[ith_allele].type & ~(VCF_INS|VCF_DEL); } -inline static int _has_variant_type(int type, int bitmask, enum bcf_variant_match mode) +inline static int has_variant_type(int type, int bitmask, enum bcf_variant_match mode) { if ( mode==overlap ) return type & bitmask; @@ -4282,11 +4282,13 @@ inline static int _has_variant_type(int type, int bitmask, enum bcf_variant_matc } int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode) { - return _has_variant_type(bcf_get_variant_type(rec, ith_allele), bitmask, mode); + if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); + return has_variant_type(rec->d.var[ith_allele].type, bitmask, mode); } int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode) { - return _has_variant_type(bcf_get_variant_types(rec), bitmask, mode); + if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); + return has_variant_type(rec->d.var_type, bitmask, mode); } int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) From 8d9193899ef3e34ea8ecc9f3fb72063e74697f69 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 2 Aug 2022 18:19:55 +0100 Subject: [PATCH 318/488] Adjust new variant type interfaces * Remove `mode` from bcf_has_variant_type() interface, and add a special case for `VCF_REF` Individual alleles only have a single variant type, so the only useful mode is the overlap one (bitwise-and). The exception is VCF_REF, which is encoded as 0, so has to be tested for by equality. * Put `bcf_match_` prefix on enumerated values, to avoid name clashes * Make `bitmask` unsigned for more predictable bitwise operations (the return value still has to be signed, though). * Return -1 if bcf_set_variant_types() fails, of if the requested allele is not valid. As callers using the legacy API won't be checking for a -1 return, these unfortunately need to be made to call exit(1) on failure. This is however an improvement on what would have happened under the same conditions before, which would most likely have been a NULL pointer dereference. * Add a bcf_variant_length() function, to more easily access the rec->d.var[].n field. * Be more specific on specifying the mask used to restrict the types the old functions return, in case more are added later. * Improve documentation in the header. --- htslib/vcf.h | 90 +++++++++++++++++++++++++++++++++++++++++++++++----- vcf.c | 78 ++++++++++++++++++++++++++++++++++----------- 2 files changed, 142 insertions(+), 26 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index b8162d7a2..d3ef6560f 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -753,27 +753,101 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *src_line); + /// Get variant types in a BCF record /** - * bcf_get_variant_type[s]() - returns one of VCF_REF, VCF_SNP, etc. (DEPRECATED) - * bcf_has_variant_type[s]() - the preferred way to query the presence of variant types - * @bitmask: combination of VCF_* variant type above, VCF_INDEL implies VCF_INS|VCF_DEL - * @mode: `exact` for an exact match, `overlap` for at least one matching variant, - * `subset` for the listed variants only + * @param rec BCF/VCF record + * @return Types of variant present + * + * The return value will be a bitwise-or of VCF_SNP, VCF_MNP, + * VCF_INDEL, VCF_OTHER, VCF_BND or VCF_OVERLAP. If will return + * VCF_REF (i.e. 0) if none of the other types is present. + * @deprecated Please use bcf_has_variant_types() instead */ HTSLIB_EXPORT int bcf_get_variant_types(bcf1_t *rec); + /// Get variant type in a BCF record, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele to check + * @return Type of variant present + * + * The return value will be one of VCF_REF, VCF_SNP, VCF_MNP, + * VCF_INDEL, VCF_OTHER, VCF_BND or VCF_OVERLAP. + * @deprecated Please use bcf_has_variant_type() instead + */ HTSLIB_EXPORT int bcf_get_variant_type(bcf1_t *rec, int ith_allele); - enum bcf_variant_match { exact, overlap, subset }; + /// Match mode for bcf_has_variant_types() + enum bcf_variant_match { + bcf_match_exact, ///< Types present exactly match tested for + bcf_match_overlap, ///< At least one variant type in common + bcf_match_subset, ///< Test set is a subset of types present + }; + /// Check for presence of variant types in a BCF record + /** + * @param rec BCF/VCF record + * @param bitmask Set of variant types to test for + * @param mode Match mode + * @return >0 if the variant types are present, + * 0 if not present, + * -1 on error + * + * @p bitmask should be the bitwise-or of the variant types (VCF_SNP, + * VCF_MNP, etc.) to test for. + * + * The return value is the bitwise-and of the set of types present + * and @p bitmask. Callers that want to check for the presence of more + * than one type can avoid function call overhead by passing all the + * types to be checked for in a single call to this function, in + * bcf_match_overlap mode, and then check for them individually in the + * returned value. + * + * As VCF_REF is represented by 0 (i.e. the absence of other variants) + * it should be tested for using + * bcf_has_variant_types(rec, VCF_REF, bcf_match_exact) + * which will return 1 if no other variant type is present, otherwise 0. + */ HTSLIB_EXPORT - int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode); + int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, enum bcf_variant_match mode); + /// Check for presence of variant types in a BCF record, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele to check + * @param bitmask Set of variant types to test for + * @return >0 if one of the variant types is present, + * 0 if not present, + * -1 on error + * + * @p bitmask should be the bitwise-or of the variant types (VCF_SNP, + * VCF_MNP, etc.) to test for, or VCF_REF on its own. + * + * The return value is the bitwise-and of the set of types present + * and @p bitmask. Callers that want to check for the presence of more + * than one type can avoid function call overhead by passing all the + * types to be checked for in a single call to this function, and then + * check for them individually in the returned value. + * + * As a special case, if @p bitmask is VCF_REF (i.e. 0), the function + * tests for an exact match. The return value will be 1 if the + * variant type calculated for the allele is VCF_REF, otherwise if + * any other type is present it will be 0. + */ HTSLIB_EXPORT - int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode); + int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask); + /// Return the number of bases affected by a variant, for a given allele + /** + * @param rec BCF/VCF record + * @param ith_allele Allele index + * @return The number of bases affected (negative for deletions), + * or bcf_int32_missing on error. + */ + HTSLIB_EXPORT + int bcf_variant_length(bcf1_t *rec, int ith_allele); HTSLIB_EXPORT int bcf_is_snp(bcf1_t *v); diff --git a/vcf.c b/vcf.c index 5a1aa2758..56af63054 100644 --- a/vcf.c +++ b/vcf.c @@ -4238,7 +4238,10 @@ static int bcf_set_variant_types(bcf1_t *b) bcf_dec_t *d = &b->d; if ( d->n_var < b->n_allele ) { - d->var = (bcf_variant_t *) realloc(d->var, sizeof(bcf_variant_t)*b->n_allele); + bcf_variant_t *new_var = realloc(d->var, sizeof(bcf_variant_t)*b->n_allele); + if (!new_var) + return -1; + d->var = new_var; d->n_var = b->n_allele; } int i; @@ -4254,42 +4257,81 @@ static int bcf_set_variant_types(bcf1_t *b) return 0; } +// bcf_get_variant_type/bcf_get_variant_types should only return the following, +// to be compatible with callers that are not expecting newer values +// like VCF_INS, VCF_DEL. The full set is available from the newer +// vcf_has_variant_type* interfaces. +#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP) int bcf_get_variant_types(bcf1_t *rec) { - if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return rec->d.var_type & ~(VCF_INS|VCF_DEL); + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) { + hts_log_error("Couldn't get variant types: %s", strerror(errno)); + exit(1); // Due to legacy API having no way to report failures + } + } + return rec->d.var_type & ORIG_VAR_TYPES; } + int bcf_get_variant_type(bcf1_t *rec, int ith_allele) { - if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return rec->d.var[ith_allele].type & ~(VCF_INS|VCF_DEL); + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) { + hts_log_error("Couldn't get variant types: %s", strerror(errno)); + exit(1); // Due to legacy API having no way to report failures + } + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) { + hts_log_error("Requested allele outside valid range"); + exit(1); + } + return rec->d.var[ith_allele].type & ORIG_VAR_TYPES; } -inline static int has_variant_type(int type, int bitmask, enum bcf_variant_match mode) +#undef ORIG_VAR_TYPES + +int bcf_has_variant_type(bcf1_t *rec, int ith_allele, uint32_t bitmask) { - if ( mode==overlap ) return type & bitmask; + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return -1; + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) return -1; + if (bitmask == VCF_REF) { // VCF_REF is 0, so handled as a special case + return rec->d.var[ith_allele].type == VCF_REF; + } + return bitmask & rec->d.var[ith_allele].type; +} + +int bcf_variant_length(bcf1_t *rec, int ith_allele) +{ + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return bcf_int32_missing; + } + if (ith_allele < 0 || ith_allele >= rec->n_allele) return bcf_int32_missing; + return rec->d.var[ith_allele].n; +} + +int bcf_has_variant_types(bcf1_t *rec, uint32_t bitmask, + enum bcf_variant_match mode) +{ + if ( rec->d.var_type==-1 ) { + if (bcf_set_variant_types(rec) != 0) return -1; + } + uint32_t type = rec->d.var_type; + if ( mode==bcf_match_overlap ) return bitmask & type; // VCF_INDEL is always set with VCF_INS and VCF_DEL by bcf_set_variant_type[s], but the bitmask may // ask for say `VCF_INS` or `VCF_INDEL` only if ( bitmask&(VCF_INS|VCF_DEL) && !(bitmask&VCF_INDEL) ) type &= ~VCF_INDEL; else if ( bitmask&VCF_INDEL && !(bitmask&(VCF_INS|VCF_DEL)) ) type &= ~(VCF_INS|VCF_DEL); - if ( mode==subset ) + if ( mode==bcf_match_subset ) { if ( ~bitmask & type ) return 0; else return bitmask & type; } + // mode == bcf_match_exact return type==bitmask ? type : 0; } -int bcf_has_variant_type(bcf1_t *rec, int ith_allele, int bitmask, enum bcf_variant_match mode) -{ - if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return has_variant_type(rec->d.var[ith_allele].type, bitmask, mode); -} -int bcf_has_variant_types(bcf1_t *rec, int bitmask, enum bcf_variant_match mode) -{ - if ( rec->d.var_type==-1 ) bcf_set_variant_types(rec); - return has_variant_type(rec->d.var_type, bitmask, mode); -} int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const void *values, int n, int type) { From 2a646be681f892cebf725a471dcba222db86a43b Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 12 Aug 2022 12:26:20 +0100 Subject: [PATCH 319/488] Add trivial le_to_u8() endianness routine This is even more trivial than le_to_i8() but can be useful for the regular treatment of unsigned types in #define BRANCH type switches. --- htslib/hts_endian.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/htslib/hts_endian.h b/htslib/hts_endian.h index 790d2d5c6..30ad8055d 100644 --- a/htslib/hts_endian.h +++ b/htslib/hts_endian.h @@ -113,6 +113,14 @@ typedef uint64_t uint64_u; # endif #endif +/// Get a uint8_t value from an unsigned byte array +/** @param buf Pointer to source byte, may be unaligned + * @return An 8-bit unsigned integer + */ +static inline uint8_t le_to_u8(const uint8_t *buf) { + return *buf; +} + /// Get a uint16_t value from an unsigned byte array /** @param buf Pointer to source byte, may be unaligned * @return A 16 bit unsigned integer From 8f140eed115bdb18257174420779757546e91c6c Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Fri, 12 Aug 2022 15:13:38 +0100 Subject: [PATCH 320/488] NEWS additions for Summer 2022 Release. (PR #1465) Co-authored-by: Rob Davies --- NEWS | 166 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 158 insertions(+), 8 deletions(-) diff --git a/NEWS b/NEWS index 53a0d3c34..fd49755f4 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,156 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* Make hfile_s3 refresh AWS credentials on expiry in order to make HTSlib work + better with AWS IAM credentials, which have a limited lifespan. + (PR#1462 and PR#1474, addresses #344) + +* Allow BAM headers between 2GB and 4GB in size once more. This is not + permitted in the BAM specification but was allowed in an earlier version of + HTSlib. There is now a warning at 2GB and a hard failure at 4GB. + (PR#1421, fixes #1420 and samtools#1613. Reported by John Marshall and + R C Mueller) + +* Improve error message when failing to load an index. + (PR#1468, example of the problem samtools#1637) + +* Permit MM (base modification) tags containing "." and "?" suffixes. These + define implicit vs explicit coordinates. See the SAM tags specification for + details. + (PR#1423 and PR#1426, fixes #1418. PR#1469, fixes #1466. Reported + by cjw85) + +* Warn if spaces instead of tabs are detected in a VCF file to prevent + confusion. + (PR#1328, fixes bcftools#1575. Reported by ketkijoshi278) + +* Add an "sclen" filter expression keyword. This is the length of a soft-clip, + both left and right end. It may be combined with qlen (qlen-sclen) to obtain + the number of bases in the query sequence that have been aligned to the genome + ie it provides a way to compare local-alignment vs global-alignment length. + (PR#1441 and PR/samtools#1661, fixes #1436. Requested by Chang Y) + +* Improve error messages for CRAM reference mismatches. If the user specifies + the wrong reference, the CRAM slice header MD5sum checks fail. We now report + the SQ line M5 string too so it is possible to validate against the whole + chr in the ref.fa file. The error message has also been improved to report + the reference name instead of #num. Finally, we now hint at the likely cause, + which counters the misleading samtools supplied error of "truncated or + corrupt" file. + (PR#1427, fixes samtools#1640. Reported by Jian-Guo Zhou) + +* Expose more of the CRAM API and add new functionality to extract the reference + from a CRAM file. + (PR#1429 and PR#1442) + +* Improvements to the implementation of embedded references in CRAM where no + external reference is specified. + (PR#1449, addresses some of the issues in #1445) + +* The CRAM writer now allows alignment records with RG:Z: aux tags that + don't have a corresponding @RG ID in the file header. Previously these + tags would have been silently dropped. HTSlib will complain whenever it + has to add one though, as such tags do not conform to recommended practice + for the SAM, BAM and CRAM formats. + (PR#1480, fixes #1479. Reported by Alex Leonard) + +* Set tab delimiter in man page for tabix GFF3 sort. + (PR#1457. Thanks to Colin Diesh) + +* When using libdeflate, the 1...9 scale of BGZF compression levels is + now remapped to the 1...12 range used by libdeflate instead of being + passed directly. In particular, HTSlib levels 8 and 9 now map to + libdeflate levels 10 and 12, so it is possible to select the highest (but + slowest) compression offered by libdeflate. + (PR#1488, fixes #1477. Reported by Gert Hulselmans) + +* The VCF variant API has been extended so that it can return separate flags + for INS and DEL variants as well as the existing INDEL one. These flags + have not been added to the old bcf_get_variant_types() interface as + it could break existing users. To access them, it is necessary to use new + functions bcf_has_variant_type() and bcf_has_variant_types(). + (PR#1467) + +* The missing, but trivial, `le_to_u8()` function has been added to hts_endian. + (PR#1494, Thanks to John Marshall) + +Build changes +------------- + +These are compiler, configuration and makefile based changes. + +* Update htscodecs to version 1.3.0 for new SIMD code + various fixes. + Updates the htscodecs submodule and adds changes necessary to make HTSlib + build the new SIMD codec implementations. + (PR#1438, PR#1489) + +* Fix clang builds under mingw. Under mingw, clang requires dllexport to be + applied to both function declarations and function definitions. + (PR#1435, fixes #1433. Reported by teepean) + +* Fix curl type warning with gcc 12.1 on Windows. + (PR#1443) + +* Detect ARM Neon support and only build appropriate SIMD object files. + (PR#1451, fixes #1450. Thanks to John Marshall) + +* `make print-config` now reports extra CFLAGS that are needed to build the + SIMD parts of htscodecs. These may be of use to third-party build + systems that don't use HTSlib's or htscodecs' build infrastructure. (PR#1485. + Thanks to John Marshall) + +Bug fixes +--------- + +* Fix bug when reading position -1 in BCF (0 in VCF), which is used to indicate + telomeric regions. The BCF reader was incorrectly assuming the value stored + in the file was unsigned, so a VCF->BCF->VCF round-trip would change it + from 0 to 4294967296. + (PR#1476, fixes #1475 and bcftools#1753. Reported by Rodrigo Martin) + +* Various bugs and quirks have been fixed in the filter expression engine, + mostly related to the handling of absent tags, and the is_true flag. + Note that as a result of these fixes, some filter expressions may give + different results: + - Fixed and-expressions including aux tag values which could give an invalid + true result depending on the order of terms. + - The expression `![NM]` is now true if only `NM` does not exist. In + earlier versions it would also report true for tags like `NM:i:0` which + exist but have a value of zero. + - The expression `[X1] != 0` is now false when `X1` does not exist. Earlier + versions would return true for this comparison when the tag was missing. + - NULL values due to missing tags now propagate through string, bitwise + and mathematical operations. Logical operations always treat them as + false. + (PR#1463, fixes samtools#1670. Reported by Gert Hulselmans; + PR#1478, fixes samtools#1677. Reported by johnsonzcode) + +* Fix buffer overrun in bam_plp_insertion_mod. Memory now grows to the proper + size needed for base modification data. + (PR#1430, fixes samtools#1652. Reported by hd2326) + +* Remove limit of returned size from fai_retrieve(). + (PR#1446, fixes samtools#1660. Reported by Shane McCarthy) + +* Cap hts_getline() return value at INT_MAX. Prevents hts_getline() from + returning a negative number (a fail) for very long string length values. + (PR#1448. Thanks to John Marshall) + +* Fix breakend detection and test bcf_set_variant_type(). + (PR#1456, fixes #1455. Thanks to Martin Pollard) + +* Prevent arrays of BCF_BT_NULL values found in BCF files from causing + bcf_fmt_array() to call exit() as the type is unsupported. These are + now tested for and caught by bcf_record_check(), which returns an + error code instead. (PR#1486) + +* Improved detection of fasta and fastq files that have very long comments + following identifiers. (PR#1491, thanks to John Marshall. + Fixes samtools/samtools#1689, reported by cjw85) + +* Fixed a SEGV triggered by giving a SAM file to `samtools import`. + (PR#1492) + Noteworthy changes in release 1.15.1 (7th April 2022) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -435,7 +585,7 @@ These are compiler, configuration and makefile based changes. compiler flags. Thanks to John Marshall. (#1187) * Added 'fall through' comments to prevent warnings issued by Clang on - intentional fall through case statements, when building with + intentional fall through case statements, when building with `-Wextra flag`. Thanks to John Marshall. (#1163) * Non-configure builds now define _XOPEN_SOURCE=600 to allow them to work @@ -459,7 +609,7 @@ Bug fixes CIGAR segments. Thanks to `@wulj2` for the analysis. (#1202; fixed #1196) * Fixed a tabix bug that prevented setting the correct number of lines to be - skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189; + skipped in a region file. Thanks to Jim Robinson for reporting it. (#1189; fixed #1186) * Made `bam_itr_next` an alias for `sam_itr_next`, to prevent it from crashing @@ -1379,7 +1529,7 @@ Noteworthy changes in release 1.8 (3rd April 2018) Noteworthy changes in release 1.7 (26th January 2018) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -* BAM: HTSlib now supports BAMs which include CIGARs with more than +* BAM: HTSlib now supports BAMs which include CIGARs with more than 65535 operations as per HTS-Specs 18th November (dab57f4 and 2f915a8). * BCF/VCF: @@ -1397,13 +1547,13 @@ Noteworthy changes in release 1.7 (26th January 2018) (#651, #653; reported by Imran Haque and @egafni via pysam). * Multi-region iterator: The new structure takes a list of regions and - iterates over all, deduplicating reads in the process, and producing a - full list of file offset intervals. This is usually much faster than + iterates over all, deduplicating reads in the process, and producing a + full list of file offset intervals. This is usually much faster than repeatedly using the old single-region iterator on a series of regions. * Curl improvements: - Add Bearer token support via HTS_AUTH_LOCATION env (#600). - - Use CURL_CA_BUNDLE environment variable to override the CA (#622; + - Use CURL_CA_BUNDLE environment variable to override the CA (#622; thanks to Garret Kelly & David Alexander). - Speed up (removal of excessive waiting) for both http(s) and ftp. - Avoid repeatedly reconnecting by removal of unnecessary seeks. @@ -1412,7 +1562,7 @@ Noteworthy changes in release 1.7 (26th January 2018) * BGZF block caching, if enabled, now performs far better (#629; reported by Ram Yalamanchili). -* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas +* Added an hFILE layer for in-memory I/O buffers (#590; thanks to Thomas Hickman). * Tidied up the drand48 support (intended for systems that do not @@ -1510,7 +1660,7 @@ Release 1.4 (13 March 2017) * HTSlib now links against libbz2 and liblzma by default. To remove these dependencies, run configure with options --disable-bz2 and --disable-lzma, - but note that this may make some CRAM files produced elsewhere unreadable. + but note that this may make some CRAM files produced elsewhere unreadable. * Added a thread pool interface and replaced the bgzf multi-threading code to use this pool. BAM and CRAM decoding is now multi-threaded From b5cc0b76de4511690c938ad6a04d3cbf03bb3ab3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 15 Aug 2022 16:17:17 +0100 Subject: [PATCH 321/488] Fix clang builds on Windows/mingw. If we use HTSLIB_EXPORT once, it has to be used everywhere for that symbol. Also see #1435 --- htslib/ksort.h | 2 ++ htslib_vars.mk | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/htslib/ksort.h b/htslib/ksort.h index ad19fc47a..fe2933bd5 100644 --- a/htslib/ksort.h +++ b/htslib/ksort.h @@ -64,6 +64,7 @@ #include #include +#include "hts.h" #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) @@ -81,6 +82,7 @@ extern "C" { // problems on Windows. Don't include htslib/hts_os.h for this as it // may not get on with older attempts to fix this in code that includes // this file. +HTSLIB_EXPORT extern double hts_drand48(void); typedef struct { diff --git a/htslib_vars.mk b/htslib_vars.mk index 1f4c0905a..2a7b29450 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -42,7 +42,7 @@ htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h) htslib_klist_h = $(HTSPREFIX)htslib/klist.h htslib_kroundup_h = $(HTSPREFIX)htslib/kroundup.h htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h -htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h +htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h $(htslib_hts_h) htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h $(htslib_hts_defs_h) $(htslib_kroundup_h) htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h) htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) $(htslib_hts_endian_h) From ecc3d0f419aaf9aa7009af60685445878ff1e357 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 13 Aug 2022 22:16:21 +0100 Subject: [PATCH 322/488] Implement endianness conversion in bcf_format_gt() Add a convert() parameter to htslib/vcf.h's BRANCH-style macro, similar to those previously added to all the BRANCH-style macros in vcf.c. Fixes the VCF printing of records with more alleles than fits in BCF_BT_INT8. Add a record with GT values >256 to test_bcf2vcf's VCF file, and regenerate the corresponding BCF file via test/test_view -b -l0 -p test/tabix/vcf_file.bcf test/tabix/vcf_file.vcf --- htslib/vcf.h | 20 +++++++++++--------- test/tabix/vcf_file.bcf | Bin 2719 -> 4485 bytes test/tabix/vcf_file.vcf | 1 + 3 files changed, 12 insertions(+), 9 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index d3ef6560f..c94bea589 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1460,21 +1460,23 @@ static inline int bcf_float_is_vector_end(float f) static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) { uint32_t e = 0; - #define BRANCH(type_t, missing, vector_end) { \ - type_t *ptr = (type_t*) (fmt->p + isample*fmt->size); \ + #define BRANCH(type_t, convert, missing, vector_end) { \ + uint8_t *ptr = fmt->p + isample*fmt->size; \ int i; \ - for (i=0; in && ptr[i]!=vector_end; i++) \ + for (i=0; in; i++, ptr += sizeof(type_t)) \ { \ - if ( i ) e |= kputc("/|"[ptr[i]&1], str) < 0; \ - if ( !(ptr[i]>>1) ) e |= kputc('.', str) < 0; \ - else e |= kputw((ptr[i]>>1) - 1, str) < 0; \ + type_t val = convert(ptr); \ + if ( val == vector_end ) break; \ + if ( i ) e |= kputc("/|"[val&1], str) < 0; \ + if ( !(val>>1) ) e |= kputc('.', str) < 0; \ + else e |= kputw((val>>1) - 1, str) < 0; \ } \ if (i == 0) e |= kputc('.', str) < 0; \ } switch (fmt->type) { - case BCF_BT_INT8: BRANCH(int8_t, bcf_int8_missing, bcf_int8_vector_end); break; - case BCF_BT_INT16: BRANCH(int16_t, bcf_int16_missing, bcf_int16_vector_end); break; - case BCF_BT_INT32: BRANCH(int32_t, bcf_int32_missing, bcf_int32_vector_end); break; + case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, bcf_int8_missing, bcf_int8_vector_end); break; + case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, bcf_int16_missing, bcf_int16_vector_end); break; + case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, bcf_int32_missing, bcf_int32_vector_end); break; case BCF_BT_NULL: e |= kputc('.', str) < 0; break; default: hts_log_error("Unexpected type %d", fmt->type); return -2; } diff --git a/test/tabix/vcf_file.bcf b/test/tabix/vcf_file.bcf index 75a64b38c7f62b59c2d8c1009f4cbdff6bda69c7..a4aafec47726dbcc1c335c27fe5810bf13e33923 100644 GIT binary patch delta 1828 zcmXAqF^UyI6h-@g2bGzq1Nmlx1`{iSaoa(Gae-iL;4V6GFfeohuE9WX4~Fi*1-J!0 z=Qz-}tIn%__t$&XJzpQ)J-a;l{P1e=@9+KP^?qJHtfx2Y_dnk*@2`(v_m|~{&^@k? zKb|iq*DWt!pO+V>^Rk>CT=vU$nU`_9Z@2rr-5Kn!_HE;uT!V}GWImZs=9BqgK9~>Y zgZa*UXTCE>cz#pnp$x#hDI2mO8?qtuw&8`yOgK$Ac@BY1^G)+j^KmIM^G)-C z#(pqhXu@eimME}c!eK(5P-G??CM1C(GvP2H`4rjz-zobKxVujTMo^K>Luf*O4k9$6 z28hfgK7+_i$lqJ?ZCg6ik;Z3y#%FxSM|{Lbd>jS#0(*hIKnB#5q!cxkQ&u$t3fnlI z0kxH6z;+t9zWmfzQhqATm=Yh|46y68Kn4_A(hexK;|wTvwE%0rL#M?nF z@CA}*z>#;{25Nz46;caO^Vb42UVippAcI=Kk26s%KsDg}s~OY+-_Hn~-_LrKJSZHE zYrwwZ=4Tu?KZh+ojshC6?E>z=mpzr&0#sfL(0B&5z)ynF3~B)-YrPxrRaVPi3;b|I zcFsSCTH{*a$E2|j;FFrD7NGHVPz(Iz7|oy-Fl>DWwE%3MT7bH~7O-&R{B!dGdY5Xz z{?m8{FiAV81&mv7kNpy*tY-{I<8A=oxcPnkRr8~8-aie{0vhn+dO!o;IkXm_@>+n# xGpGd~bfX#60?+WetT*e`dcB_4tEcDX-TC+F`uz0n=f^keviNWD%ctwd{{Xyx{Wkyr delta 43 wcmZowo-Zmc-_60o00RHn7@VA$7@D{kQ@Cb--6)>I#dhkQgMA9u diff --git a/test/tabix/vcf_file.vcf b/test/tabix/vcf_file.vcf index de0a7c7b6..d3cf30fc8 100644 --- a/test/tabix/vcf_file.vcf +++ b/test/tabix/vcf_file.vcf @@ -35,3 +35,4 @@ 2 3199812 . G GTT,GT 82.7 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:322:26 1/2:322:26 3 3212016 . CTT C,CT 79 PASS AN=4;AC=2,2 GT:GQ:DP 1/2:91:26 1/2:91:26 4 3258448 . TACACACAC T . PASS AN=4;AC=2 GT:GQ:DP 0/1:325:31 0/1:325:31 +4 3258501 . C A,T,G,CA,CT,CG,CC,CAA,CAT,CAG,CAC,CTA,CTT,CTG,CTC,CGA,CGT,CGG,CGC,CCA,CCT,CCG,CCC,CAAA,CAAT,CAAG,CAAC,CATA,CATT,CATG,CATC,CAGA,CAGT,CAGG,CAGC,CACA,CACT,CACG,CACC,CTAA,CTAT,CTAG,CTAC,CTTA,CTTT,CTTG,CTTC,CTGA,CTGT,CTGG,CTGC,CTCA,CTCT,CTCG,CTCC,CGAA,CGAT,CGAG,CGAC,CGTA,CGTT,CGTG,CGTC,CGGA,CGGT,CGGG,CGGC,CGCA,CGCT,CGCG,CGCC,CCAA,CCAT,CCAG,CCAC,CCTA,CCTT,CCTG,CCTC,CCGA,CCGT,CCGG,CCGC,CCCA,CCCT,CCCG,CCCC,CAAAA,CAAAT,CAAAG,CAAAC,CAATA,CAATT,CAATG,CAATC,CAAGA,CAAGT,CAAGG,CAAGC,CAACA,CAACT,CAACG,CAACC,CATAA,CATAT,CATAG,CATAC,CATTA,CATTT,CATTG,CATTC,CATGA,CATGT,CATGG,CATGC,CATCA,CATCT,CATCG,CATCC,CAGAA,CAGAT,CAGAG,CAGAC,CAGTA,CAGTT,CAGTG,CAGTC,CAGGA,CAGGT,CAGGG,CAGGC,CAGCA,CAGCT,CAGCG,CAGCC,CACAA,CACAT,CACAG,CACAC,CACTA,CACTT,CACTG,CACTC,CACGA,CACGT,CACGG,CACGC,CACCA,CACCT,CACCG,CACCC,CTAAA,CTAAT,CTAAG,CTAAC,CTATA,CTATT,CTATG,CTATC,CTAGA,CTAGT,CTAGG,CTAGC,CTACA,CTACT,CTACG,CTACC,CTTAA,CTTAT,CTTAG,CTTAC,CTTTA,CTTTT,CTTTG,CTTTC,CTTGA,CTTGT,CTTGG,CTTGC,CTTCA,CTTCT,CTTCG,CTTCC,CTGAA,CTGAT,CTGAG,CTGAC,CTGTA,CTGTT,CTGTG,CTGTC,CTGGA,CTGGT,CTGGG,CTGGC,CTGCA,CTGCT,CTGCG,CTGCC,CTCAA,CTCAT,CTCAG,CTCAC,CTCTA,CTCTT,CTCTG,CTCTC,CTCGA,CTCGT,CTCGG,CTCGC,CTCCA,CTCCT,CTCCG,CTCCC,CGAAA,CGAAT,CGAAG,CGAAC,CGATA,CGATT,CGATG,CGATC,CGAGA,CGAGT,CGAGG,CGAGC,CGACA,CGACT,CGACG,CGACC,CGTAA,CGTAT,CGTAG,CGTAC,CGTTA,CGTTT,CGTTG,CGTTC,CGTGA,CGTGT,CGTGG,CGTGC,CGTCA,CGTCT,CGTCG,CGTCC,CGGAA,CGGAT,CGGAG,CGGAC,CGGTA,CGGTT,CGGTG,CGGTC,CGGGA,CGGGT,CGGGG,CGGGC,CGGCA,CGGCT,CGGCG,CGGCC,CGCAA,CGCAT,CGCAG,CGCAC,CGCTA,CGCTT,CGCTG,CGCTC,CGCGA,CGCGT,CGCGG,CGCGC,CGCCA,CGCCT,CGCCG,CGCCC,CCAAA,CCAAT,CCAAG,CCAAC,CCATA,CCATT,CCATG,CCATC,CCAGA,CCAGT,CCAGG,CCAGC,CCACA,CCACT,CCACG,CCACC,CCTAA,CCTAT,CCTAG,CCTAC,CCTTA,CCTTT,CCTTG,CCTTC,CCTGA,CCTGT 45 PASS AN=4;AC=2 GT 0/300 240/260 From ce7e29837d9a858832d6b5de79bddb5639d22c3f Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 16 Aug 2022 11:42:20 +0100 Subject: [PATCH 323/488] Make ksort.h use hts_defs.h instead of hts.h for HTSLIB_EXPORT Revised implementation of b5cc0b7, suggested by John Marshall. See also #1497 --- htslib/ksort.h | 2 +- htslib_vars.mk | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/htslib/ksort.h b/htslib/ksort.h index fe2933bd5..7857d4c77 100644 --- a/htslib/ksort.h +++ b/htslib/ksort.h @@ -64,7 +64,7 @@ #include #include -#include "hts.h" +#include "hts_defs.h" #ifndef klib_unused #if (defined __clang__ && __clang_major__ >= 3) || (defined __GNUC__ && __GNUC__ >= 3) diff --git a/htslib_vars.mk b/htslib_vars.mk index 2a7b29450..6af71863c 100644 --- a/htslib_vars.mk +++ b/htslib_vars.mk @@ -42,7 +42,7 @@ htslib_khash_str2int_h = $(HTSPREFIX)htslib/khash_str2int.h $(htslib_khash_h) htslib_klist_h = $(HTSPREFIX)htslib/klist.h htslib_kroundup_h = $(HTSPREFIX)htslib/kroundup.h htslib_kseq_h = $(HTSPREFIX)htslib/kseq.h -htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h $(htslib_hts_h) +htslib_ksort_h = $(HTSPREFIX)htslib/ksort.h $(htslib_hts_defs_h) htslib_kstring_h = $(HTSPREFIX)htslib/kstring.h $(htslib_hts_defs_h) $(htslib_kroundup_h) htslib_regidx_h = $(HTSPREFIX)htslib/regidx.h $(htslib_hts_h) htslib_sam_h = $(HTSPREFIX)htslib/sam.h $(htslib_hts_h) $(htslib_hts_endian_h) From 136e4a9b13bdcf0e2b8f2245a70f595c8c673a27 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 16 Aug 2022 16:11:38 +0100 Subject: [PATCH 324/488] Fix "make check" on MinGW / MacOS. (PR #1496) Fixes for when "make check" is run without first doing "make". If we haven't previously done a "make" or "make all", the .dll file won't have been created. The test/with-shlib.sh script will then fail as the "cp -p ../../hts-*.dll ." command fails. The analogous non-mingw commands are "ln -s ../../libhts.so.* ." and similar, which don't fail in the $? sense, but do fail with regards to produced a bogus target containing a wild-card pattern. On MacOS, building with `--enable-plugins` has no dependency between the plugins and libhts.dylib. That also caused a test failure because test/plugins-dlhts expects to find a dylib file. - We no longer attempt to execute this test when the external plugins don't exist. - "make plugins" now explicitly builds the shared library. - "make check" and "make test" now has "all" as a dependency. While strictly not everything is a dependency of running the tests, ensuring all the code compiles can be viewed as one of the tests to perform (albeit only executed the first time due to makefile dependency checking rules). --- Makefile | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index bd020bd21..918eff0f7 100644 --- a/Makefile +++ b/Makefile @@ -317,6 +317,9 @@ endif BUILT_PLUGINS = $(PLUGIN_OBJS:.o=$(PLUGIN_EXT)) +ifneq "$(BUILT_PLUGINS)" "" +plugins: lib-shared +endif plugins: $(BUILT_PLUGINS) @@ -563,7 +566,7 @@ SRC = $(srcprefix) # # If using MSYS, avoid poor shell expansion via: # MSYS2_ARG_CONV_EXCL="*" make check -check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODECS_TEST_TARGETS) +check test: all $(HTSCODECS_TEST_TARGETS) test/hts_endian test/test_expr test/test_kfunc @@ -572,8 +575,12 @@ check test: $(BUILT_PROGRAMS) $(BUILT_TEST_PROGRAMS) $(BUILT_PLUGINS) $(HTSCODEC test/test_time_funcs test/fieldarith test/fieldarith.sam test/hfile - HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR) - HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR) + if test "x$(BUILT_PLUGINS)" != "x"; then \ + HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -g ./libhts.$(SHLIB_FLAVOUR); \ + fi + if test "x$(BUILT_PLUGINS)" != "x"; then \ + HTS_PATH=. test/with-shlib.sh test/plugins-dlhts -l ./libhts.$(SHLIB_FLAVOUR); \ + fi test/test_bgzf test/bgziptest.txt test/test-parse-reg -t test/colons.bam cd test/sam_filter && ./filter.sh filter.tst From 9fcd248c764c74dc6a4c1e6189a9e0c3204dce5b Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 16 Aug 2022 17:58:51 +0100 Subject: [PATCH 325/488] Improve SIMD detection Change ax_check_compile_flag to use AC_LINK_IFELSE so that it can't cheat by making implicit declarations of the symbols we're testing for. Include _mm256_extract_epi64() in the AVX2 tests. This only exists on X86_64, so trying to build the AVX2 code on i686 didn't work. --- configure.ac | 3 ++- hts_probe_cc.sh | 3 ++- m4/ax_check_compile_flag.m4 | 2 +- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/configure.ac b/configure.ac index b848dc633..7d40948a6 100644 --- a/configure.ac +++ b/configure.ac @@ -110,7 +110,8 @@ AX_CHECK_COMPILE_FLAG([-mavx2], [ ]],[[ __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); __m256i b = _mm256_add_epi32(a, a); - return *((char *) &b); + long long c = _mm256_extract_epi64(b, 0); + return (int) c; ]])]) dnl Options for rANS32x16 avx512 version diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh index 905279099..37d6bae7e 100755 --- a/hts_probe_cc.sh +++ b/hts_probe_cc.sh @@ -71,7 +71,8 @@ cat - <<'EOF' > conftest.c int main(int argc, char **argv) { __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); __m256i b = _mm256_add_epi32(a, a); - return *((char *) &b); + long long c = _mm256_extract_epi64(b, 0); + return (int) c; } EOF FLAGS="-mavx2" diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4 index bd753b34d..16bb46495 100644 --- a/m4/ax_check_compile_flag.m4 +++ b/m4/ax_check_compile_flag.m4 @@ -42,7 +42,7 @@ AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" - AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], + AC_LINK_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], [AS_VAR_SET(CACHEVAR,[yes])], [AS_VAR_SET(CACHEVAR,[no])]) _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) From 356c4c48a05248f440948d67cf0e792685931296 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 16 Aug 2022 18:05:34 +0100 Subject: [PATCH 326/488] Silence some overflow warnings on i686 Apparently `if (sizeof(time_t) < 8)` isn't enough to keep the compiler from warning about the code that only runs when it's bigger. --- test/test_time_funcs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_time_funcs.c b/test/test_time_funcs.c index 9ca292f7a..0e0512988 100644 --- a/test/test_time_funcs.c +++ b/test/test_time_funcs.c @@ -71,7 +71,8 @@ int main(int argc, char **argv) { if (test_normalised(0, INT_MAX - 1000, 1000) != 0) return EXIT_FAILURE; if (sizeof(time_t) >= 8) { - if (test_normalised(INT_MAX - 1000, (time_t) INT_MAX * 2, 1000) != 0) + if (test_normalised(INT_MAX - 1000, + (time_t)((int64_t) INT_MAX * 2), 1000) != 0) return EXIT_FAILURE; } @@ -116,7 +117,8 @@ int main(int argc, char **argv) { res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) -1); } else { // 2038-01-19 03:14:08 - res |= test_specific(2038, 1, 19, 3, 14, 8, (time_t) INT_MAX + 1); + res |= test_specific(2038, 1, 19, 3, 14, 8, + (time_t)((int64_t) INT_MAX + 1)); } return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE; From b47469a6d322220d4412798ccaf80e4b5a8f6163 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 16 Aug 2022 18:39:42 +0100 Subject: [PATCH 327/488] Silence a gcc-10 format-truncation warning. It gets upset because in theory the combination of prefix and filename could be longer than the value snprintf() can return. Trying to use a filename that long would be somewhat bizarre. --- hts.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hts.c b/hts.c index 19981d83f..8b437f2b9 100644 --- a/hts.c +++ b/hts.c @@ -1354,6 +1354,8 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode, int ret = -1; if (fn2_len > sizeof(fn_buf)) { + if (fn2_len >= INT_MAX) // Silence gcc format-truncation warning + return -1; fn2 = malloc(fn2_len); if (!fn2) return -1; } From a1013a6e5eca6ba6b1a36b2a2a012952a5cf7df5 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 16 Aug 2022 16:41:02 +0100 Subject: [PATCH 328/488] Fix newer clang warning of unused variable. --- tbx.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tbx.c b/tbx.c index f0310a257..3af2c09fb 100644 --- a/tbx.c +++ b/tbx.c @@ -93,12 +93,11 @@ int tbx_name2id(tbx_t *tbx, const char *ss) int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) { - int i, b = 0, id = 1, ncols = 0; + int i, b = 0, id = 1; char *s; intv->ss = intv->se = 0; intv->beg = intv->end = -1; for (i = 0; i <= len; ++i) { if (line[i] == '\t' || line[i] == 0) { - ++ncols; if (id == conf->sc) { intv->ss = line + b; intv->se = line + i; } else if (id == conf->bc) { From fecbd67cb349896a1f858ee8d4aad320464a34de Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 17 Aug 2022 14:22:32 +0100 Subject: [PATCH 329/488] Add some more NEWS updates. --- NEWS | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/NEWS b/NEWS index fd49755f4..3cc571f52 100644 --- a/NEWS +++ b/NEWS @@ -74,6 +74,9 @@ Noteworthy changes in release a.b * The missing, but trivial, `le_to_u8()` function has been added to hts_endian. (PR#1494, Thanks to John Marshall) +* bcf_format_gt() now works properly on big-endian platforms. + (PR#1495, Thanks to John Marshall) + Build changes ------------- @@ -82,11 +85,11 @@ These are compiler, configuration and makefile based changes. * Update htscodecs to version 1.3.0 for new SIMD code + various fixes. Updates the htscodecs submodule and adds changes necessary to make HTSlib build the new SIMD codec implementations. - (PR#1438, PR#1489) + (PR#1438, PR#1489, PR#1500) * Fix clang builds under mingw. Under mingw, clang requires dllexport to be applied to both function declarations and function definitions. - (PR#1435, fixes #1433. Reported by teepean) + (PR#1435, PR#1497, PR#1498 fixes #1433. Reported by teepean) * Fix curl type warning with gcc 12.1 on Windows. (PR#1443) @@ -99,6 +102,11 @@ These are compiler, configuration and makefile based changes. systems that don't use HTSlib's or htscodecs' build infrastructure. (PR#1485. Thanks to John Marshall) +* Fixed some Makefile dependency issues for the "check"/"test" targets + and plugins. In particular, "make check" will now build the "all" target, + if not done already, before running the tests. + (PR#1496) + Bug fixes --------- From 203f5bb7b047c0833dc564b0faf519ff29a098a3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 19 Aug 2022 09:43:11 +0100 Subject: [PATCH 330/488] Update man SEE ALSO sections from .BR to .IR so the website uses URLs --- bgzip.1 | 5 ++--- htslib-s3-plugin.7 | 4 ++-- tabix.1 | 5 ++--- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/bgzip.1 b/bgzip.1 index 1ada36630..30c2808e5 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -181,6 +181,5 @@ The BGZF library was originally implemented by Bob Handsaker and modified by Heng Li for remote file access and in-memory caching. .SH SEE ALSO -.PP -.BR gzip (1), -.BR tabix (1) +.IR gzip (1), +.IR tabix (1) diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index 9f77cb585..7a8e51a9e 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -207,8 +207,8 @@ addressing_style or host_bucket. The first two can be set to \fBpath\fR while host_bucket must \fBnot\fR include the \fB%(bucket).s\fR string. .SH "SEE ALSO" -.BR htsfile (1) -.BR samtools (1) +.IR htsfile (1) +.IR samtools (1) .PP RFC 3339: .PP diff --git a/tabix.1 b/tabix.1 index 15ac768e7..2d403aaea 100644 --- a/tabix.1 +++ b/tabix.1 @@ -199,6 +199,5 @@ implemented by Bob Handsaker and modified by Heng Li for remote file access and in-memory caching. .SH SEE ALSO -.PP -.BR bgzip (1), -.BR samtools (1) +.IR bgzip (1), +.IR samtools (1) From 503618637124b3bff51ac545f43378b2665d4691 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 22 Aug 2022 18:44:30 +0100 Subject: [PATCH 331/488] Fix n-squared complexity in sample line with many adjacent tabs This could be triggered by a #CHROM line ending in something like: ...\tINFO\t\t\t\t\t\t ... many tabs ... \t\t\tfoo\n Between each pair of tabs, bcf_hdr_add_sample_len() was called with len = 0, as if from bcf_hdr_add_sample(). This made it use strlen(s) instead of 0 as the sample name length, resulting in the addition of a bogus sample name with lots of leading tabs. The sample line parser then moved on to the next tab, and do the same thing again with one fewer leading tab. Fix by making bcf_hdr_add_sample_len() always use the passed-in length, even if 0, allowing the empty sample name trap to do its work. bcf_hdr_add_sample() is updated to call strlen() itself, and to also deal with the backwards-compatibility check where it was permissible to call it with a NULL string. Credit to OSS-Fuzz Fixes oss-fuzz 47641 --- vcf.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/vcf.c b/vcf.c index 56af63054..4bbf15956 100644 --- a/vcf.c +++ b/vcf.c @@ -93,9 +93,6 @@ static char *find_chrom_header_line(char *s) static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) { - if ( !s ) return 0; - if (len == 0) len = strlen(s); - const char *ss = s; while ( *ss && isspace_c(*ss) && ss - s < len) ss++; if ( !*ss || ss - s == len) @@ -140,7 +137,12 @@ static int bcf_hdr_add_sample_len(bcf_hdr_t *h, const char *s, size_t len) int bcf_hdr_add_sample(bcf_hdr_t *h, const char *s) { - return bcf_hdr_add_sample_len(h, s, 0); + if (!s) { + // Allowed for backwards-compatibility, calling with s == NULL + // used to trigger bcf_hdr_sync(h); + return 0; + } + return bcf_hdr_add_sample_len(h, s, strlen(s)); } int HTS_RESULT_USED bcf_hdr_parse_sample_line(bcf_hdr_t *hdr, const char *str) From a92615b272f8eb60e0cc5691e2a962f6159f0fdc Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 4 Nov 2021 20:33:17 +0000 Subject: [PATCH 332/488] Add bam_aux_first()/bam_aux_next() tagged aux field iterator API Add new API functions for iterating through a BAM record's aux fields, inline accessor methods for field tag and type (or code can continue to use s-2 and *s), and a variant of bam_aux_del() that returns the (updated) iterator to the following field (for use in iterator-based loops that delete fields). Add test cases for the new API functions. --- htslib/sam.h | 53 +++++++++++++++++++++++++++++++++- sam.c | 80 +++++++++++++++++++++++++++++++--------------------- test/sam.c | 39 +++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 33 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 5f8c0a554..895a14665 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1438,7 +1438,6 @@ int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b, /// Converts a BAM aux tag to SAM format /* - * @param b Pointer to the bam record * @param key Two letter tag key * @param type Single letter type code: ACcSsIifHZB. * @param tag Tag data pointer, in BAM format @@ -1628,6 +1627,29 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, return NULL; } +/// Return a pointer to a BAM record's first aux field +/** @param b Pointer to the BAM record + @return Aux field pointer, or NULL if the record has none + +When NULL is returned, errno will also be set to ENOENT. ("Aux field pointers" +point to the TYPE byte within the auxiliary data for that field; but in general +it is unnecessary for user code to be aware of this.) + */ +HTSLIB_EXPORT +uint8_t *bam_aux_first(const bam1_t *b); + +/// Return a pointer to a BAM record's next aux field +/** @param b Pointer to the BAM record + @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return Pointer to the next aux field, or NULL if no next field or error + +Whenever NULL is returned, errno will also be set: ENOENT if @p s was the +record's last aux field; otherwise EINVAL, indicating that the BAM record's +aux data is corrupt. + */ +HTSLIB_EXPORT +uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s); + /// Return a pointer to an aux record /** @param b Pointer to the bam record @param tag Desired aux tag @@ -1640,6 +1662,19 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, HTSLIB_EXPORT uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]); +/// Return the aux field's 2-character tag +/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return Pointer to the tag characters, NOT NUL-terminated + */ +static inline +const char *bam_aux_tag(const uint8_t *s) { return (const char *) (s-2); } + +/// Return the aux field's type character +/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get() + @return The type character: one of cCsSiI/fd/A/Z/H/B + */ +static inline char bam_aux_type(const uint8_t *s) { return *s; } + /// Return a SAM formatting string containing a BAM tag /** @param b Pointer to the bam record @param tag Desired aux tag @@ -1751,6 +1786,22 @@ int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8 HTSLIB_EXPORT int bam_aux_del(bam1_t *b, uint8_t *s); +/// Delete an aux field from a BAM record +/** @param b The BAM record to update + @param s Pointer to the aux field to delete, as returned by + bam_aux_first()/_next()/_get(); must not be NULL + @return Pointer to the following aux field, or NULL if none or on error + +Identical to @c bam_aux_del() apart from the return value, which is an +aux iterator suitable for use with @c bam_aux_next()/etc. + +Whenever NULL is returned, errno will also be set: ENOENT if the aux field +deleted was the record's last one; otherwise EINVAL, indicating that the +BAM record's aux data is corrupt. + */ +HTSLIB_EXPORT +uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s); + /// Update or add a string-type tag /* @param b The bam record to update @param tag Tag identifier diff --git a/sam.c b/sam.c index c95d1c693..ee88cc23e 100644 --- a/sam.c +++ b/sam.c @@ -4614,31 +4614,42 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end) } } +uint8_t *bam_aux_first(const bam1_t *b) +{ + uint8_t *s = bam_get_aux(b); + uint8_t *end = b->data + b->l_data; + if (s >= end) { errno = ENOENT; return NULL; } + return s+2; +} + +uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s) +{ + uint8_t *end = b->data + b->l_data; + uint8_t *next = s? skip_aux((uint8_t *) s, end) : end; + if (next == NULL) goto bad_aux; + if (next >= end) { errno = ENOENT; return NULL; } + return next+2; + + bad_aux: + hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); + errno = EINVAL; + return NULL; +} + uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) { - uint8_t *s, *end, *t = (uint8_t *) tag; - uint16_t y = (uint16_t) t[0]<<8 | t[1]; - s = bam_get_aux(b); - end = b->data + b->l_data; - while (s != NULL && end - s >= 3) { - uint16_t x = (uint16_t) s[0]<<8 | s[1]; - s += 2; - if (x == y) { + uint8_t *s; + for (s = bam_aux_first(b); s; s = bam_aux_next(b, s)) + if (s[-2] == tag[0] && s[-1] == tag[1]) { // Check the tag value is valid and complete - uint8_t *e = skip_aux(s, end); - if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') { - goto bad_aux; // Unterminated string - } - if (e != NULL) { - return s; - } else { - goto bad_aux; - } + uint8_t *e = skip_aux(s, b->data + b->l_data); + if (e == NULL) goto bad_aux; + if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux; + + return s; } - s = skip_aux(s, end); - } - if (s == NULL) goto bad_aux; - errno = ENOENT; + + // errno now as set by bam_aux_first()/bam_aux_next() return NULL; bad_aux: @@ -4647,23 +4658,28 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]) return NULL; } -// s MUST BE returned by bam_aux_get() int bam_aux_del(bam1_t *b, uint8_t *s) { - uint8_t *p, *aux; - int l_aux = bam_get_l_aux(b); - aux = bam_get_aux(b); - p = s - 2; - s = skip_aux(s, aux + l_aux); - if (s == NULL) goto bad_aux; - memmove(p, s, l_aux - (s - aux)); - b->l_data -= s - p; - return 0; + s = bam_aux_remove(b, s); + return (s || errno == ENOENT)? 0 : -1; +} + +uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s) +{ + uint8_t *end = b->data + b->l_data; + uint8_t *next = skip_aux(s, end); + if (next == NULL) goto bad_aux; + + b->l_data -= next - (s-2); + if (next >= end) { errno = ENOENT; return NULL; } + + memmove(s-2, next, end - next); + return s; bad_aux: hts_log_error("Corrupted aux data for read %s", bam_get_qname(b)); errno = EINVAL; - return -1; + return NULL; } int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data) diff --git a/test/sam.c b/test/sam.c index 036349f2b..c641b88e6 100644 --- a/test/sam.c +++ b/test/sam.c @@ -87,6 +87,15 @@ uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type) return NULL; } +static void check_aux_count(const bam1_t *aln, int expected, const char *what) +{ + const uint8_t *itr; + int n = 0; + for (itr = bam_aux_first(aln); itr; itr = bam_aux_next(aln, itr)) n++; + if (n != expected) + fail("%s has %d aux fields, expected %d", what, n, expected); +} + static void check_int_B_array(bam1_t *aln, char *tag, uint32_t nvals, int64_t *vals) { uint8_t *p; @@ -285,10 +294,30 @@ static int aux_fields1(void) if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); + check_aux_count(aln, 24, "Original record"); + bam_aux_del(aln,p); if (bam_aux_get(aln,"XA")) fail("XA field was not deleted"); + check_aux_count(aln, 23, "Record post-XA-deletion"); + + p = bam_aux_get(aln, "Y2"); + if (p == NULL || strncmp(bam_aux_tag(p), "Y2", 2) != 0 || bam_aux_type(p) != 'i') + fail("bam_aux_get() missed Y2 field"); + + p = bam_aux_next(aln, p); + if (p == NULL || strncmp(bam_aux_tag(p), "Y3", 2) != 0 || bam_aux_type(p) != 'c') + fail("bam_aux_next() missed Y3 field"); + + p = bam_aux_get(aln, "Y8"); + if (p == NULL || strncmp(bam_aux_tag(p), "Y8", 2) != 0 || bam_aux_type(p) != 'I') + fail("bam_aux_get() missed Y8 field"); + + p = bam_aux_next(aln, p); + if (p != NULL || errno != ENOENT) + fail("bam_aux_next missed the end of fields"); + if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %"PRId64", expected 37", bam_aux2i(p)); @@ -492,6 +521,16 @@ static int aux_fields1(void) if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); + + // Test field removal APIs -- after the strcmp(..., r1) check so that + // can also check the formatting of the to-be-removed fields. + + p = bam_aux_remove(aln, check_bam_aux_get(aln, "XH", 'H')); + if (bam_aux_get(aln, "XH")) + fail("XH field was not removed"); + check_aux_count(aln, 31, "Record post-XH-removal"); + if (strncmp(bam_aux_tag(p), "XB", 2) != 0 || bam_aux_type(p) != 'B') + fail("bam_aux_remove() missed XB field"); } else fail("can't read record"); From 2ff03d349d3ae23e80fc6ef75fec06854188922b Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 24 Aug 2022 20:11:42 +0100 Subject: [PATCH 333/488] Document that bam_aux_del()'s `s` parameter must be non-NULL (& reformat) --- htslib/sam.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 895a14665..4beaea376 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1777,11 +1777,13 @@ HTSLIB_EXPORT int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8_t *data); /// Delete tag data from a bam record -/* @param b The bam record to update - @param s Pointer to the tag to delete, as returned by bam_aux_get(). - @return 0 on success; -1 on failure - If the bam record's aux data is corrupt, errno is set to EINVAL and this - function returns -1; +/** @param b The BAM record to update + @param s Pointer to the aux field to delete, as returned by bam_aux_get() + Must not be NULL + @return 0 on success; -1 on failure + +If the BAM record's aux data is corrupt, errno is set to EINVAL and this +function returns -1. */ HTSLIB_EXPORT int bam_aux_del(bam1_t *b, uint8_t *s); From e4957188734d8d977f3b494983e28156af9a258e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Sun, 2 Jan 2022 01:49:20 -1000 Subject: [PATCH 334/488] bgzip text compression mode Co-authored-by: Mike Lin Compressing text now promotes alignment of BGZF blocks with the uncompressed text lines. BGZF blocks start at the beginning of an input line and end after some subsequent newline (except when the block's first line overflows the BGZF block size). This ensures it's possible to specify byte ranges of a BGZF file that decompress into complete text records -- useful for parallel processing and "slicing" from remote servers. To disable this feature and provide a way to produce identical output with 1.15 and earlier, the --binary option forces text data to be processed as if it were binary. The idea and initial implementation was Mike Lin's, with the current revised implementation by James Bonfield. --- Makefile | 2 +- bgzip.1 | 11 +++++ bgzip.c | 130 +++++++++++++++++++++++++++++++++++++++++++-------- test/test.pl | 24 ++++++++++ 4 files changed, 147 insertions(+), 20 deletions(-) diff --git a/Makefile b/Makefile index 374141898..827d93db7 100644 --- a/Makefile +++ b/Makefile @@ -494,7 +494,7 @@ htsfile: htsfile.o libhts.a tabix: tabix.o libhts.a $(CC) $(LDFLAGS) -o $@ tabix.o libhts.a $(LIBS) -lpthread -bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) +bgzip.o: bgzip.c config.h $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_hfile_h) htsfile.o: htsfile.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) tabix.o: tabix.c config.h $(htslib_tbx_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(htslib_hts_h) $(htslib_regidx_h) $(htslib_hts_defs_h) $(htslib_hts_log_h) diff --git a/bgzip.1 b/bgzip.1 index 30c2808e5..07f13fa41 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -74,6 +74,17 @@ after decompression completes the input file will be removed. .SH OPTIONS .TP 10 +.B "--binary" +Bgzip will attempt to ensure BGZF blocks end on a newline when the +input is a text file. The exception to this is where a single line is +larger than a BGZF block (64Kb). This can aid tools that use the +index to perform random access on the compressed stream, as the start +of a block is likely to also be the start of a text record. + +This option processes text files as if they were binary content, +ignoring the location of newlines. This also restores the behaviour +for text files to bgzip version 1.15 and earlier. +.TP .BI "-b, --offset " INT Decompress to standard output from virtual file position (0-based uncompressed offset). diff --git a/bgzip.c b/bgzip.c index bd0374811..4516eadec 100644 --- a/bgzip.c +++ b/bgzip.c @@ -36,13 +36,14 @@ #include #include "htslib/bgzf.h" #include "htslib/hts.h" +#include "htslib/hfile.h" #ifdef _WIN32 # define WIN32_LEAN_AND_MEAN # include #endif -static const int WINDOW_SIZE = 64 * 1024; +static const int WINDOW_SIZE = BGZF_BLOCK_SIZE; static void error(const char *format, ...) { @@ -121,15 +122,16 @@ static int bgzip_main_usage(FILE *fp, int status) fprintf(fp, " -r, --reindex (re)index compressed file\n"); fprintf(fp, " -s, --size INT decompress INT bytes (uncompressed size)\n"); fprintf(fp, " -t, --test test integrity of compressed file\n"); + fprintf(fp, " --binary Don't align blocks with text lines\n"); fprintf(fp, " -@, --threads INT number of compression threads to use [1]\n"); return status; } int main(int argc, char **argv) { - int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep; + int c, compress, compress_level = -1, pstdout, is_forced, test, index = 0, rebgzip = 0, reindex = 0, keep, binary; BGZF *fp; - void *buffer; + char *buffer; long start, end, size; char *index_fname = NULL; int threads = 1; @@ -151,10 +153,11 @@ int main(int argc, char **argv) {"test", no_argument, NULL, 't'}, {"version", no_argument, NULL, 1}, {"keep", no_argument, NULL, 'k'}, + {"binary", no_argument, NULL, 2}, {NULL, 0, NULL, 0} }; - compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; + compress = 1; pstdout = 0; start = 0; size = -1; end = -1; is_forced = 0; test = 0; keep = 0; binary = 0; while((c = getopt_long(argc, argv, "cdh?fb:@:s:iI:l:grtk",loptions,NULL)) >= 0){ switch(c){ case 'd': compress = 0; break; @@ -175,6 +178,7 @@ int main(int argc, char **argv) "bgzip (htslib) %s\n" "Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; + case 2: binary = 1; break; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); case '?': return bgzip_main_usage(stderr, EXIT_FAILURE); } @@ -185,7 +189,7 @@ int main(int argc, char **argv) return 1; } if (compress == 1) { - int f_src = fileno(stdin); + hFILE* f_src = NULL; char out_mode[3] = "w\0"; char out_mode_exclusive[4] = "wx\0"; @@ -198,13 +202,13 @@ int main(int argc, char **argv) out_mode_exclusive[2] = compress_level + '0'; } + if (!(f_src = hopen(argc > optind ? argv[optind] : "-", "r"))) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); + return 1; + } + if ( argc>optind ) { - if ((f_src = open(argv[optind], O_RDONLY)) < 0) { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } - if (pstdout) fp = bgzf_open("-", out_mode); else @@ -250,18 +254,103 @@ int main(int argc, char **argv) bgzf_mt(fp, threads, 256); buffer = malloc(WINDOW_SIZE); -#ifdef _WIN32 - _setmode(f_src, O_BINARY); -#endif + if (!buffer) + return 1; if (rebgzip){ if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); } else { - while ((c = read(f_src, buffer, WINDOW_SIZE)) > 0) - if (bgzf_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); + htsFormat fmt; + int textual = 0; + if (!binary + && hts_detect_format(f_src, &fmt) == 0 + && fmt.compression == no_compression) { + switch(fmt.format) { + case text_format: + case sam: + case vcf: + case bed: + case fasta_format: + case fastq_format: + case fai_format: + case fqi_format: + textual = 1; + break; + default: break; // silence clang warnings + } + } + + if (binary || !textual) { + // Binary data, either detected or explicit + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) + error("Could not write %d bytes: Error %d\n", + c, fp->errcode); + } else { + /* Text mode, try a flush after a newline */ + int in_header = 1, n = 0, long_line = 0; + while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) { + int c2 = c+n; + int flush = 0; + if (in_header && + (long_line || buffer[0] == '@' || buffer[0] == '#')) { + // Scan forward to find the last header line. + int last_start = 0; + n = 0; + while (n < c2) { + if (buffer[n++] != '\n') + continue; + + last_start = n; + if (n < c2 && + !(buffer[n] == '@' || buffer[n] == '#')) { + in_header = 0; + break; + } + } + if (!last_start) { + n = c2; + long_line = 1; + } else { + n = last_start; + flush = 1; + long_line = 0; + } + } else { + // Scan backwards to find the last newline. + n += c; // c read plus previous n overflow + while (--n >= 0 && ((char *)buffer)[n] != '\n') + ; + + if (n >= 0) { + flush = 1; + n++; + } else { + n = c2; + } + } + + // Pos n is either at the end of the buffer with flush==0, + // or the first byte after a newline and a flush point. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + if (flush) + if (bgzf_flush_try(fp, 65536) < 0) // force + return -1; + + memmove(buffer, buffer+n, c2-n); + n = c2-n; + } + + // Trailing data. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + } } if ( index ) { @@ -270,13 +359,16 @@ int main(int argc, char **argv) error("Could not write index to '%s'\n", index_fname); } else { if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) - error("Could not write index to '%s.gz.gzi'", argv[optind]); + error("Could not write index to '%s.gz.gzi'\n", + argv[optind]); } } - if (bgzf_close(fp) < 0) error("Close failed: Error %d", fp->errcode); + if (bgzf_close(fp) < 0) + error("Output close failed: Error %d\n", fp->errcode); + if (hclose(f_src) < 0) + error("Input close failed\n"); if (argc > optind && !pstdout && !keep) unlink(argv[optind]); free(buffer); - close(f_src); return 0; } else if ( reindex ) diff --git a/test/test.pl b/test/test.pl index d6c01786a..a529cfd1a 100755 --- a/test/test.pl +++ b/test/test.pl @@ -398,6 +398,30 @@ sub test_bgzip { } passed($opts,$test); + # Round-trip test of text in binary mode + my $test = sprintf('%s %2s threads', 'bgzip text mode round-trip', + $threads ? $threads : 'no'); + print "$test: "; + my $c = "$$opts{bin}/bgzip $at --binary -i -I '$index' < '$data' > '$compressed'"; + my ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "$$opts{bin}/bgzip $at -d < '$compressed' > '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + $c = "cmp '$data' '$uncompressed'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, $out ? $out : "'$data' '$uncompressed' differ"); + return; + } + passed($opts,$test); + # Extract from an offset $test = sprintf('%s %2s threads', 'bgzip -b', $threads ? $threads : 'no'); From 191ddc022a73cc03aa7702ca9e3425e919ab6396 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 11 Aug 2022 15:00:47 +0100 Subject: [PATCH 335/488] Make the bgzip -g option less opaque. --- bgzip.1 | 3 ++- bgzip.c | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bgzip.1 b/bgzip.1 index 07f13fa41..228542647 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -103,7 +103,8 @@ Use \fB--force\fR twice to do both without asking. .TP .B "-g, --rebgzip" Try to use an existing index to create a compressed file with matching -block offsets. +block offsets. The index must be specified using the \fB-I +\fIfile.gzi\fR option. Note that this assumes that the same compression library and level are in use as when making the original file. Don't use it unless you know what you're doing. diff --git a/bgzip.c b/bgzip.c index 4516eadec..67396f983 100644 --- a/bgzip.c +++ b/bgzip.c @@ -245,7 +245,7 @@ int main(int argc, char **argv) if ( rebgzip && !index_fname ) { - fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); + fprintf(stderr, "[bgzip] Index file name expected when writing to stdout. See -I option.\n"); return 1; } From 3a374083c11884d22f05c6a095a6b0cc3cc8cdc0 Mon Sep 17 00:00:00 2001 From: Lilian Janin Date: Fri, 2 Sep 2022 10:14:19 +0100 Subject: [PATCH 336/488] Fix error code 0 returned by bcftools after error Without this, bcftools returns error code 0 even when interrupted by these errors visible in stderr: - [E::hts_itr_next] Failed to seek to offset 70116896333: Illegal seek - [E::bgzf_read_block] Failed to read BGZF block data at offset 29255442 expected 12238 bytes; hread returned -1 --- synced_bcf_reader.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index c980e3b45..1dae43ae2 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -641,7 +641,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) } else if ( reader->tbx_idx ) { - if ( (ret=tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps)) < 0 ) break; // no more lines + ret = tbx_itr_next(reader->file, reader->tbx_idx, reader->itr, &files->tmps); + if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) { files->errnum = vcf_parse_error; break; } } From 76d46189ce42540af9dca1fa0252820eb76ccab0 Mon Sep 17 00:00:00 2001 From: Lilian Janin Date: Fri, 2 Sep 2022 12:02:32 +0100 Subject: [PATCH 337/488] Make bcftools return an error code != 0 after [E::bgzf_read_block] Invalid BGZF header at offset xxx --- synced_bcf_reader.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 1dae43ae2..23e0ecaef 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -623,7 +623,9 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) { if ( reader->file->format.format==vcf ) { - if ( (ret=hts_getline(reader->file, KS_SEP_LINE, &files->tmps)) < 0 ) break; // no more lines + ret = hts_getline(reader->file, KS_SEP_LINE, &files->tmps); + if ( ret < -1 ) files->errnum = bcf_read_error; + if ( ret < 0 ) break; // no more lines or an error ret = vcf_parse1(&files->tmps, reader->header, reader->buffer[reader->nbuffer+1]); if ( ret<0 ) { files->errnum = vcf_parse_error; break; } } From 6366029324e2d01902c4699a11a1b966dcfa3c8f Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 9 Sep 2022 12:04:06 +0100 Subject: [PATCH 338/488] Make tabix support CSI indices with large positions. This already worked for SAM and VCF where the SQ and Contig lines indicate the maximum length of a reference sequence. However for BED files this was left as zero, which had the effect of fighting against the user by decreasing n_lvls as we increase min_shift. When unknown, max_ref_len is now an arbitrary large size (100G), but this may produce more levels than are strictly necessary, although this doesn't appear to have negative consequences. Also fixed the misleading error message about CSI being unable to index data. This was perhaps intended to be for mis-specified VCF data where a contig was listed as small but the records were at larger offsets, however it simply lead me up the garden path by categorically stating CSI cannot store such large values. --- hts.c | 6 +++--- tabix.1 | 2 +- tbx.c | 5 ++++- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/hts.c b/hts.c index 8b437f2b9..c79d92d99 100644 --- a/hts.c +++ b/hts.c @@ -2354,9 +2354,9 @@ int hts_idx_check_range(hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t end) return 0; if (idx->fmt == HTS_FMT_CSI) { - hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos - " cannot be stored in a csi index. " - "Please check headers match the data", + hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos" " + "cannot be stored in a csi index with these parameters. " + "Please use a larger min_shift or depth", beg, end); } else { hts_log_error("Region %"PRIhts_pos"..%"PRIhts_pos diff --git a/tabix.1 b/tabix.1 index 2d403aaea..559ec6e69 100644 --- a/tabix.1 +++ b/tabix.1 @@ -101,7 +101,7 @@ start column. [5] Force to overwrite the index file if it is present. .TP .BI "-m, --min-shift " INT -set minimal interval size for CSI indices to 2^INT [14] +Set minimal interval size for CSI indices to 2^INT [14] .TP .BI "-p, --preset " STR Input format for indexing. Valid values are: gff, bed, sam, vcf. diff --git a/tbx.c b/tbx.c index 3af2c09fb..61d2ccd65 100644 --- a/tbx.c +++ b/tbx.c @@ -321,8 +321,11 @@ tbx_t *tbx_index(BGZF *fp, int min_shift, const tbx_conf_t *conf) continue; } if (first == 0) { - if (fmt == HTS_FMT_CSI) + if (fmt == HTS_FMT_CSI) { + if (!max_ref_len) + max_ref_len = (int64_t)100*1024*1024*1024; // 100G default n_lvls = adjust_n_lvls(min_shift, n_lvls, max_ref_len); + } tbx->idx = hts_idx_init(0, fmt, last_off, min_shift, n_lvls); if (!tbx->idx) goto fail; first = 1; From a1dec95bf18c24a1ea38ef736088411d7aa55a08 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 15 Sep 2022 09:02:17 +0100 Subject: [PATCH 339/488] Prevent crash when only FASTA entry has no sequence. Eg the minimal file of: >foo Credit to OSS-Fuzz Fixes oss-fuzz 51378 --- sam.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sam.c b/sam.c index ee88cc23e..61c84877b 100644 --- a/sam.c +++ b/sam.c @@ -3853,7 +3853,6 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { } // Name - if (*x->name.s != x->nprefix) return -2; @@ -3893,8 +3892,8 @@ static int fastq_parse1(htsFile *fp, bam1_t *b) { if ((ret = hts_getline(fp, KS_SEP_LINE, &fp->line)) < 0) if (fp->format.format == fastq_format || ret < -1) return -2; - if (*fp->line.s == (fp->format.format == fastq_format ? '+' : '>') - || ret == -1) + if (ret == -1 || + *fp->line.s == (fp->format.format == fastq_format ? '+' : '>')) break; if (kputsn(fp->line.s, fp->line.l, &x->seq) < 0) return -2; From 4604554d424406c6764af8da17b370c1b525ae1a Mon Sep 17 00:00:00 2001 From: David Seifert Date: Mon, 19 Sep 2022 22:07:46 +0200 Subject: [PATCH 340/488] Use POSIX `grep` * GNU grep 3.8 considers `egrep` and `fgrep` obsolescent and throws warnings: https://lists.gnu.org/archive/html/info-gnu/2022-09/msg00001.html --- test/sam_filter/filter.tst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index 129516b24..b81a19857 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -42,17 +42,17 @@ P string6.out $tv -i 'filter=library=="x"' ../xx#rg.sam P string7.out $tv -i 'filter=library!="x"' ../xx#rg.sam # Integer ops -P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam |egrep -cv '^@' -P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam |egrep -cv '^@' +P int1.out $tv -i 'filter=pos % 23 == 11' ../ce#1000.sam | grep -E -cv '^@' +P int2.out $tv -i 'filter=qlen/(flag*mapq+pos)>5' ../ce#1000.sam | grep -E -cv '^@' # Aux tags -P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam |egrep -cv '^@' +P int3.out $tv -i 'filter=[NM]>=10 || [MD]=~"A.*A.*A"' -t4 ../ce#1000.sam | grep -E -cv '^@' # Functions. -P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | egrep -cv '^@' -P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | egrep -cv '^@' -P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | egrep -cv '^@' -P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | egrep -cv '^@' -P func5.out $tv -i 'filter=sclen>=20' ../realn02.sam | egrep -v '^@' -P func6.out $tv -i 'filter=rlen<50' ../realn02.sam | egrep -v '^@' -P func7.out $tv -i 'filter=qlen>100' ../realn02.sam | egrep -v '^@' +P func1.out $tv -i 'filter=length(seq) != qlen' ../ce#5b.sam | grep -E -cv '^@' +P func2.out $tv -i 'filter=min(qual) >= 20' ../ce#1000.sam | grep -E -cv '^@' +P func3.out $tv -i 'filter=max(qual) <= 20' ../ce#1000.sam | grep -E -cv '^@' +P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam | grep -E -cv '^@' +P func5.out $tv -i 'filter=sclen>=20' ../realn02.sam | grep -E -v '^@' +P func6.out $tv -i 'filter=rlen<50' ../realn02.sam | grep -E -v '^@' +P func7.out $tv -i 'filter=qlen>100' ../realn02.sam | grep -E -v '^@' From 853b801794e680ca88d3a5d03c1ab2c0d1e68b53 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 11 Oct 2022 13:46:12 +0100 Subject: [PATCH 341/488] Switch to building libdeflate with cmake Following removal of the Makefile in libdeflate commit 03fba38bd --- .cirrus.yml | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 3a7b910a5..720ffb1b3 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -15,7 +15,8 @@ libdeflate_template: &LIBDEFLATE pushd "$HOME" git clone --depth 1 https://github.com/ebiggers/libdeflate.git pushd libdeflate - make -j 4 CFLAGS='-fPIC -O3' libdeflate.a + cmake -B build -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -DCMAKE_C_FLAGS='-g -O3 -fPIC' + cmake --build build --verbose popd popd fi @@ -27,7 +28,7 @@ compile_template: &COMPILE compile_script: | git submodule update --init --recursive if test "x$USE_LIBDEFLATE" = "xyes"; then - CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate" --with-libdeflate' + CONFIG_OPTS='CPPFLAGS="-I$HOME/libdeflate" LDFLAGS="$LDFLAGS -L$HOME/libdeflate/build" --with-libdeflate' else CONFIG_OPTS='--without-libdeflate' fi @@ -75,6 +76,13 @@ gcc_task: CFLAGS: -std=c99 -pedantic -Wformat=2 USE_LIBDEFLATE: yes + install_script: | + apt-get update + apt-get install -y --no-install-suggests --no-install-recommends \ + ca-certificates libc-dev make git autoconf automake \ + zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev libssl-dev \ + cmake + << : *LIBDEFLATE << : *COMPILE << : *TEST @@ -187,8 +195,9 @@ macosx_task: USE_CONFIG: yes USE_LIBDEFLATE: yes - package_install_script: - - HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git + package_install_script: | + HOMEBREW_NO_AUTO_UPDATE=1 brew install autoconf automake libtool xz git \ + cmake << : *LIBDEFLATE << : *COMPILE From 32de287eafdafc45dde0a22244b72697294f161d Mon Sep 17 00:00:00 2001 From: kojix2 <2xijok@gmail.com> Date: Fri, 14 Oct 2022 13:14:51 +0900 Subject: [PATCH 342/488] Fix a typo in sam.h documentation --- htslib/sam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/sam.h b/htslib/sam.h index 4beaea376..514a6be04 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -189,7 +189,7 @@ extern const int8_t bam_cigar_table[256]; * Mate position and insert size also need to be 64-bit, but * we won't accept more than 32-bit for tid. * - * The bam_core_t structure is the *in memory* layout and not + * The bam1_core_t structure is the *in memory* layout and not * the same as the on-disk format. 64-bit changes here permit * SAM to work with very long chromosomes and permit BAM and CRAM * to seamlessly update in the future without further API/ABI From eab0ee16d0bf0168459dc82354e3672b74267150 Mon Sep 17 00:00:00 2001 From: vasudeva8 <113358286+vasudeva8@users.noreply.github.com> Date: Tue, 18 Oct 2022 17:21:04 +0100 Subject: [PATCH 343/488] Adds bcf_strerror method (PR #1510) --- htslib/vcf.h | 16 +++++++++++ vcf.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 93 insertions(+), 4 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index c94bea589..04784d168 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -205,6 +205,22 @@ typedef struct bcf_dec_t { #define BCF_ERR_CTG_INVALID 32 #define BCF_ERR_TAG_INVALID 64 +/// Get error description for bcf error code +/** @param errorcode The error code which is to be described + @param buffer The buffer in which description to be added + @param maxbuffer The size of buffer passed + @return NULL on invalid buffer; buffer on other cases + +The buffer will be an empty string when @p errorcode is 0. +Description of errors present in code will be appended to @p buffer with ',' separation. +The buffer has to be at least 4 characters long. NULL will be returned if it is smaller or when buffer is NULL. + +'...' will be appended if the description doesn't fit in the given buffer. + */ + +HTSLIB_EXPORT +const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer); + /* The bcf1_t structure corresponds to one VCF/BCF line. Reading from VCF file is slower because the string is first to be parsed, packed into BCF line diff --git a/vcf.c b/vcf.c index 4bbf15956..764370af1 100644 --- a/vcf.c +++ b/vcf.c @@ -1976,7 +1976,8 @@ int bcf_write(htsFile *hfp, bcf_hdr_t *h, bcf1_t *v) // header. At this point, the header must have been printed, // proceeding would lead to a broken BCF file. Errors must be checked // and cleared by the caller before we can proceed. - hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos, v->errcode, bcf_seqname_safe(h,v), v->pos+1); + char errdescription[1024] = ""; + hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos, v->errcode, bcf_strerror(v->errcode, errdescription, sizeof(errdescription)), bcf_seqname_safe(h,v), v->pos+1); return -1; } bcf1_sync(v); // check if the BCF record was modified @@ -2487,7 +2488,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, t); - v->errcode = BCF_ERR_TAG_UNDEF; + v->errcode |= BCF_ERR_TAG_UNDEF; if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for FORMAT '%s' at %s:%"PRIhts_pos, t, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; @@ -2917,7 +2918,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (res < 0) bcf_hrec_destroy(hrec); if (res > 0) res = bcf_hdr_sync((bcf_hdr_t*)h); k = kh_get(vdict, d, key); - v->errcode = BCF_ERR_TAG_UNDEF; + v->errcode |= BCF_ERR_TAG_UNDEF; if (res || k == kh_end(d)) { hts_log_error("Could not add dummy header for INFO '%s' at %s:%"PRIhts_pos, key, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; @@ -3817,7 +3818,8 @@ int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) int i; if ( line->errcode ) { - hts_log_error("Unchecked error (%d) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_seqname_safe(src_hdr,line), line->pos+1); + char errordescription[1024] = ""; + hts_log_error("Unchecked error (%d %s) at %s:%"PRIhts_pos", exiting", line->errcode, bcf_strerror(line->errcode, errordescription, sizeof(errordescription)), bcf_seqname_safe(src_hdr,line), line->pos+1); exit(1); } if ( src_hdr->ntransl==-1 ) return 0; // no need to translate, all tags have the same id @@ -5049,3 +5051,74 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v #undef BRANCH return nsmpl*fmt->n; } + +//error description structure definition +typedef struct err_desc { + int errorcode; + const char *description; +}err_desc; + +// error descriptions +static const err_desc errdesc_bcf[] = { + { BCF_ERR_CTG_UNDEF, "Contig not defined in header"}, + { BCF_ERR_TAG_UNDEF, "Tag not defined in header" }, + { BCF_ERR_NCOLS, "Incorrect number of columns" }, + { BCF_ERR_LIMITS, "Limits reached" }, + { BCF_ERR_CHAR, "Invalid character" }, + { BCF_ERR_CTG_INVALID, "Invalid contig" }, + { BCF_ERR_TAG_INVALID, "Invalid tag" }, +}; + +/// append given description to buffer based on available size and add ... when not enough space + /** @param buffer buffer to which description to be appended + @param offset offset at which to be appended + @param maxbuffer maximum size of the buffer + @param description the description to be appended +on failure returns -1 - when buffer is not big enough; returns -1 on invalid params and on too small buffer which are improbable due to validation at caller site +on success returns 0 + */ +static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, const char *description) { + + if (!description || !buffer || !offset || (maxbuffer < 4)) + return -1; + + size_t rembuffer = maxbuffer - *offset; + if (rembuffer > (strlen(description) + (rembuffer == maxbuffer ? 0 : 1))) { //add description with optionally required ',' + *offset += snprintf(buffer + *offset, rembuffer, "%s%s", (rembuffer == maxbuffer)? "": ",", description); + } else { //not enough space for description, put ... + size_t tmppos = (rembuffer <= 4) ? maxbuffer - 4 : *offset; + snprintf(buffer + tmppos, 4, "..."); //ignore offset update + return -1; + } + return 0; +} + +//get description for given error code. return NULL on error +const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { + size_t usedup = 0; + int ret = 0; + + if (!buffer || maxbuffer < 4) + return NULL; //invalid / insufficient buffer + + if (!errorcode) { + buffer[0] = '\0'; //no error, set null + return buffer; + } + + for (int idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) { + if (errorcode & errdesc_bcf[idx].errorcode) { //error is set, add description + ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description); + if (ret < 0) + break; //not enough space, ... added, no need to continue + + errorcode &= ~errdesc_bcf[idx].errorcode; //reset the error + } + } + + if (errorcode && (ret >= 0)) { //undescribed error is present in error code and had enough buffer, try to add unkonwn error as well§ + add_desc_to_buffer(buffer, &usedup, maxbuffer, "Unknown error"); + } + return buffer; +} + From 3c6f83f11e6e99cdcbed3541ae7d8d5f7786578d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 20 Oct 2022 12:05:44 +0100 Subject: [PATCH 344/488] Add an fai_line_length function. This exposes the input datas line wrap length (minus white-space / newlines). Also fixed a SEGV in fai_retrieve given incorrect or malicious .fai files with a zero length "blen" figure. --- faidx.c | 22 ++++++++++++++++++++++ htslib/faidx.h | 9 +++++++++ 2 files changed, 31 insertions(+) diff --git a/faidx.c b/faidx.c index f3be5e57c..f0be8f0e4 100644 --- a/faidx.c +++ b/faidx.c @@ -702,6 +702,12 @@ static char *fai_retrieve(const faidx_t *fai, const faidx1_t *val, return NULL; } + if (val->line_blen <= 0) { + hts_log_error("Invalid line length in index: %d", val->line_blen); + *len = -1; + return NULL; + } + ret = bgzf_useek(fai->bgzf, offset + beg / val->line_blen * val->line_len @@ -766,6 +772,22 @@ static int fai_get_val(const faidx_t *fai, const char *str, return 0; } +/* + * The internal still has line_blen as uint32_t, but our references + * can be longer, so for future proofing we use hts_pos_t. We also needed + * a signed value so we can return negatives as an error. + */ +hts_pos_t fai_line_length(const faidx_t *fai, const char *str) +{ + faidx1_t val; + int64_t beg, end; + hts_pos_t len; + + if (fai_get_val(fai, str, &len, &val, &beg, &end)) + return -1; + else + return val.line_blen; +} char *fai_fetch64(const faidx_t *fai, const char *str, hts_pos_t *len) { diff --git a/htslib/faidx.h b/htslib/faidx.h index 149cebd2e..92da40ac9 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -188,6 +188,15 @@ char *fai_fetch(const faidx_t *fai, const char *reg, int *len); HTSLIB_EXPORT char *fai_fetch64(const faidx_t *fai, const char *reg, hts_pos_t *len); +/// Query the line-wrap length for a chromosome specified as part of a region +/** @param fai Pointer to the faidx_t struct + @param reg Region in the format "chr2:20,000-30,000" + @return The line length (excluding newline), + negative on error. +*/ +HTSLIB_EXPORT +hts_pos_t fai_line_length(const faidx_t *fai, const char *reg); + /// Fetch the quality string for a region for FASTQ files /** @param fai Pointer to the faidx_t struct @param reg Region in the format "chr2:20,000-30,000" From 2ff4b13cd9aaa78ca3992b8ac0b55f32b1a5a2c6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 25 Oct 2022 12:36:47 +0100 Subject: [PATCH 345/488] Check for invalid BC tags in fastq output. I don't know how best to deal with such situations, so rather than a hard error we treat the barcode component the same as if no BC tag was stored - with "0". It also whinges, and you'll probably get this message many times, but take that as "encouragement" to fix the input data! Also fixed the barcode to be uppercase, incase of e.g. "ac+gt". I'm not sure if it's required, but we may as well. Fixes samtools/samtools#1728 --- sam.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sam.c b/sam.c index 61c84877b..59a7e5952 100644 --- a/sam.c +++ b/sam.c @@ -4285,13 +4285,24 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) bc ? (char *)bc+1 : "0") < 0) return -1; + if (bc && (*bc != 'Z' || (!isupper_c(bc[1]) && !islower_c(bc[1])))) { + hts_log_warning("BC tag starts with non-sequence base; using '0'"); + str->l -= strlen((char *)bc)-2; // limit to 1 char + str->s[str->l-1] = '0'; + str->s[str->l] = 0; + bc = NULL; + } + // Replace any non-alpha with '+'. Ie seq-seq to seq+seq if (bc) { int l = strlen((char *)bc+1); char *c = (char *)str->s + str->l - l; - for (i = 0; i < l; i++) + for (i = 0; i < l; i++) { if (!isalpha_c(c[i])) c[i] = '+'; + else if (islower_c(c[i])) + c[i] = toupper_c(c[i]); + } } } From c91804c5d94bfb8c36d82bb4a8a262874746c6cf Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 20 Oct 2022 14:37:07 +0100 Subject: [PATCH 346/488] Warn if ref file is given but it doesn't contain the refs we need. Eg "samtools view -T human.fa human.cram" but we have a "1" vs "chr1" mix up. It's not a hard error, as we may have a partial reference we wish to use and there are further fallback measures to continue. Similarly we cannot check this at header loading time, as possibly we have a reference file for a single chromosome, a CRAM aligned against that single chromosome, but all SQ lines present in the header. Improves the diagnostics for #1515 --- cram/cram_io.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cram/cram_io.c b/cram/cram_io.c index 5d01e1318..3fe62f196 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -3422,6 +3422,9 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { */ pthread_mutex_lock(&fd->refs->lock); if (r->length == 0) { + if (fd->refs->fn) + hts_log_warning("Reference file given, but ref '%s' not present", + r->name); if (cram_populate_ref(fd, id, r) == -1) { hts_log_error("Failed to populate reference for id %d", id); pthread_mutex_unlock(&fd->refs->lock); From d7f6579bc269b70bd9c07fb3653b9cf2d0a5ae3c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 3 Nov 2022 11:55:06 +0000 Subject: [PATCH 347/488] Fix buffer read-overrun in bam_plp_insertion_mod. We already validate CIGAR length vs sequence, but if we have SEQ '*' then it passes those checks. Unfortunately this means we read beyond the end of sequence later on. --- sam.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sam.c b/sam.c index 59a7e5952..580e32d40 100644 --- a/sam.c +++ b/sam.c @@ -5352,8 +5352,10 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, break; case BAM_CINS: for (l = 0; l < (cigar[k]>>BAM_CIGAR_SHIFT); l++, j++) { - c = seq_nt16_str[bam_seqi(bam_get_seq(p->b), - p->qpos + j - p->is_del)]; + c = p->qpos + j - p->is_del < p->b->core.l_qseq + ? seq_nt16_str[bam_seqi(bam_get_seq(p->b), + p->qpos + j - p->is_del)] + : 'N'; ins->s[indel++] = c; int nm; hts_base_mod mod[256]; From 3f67e7e7435b24abb16b54ace8b2d3fa21b57b89 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 3 Nov 2022 16:48:28 +0000 Subject: [PATCH 348/488] Fix ref fix from c91804c We have too many similarly sounding reference filenames. While the previous fixed worked when MMAP was in use, when it's not we end up setting fd->refs->fn in cram_populate_ref, which causes samtools test failures on Windows. With hindsight, the fix was wrong as fd->ref_fn is the actual filename we specified with view -T, while fd->refs->fn is the current filename loaded (which is the same thing *unless* we're using a local cache and no mmap in which case it's also updated to point to the filename associated with the open file descriptor). Definitely an "I wouldn't start from here" problem. (Sorry!) --- cram/cram_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 3fe62f196..7bb098b13 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -3422,7 +3422,7 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { */ pthread_mutex_lock(&fd->refs->lock); if (r->length == 0) { - if (fd->refs->fn) + if (fd->ref_fn) hts_log_warning("Reference file given, but ref '%s' not present", r->name); if (cram_populate_ref(fd, id, r) == -1) { From 7d0bf4d959b140c7750d9d21502adc5c97564237 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 8 Jun 2022 12:32:14 +0100 Subject: [PATCH 349/488] Add faidx_seq_len64() and fai_adjust_region() interfaces Add faidx_seq_len64() as a replacement for faidx_seq_len() that can return the correct length of sequences longer than INT_MAX. Make faidx_seq_len() clamp its output at INT_MAX, which is probably slightly less bad than overflowing. Adds a fai_adjust_region() function that can be used to ensure that a given range does not go beyond the end of the requested sequence. The interface is designed so that the output of fai_parse_region() can be passed to it. This essentially exposes the internal faidx_adjust_position() function which is currently used to enforce the same limits in the faidx_fetch_seq64() and faidx_fetch_qual64() interfaces. The new function allows callers to get a better idea of what will be retrieved by applying the limits in advance. --- faidx.c | 50 ++++++++++++++++++++++++++++++++++++++++++-------- htslib/faidx.h | 38 +++++++++++++++++++++++++++++++++++++- 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/faidx.c b/faidx.c index f0be8f0e4..c1f9523ea 100644 --- a/faidx.c +++ b/faidx.c @@ -844,26 +844,40 @@ const char *faidx_iseq(const faidx_t *fai, int i) return fai->name[i]; } -int faidx_seq_len(const faidx_t *fai, const char *seq) +hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq) { khint_t k = kh_get(s, fai->hash, seq); if ( k == kh_end(fai->hash) ) return -1; return kh_val(fai->hash, k).len; } -static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char *c_name, hts_pos_t *p_beg_i, hts_pos_t *p_end_i, hts_pos_t *len) { +int faidx_seq_len(const faidx_t *fai, const char *seq) +{ + hts_pos_t len = faidx_seq_len64(fai, seq); + return len < INT_MAX ? len : INT_MAX; +} + +static int faidx_adjust_position(const faidx_t *fai, int end_adjust, + faidx1_t *val_out, const char *c_name, + hts_pos_t *p_beg_i, hts_pos_t *p_end_i, + hts_pos_t *len) { khiter_t iter; + faidx1_t *val; // Adjust position iter = kh_get(s, fai->hash, c_name); if (iter == kh_end(fai->hash)) { - *len = -2; + if (len) + *len = -2; hts_log_error("The sequence \"%s\" was not found", c_name); return 1; } - *val = kh_value(fai->hash, iter); + val = &kh_value(fai->hash, iter); + + if (val_out) + *val_out = *val; if(*p_end_i < *p_beg_i) *p_beg_i = *p_end_i; @@ -871,22 +885,42 @@ static int faidx_adjust_position(const faidx_t *fai, faidx1_t *val, const char * if(*p_beg_i < 0) *p_beg_i = 0; else if(val->len <= *p_beg_i) - *p_beg_i = val->len - 1; + *p_beg_i = val->len; if(*p_end_i < 0) *p_end_i = 0; else if(val->len <= *p_end_i) - *p_end_i = val->len - 1; + *p_end_i = val->len - end_adjust; return 0; } +int fai_adjust_region(const faidx_t *fai, int tid, + hts_pos_t *beg, hts_pos_t *end) +{ + hts_pos_t orig_beg, orig_end; + + if (!fai || !beg || !end || tid < 0 || tid >= fai->n) + return -1; + + orig_beg = *beg; + orig_end = *end; + if (faidx_adjust_position(fai, 0, NULL, fai->name[tid], beg, end, NULL) != 0) { + hts_log_error("Inconsistent faidx internal state - couldn't find \"%s\"", + fai->name[tid]); + return -1; + } + + return ((orig_beg != *beg ? 1 : 0) | + (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0)); +} + char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { faidx1_t val; // Adjust position - if (faidx_adjust_position(fai, &val, c_name, &p_beg_i, &p_end_i, len)) { + if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) { return NULL; } @@ -907,7 +941,7 @@ char *faidx_fetch_qual64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg faidx1_t val; // Adjust position - if (faidx_adjust_position(fai, &val, c_name, &p_beg_i, &p_end_i, len)) { + if (faidx_adjust_position(fai, 1, &val, c_name, &p_beg_i, &p_end_i, len)) { return NULL; } diff --git a/htslib/faidx.h b/htslib/faidx.h index 92da40ac9..b237c1cd6 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -292,7 +292,22 @@ int faidx_nseq(const faidx_t *fai); HTSLIB_EXPORT const char *faidx_iseq(const faidx_t *fai, int i); -/// Return sequence length, -1 if not present +/// Return sequence length +/** @param fai Pointer to the faidx_t struct + @param seq Name of the sequence + @return Sequence length, or -1 if not present +*/ +HTSLIB_EXPORT +hts_pos_t faidx_seq_len64(const faidx_t *fai, const char *seq); + +/// Return sequence length +/** @param fai Pointer to the faidx_t struct + @param seq Name of the sequence + @return Sequence length, or -1 if not present + + @deprecated This funtion cannot handle very long sequences. + Use faidx_seq_len64() instead. +*/ HTSLIB_EXPORT int faidx_seq_len(const faidx_t *fai, const char *seq); @@ -314,6 +329,27 @@ const char *fai_parse_region(const faidx_t *fai, const char *s, int *tid, hts_pos_t *beg, hts_pos_t *end, int flags); +/// Adjust region to the actual sequence length +/** @param fai Pointer to the faidx_t struct + @param tid Sequence index, as returned by fai_parse_region() + @param beg[in,out] The start of the region (0 based) + @param end[in,out] One past end of the region (0 based) + @return 1, 2, or 3 if @p beg, @p end, or both are adjusted, + 0 if @p beg and @p end are unchanged + -1 on error + + Looks up the length of @p tid, and then adjusts the values of @p beg + and @p end if they fall outside the boundaries of the sequence. + + If @p beg > @p end, it will be set to @p end. + + The return value indicates which, if any, of the inputs have been + adjusted. -1 will be returned if @p tid is not a valid sequence index. +*/ +HTSLIB_EXPORT +int fai_adjust_region(const faidx_t *fai, int tid, + hts_pos_t *beg, hts_pos_t *end); + /// Sets the cache size of the underlying BGZF compressed file /** @param fai Pointer to the faidx_t struct * @param cache_size Selected cache size in bytes From 8e43fb0650fabb8ca34775510e1e33ce6e48fc2a Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 2 Nov 2022 15:57:56 +0000 Subject: [PATCH 350/488] Rework / add new faidx tests The faidx interfaces were only being tested as side-effects of other tests. This adds dedicated tests for them. --- .gitattributes | 4 + .gitignore | 3 + Makefile | 12 +- test/faidx/ce.1.expected.fa | 8 + test/faidx/faidx.1.expected.fa | 6 + test/{ => faidx}/faidx.fa | 0 test/faidx/faidx.fa.expected.fai | 6 + test/faidx/faidx.tst | 74 +++++ test/faidx/fastqs.1.expected.fq | 16 + test/faidx/fastqs.2.expected.fa | 8 + test/{ => faidx}/fastqs.fq | 0 test/faidx/fastqs.fq.expected.fai | 105 ++++++ test/faidx/test-faidx.sh | 35 ++ test/sam.c | 2 +- test/test_faidx.c | 516 ++++++++++++++++++++++++++++++ 15 files changed, 792 insertions(+), 3 deletions(-) create mode 100644 test/faidx/ce.1.expected.fa create mode 100644 test/faidx/faidx.1.expected.fa rename test/{ => faidx}/faidx.fa (100%) create mode 100644 test/faidx/faidx.fa.expected.fai create mode 100644 test/faidx/faidx.tst create mode 100644 test/faidx/fastqs.1.expected.fq create mode 100644 test/faidx/fastqs.2.expected.fa rename test/{ => faidx}/fastqs.fq (100%) create mode 100644 test/faidx/fastqs.fq.expected.fai create mode 100755 test/faidx/test-faidx.sh create mode 100644 test/test_faidx.c diff --git a/.gitattributes b/.gitattributes index a14bb82b1..5d9850bc7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -20,3 +20,7 @@ README.md export-ignore # Remove the text attribute from index_dos.sam, so that the line separators # for the test file don't get converted into Unix format. test/index_dos.sam -text + +# Remove the text attribute from various faidx test files +test/faidx/faidx*.fa* -text +test/faidx/fastqs*.fq* -text diff --git a/.gitignore b/.gitignore index 6b58e8439..1dafc3615 100644 --- a/.gitignore +++ b/.gitignore @@ -44,6 +44,8 @@ shlib-exports-*.txt /bgzip /htsfile /tabix +/test/faidx/*.tmp* +/test/faidx/FAIL* /test/fieldarith /test/hfile /test/hts_endian @@ -59,6 +61,7 @@ shlib-exports-*.txt /test/test-bcf_set_variant_type /test/test_bgzf /test/test_expr +/test/test_faidx /test/test_index /test/test_introspection /test/test_kfunc diff --git a/Makefile b/Makefile index 827d93db7..bb84a25f5 100644 --- a/Makefile +++ b/Makefile @@ -80,6 +80,7 @@ BUILT_TEST_PROGRAMS = \ test/sam \ test/test_bgzf \ test/test_expr \ + test/test_faidx \ test/test_kfunc \ test/test_kstring \ test/test_mod \ @@ -583,12 +584,13 @@ check test: all $(HTSCODECS_TEST_TARGETS) fi test/test_bgzf test/bgziptest.txt test/test-parse-reg -t test/colons.bam + cd test/faidx && ./test-faidx.sh faidx.tst cd test/sam_filter && ./filter.sh filter.tst cd test/tabix && ./test-tabix.sh tabix.tst cd test/mpileup && ./test-pileup.sh mpileup.tst cd test/fastq && ./test-fastq.sh cd test/base_mods && ./base-mods.sh base-mods.tst - REF_PATH=: test/sam test/ce.fa test/faidx.fa test/fastqs.fq + REF_PATH=: test/sam test/ce.fa test/faidx/faidx.fa test/faidx/fastqs.fq test/test-regidx cd test && REF_PATH=: ./test.pl $${TEST_OPTS:-} @@ -622,6 +624,9 @@ test/test_bgzf: test/test_bgzf.o libhts.a test/test_expr: test/test_expr.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_expr.o libhts.a -lz $(LIBS) -lpthread +test/test_faidx: test/test_faidx.o libhts.a + $(CC) $(LDFLAGS) -o $@ test/test_faidx.o libhts.a -lz $(LIBS) -lpthread + test/test_kfunc: test/test_kfunc.o libhts.a $(CC) $(LDFLAGS) -o $@ test/test_kfunc.o libhts.a -lz $(LIBS) -lpthread @@ -739,6 +744,7 @@ test/test-regidx.o: test/test-regidx.c config.h $(htslib_kstring_h) $(htslib_reg test/test_str2int.o: test/test_str2int.c config.h $(textutils_internal_h) test/test_time_funcs.o: test/test_time_funcs.c config.h $(hts_time_funcs_h) test/test_view.o: test/test_view.c config.h $(cram_h) $(htslib_sam_h) $(htslib_vcf_h) $(htslib_hts_log_h) +test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) @@ -845,7 +851,9 @@ htslib-uninstalled.pc: htslib.pc.tmp testclean: - -rm -f test/*.tmp test/*.tmp.* test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt + -rm -f test/*.tmp test/*.tmp.* test/faidx/*.tmp* test/faidx/FAIL* \ + test/longrefs/*.tmp.* test/tabix/*.tmp.* test/tabix/FAIL* \ + header-exports.txt shlib-exports-$(SHLIB_FLAVOUR).txt -rm -rf htscodecs/tests/test.out # Only remove this in git checkouts diff --git a/test/faidx/ce.1.expected.fa b/test/faidx/ce.1.expected.fa new file mode 100644 index 000000000..d606105c4 --- /dev/null +++ b/test/faidx/ce.1.expected.fa @@ -0,0 +1,8 @@ +>CHROMOSOME_I:5001-5125 length: 125 +AACTGGTTCAAAAACAAAAATTTTTTAAACTGTACAAACTGTCCAAAAAT +TCGTCGTAAATCGACACACCCTTCTCATTTTTTCAAAATTTTAATTGTTT +TCGAATGTTTTTTTTGCAGAATAAT +>CHROMOSOME_X:101-225 length: 125 +GCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGC +CTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCTAAGCCT +AAGCCTAAGCCTAAGCCTAAGCCTA diff --git a/test/faidx/faidx.1.expected.fa b/test/faidx/faidx.1.expected.fa new file mode 100644 index 000000000..d14656e9f --- /dev/null +++ b/test/faidx/faidx.1.expected.fa @@ -0,0 +1,6 @@ +>trailingblank2:28-33 length: 6 +GGGCCC +>trailingblank3:4-5 length: 2 +TA +>bar:4-5 length: 2 +TA diff --git a/test/faidx.fa b/test/faidx/faidx.fa similarity index 100% rename from test/faidx.fa rename to test/faidx/faidx.fa diff --git a/test/faidx/faidx.fa.expected.fai b/test/faidx/faidx.fa.expected.fai new file mode 100644 index 000000000..b4d1aff26 --- /dev/null +++ b/test/faidx/faidx.fa.expected.fai @@ -0,0 +1,6 @@ + 4 2 4 5 +trailingblank1 33 23 12 13 +trailingblank2 72 111 24 25 +trailingblank3 5 234 4 6 +foo 8 252 6 7 +bar 8 280 8 9 diff --git a/test/faidx/faidx.tst b/test/faidx/faidx.tst new file mode 100644 index 000000000..b6bd7cac1 --- /dev/null +++ b/test/faidx/faidx.tst @@ -0,0 +1,74 @@ +# Copyright (C) 2022 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# First field: +# INIT = initialisation, not counted in testing +# P = expected to pass (zero return; expected output matches, if present) +# N = expected to return non-zero +# F = expected to fail +# +# Second field (P/N/F only): +# Filename of expected output. If '.', output is not checked +# +# Rest: +# Command to execute. $bgzip and $test_faidx are replaced with the path to +# bgzip and test_faidx. + +# Index fasta +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -e faidx.fa.expected.fai + +# Test various functions on the fasta index +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_line_length -e 24 trailingblank2 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_has_seq -e 1 foo +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_has_seq -e 0 absent +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_iseq -e trailingblank3 3 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_seq_len -e 33 trailingblank1 +P . $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_seq_len64 -e 72 trailingblank2 + +# Index fastq +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -e fastqs.fq.expected.fai + +# Test various functions on the fastq index +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_line_length -e 63 FAKE0005_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_line_length -e 144 SRR014849.203935_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_has_seq -e 1 SRR014849.203935_3 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_has_seq -e 0 absent +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_iseq -e FAKE0005_1 0 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_seq_len -e 453 FSRRS4401CM938_1 +P . $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -t faidx_seq_len64 -e 309 FSRRS4401AOV6A_4 + +# Fasta retrieval tests +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_fetch trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t faidx_fetch_seq64 trailingblank2:28-33 trailingblank3:4-5 bar:4-5 +P faidx.1.expected.fa $test_faidx -i faidx.fa -f faidx.fa.tmp.fai -t fai_adjust_region trailingblank2:28-33 trailingblank3:4-5 bar:4-5 + +# Fastq retrieval tests +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t fai_fetch FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.1.expected.fq $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai -Q -t faidx_fetch_seq64 FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 +P fastqs.2.expected.fa $test_faidx -i fastqs.fq -f fastqs.fq.tmp.fai FAKE0006_1:4-12 FSRRS4401BE7HA_1:81-120 FAKE0010_2 SRR014849.50939_3:71-90 + +# Indexing and retrieval on bgzip compressed fasta +INIT $bgzip -c < ../ce.fa > ce.fa.tmp.gz +P . $test_faidx -i ce.fa.tmp.gz -f ce.fa.tmp.gz.fai -g ce.fa.tmp.gz.gzi -e ../ce.fa.fai +P ce.1.expected.fa $test_faidx -i ce.fa.tmp.gz -f ce.fa.tmp.gz.fai -g ce.fa.tmp.gz.gzi CHROMOSOME_I:5001-5125 CHROMOSOME_X:101-225 diff --git a/test/faidx/fastqs.1.expected.fq b/test/faidx/fastqs.1.expected.fq new file mode 100644 index 000000000..729393837 --- /dev/null +++ b/test/faidx/fastqs.1.expected.fq @@ -0,0 +1,16 @@ +@FAKE0006_1:4-12 length: 9 +TGCATGCAT ++ +{zyxwvuts +@FSRRS4401BE7HA_1:81-120 length: 40 +GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca ++ +c^^^YRPOSNVU\YTMMMSMRKKKRUUNNNNS[`aa```\ +@FAKE0010_2 length: 30 +gatcrywsmkhbvdnGATCRYWSMKHBVDN ++ +I?5+I?5+I?5+I?5+I?5+I?5+I?5+I? +@SRR014849.50939_3:71-90 length: 20 +CAATAAATCAATACATAAAA ++ +\aZ\d`OY[aY[[\[[e`WP diff --git a/test/faidx/fastqs.2.expected.fa b/test/faidx/fastqs.2.expected.fa new file mode 100644 index 000000000..9b67d15e7 --- /dev/null +++ b/test/faidx/fastqs.2.expected.fa @@ -0,0 +1,8 @@ +>FAKE0006_1:4-12 length: 9 +TGCATGCAT +>FSRRS4401BE7HA_1:81-120 length: 40 +GCCCGTTTGTCGATATTTGtatttaaagtaatccgtcaca +>FAKE0010_2 length: 30 +gatcrywsmkhbvdnGATCRYWSMKHBVDN +>SRR014849.50939_3:71-90 length: 20 +CAATAAATCAATACATAAAA diff --git a/test/fastqs.fq b/test/faidx/fastqs.fq similarity index 100% rename from test/fastqs.fq rename to test/faidx/fastqs.fq diff --git a/test/faidx/fastqs.fq.expected.fai b/test/faidx/fastqs.fq.expected.fai new file mode 100644 index 000000000..77ba04a5d --- /dev/null +++ b/test/faidx/fastqs.fq.expected.fai @@ -0,0 +1,105 @@ +FAKE0005_1 63 85 63 64 151 +FAKE0006_1 63 300 63 64 366 +FAKE0005_2 63 515 63 64 581 +FAKE0006_2 63 730 63 64 796 +FAKE0005_3 63 945 63 64 1011 +FAKE0006_3 63 1160 63 64 1226 +FAKE0005_4 63 1375 63 64 1441 +FAKE0006_4 63 1590 63 64 1656 +FSRRS4401BE7HA_1 395 1823 395 396 2221 +FSRRS4401BRRTC_1 145 2720 145 146 2868 +FSRRS4401B64ST_1 382 3118 382 383 3503 +FSRRS4401EJ0YH_1 381 3990 381 382 4374 +FSRRS4401BK0IB_1 507 4860 507 508 5370 +FSRRS4401ARCCB_1 258 5982 258 259 6243 +FSRRS4401CM938_1 453 6606 453 454 7062 +FSRRS4401EQLIK_1 411 7620 411 412 8034 +FSRRS4401AOV6A_1 309 8550 309 310 8862 +FSRRS4401EG0ZW_1 424 9276 424 425 9703 +FSRRS4401BE7HA_2 395 10231 395 396 10629 +FSRRS4401BRRTC_2 145 11128 145 146 11276 +FSRRS4401B64ST_2 382 11526 382 383 11911 +FSRRS4401EJ0YH_2 381 12398 381 382 12782 +FSRRS4401BK0IB_2 507 13268 507 508 13778 +FSRRS4401ARCCB_2 258 14390 258 259 14651 +FSRRS4401CM938_2 453 15014 453 454 15470 +FSRRS4401EQLIK_2 411 16028 411 412 16442 +FSRRS4401AOV6A_2 309 16958 309 310 17270 +FSRRS4401EG0ZW_2 424 17684 424 425 18111 +FSRRS4401BE7HA_3 395 18639 395 396 19037 +FSRRS4401BRRTC_3 145 19536 145 146 19684 +FSRRS4401B64ST_3 382 19934 382 383 20319 +FSRRS4401EJ0YH_3 381 20806 381 382 21190 +FSRRS4401BK0IB_3 507 21676 507 508 22186 +FSRRS4401ARCCB_3 258 22798 258 259 23059 +FSRRS4401CM938_3 453 23422 453 454 23878 +FSRRS4401EQLIK_3 411 24436 411 412 24850 +FSRRS4401AOV6A_3 309 25366 309 310 25678 +FSRRS4401EG0ZW_3 424 26092 424 425 26519 +FSRRS4401BE7HA_4 395 27047 80 81 27449 +FSRRS4401BRRTC_4 145 27952 80 81 28101 +FSRRS4401B64ST_4 382 28352 80 81 28741 +FSRRS4401EJ0YH_4 381 29232 80 81 29620 +FSRRS4401BK0IB_4 507 30110 80 81 30626 +FSRRS4401ARCCB_4 258 31244 80 81 31508 +FSRRS4401CM938_4 453 31874 80 81 32335 +FSRRS4401EQLIK_4 411 32898 80 81 33317 +FSRRS4401AOV6A_4 309 33838 80 81 34153 +FSRRS4401EG0ZW_4 424 34570 80 81 35002 +FAKE0007_1 41 35549 41 42 35593 +FAKE0008_1 41 35752 41 42 35796 +FAKE0009_1 41 35955 41 42 35999 +FAKE0010_1 30 36143 30 31 36176 +FAKE0007_2 41 36324 41 42 36368 +FAKE0008_2 41 36527 41 42 36571 +FAKE0009_2 41 36730 41 42 36774 +FAKE0010_2 30 36918 30 31 36951 +FAKE0007_3 41 37099 41 42 37143 +FAKE0008_3 41 37302 41 42 37346 +FAKE0009_3 41 37505 41 42 37549 +FAKE0010_3 30 37693 30 31 37726 +FAKE0007_4 41 37874 41 42 37918 +FAKE0008_4 41 38077 41 42 38121 +FAKE0009_4 41 38280 41 42 38324 +FAKE0010_4 30 38468 30 31 38501 +FAKE0011_1 41 38649 41 42 38693 +FAKE0012_1 41 38852 41 42 38896 +FAKE0013_1 41 39055 41 42 39099 +FAKE0014_1 30 39250 30 31 39283 +FAKE0011_2 41 39431 41 42 39475 +FAKE0012_2 41 39634 41 42 39678 +FAKE0013_2 41 39837 41 42 39881 +FAKE0014_2 30 40032 30 31 40065 +FAKE0011_3 41 40213 41 42 40257 +FAKE0012_3 41 40416 41 42 40460 +FAKE0013_3 41 40619 41 42 40663 +FAKE0014_3 30 40814 30 31 40847 +FAKE0011_4 41 40995 41 42 41039 +FAKE0012_4 41 41198 41 42 41242 +FAKE0013_4 41 41401 41 42 41445 +FAKE0014_4 30 41596 30 31 41629 +FAKE0001_1 94 41745 94 95 41842 +FAKE0002_1 94 42022 94 95 42119 +FAKE0001_2 94 42299 94 95 42396 +FAKE0002_2 94 42576 94 95 42673 +FAKE0001_3 94 42853 94 95 42950 +FAKE0002_3 94 43130 94 95 43227 +FAKE0001_4 94 43407 94 95 43504 +FAKE0002_4 94 43684 94 95 43781 +FAKE0003_1 68 43963 68 69 44034 +FAKE0004_1 68 44190 68 69 44261 +FAKE0003_2 68 44417 68 69 44488 +FAKE0004_2 68 44644 68 69 44715 +FAKE0003_3 68 44871 68 69 44942 +FAKE0004_3 68 45098 68 69 45169 +FAKE0003_4 68 45325 68 69 45396 +FAKE0004_4 68 45552 68 69 45623 +SRR014849.50939_1 135 45737 135 136 45875 +SRR014849.110027_1 131 46057 131 132 46191 +SRR014849.203935_1 144 46369 144 145 46516 +SRR014849.50939_2 135 46706 135 136 46844 +SRR014849.110027_2 131 47026 131 132 47160 +SRR014849.203935_2 144 47338 144 145 47485 +SRR014849.50939_3 135 47675 135 136 47813 +SRR014849.110027_3 131 47995 131 132 48129 +SRR014849.203935_3 144 48307 144 145 48454 diff --git a/test/faidx/test-faidx.sh b/test/faidx/test-faidx.sh new file mode 100755 index 000000000..ae501e086 --- /dev/null +++ b/test/faidx/test-faidx.sh @@ -0,0 +1,35 @@ +#!/bin/sh +# +# Copyright (C) 2022 Genome Research Ltd. +# +# Author: Robert Davies +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +# DEALINGS IN THE SOFTWARE. + +# Load in the test driver +. ../simple_test_driver.sh + +echo "Testing faidx..." + +bgzip="../../bgzip" +test_faidx="../test_faidx" + +test_driver $@ + +exit $? diff --git a/test/sam.c b/test/sam.c index c641b88e6..28ca1bc5f 100644 --- a/test/sam.c +++ b/test/sam.c @@ -2263,7 +2263,7 @@ int main(int argc, char **argv) test_text_file("test/emptyfile", 0); test_text_file("test/xx#pair.sam", 7); test_text_file("test/xx.fa", 7); - test_text_file("test/fastqs.fq", 500); + test_text_file("test/faidx/fastqs.fq", 500); check_enum1(); check_cigar_tab(); check_big_ref(0); diff --git a/test/test_faidx.c b/test/test_faidx.c new file mode 100644 index 000000000..566149071 --- /dev/null +++ b/test/test_faidx.c @@ -0,0 +1,516 @@ +/* test/test_fadix.c -- Test faidx interfaces + + Copyright (C) 2022 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include + +#include +#include +#include + +#include "../htslib/faidx.h" + +int file_compare(const char *file1, const char *file2) { + FILE *f1 = NULL; + FILE *f2 = NULL; + unsigned int lno = 1; + size_t got1, got2, i; + char buf1[1024], buf2[1024]; + int ret = -1; + + f1 = fopen(file1, "rb"); + if (!f1) { + perror(file1); + goto out; + } + f2 = fopen(file2, "rb"); + if (!f2) { + perror(file2); + goto out; + } + + do { + got1 = fread(buf1, 1, sizeof(buf1), f1); + got2 = fread(buf2, 1, sizeof(buf2), f2); + + for (i = 0; i < got1 && i < got2 && buf1[i] == buf2[i]; i++) + lno += (buf1[i] == '\n'); + if (i < got1 || i < got2) { + fprintf(stderr, "%s and %s differ at line %u\n", + file1, file2, lno); + goto out; + } + } while (got1 > 0 && got2 > 0); + + if (ferror(f1)) { + perror(file1); + goto out; + } + if (ferror(f2)) { + perror(file2); + goto out; + } + + if (got1 > 0 || got2 > 0) { + fprintf(stderr, "EOF on %s at line %u\n", + got1 ? file2 : file1, lno); + goto out; + } + + ret = 0; + out: + if (f1) fclose(f1); + if (f2) fclose(f2); + return ret; +} + +faidx_t * load_index(const char *fn, const char *fnfai, const char *fngzi, + int flags, enum fai_format_options format) { + faidx_t *fai = fai_load3_format(fn, fnfai, fngzi, flags, format); + if (!fai) { + fprintf(stderr, "Failed: fai_load3(%s, %s, %s, %d, %d)\n", + fn, fnfai ? fnfai : "NULL", fngzi ? fngzi : "NULL", flags, + (int) format); + return NULL; + } + return fai; +} + +int do_retrieval(const char *fn, const char *fnfai, const char *fngzi, + int flags, enum fai_format_options format, const char *fnout, + const char *interface, int nreg, char **regions) { + int i, use_64bit = 1, use_parse_reg = 0, use_adjust_reg = 0; + faidx_t *fai = NULL; + FILE *out = stdout; + + if (interface) { + if (strcmp(interface, "fai_fetch") == 0) { + use_64bit = 0; + } else if (strcmp(interface, "faidx_fetch_seq") == 0) { + use_64bit = 0; + use_parse_reg = 1; + } else if (strcmp(interface, "faidx_fetch_seq64") == 0 + || strcmp(interface, "fai_parse_region") == 0) { + use_parse_reg = 1; + } else if (strcmp(interface, "fai_adjust_region") == 0) { + use_parse_reg = 1; + use_adjust_reg = 1; + } + } + + if (fnout) { + out = fopen(fnout, "wb"); + if (!out) { + perror(fnout); + return -1; + } + } + + fai = load_index(fn, fnfai, fngzi, flags, format); + if (!fai) + goto fail; + + for (i = 0; i < nreg; i++) { + hts_pos_t len = 0, pos, beg = 0, end = 0; + int tid = 0; + char *seq = NULL; + size_t l; + + if (use_parse_reg) { + const char *e = fai_parse_region(fai, regions[i], + &tid, &beg, &end, 0); + if (e == NULL) { + fprintf(stderr, "Failed: " + "fai_parse_region(fai, %s, &tid, &beg, &end, 0)\n", + regions[i]); + goto fail; + } + if (use_adjust_reg) { + hts_pos_t orig_beg = beg, orig_end = end; + int r = fai_adjust_region(fai, tid, &beg, &end); + if (r < 0 + || (((r & 1) != 0) ^ (beg != orig_beg)) + || (((r & 2) != 0) ^ (end != orig_end))) { + fprintf(stderr, "Failed: fai_adjust_region(fai, %d, " + "%"PRIhts_pos", %"PRIhts_pos") returned %d\n" + "After: beg = %"PRIhts_pos" end = %"PRIhts_pos"\n", + tid, orig_beg, orig_end, r, beg, end); + goto fail; + } + } + if (use_64bit) { + seq = faidx_fetch_seq64(fai, faidx_iseq(fai, tid), + beg, end - 1, &len); + } else { + int ilen = 0; + seq = faidx_fetch_seq(fai, faidx_iseq(fai, tid), + beg, end - 1, &ilen); + len = ilen; + } + if (!seq) { + fprintf(stderr, "Failed: faidx_fetch_seq%s(fai, %s, " + "%"PRIhts_pos", %"PRIhts_pos", &len)\n", + use_64bit ? "64" : "", faidx_iseq(fai, tid), beg, end); + goto fail; + } + } else { + if (use_64bit) { + seq = fai_fetch64(fai, regions[i], &len); + } else { + int ilen = 0; + seq = fai_fetch(fai, regions[i], &ilen); + len = ilen; + } + if (!seq) { + fprintf(stderr, "Failed: fai_fetch%s(fai, %s, &len)\n", + use_64bit ? "64" : "", regions[i]); + goto fail; + } + } + + l = strlen(seq); + fprintf(out, "%c%s length: %"PRIhts_pos"\n", + format == FAI_FASTQ ? '@' : '>', regions[i], len); + for (pos = 0; pos < l; pos += 50) { + fprintf(out, "%.*s\n", 50, seq + pos); + } + free(seq); + if (format == FAI_FASTQ) { + hts_pos_t qual_len = 0; + char *qual; + if (use_parse_reg) { + if (use_64bit) { + qual = faidx_fetch_qual64(fai, faidx_iseq(fai, tid), + beg, end - 1, &qual_len); + } else { + int ilen = 0; + qual = faidx_fetch_qual(fai, faidx_iseq(fai, tid), + beg, end - 1, &ilen); + qual_len = ilen; + } + } else { + if (use_64bit) { + qual = fai_fetchqual64(fai, regions[i], &qual_len); + } else { + int ilen = 0; + qual = fai_fetchqual(fai, regions[i], &ilen); + qual_len = ilen; + } + if (!qual) { + fprintf(stderr, "Failed: fai_fetchqual64(fai, %s, &len)\n", + regions[i]); + goto fail; + } + } + if (qual_len != len) { + fprintf(stderr, + "Sequence and quality lengths differ for %s %s\n", + fn, regions[i]); + free(qual); + goto fail; + } + fprintf(out, "+\n"); + l = strlen(qual); + for (pos = 0; pos < l; pos+=50) { + fprintf(out, "%.*s\n", 50, qual + pos); + } + free(qual); + } + } + + fai_destroy(fai); + + if (fnout) { + if (fclose(out) != 0) { + perror(fnout); + return -1; + } + } + return 0; + + fail: + if (fai) + fai_destroy(fai); + if (fnout) + fclose(out); + + return -1; +} + +int test_fai_line_length(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *reg) { + hts_pos_t found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = fai_line_length(fai, reg); + fai_destroy(fai); + if (expected) { + long long exp_len = strtoll(expected, NULL, 10); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %"PRIhts_pos" from " + "fai_line_length, expected %s\n", found_len, expected); + return -1; + } + } else { + printf("%"PRIhts_pos"\n", found_len); + } + return 0; +} + +int test_faidx_has_seq(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + int res; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + res = faidx_has_seq(fai, seq); + fai_destroy(fai); + if (expected) { + long exp_res = strtol(expected, NULL, 10); + if (res != exp_res) { + fprintf(stderr, "Unexpected result %d from faidx_has_seq(%s) " + "expected %s\n", res, seq, expected); + return -1; + } + } else { + printf("%d\n", res); + } + return 0; +} + +int test_faidx_iseq(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *index) { + const char *found_name = NULL; + int idx = atoi(index); + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_name = faidx_iseq(fai, idx); + + if (expected) { + if (!found_name || strcmp(found_name, expected) != 0) { + fprintf(stderr, "Unexpected result %s from faidx_iseq(fai, %d), " + "expected %s\n", found_name ? found_name : "(null)", + idx, expected); + fai_destroy(fai); + return -1; + } + } else { + printf("%s\n", found_name ? found_name : "(null)"); + } + + fai_destroy(fai); + return 0; +} + +int test_faidx_seq_len(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + int found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = faidx_seq_len(fai, seq); + fai_destroy(fai); + + if (expected) { + int exp_len = atoi(expected); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %d from faidx_seq_len(fai, %s) " + "expected %s\n", found_len, seq, expected); + return -1; + } + } else { + printf("%d\n", found_len); + } + + return 0; +} + +int test_faidx_seq_len64(const char *fn, const char *fnfai, const char *fngzi, + enum fai_format_options format, const char *expected, + const char *seq) { + hts_pos_t found_len; + faidx_t *fai = NULL; + + fai = load_index(fn, fnfai, fngzi, 0, format); + if (!fai) + return -1; + + found_len = faidx_seq_len(fai, seq); + fai_destroy(fai); + + if (expected) { + long long exp_len = strtoll(expected, NULL, 10); + if (found_len != exp_len) { + fprintf(stderr, "Unexpected result %"PRIhts_pos + " from fai_seq_len64(fai, %s) expected %s\n", + found_len, seq, expected); + return -1; + } + } else { + printf("%"PRIhts_pos"\n", found_len); + } + + return 0; +} + +void usage(FILE *out, const char *arg0) { + fprintf(out, + "Usage: %s [-c] -i fasta/q [-f fai_file] [-g gzi_file] [-e expected_fai]\n" + " %s [-cQ] -i fasta/q [-f fai_file] [-g gzi_file] [region]\n" + " %s -t FUNC -i fasta/q [-f fai_file] [-g gzi_file] [-e expected] \n" + " %s -h\n", + arg0, arg0, arg0, arg0); +} + +void help(FILE *out, const char *arg0) { + usage(out, arg0); + fprintf(out, + "Options:\n" + " -i FILE Input file\n" + " -f FILE Fasta/q index file name\n" + " -g FILE Bgzip index file name\n" + " -o FILE Output file name\n" + " -e FILE|STR Expected output\n" + " -c Set FAI_CREATE flag\n" + " -Q Output fastq format\n" + " -t FUNC Test function\n" + " -h Print this help\n" + "\n" + "Expected output is compared to the FAI file in indexing mode;" + " the output file\n" + "in retrieval mode; " + "expected output for various -t function tests.\n" + "\n" + "Unit tests (-t option):\n" + " fai_line_length, faidx_has_seq, faidx_iseq, faidx_seq_len, faidx_seq_len64\n" + "In retrieval mode, -t can change the functions used to fetch data:\n" + " fai_fetch, fai_fetch64, faidx_fetch_seq, faidx_fetch_seq64,\n" + " fai_parse_region, fai_adjust_region\n" + "\n"); +} + +int main(int argc, char **argv) { + int opt; + const char *fn = NULL; + const char *fnout = NULL; + const char *fnfai = NULL; + const char *fngzi = NULL; + const char *expected = NULL; + const char *func = ""; + int flags = 0; + enum fai_format_options format = FAI_FASTA; + int res; + + while ((opt = getopt(argc, argv, "i:f:g:o:e:t:cQh")) > 0) { + switch (opt) { + case 'i': + fn = optarg; + break; + case 'f': + fnfai = optarg; + break; + case 'g': + fngzi = optarg; + break; + case 'o': + fnout = optarg; + break; + case 'e': + expected = optarg; + break; + case 'c': + flags |= FAI_CREATE; + break; + case 'Q': + format = FAI_FASTQ; + break; + case 't': + func = optarg; + break; + case 'h': + help(stdout, argv[0]); + return EXIT_SUCCESS; + default: + usage(stderr, argv[0]); + return EXIT_FAILURE; + } + } + + if (!fn) { + usage(stderr, argv[0]); + return EXIT_FAILURE; + } + + if (optind == argc) { + // Index building mode + res = fai_build3(fn, fnfai, fngzi); + if (res) { + fprintf(stderr, "Failed: fai_build3(%s, %s, %s)\n", + fn, fnfai ? fnfai : "NULL", fngzi ? fngzi : "NULL"); + } else if (expected) { + res = file_compare(fnfai, expected); + } + } else { + if (strcmp(func, "fai_line_length") == 0) { + res = test_fai_line_length(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_has_seq") == 0) { + res = test_faidx_has_seq(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_iseq") == 0) { + res = test_faidx_iseq(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_seq_len") == 0) { + res = test_faidx_seq_len(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else if (strcmp(func, "faidx_seq_len64") == 0) { + res = test_faidx_seq_len64(fn, fnfai, fngzi, format, expected, + argv[optind]); + } else { + res = do_retrieval(fn, fnfai, fngzi, flags, format, fnout, + func, argc - optind, &argv[optind]); + if (res == 0 && fnout && expected) { + res = file_compare(fnout, expected); + } + } + } + return res == 0 ? EXIT_SUCCESS : EXIT_FAILURE; +} From 58abf07c40936ade9c1c595eb5add64dbb0318f3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 16 Nov 2022 20:05:54 +0000 Subject: [PATCH 351/488] Fix build on ancient versions of gcc Default on gcc 4.6 was 'gnu90', which allows some c99 features but not 'for (int n = 0, ...)' Adds a -std=gnu90 build to the tests so we don't get caught by this again. --- .cirrus.yml | 1 + vcf.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 720ffb1b3..9ac3c645f 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -137,6 +137,7 @@ rocky_task: LC_ALL: C CIRRUS_CLONE_DEPTH: 1 USE_CONFIG: yes + CFLAGS: -std=gnu90 # NB: we could consider building a docker image with these # preinstalled and specifying that instead, to speed up testing. diff --git a/vcf.c b/vcf.c index 764370af1..e91b24775 100644 --- a/vcf.c +++ b/vcf.c @@ -5097,6 +5097,7 @@ static int add_desc_to_buffer(char *buffer, size_t *offset, size_t maxbuffer, co const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { size_t usedup = 0; int ret = 0; + int idx; if (!buffer || maxbuffer < 4) return NULL; //invalid / insufficient buffer @@ -5106,7 +5107,7 @@ const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { return buffer; } - for (int idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) { + for (idx = 0; idx < sizeof(errdesc_bcf) / sizeof(err_desc); ++idx) { if (errorcode & errdesc_bcf[idx].errorcode) { //error is set, add description ret = add_desc_to_buffer(buffer, &usedup, maxbuffer, errdesc_bcf[idx].description); if (ret < 0) From e88e3431e963216de2085816a67b37b08bb15069 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 23 Nov 2022 16:17:19 +0000 Subject: [PATCH 352/488] Ensure strings in config_vars.h are escaped correctly This is more tricky than it first appears as the strings in question have to make it through make expansion, shell string expansion, interpretation by the command writing the file, and end up making sense to the C compiler. The values are escaped by converting \ to \\, " to \" and ' to \x27 (as it's impossible to quote a ' in a single-quoted string). Writing the file is now done by printf, which won't try to interpret any of the strings being printed as an option. Its arguments are all single-quoted to prevent the shell from trying to expand them in an unwanted manner. --- Makefile | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index bb84a25f5..8961d0292 100644 --- a/Makefile +++ b/Makefile @@ -161,12 +161,20 @@ show-version: @echo PACKAGE_VERSION = $(PACKAGE_VERSION) @echo NUMERIC_VERSION = $(NUMERIC_VERSION) +config_vars.h: override escape=$(subst ',\x27,$(subst ",\",$(subst \,\\,$(1)))) +config_vars.h: override hts_cc_escaped=$(call escape,$(CC)) +config_vars.h: override hts_cppflags_escaped=$(call escape,$(CPPFLAGS)) +config_vars.h: override hts_cflags_escaped=$(call escape,$(CFLAGS)) +config_vars.h: override hts_ldflags_escaped=$(call escape,$(LDFLAGS)) +config_vars.h: override hts_libs_escaped=$(call escape,$(LIBS)) + config_vars.h: - echo '#define HTS_CC "$(CC)"' > $@ - echo '#define HTS_CPPFLAGS "$(CPPFLAGS)"' >> $@ - echo '#define HTS_CFLAGS "$(CFLAGS)"' >> $@ - echo '#define HTS_LDFLAGS "$(LDFLAGS)"' >> $@ - echo '#define HTS_LIBS "$(LIBS)"' >> $@ + printf '#define HTS_CC "%s"\n#define HTS_CPPFLAGS "%s"\n#define HTS_CFLAGS "%s"\n#define HTS_LDFLAGS "%s"\n#define HTS_LIBS "%s"\n' \ + '$(hts_cc_escaped)' \ + '$(hts_cppflags_escaped)' \ + '$(hts_cflags_escaped)' \ + '$(hts_ldflags_escaped)' \ + '$(hts_libs_escaped)' > $@ .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so From 59f727ac8d2a49e97fa8857ef9a2ccf195a11b0a Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 5 Dec 2022 11:29:53 +0000 Subject: [PATCH 353/488] Make it easier to modify shared library permissions during install The new LIB_PERM Makefile variable defaults to 644 (as before), but can now be amended to e.g. 755. Updated the INSTALL file with a section for HP-UX explaining how to do this. Fixes #1525 --- INSTALL | 11 +++++++++++ Makefile | 3 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/INSTALL b/INSTALL index 2f6deacb2..dd2c3ec90 100644 --- a/INSTALL +++ b/INSTALL @@ -110,6 +110,7 @@ The 'make install' command installs the libraries, library header files, utilities, several manual pages, and a pkgconfig file to /usr/local. The installation location can be changed by configuring with --prefix=DIR or via 'make prefix=DIR install' (see Installation Locations below). +Shared library permissions can be set via e.g. 'make install LIB_PERM=755'. Configuration @@ -291,3 +292,13 @@ mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools mingw-w64-x86_64-tools-git (The last is only needed for building libraries compatible with MSVC.) + +HP-UX +----- + +HP-UX requires that shared libraries have execute permission. The +default for HTSlib is to install with permission 644 (read-write for +owner and read-only for group / other). This can be overridden by +setting the LIB_PERM variable at install time with: + + make install LIB_PERM=755 diff --git a/Makefile b/Makefile index 8961d0292..fb5f7a0c7 100644 --- a/Makefile +++ b/Makefile @@ -58,7 +58,8 @@ MKDIR_P = mkdir -p INSTALL = install -p INSTALL_DATA = $(INSTALL) -m 644 INSTALL_DIR = $(MKDIR_P) -m 755 -INSTALL_LIB = $(INSTALL_DATA) +LIB_PERM = 644 +INSTALL_LIB = $(INSTALL) -m $(LIB_PERM) INSTALL_MAN = $(INSTALL_DATA) INSTALL_PROGRAM = $(INSTALL) From a9421b71c654118f52e822164374307f0d593204 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 8 Dec 2022 17:16:00 +0000 Subject: [PATCH 354/488] Switch MacOS CI tests to an ARM-based image Before x86-64 is phased out at the end of the year. Uses cirrus-ci recommended container, see: https://cirrus-ci.org/guide/macOS/ --- .cirrus.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 9ac3c645f..dc93b071d 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -181,8 +181,8 @@ arm_ubuntu_task: macosx_task: name: macosx + clang - osx_instance: - image: monterey-base + macos_instance: + image: ghcr.io/cirruslabs/macos-ventura-base:latest environment: CC: clang From 6ed4e1caf4907594e3e348d6e59d73f1898effda Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 7 Dec 2022 11:01:23 +0000 Subject: [PATCH 355/488] Remove variable redeclaration warnings from perl test script --- test/test.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test.pl b/test/test.pl index a529cfd1a..a1fa6a7e6 100755 --- a/test/test.pl +++ b/test/test.pl @@ -399,11 +399,11 @@ sub test_bgzip { passed($opts,$test); # Round-trip test of text in binary mode - my $test = sprintf('%s %2s threads', 'bgzip text mode round-trip', + $test = sprintf('%s %2s threads', 'bgzip text mode round-trip', $threads ? $threads : 'no'); print "$test: "; - my $c = "$$opts{bin}/bgzip $at --binary -i -I '$index' < '$data' > '$compressed'"; - my ($ret, $out) = _cmd($c); + $c = "$$opts{bin}/bgzip $at --binary -i -I '$index' < '$data' > '$compressed'"; + ($ret, $out) = _cmd($c); if ($ret) { failed($opts, $test, "non-zero exit from $c"); return; From c58203d03b2c81de3d84c68985c59ce032a5abac Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 8 Dec 2022 09:15:44 +0000 Subject: [PATCH 356/488] Make bcf_hdr_seqnames() work with gapped chromosome ids The bcf_hdr_remove() call can create gaps in tid blocks which fail assertion in bcf_hdr_seqnames(). This problem was encountered in #1533, but is only a partial fix of the problem --- htslib/vcf.h | 6 +++++- vcf.c | 36 ++++++++++++++++++++++++++++++------ 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 04784d168..8ab78c079 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -636,7 +636,11 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_subset(const bcf_hdr_t *h0, int n, char *const* samples, int *imap); - /** Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names) */ + /** + * Creates a list of sequence names. It is up to the caller to free the list (but not the sequence names). + * NB: sequence name indexes returned by bcf_hdr_seqnames() may not correspond to bcf1_t.rid, use + * bcf_hdr_id2name() or bcf_seqname() instead. + */ HTSLIB_EXPORT const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *nseqs); diff --git a/vcf.c b/vcf.c index e91b24775..fc96aa24a 100644 --- a/vcf.c +++ b/vcf.c @@ -2214,20 +2214,44 @@ char *bcf_hdr_fmt_text(const bcf_hdr_t *hdr, int is_bcf, int *len) const char **bcf_hdr_seqnames(const bcf_hdr_t *h, int *n) { vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; - int tid, m = kh_size(d); + int i, tid, m = kh_size(d); const char **names = (const char**) calloc(m,sizeof(const char*)); + if ( !names ) + { + hts_log_error("Failed to allocate memory"); + *n = 0; + return NULL; + } khint_t k; for (k=kh_begin(d); k= m ) + { + // This can happen after a contig has been removed from BCF header via bcf_hdr_remove() + if ( hts_resize(const char*, tid + 1, &m, &names, HTS_RESIZE_CLEAR)<0 ) + { + hts_log_error("Failed to allocate memory"); + *n = 0; + free(names); + return NULL; + } + m = tid + 1; + } names[tid] = kh_key(d,k); } - // sanity check: there should be no gaps - for (tid=0; tid Date: Wed, 14 Dec 2022 17:31:55 +0000 Subject: [PATCH 357/488] Cut down the number of embed_ref=2 tests that get run The test only runs on the ce#1000.sam file, so it shouldn't be inside the loop that runs on all sam files in the test directory. --- test/test.pl | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/test/test.pl b/test/test.pl index a1fa6a7e6..fc6036f60 100755 --- a/test/test.pl +++ b/test/test.pl @@ -661,14 +661,6 @@ sub test_view testv $opts, "./compare_sam.pl -Baux $md $sam $jsam"; } - # embed_ref=2 mode - my $ersam = "ce#1000.sam"; - my $ercram = "ce#1000_er.tmp.cram"; - my $ersam2 = "${ercram}.sam"; - testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; - testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; - testv $opts, "./compare_sam.pl $ersam $ersam2"; - if ($test_view_failures == 0) { passed($opts, "$sam conversions"); @@ -679,6 +671,21 @@ sub test_view } } + # embed_ref=2 mode + print "test_view testing embed_ref=2:\n"; + $test_view_failures = 0; + my $ersam = "ce#1000.sam"; + my $ercram = "ce#1000_er.tmp.cram"; + my $ersam2 = "${ercram}.sam"; + testv $opts, "./test_view $tv_args -C -p $ercram $ersam"; + testv $opts, "./test_view $tv_args -p $ersam2 $ercram"; + testv $opts, "./compare_sam.pl $ersam $ersam2"; + if ($test_view_failures == 0) { + passed($opts, "embed_ref=2 tests"); + } else { + failed($opts, "embed_ref=2 tests", "$test_view_failures subtests failed"); + } + # BAM and CRAM range queries on prebuilt BAM and CRAM # The cram file has @SQ UR: set to point to an invalid location to # force the reference to be reloaded from the one given on the From 3fd25cd11bf4346bf63ca137d57897bf779dd2fb Mon Sep 17 00:00:00 2001 From: Bergur Ragnarsson Date: Wed, 23 Nov 2022 23:55:33 +0000 Subject: [PATCH 358/488] draft fix --- bgzf.c | 13 ++++--- test/test_bgzf.c | 91 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 4 deletions(-) diff --git a/bgzf.c b/bgzf.c index a969b1567..7bcca554b 100644 --- a/bgzf.c +++ b/bgzf.c @@ -989,6 +989,8 @@ int bgzf_read_block(BGZF *fp) { hts_tpool_result *r; + if (fp->errcode) return -1; + if (fp->mt) { again: if (fp->mt->hit_eof) { @@ -1479,6 +1481,8 @@ int bgzf_mt_read_block(BGZF *fp, bgzf_job *j) int64_t block_address; block_address = htell(fp->fp); + j->block_address = block_address; // in case we exit with j->errcode + if (fp->cache_size && load_block_from_cache(fp, block_address)) return 0; count = hpeek(fp->fp, header, sizeof(header)); if (count == 0) // no data read @@ -1667,10 +1671,7 @@ static void *bgzf_mt_reader(void *vp) { hts_tpool_process_destroy(mt->out_queue); return NULL; } - if (j->errcode != 0) { - hts_tpool_process_destroy(mt->out_queue); - return &j->errcode; - } + // TODO: handle j->errcode correctly when SEEK and HAS_EOF arrive // We hit EOF so can stop reading, but we may get a subsequent // seek request. In this case we need to restart the reader. @@ -1706,6 +1707,9 @@ static void *bgzf_mt_reader(void *vp) { pthread_cond_signal(&mt->command_c); pthread_mutex_unlock(&mt->command_m); hts_tpool_process_destroy(mt->out_queue); + if (j->errcode != 0) { + return &j->errcode; + } return NULL; } } @@ -2285,6 +2289,7 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str) fp->block_length = 0; } } while (state == 0); + if (state < -1) return state; if (str->l == 0 && state < 0) return state; fp->uncompressed_address += str->l + 1; if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--; diff --git a/test/test_bgzf.c b/test/test_bgzf.c index 90ec167ac..7981e1d26 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -890,6 +890,8 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + for (pos = 0; pos < f->ltext; ) { const char *end = strchr(text + pos, '\n'); size_t l = end ? end - (text + pos) : f->ltext - pos; @@ -909,6 +911,7 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { "Got : %.*s\n", __func__, f->tmp_bgzf, (int) l, (char *) f->text + pos, (int) str.l, str.s); + goto fail; } pos += l + 1; @@ -924,6 +927,90 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { return -1; } +static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int nthreads) { + BGZF* bgz = NULL; + ssize_t bg_put; + size_t pos; + kstring_t str = { 0, 0, NULL }; + const char *text = (const char *) f->text; + + bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__); + if (!bgz) goto fail; + + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + + const char *text_line2 = strchr(text, '\n') + 1; + bg_put = try_bgzf_write(bgz, text, text_line2 - text, f->tmp_bgzf, __func__); + if (bg_put < 0) goto fail; + if (bgzf_flush(bgz) < 0) goto fail; + int64_t block2_start = bgz->block_address; + + const char *text_line3 = strchr(text_line2, '\n') + 1; + bg_put = try_bgzf_write(bgz, text_line2, text_line3 - text_line2, f->tmp_bgzf, __func__); + if (bg_put < 0) goto fail; + if (bgzf_flush(bgz) < 0) goto fail; + int64_t block3_start = bgz->block_address; + + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + + for(int64_t newsize = block3_start - 1; newsize > block2_start; newsize--) { + fprintf(stderr, "test truncated %" PRId64 " with threads %d\n", newsize, nthreads); + + if (truncate(f->tmp_bgzf, newsize) != 0) goto fail; + + bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); + if (!bgz) goto fail; + + if (nthreads > 0 && try_bgzf_mt(bgz, nthreads, __func__) != 0) goto fail; + + for (pos = 0; pos < f->ltext; ) { + const char *end = strchr(text + pos, '\n'); + size_t l = end ? end - (text + pos) : f->ltext - pos; + + int res = bgzf_getline(bgz, '\n', &str); + if (res < -1) { + // ok, we expect error from truncated file + break; + } else if (res == -1) { + // truncated file should never return EOF since we do not truncate at block boundary + fprintf(stderr, "%s : %s from bgzf_getline on %s\n", + __func__, "Unexpected EOF", + f->tmp_bgzf); + goto fail; + } + + if (str.l != l || memcmp(text + pos, str.s, l) != 0) { + fprintf(stderr, + "%s : Unexpected data from bgzf_getline on %s\n" + "Expected : %.*s\n" + "Got : %.*s\n", + __func__, f->tmp_bgzf, (int) l, (char *) f->text + pos, + (int) str.l, str.s); + goto fail; + } + + pos += l + 1; + } + + // verify we still get error and don't hang if we try again: + int res = bgzf_getline(bgz, '\n', &str); + if (res > -2) { + fprintf(stderr, "%s : unexpected bgzf_getline result %d\n", __func__, res); + goto fail; + } + + // closing a stream with error returns error + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) == 0) goto fail; + } + free(ks_release(&str)); + return 0; + + fail: + if (bgz) bgzf_close(bgz); + free(ks_release(&str)); + return -1; +} + int main(int argc, char **argv) { Files f = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, 0 }; int retval = EXIT_FAILURE; @@ -1000,6 +1087,10 @@ int main(int argc, char **argv) { if (test_bgzf_getline(&f, "w", 1) != 0) goto out; if (test_bgzf_getline(&f, "w", 2) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 0) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 1) != 0) goto out; + if (test_bgzf_getline_on_truncated_file(&f, "w", 2) != 0) goto out; + retval = EXIT_SUCCESS; out: From dd63fb089cf7ab4b17f13e9d50aa809a5cb5dafc Mon Sep 17 00:00:00 2001 From: Bergur Ragnarsson Date: Sat, 10 Dec 2022 18:43:45 +0000 Subject: [PATCH 359/488] fix memory leak --- bgzf.c | 1 + test/test_bgzf.c | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bgzf.c b/bgzf.c index 7bcca554b..737c90bc9 100644 --- a/bgzf.c +++ b/bgzf.c @@ -1020,6 +1020,7 @@ int bgzf_read_block(BGZF *fp) hts_log_error("BGZF decode jobs returned error %d " "for block offset %"PRId64, j->errcode, j->block_address); + hts_tpool_delete_result(r, 0); return -1; } diff --git a/test/test_bgzf.c b/test/test_bgzf.c index 7981e1d26..de7423078 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -954,7 +954,7 @@ static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int n if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; for(int64_t newsize = block3_start - 1; newsize > block2_start; newsize--) { - fprintf(stderr, "test truncated %" PRId64 " with threads %d\n", newsize, nthreads); + //fprintf(stderr, "test truncated %" PRId64 " with threads %d\n", newsize, nthreads); if (truncate(f->tmp_bgzf, newsize) != 0) goto fail; From d7737aa69484191bb3ce4e0c20dec10d79231820 Mon Sep 17 00:00:00 2001 From: Bergur Ragnarsson Date: Wed, 14 Dec 2022 23:14:17 +0000 Subject: [PATCH 360/488] exit early on error --- bgzf.c | 15 +++++++++------ test/test_bgzf.c | 20 +++++++++++--------- 2 files changed, 20 insertions(+), 15 deletions(-) diff --git a/bgzf.c b/bgzf.c index 737c90bc9..1576683b8 100644 --- a/bgzf.c +++ b/bgzf.c @@ -1007,11 +1007,14 @@ int bgzf_read_block(BGZF *fp) if (fp->uncompressed_block == NULL) return -1; fp->compressed_block = (char *)fp->uncompressed_block + BGZF_MAX_BLOCK_SIZE; } // else it's already allocated with malloc, maybe even in-use. - if (mt_destroy(fp->mt) < 0) + if (mt_destroy(fp->mt) < 0) { fp->errcode = BGZF_ERR_IO; + } fp->mt = NULL; hts_tpool_delete_result(r, 0); - + if (fp->errcode) { + return -1; + } goto single_threaded; } @@ -1672,7 +1675,10 @@ static void *bgzf_mt_reader(void *vp) { hts_tpool_process_destroy(mt->out_queue); return NULL; } - // TODO: handle j->errcode correctly when SEEK and HAS_EOF arrive + if (j->errcode != 0) { + hts_tpool_process_destroy(mt->out_queue); + return &j->errcode; + } // We hit EOF so can stop reading, but we may get a subsequent // seek request. In this case we need to restart the reader. @@ -1708,9 +1714,6 @@ static void *bgzf_mt_reader(void *vp) { pthread_cond_signal(&mt->command_c); pthread_mutex_unlock(&mt->command_m); hts_tpool_process_destroy(mt->out_queue); - if (j->errcode != 0) { - return &j->errcode; - } return NULL; } } diff --git a/test/test_bgzf.c b/test/test_bgzf.c index de7423078..5b2192986 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -953,8 +953,9 @@ static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int n if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; - for(int64_t newsize = block3_start - 1; newsize > block2_start; newsize--) { - //fprintf(stderr, "test truncated %" PRId64 " with threads %d\n", newsize, nthreads); + int64_t newsize; + for(newsize = block3_start - 1; newsize > block2_start; newsize--) { + //fprintf(stderr, "test_bgzf_getline_on_truncated_file : size truncated to %" PRId64 " with threads %d\n", newsize, nthreads); if (truncate(f->tmp_bgzf, newsize) != 0) goto fail; @@ -988,17 +989,18 @@ static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int n (int) str.l, str.s); goto fail; } - pos += l + 1; } - // verify we still get error and don't hang if we try again: - int res = bgzf_getline(bgz, '\n', &str); - if (res > -2) { - fprintf(stderr, "%s : unexpected bgzf_getline result %d\n", __func__, res); - goto fail; + // verify error is persistent + int k; + for(k = 0; k < 3; k++) { + int res = bgzf_getline(bgz, '\n', &str); + if (res > -2) { + fprintf(stderr, "%s : unexpected bgzf_getline result %d\n", __func__, res); + goto fail; + } } - // closing a stream with error returns error if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) == 0) goto fail; } From f56be61b91bfc57983d826c5b4b926999a115676 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 6 Jan 2023 16:21:20 +0000 Subject: [PATCH 361/488] Make bcf_hdr_idinfo_exists() more robust Do not rely on int_id being return value of bcf_hdr_id2int() Resolves #1538 --- htslib/vcf.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 8ab78c079..03ec57f7f 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1230,7 +1230,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_hdr_id2number(hdr,type,int_id) ((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>12) #define bcf_hdr_id2type(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type]>>4 & 0xf) #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) - #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) + #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && (int_id)<(hdr)->n[BCF_DT_ID] && (hdr)->id[BCF_DT_ID][int_id].val && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) /// Convert BCF FORMAT data to string form /** From 4ec92c177f4eaf8ad73f592c32544d73b35795ad Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 13 Jan 2023 15:44:56 +0000 Subject: [PATCH 362/488] Add CRAM SQ/M5 header checking when specifying a fasta file. (PR #1522) * Add CRAM SQ/M5 header checking when specifying a fasta file. Given the possibility of creating a CRAM which cannot be decoded again due to specifying a fasta file that differs to the M5 tags specified in the header, we're now more militant in enforcing these match. The references are checked on-the-fly as they're used. For name sorted data this may have a significant initial cost (the same that we see when converting from BAM to CRAM without M5 tags in the BAM header), but for position streaming the cost is only for chromosomes currently in use. This means writing a small CRAM file covering a region from just one chromosome will only incur the CPU cost of validating that one chromosome, as we're not trying to check the header tags are valid (we take them at their word) and are instead trying to ensure the fasta sequence we've been given matches the supplied headers. * Correct CRAM M5 header creation bug with refs out of order. Note this doesn't happen on the command line, so it's something specific to the way test/sam.c is written that triggers this. test/sam.c calls cram_load_reference before the SAM header is known. That in turn means refs2id hasn't been called and fd->refs array isn't in sync with the file header (as we have a subset of refs). Then cram_write_SAM_hdr md5sums the wrong reference when created SQ M5 tags. There was already code in the cram_write_SAM_hdr to recall refs2id to get things in sync, but unfortunately it was after the creation of M5 tags. What's odd is how this ever worked in test/sam. Maybe it just didn't and we never noticed? It was only spotted here due to the new validation code. --- cram/cram_encode.c | 58 ++++++++++++++++++++++++++++++++++++++++++++- cram/cram_io.c | 14 +++++++---- cram/cram_structs.h | 1 + 3 files changed, 67 insertions(+), 6 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 1ba1988f4..6371e5e39 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1688,6 +1688,54 @@ static int cram_generate_reference(cram_container *c, cram_slice *s, int r1) { return -1; } +// Check if the SQ M5 tag matches the reference we've loaded. +static int validate_md5(cram_fd *fd, int ref_id) { + if (fd->ignore_md5 || ref_id < 0 || ref_id >= fd->refs->nref) + return 0; + + // Have we already checked this ref? + if (fd->refs->ref_id[ref_id]->validated_md5) + return 0; + + // Check if we have the MD5 known. + // We should, but maybe we're using embedded references? + sam_hrecs_t *hrecs = fd->header->hrecs; + sam_hrec_type_t *ty = sam_hrecs_find_type_id(hrecs, "SQ", "SN", + hrecs->ref[ref_id].name); + if (!ty) + return 0; + + sam_hrec_tag_t *m5tag = sam_hrecs_find_key(ty, "M5", NULL); + if (!m5tag) + return 0; + + // It's known, so compute md5 on the loaded reference sequence. + char *ref = fd->refs->ref_id[ref_id]->seq; + int64_t len = fd->refs->ref_id[ref_id]->length; + hts_md5_context *md5; + char unsigned buf[16]; + char buf2[33]; + + if (!(md5 = hts_md5_init())) + return -1; + hts_md5_update(md5, ref, len); + hts_md5_final(buf, md5); + hts_md5_destroy(md5); + hts_md5_hex(buf2, buf); + + // Compare it to header @SQ M5 tag + if (strcmp(m5tag->str+3, buf2)) { + hts_log_error("SQ header M5 tag discrepancy for reference '%s'", + hrecs->ref[ref_id].name); + hts_log_error("Please use the correct reference, or " + "consider using embed_ref=2"); + return -1; + } + fd->refs->ref_id[ref_id]->validated_md5 = 1; + + return 0; +} + /* * Encodes all slices in a container into blocks. * Returns 0 on success @@ -1715,8 +1763,11 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (!fd->no_ref && c->refs_used) { for (i = 0; i < nref; i++) { - if (c->refs_used[i]) + if (c->refs_used[i]) { cram_get_ref(fd, i, 1, 0); + if (validate_md5(fd, i) < 0) + goto_err; + } } } @@ -1744,6 +1795,9 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { embed_ref = fd->embed_ref = 2; pthread_mutex_unlock(&fd->ref_lock); goto auto_ref; + } else if (ref) { + if (validate_md5(fd, c->ref_seq_id) < 0) + goto_err; } if ((c->ref_id = bam_ref(b)) >= 0) { c->ref_seq_id = c->ref_id; @@ -1813,6 +1867,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { free(MD.s); return -1; } + if (validate_md5(fd, bam_ref(b)) < 0) + return -1; c->ref_seq_id = bam_ref(b); // overwritten later by -2 if (!fd->refs->ref_id[c->ref_seq_id]->seq) diff --git a/cram/cram_io.c b/cram/cram_io.c index 7bb098b13..d04cc282a 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2586,6 +2586,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { e->seq = NULL; e->mf = NULL; e->is_md5 = 0; + e->validated_md5 = 0; k = kh_put(refs, r->h_meta, e->name, &n); if (-1 == n) { @@ -3022,6 +3023,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { fd->refs->fp = fp; fd->refs->fn = r->fn; r->is_md5 = 1; + r->validated_md5 = 1; // Fall back to cram_get_ref() where it'll do the actual // reading of the file. @@ -3043,6 +3045,7 @@ static int cram_populate_ref(cram_fd *fd, int id, ref_entry *r) { } r->length = sz; r->is_md5 = 1; + r->validated_md5 = 1; } else { refs_t *refs; const char *fn; @@ -4826,6 +4829,11 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { return -1; } + if (-1 == refs_from_header(fd)) + return -1; + if (-1 == refs2id(fd->refs, fd->header)) + return -1; + /* Fix M5 strings */ if (fd->refs && !fd->no_ref && fd->embed_ref <= 1) { int i; @@ -4873,6 +4881,7 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { cram_ref_decr(fd->refs, i); hts_md5_hex(buf2, buf); + fd->refs->ref_id[i]->validated_md5 = 1; if (sam_hdr_update_line(hdr, "SQ", "SN", hdr->hrecs->ref[i].name, "M5", buf2, NULL)) return -1; } @@ -5000,11 +5009,6 @@ int cram_write_SAM_hdr(cram_fd *fd, sam_hdr_t *hdr) { cram_free_container(c); } - if (-1 == refs_from_header(fd)) - return -1; - if (-1 == refs2id(fd->refs, fd->header)) - return -1; - if (0 != hflush(fd->fp)) return -1; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 16739c2c6..cbb226b70 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -675,6 +675,7 @@ typedef struct ref_entry { char *seq; mFILE *mf; int is_md5; // Reference comes from a raw seq found by MD5 + int validated_md5; } ref_entry; KHASH_MAP_INIT_STR(refs, ref_entry*) From fd90307716f4ec11f008b8b85839d6f04d61b5cc Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 4 Jan 2023 12:32:20 +0000 Subject: [PATCH 363/488] Cap bgzf_getline return value to INT_MAX Prevents overflow on long lines, which could result in a negative number being returned even if the read was otherwise successful. --- bgzf.c | 2 +- htslib/bgzf.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bgzf.c b/bgzf.c index 1576683b8..468289106 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2298,7 +2298,7 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str) fp->uncompressed_address += str->l + 1; if ( delim=='\n' && str->l>0 && str->s[str->l-1]=='\r' ) str->l--; str->s[str->l] = 0; - return str->l; + return str->l <= INT_MAX ? (int) str->l : INT_MAX; } void bgzf_index_destroy(BGZF *fp) diff --git a/htslib/bgzf.h b/htslib/bgzf.h index c4ba85679..9e4ab3794 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -303,7 +303,8 @@ typedef struct BGZF BGZF; * @param fp BGZF file handler * @param delim delimiter * @param str string to write to; must be initialized - * @return length of the string; -1 on end-of-file; <= -2 on error + * @return length of the string (capped at INT_MAX); + * -1 on end-of-file; <= -2 on error */ HTSLIB_EXPORT int bgzf_getline(BGZF *fp, int delim, struct kstring_t *str); From 2060c498267fc9d24375d3c1852ec9b3524879f7 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 4 Jan 2023 12:35:34 +0000 Subject: [PATCH 364/488] Make tbx_parse1 work for lines longer than 2Gbytes Change the type of len, i and b so they can index positions over 2Gbytes into the input string. Naturally this is only useful on 64 bit platforms as the entire input needs to be in memory. Luckily this function isn't exported, so changing the type of len does not affect the ABI. --- tbx.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tbx.c b/tbx.c index 61d2ccd65..084a82c8d 100644 --- a/tbx.c +++ b/tbx.c @@ -91,9 +91,10 @@ int tbx_name2id(tbx_t *tbx, const char *ss) return get_tid(tbx, ss, 0); } -int tbx_parse1(const tbx_conf_t *conf, int len, char *line, tbx_intv_t *intv) +int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) { - int i, b = 0, id = 1; + size_t i, b = 0; + int id = 1; char *s; intv->ss = intv->se = 0; intv->beg = intv->end = -1; for (i = 0; i <= len; ++i) { From 9b6f7e1d083ddd2796734acee15db92841afa8a4 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 4 Jan 2023 15:40:08 +0000 Subject: [PATCH 365/488] Use correct type for ret in vcf_write() It needs to be ssize_t to prevent possible overflow when writing very long records on 64 bit platforms. --- vcf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vcf.c b/vcf.c index fc96aa24a..851d32e88 100644 --- a/vcf.c +++ b/vcf.c @@ -3437,7 +3437,7 @@ int vcf_write_line(htsFile *fp, kstring_t *line) int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) { - int ret; + ssize_t ret; fp->line.l = 0; if (vcf_format1(h, v, &fp->line) != 0) return -1; From 46cfe856729544e2383003c223adb8db6bad1780 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 6 Jan 2023 16:58:05 +0000 Subject: [PATCH 366/488] Don't error when making an iterator on a tid not in the index Instead return one that instantly finishes. Fixes an edge case (issue #1534) where an index did not include an entry for a chromosome that was mentioned in the file header but had no data records. Normally these would be present but empty, but it was possible to use the IDX= key in a VCF file to make an index where the chromosome simply did not appear. In this case, rather than an error, we want to return the equivalent of HTS_IDX_NONE so the iterator produces no data. Another scenario where this is useful is if you build an index, and then try to use it immediately without first saving and reading it back in again. Such an index will have NULL entries in bidx[] for any chromosomes with no data. Again we want to return an HTS_IDX_NONE iterator if one of those chromosomes is queried. (This issue didn't usually occur because most programs are loading in an existing index, and idx_read_core() makes bidx[] entries for everything even if there's nothing in the index for the chromosome.) Note that this changes vcf_loop() in test_view.c so that it now treats bcf_itr_querys() failures as an error. The new behaviour matches sam_loop() and is needed to detect the problem being fixed here. All the other tests still work after this change no nothing was relying on the old behaviour of ignoring the errors. --- hts.c | 6 ++---- test/modhdr.expected.vcf | 4 ++++ test/modhdr.vcf.gz | Bin 0 -> 156 bytes test/modhdr.vcf.gz.csi | Bin 0 -> 86 bytes test/test.pl | 5 +++++ test/test_view.c | 3 ++- 6 files changed, 13 insertions(+), 5 deletions(-) create mode 100644 test/modhdr.expected.vcf create mode 100644 test/modhdr.vcf.gz create mode 100644 test/modhdr.vcf.gz.csi diff --git a/hts.c b/hts.c index c79d92d99..66b8643e8 100644 --- a/hts.c +++ b/hts.c @@ -3077,16 +3077,14 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t free(iter); iter = NULL; } + } else if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) { + iter->finished = 1; } else { if (beg < 0) beg = 0; if (end < beg) { free(iter); return NULL; } - if (tid >= idx->n || (bidx = idx->bidx[tid]) == NULL) { - free(iter); - return NULL; - } k = kh_get(bin, bidx, META_BIN(idx)); if (k != kh_end(bidx)) diff --git a/test/modhdr.expected.vcf b/test/modhdr.expected.vcf new file mode 100644 index 000000000..bad663c7e --- /dev/null +++ b/test/modhdr.expected.vcf @@ -0,0 +1,4 @@ +##fileformat=VCFv4.3 +##FILTER= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO diff --git a/test/modhdr.vcf.gz b/test/modhdr.vcf.gz new file mode 100644 index 0000000000000000000000000000000000000000..f97e06ab3dee21b1d51002f83957a02a8a90f6f4 GIT binary patch literal 156 zcmb2|=3rp}f&Xj_PR>jW^$e^Go?OiaA8dXei&=W*>lMuo!xDwute6j4kJ$7S4u0o; zv}4WTiaF8GSDd^eYU|W^LwN+jWmJG#w-%_3=CnO{Q@q|Pmodm=!mw15S6weJthUe$~XIcO? O$fN0$W?%*z1R? "$$opts{bin}/htsfile -c $$opts{path}/formatmissing.vcf"); test_cmd($opts, %args, out => "vcf_meta_meta.vcf", cmd => "$$opts{bin}/htsfile -c $$opts{path}/vcf_meta_meta.vcf"); + + # VCF file with contig IDX=1, simulating an edited BCF file + # See htslib issue 1534 + test_cmd($opts, %args, out => "modhdr.expected.vcf", + cmd => "$$opts{path}/test_view $$opts{path}/modhdr.vcf.gz chr22:1-2"); } sub write_multiblock_bgzf { diff --git a/test/test_view.c b/test/test_view.c index 416ffe98d..f33c1cdf0 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -224,7 +224,8 @@ int vcf_loop(int argc, char **argv, int optind, struct opts *opts, htsFile *in, hts_itr_t *iter; if ((iter = bcf_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); - continue; + exit_code = 1; + break; } while ((r = bcf_itr_next(in, iter, b)) >= 0) { if (!opts->benchmark && bcf_write1(out, h, b) < 0) { From 53f980829d3c906ff1d0506212a3e2c8c83b2cb3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 12 Jan 2023 16:11:58 +0000 Subject: [PATCH 367/488] Happy New Year --- LICENSE | 4 ++-- bgzip.c | 2 +- htsfile.c | 2 +- tabix.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/LICENSE b/LICENSE index cf2e9f82a..925d47b40 100644 --- a/LICENSE +++ b/LICENSE @@ -3,7 +3,7 @@ according to the terms of the following MIT/Expat license.] The MIT/Expat License -Copyright (C) 2012-2022 Genome Research Ltd. +Copyright (C) 2012-2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -29,7 +29,7 @@ according to the terms of the following Modified 3-Clause BSD license.] The Modified-BSD License -Copyright (C) 2012-2022 Genome Research Ltd. +Copyright (C) 2012-2023 Genome Research Ltd. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: diff --git a/bgzip.c b/bgzip.c index 67396f983..6961bbfa4 100644 --- a/bgzip.c +++ b/bgzip.c @@ -176,7 +176,7 @@ int main(int argc, char **argv) case 1: printf( "bgzip (htslib) %s\n" -"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 2: binary = 1; break; case 'h': return bgzip_main_usage(stdout, EXIT_SUCCESS); diff --git a/htsfile.c b/htsfile.c index 0391a98fa..9f7bf4531 100644 --- a/htsfile.c +++ b/htsfile.c @@ -258,7 +258,7 @@ int main(int argc, char **argv) case 1: printf( "htsfile (htslib) %s\n" -"Copyright (C) 2022 Genome Research Ltd.\n", +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); exit(EXIT_SUCCESS); break; diff --git a/tabix.c b/tabix.c index a79a7b968..0798b279f 100644 --- a/tabix.c +++ b/tabix.c @@ -581,7 +581,7 @@ int main(int argc, char *argv[]) case 1: printf( "tabix (htslib) %s\n" -"Copyright (C) 2022 Genome Research Ltd.\n", hts_version()); +"Copyright (C) 2023 Genome Research Ltd.\n", hts_version()); return EXIT_SUCCESS; case 2: return usage(stdout, EXIT_SUCCESS); From 114f5ebdd560d64ec3b256c822a9eb127321aa3d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 17 Jan 2023 11:01:32 +0000 Subject: [PATCH 368/488] Speed up load_ref_portion. This function is about 7x faster than before, which speeds up low-depth CRAM decoding by around 10% of so. Obviously the time spent in this function is constantly regardless of depth, so the deeper the data the less important the speed of this becomes. The two main improvements are: 1. Drop toupper_c and replace it with "c & ~0x20". This works for ASCII, and we already have far too many places with char lookup tables e.g. converting ACGT to 0123 that we're not going to work on mythical EBDIC systems anyway. 2. Remove the continuous white-space check. We exploit the knowledge that the FASTA format must have white-space only at the end of lines. The fai index can't work if this isn't true and I've already tested that samtools faidx fails to query correctly if we have whitespace elsewhere. Some benchmarks are below. I can't explain why mmap is being slow on this system (seq4c). It's not what I've observed before, where mmap is normally the fastest way to load the reference. 10 million records 9827_1#49 at a mean depth of ~1.75x Cram_io.c built with clang10 (although rest was probably system gcc). Reference via mmap is: ./test/test_view -B /tmp/_.cram 58752578630 cycles # 2.984 GHz 115846154170 instructions # 1.97 insn per cycle 20.41% 16536 test_view [.] rans_uncompress_O1 > 10.55% 8560 [kernel] [k] _etext 8.06% 6532 test_view [.] cram_decode_slice 7.85% 6356 test_view [.] RansDecRenorm2 5.98% 4895 test_view [.] body 5.70% 4622 test_view [.] cram_decode_seq.isra.11 4.71% 3860 libz.so.1.2.11 [.] crc32_z 4.54% 3744 libz.so.1.2.11 [.] inflateBackEnd 4.08% 3289 test_view [.] bam_set1 Not sure what "_etext" is, but it's a significant CPU portion. Develop branch: ./test/test_view -i reference=$HREF -B /tmp/_.cram 58397677857 cycles # 2.982 GHz 122583155370 instructions # 2.10 insn per cycle 20.37% 16483 test_view [.] rans_uncompress_O1 > 8.03% 6482 test_view [.] load_ref_portion 7.99% 6489 test_view [.] cram_decode_slice 7.66% 6201 test_view [.] RansDecRenorm2 5.77% 4715 test_view [.] body 5.71% 4638 test_view [.] cram_decode_seq.isra.11 5.40% 4381 libz.so.1.2.11 [.] crc32_z 4.28% 3454 test_view [.] bam_set1 3.62% 2943 test_view [.] cram_external_decode_block 3.57% 2901 libz.so.1.2.11 [.] inflateBackEnd 3.24% 2634 test_view [.] cram_byte_array_stop_decode_block 3.21% 2606 libc-2.27.so [.] __memmove_sse2_unaligned_erms 2.78% 2257 test_view [.] cram_external_decode_int 2.48% 2016 test_view [.] cram_byte_array_len_decode 2.10% 1705 test_view [.] safe_itf8_get 2.00% 1618 test_view [.] rans_uncompress_O0 > 1.72% 1405 [kernel] [k] _etext "_etext" plummets, so it's something related to the mmap, but it's been replaced by a heavy load_ref_portion instead. Old dev loop, but using &~0x20 instead of toupper_c. ./test/test_view -i reference=$HREF -B /tmp/_.cram 54701579549 cycles # 2.983 GHz 119136523274 instructions # 2.18 insn per cycle 21.35% 16066 test_view [.] rans_uncompress_O1 8.83% 6669 test_view [.] cram_decode_slice 8.50% 6395 test_view [.] RansDecRenorm2 6.23% 4749 test_view [.] body 6.23% 4708 test_view [.] cram_decode_seq.isra.11 5.82% 4391 libz.so.1.2.11 [.] crc32_z 4.44% 3329 test_view [.] bam_set1 3.86% 2917 test_view [.] cram_external_decode_block 3.83% 2892 libz.so.1.2.11 [.] inflateBackEnd > 3.59% 2699 test_view [.] load_ref_portion 3.34% 2528 test_view [.] cram_byte_array_stop_decode_block 3.33% 2511 libc-2.27.so [.] __memmove_sse2_unaligned_erms 2.98% 2250 test_view [.] cram_external_decode_int 2.67% 2015 test_view [.] cram_byte_array_len_decode 2.30% 1736 test_view [.] safe_itf8_get 2.16% 1620 test_view [.] rans_uncompress_O0 > 1.87% 1432 [kernel] [k] _etext load_ref_portion dropped from 6482 to 2699. New loop construction (this PR): ./test/test_view -i reference=$HREF -B /tmp/_.cram 53294682517 cycles # 2.982 GHz 114450133099 instructions # 2.15 insn per cycle 22.21% 16428 test_view [.] rans_uncompress_O1 8.66% 6432 test_view [.] cram_decode_slice 8.45% 6248 test_view [.] RansDecRenorm2 6.54% 4862 test_view [.] cram_decode_seq.isra.11 6.34% 4750 test_view [.] body 5.95% 4412 libz.so.1.2.11 [.] crc32_z 4.53% 3343 test_view [.] bam_set1 3.99% 2968 test_view [.] cram_external_decode_block 3.95% 2928 libz.so.1.2.11 [.] inflateBackEnd 3.54% 2635 test_view [.] cram_byte_array_stop_decode_block 3.33% 2465 libc-2.27.so [.] __memmove_sse2_unaligned_erms 2.96% 2201 test_view [.] cram_external_decode_int 2.80% 2079 test_view [.] cram_byte_array_len_decode 2.40% 1786 test_view [.] safe_itf8_get 2.22% 1643 test_view [.] rans_uncompress_O0 1.84% 1363 libc-2.27.so [.] __memset_sse2_unaligned_erms > 1.77% 1328 [kernel] [k] _etext > 1.29% 951 test_view [.] load_ref_portion load_ref_portion dropped again from 2699 to 951. --- cram/cram_io.c | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index d04cc282a..420d4bc91 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -3219,6 +3219,10 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { /* * Compute locations in file. This is trivial for the MD5 files, but * is still necessary for the fasta variants. + * + * Note the offset here, as with faidx, has the assumption that white- + * space (the diff between line_length and bases_per_line) only occurs + * at the end of a line of text. */ offset = e->line_length ? e->offset + (start-1)/e->bases_per_line * e->line_length + @@ -3247,14 +3251,34 @@ static char *load_ref_portion(BGZF *fp, ref_entry *e, int start, int end) { /* Strip white-space if required. */ if (len != end-start+1) { - int i, j; + hts_pos_t i, j; char *cp = seq; char *cp_to; + // Copy up to the first white-space, and then repeatedly just copy + // bases_per_line verbatim, and use the slow method to end again. + // + // This may seem excessive, but this code can be a significant + // portion of total CRAM decode CPU time for shallow data sets. for (i = j = 0; i < len; i++) { - if (cp[i] >= '!' && cp[i] <= '~') - cp[j++] = toupper_c(cp[i]); + if (!isspace_c(cp[i])) + cp[j++] = cp[i] & ~0x20; + else + break; } + while (i < len && isspace_c(cp[i])) + i++; + while (i < len - e->line_length) { + hts_pos_t j_end = j + e->bases_per_line; + while (j < j_end) + cp[j++] = cp[i++] & ~0x20; // toupper equiv + i += e->line_length - e->bases_per_line; + } + for (; i < len; i++) { + if (!isspace_c(cp[i])) + cp[j++] = cp[i] & ~0x20; + } + cp_to = cp+j; if (cp_to - seq != end-start+1) { From 826ceea8c4cc59a67e34f724c596cba2c7e4889c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 20 Jan 2023 10:05:41 +0000 Subject: [PATCH 369/488] Expand CRAM API a bit to cope with new samtools cram_size command. (PR #1546) - cram_block_get_content_id is updated to return -1 for CORE block, removing the need to add a content_type API and share that enum. - New cram_block_get_method() method - New cram_expand_method() method to provide extended information about the block compression used. - New cram_codec_get_content_ids, which externalises the existing cram_codec_to_id (with a hopefully better API and name). - Make cram_decode_compression_header and cram_free_compression_header external. The struct itself is opaque. - Added cram_update_cid2ds_map, cram_cid2ds_query and cram_cid2ds_free to produce, query and free a block Content-ID to Data-Series mapping from the meta-data held within a CRAM Compression Header. - Added a new cram_codec iterator to sequentially step through all cram_codec structures produced by decoding a compression header. This new code is entirely internal to htslib and is solely used by the new cram_update_cid2ds_map function. - Add a codec->describe method. This produces a text description of codec parameters, such as EXTERNAL(id=1) HUFFMAN(codes={5,1,3},lengths={1,2,2}) BYTE_ARRAY_LEN(len_codec={EXTERNAL(id=42)},val_codec={EXTERNAL(id=37)} Adds the external API: int cram_codec_describe(cram_codec *c, kstring_t *ks) TODO: Some of the proposed CRAM 4.0 codecs don't yet have descriptions, but these aren't official codecs anyway. - Add a function to describe all encodings in a compression header block: int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks); - Add cram_container_get_num_records and cram_container_get_num_bases functions. These are two new external APIs for querying sequence meta-data from containers, which in theory could be used for a (non-filtering) "view -c" cmd. --- cram/cram_codecs.c | 96 +++++++++- cram/cram_codecs.h | 7 +- cram/cram_external.c | 428 ++++++++++++++++++++++++++++++++++++++++++- htslib/cram.h | 146 ++++++++++++++- 4 files changed, 669 insertions(+), 8 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 33e1b5bf8..21240c141 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2021 Genome Research Ltd. +Copyright (c) 2012-2021,2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -447,6 +447,11 @@ cram_block *cram_external_get_block(cram_slice *slice, cram_codec *c) { return cram_get_block_by_id(slice, c->u.external.content_id); } +int cram_external_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "EXTERNAL(id=%d)", + c->u.external.content_id) < 0 ? -1 : 0; +} + cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -494,6 +499,7 @@ cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, c->free = cram_external_decode_free; c->size = cram_external_decode_size; c->get_block = cram_external_get_block; + c->describe = cram_external_describe; c->u.external.content_id = vv->varint_get32(&cp, data+size, NULL); @@ -739,6 +745,14 @@ cram_block *cram_varint_get_block(cram_slice *slice, cram_codec *c) { return cram_get_block_by_id(slice, c->u.varint.content_id); } +int cram_varint_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "VARINT(id=%d,offset=%"PRId64",type=%d)", + c->u.varint.content_id, + c->u.varint.offset, + c->u.varint.type) + < 0 ? -1 : 0; +} + cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -774,6 +788,7 @@ cram_codec *cram_varint_decode_init(cram_block_compression_hdr *hdr, c->free = cram_varint_decode_free; c->size = cram_varint_decode_size; c->get_block = cram_varint_get_block; + c->describe = cram_varint_describe; c->u.varint.content_id = vv->varint_get32 (&cp, cp_end, NULL); c->u.varint.offset = vv->varint_get64s(&cp, cp_end, NULL); @@ -942,6 +957,11 @@ int cram_const_decode_size(cram_slice *slice, cram_codec *c) { return 0; } +int cram_const_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "CONST(val=%"PRId64")", + c->u.xconst.val) < 0 ? -1 : 0; +} + cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -963,6 +983,7 @@ cram_codec *cram_const_decode_init(cram_block_compression_hdr *hdr, c->free = cram_const_decode_free; c->size = cram_const_decode_size; c->get_block = NULL; + c->describe = cram_const_describe; c->u.xconst.val = vv->varint_get64s(&cp, data+size, NULL); @@ -1091,6 +1112,12 @@ void cram_beta_decode_free(cram_codec *c) { free(c); } +int cram_beta_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "BETA(offset=%d, nbits=%d)", + c->u.beta.offset, c->u.beta.nbits) + < 0 ? -1 : 0; +} + cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -1115,6 +1142,7 @@ cram_codec *cram_beta_decode_init(cram_block_compression_hdr *hdr, return NULL; } c->free = cram_beta_decode_free; + c->describe = cram_beta_describe; c->u.beta.nbits = -1; c->u.beta.offset = vv->varint_get32(&cp, data + size, NULL); @@ -1405,6 +1433,7 @@ cram_codec *cram_xpack_decode_init(cram_block_compression_hdr *hdr, c->free = cram_xpack_decode_free; c->size = cram_xpack_decode_size; c->get_block = cram_xpack_get_block; + c->describe = NULL; c->u.xpack.nbits = vv->varint_get32(&cp, endp, NULL); c->u.xpack.nval = vv->varint_get32(&cp, endp, NULL); @@ -1735,6 +1764,7 @@ cram_codec *cram_xdelta_decode_init(cram_block_compression_hdr *hdr, c->free = cram_xdelta_decode_free; c->size = cram_xdelta_decode_size; c->get_block = cram_xdelta_get_block; + c->describe = NULL; c->u.xdelta.word_size = vv->varint_get32(&cp, endp, NULL); c->u.xdelta.last = 0; @@ -2135,6 +2165,7 @@ cram_codec *cram_xrle_decode_init(cram_block_compression_hdr *hdr, c->free = cram_xrle_decode_free; c->size = cram_xrle_decode_size; c->get_block = cram_xrle_get_block; + c->describe = NULL; c->u.xrle.cur_len = 0; c->u.xrle.cur_lit = -1; @@ -2423,6 +2454,13 @@ void cram_subexp_decode_free(cram_codec *c) { free(c); } +int cram_subexp_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "SUBEXP(offset=%d,k=%d)", + c->u.subexp.offset, + c->u.subexp.k) + < 0 ? -1 : 0; +} + cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -2442,6 +2480,7 @@ cram_codec *cram_subexp_decode_init(cram_block_compression_hdr *hdr, c->codec = E_SUBEXP; c->decode = cram_subexp_decode; c->free = cram_subexp_decode_free; + c->describe = cram_subexp_describe; c->u.subexp.k = -1; c->u.subexp.offset = vv->varint_get32(&cp, data + size, NULL); @@ -2489,6 +2528,11 @@ void cram_gamma_decode_free(cram_codec *c) { free(c); } +int cram_gamma_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "GAMMA(offset=%d)", c->u.subexp.offset) + < 0 ? -1 : 0; +} + cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -2511,6 +2555,7 @@ cram_codec *cram_gamma_decode_init(cram_block_compression_hdr *hdr, c->codec = E_GAMMA; c->decode = cram_gamma_decode; c->free = cram_gamma_decode_free; + c->describe = cram_gamma_describe; c->u.gamma.offset = vv->varint_get32(&cp, data+size, NULL); @@ -2703,6 +2748,22 @@ int cram_huffman_decode_long(cram_slice *slice, cram_codec *c, return 0; } +int cram_huffman_describe(cram_codec *c, kstring_t *ks) { + int r = 0, n; + r |= ksprintf(ks, "HUFFMAN(codes={") < 0; + for (n = 0; n < c->u.huffman.ncodes; n++) { + r |= ksprintf(ks, "%s%"PRId64, n?",":"", + c->u.huffman.codes[n].symbol); + } + r |= ksprintf(ks, "},lengths={") < 0; + for (n = 0; n < c->u.huffman.ncodes; n++) { + r |= ksprintf(ks, "%s%d", n?",":"", + c->u.huffman.codes[n].len); + } + r |= ksprintf(ks, "})") < 0; + return r; +} + /* * Initialises a huffman decoder from an encoding data stream. */ @@ -2865,6 +2926,7 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, } else { return NULL; } + h->describe = cram_huffman_describe; return (cram_codec *)h; @@ -3293,6 +3355,22 @@ void cram_byte_array_len_decode_free(cram_codec *c) { free(c); } +int cram_byte_array_len_describe(cram_codec *c, kstring_t *ks) { + int r = 0; + r |= ksprintf(ks, "BYTE_ARRAY_LEN(len_codec={") < 0; + cram_byte_array_len_decoder *l = &c->u.byte_array_len; + r |= l->len_codec->describe + ? l->len_codec->describe(l->len_codec, ks) + : (ksprintf(ks, "?")<0); + r |= ksprintf(ks, "},val_codec={") < 0; + r |= l->val_codec->describe + ? l->val_codec->describe(l->val_codec, ks) + : (ksprintf(ks, "?")<0); + r |= ksprintf(ks, "}") < 0; + + return r; +} + cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -3308,6 +3386,7 @@ cram_codec *cram_byte_array_len_decode_init(cram_block_compression_hdr *hdr, c->codec = E_BYTE_ARRAY_LEN; c->decode = cram_byte_array_len_decode; c->free = cram_byte_array_len_decode_free; + c->describe = cram_byte_array_len_describe; c->u.byte_array_len.len_codec = NULL; c->u.byte_array_len.val_codec = NULL; @@ -3532,6 +3611,13 @@ void cram_byte_array_stop_decode_free(cram_codec *c) { free(c); } +int cram_byte_array_stop_describe(cram_codec *c, kstring_t *ks) { + return ksprintf(ks, "BYTE_ARRAY_STOP(stop=%d,id=%d)", + c->u.byte_array_stop.stop, + c->u.byte_array_stop.content_id) + < 0 ? -1 : 0; +} + cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, char *data, int size, enum cram_encoding codec, @@ -3561,6 +3647,7 @@ cram_codec *cram_byte_array_stop_decode_init(cram_block_compression_hdr *hdr, return NULL; } c->free = cram_byte_array_stop_decode_free; + c->describe = cram_byte_array_stop_describe; c->u.byte_array_stop.stop = *cp++; if (CRAM_MAJOR_VERS(version) == 1) { @@ -4032,3 +4119,10 @@ int cram_codec_decoder2encoder(cram_fd *fd, cram_codec *c) { return 0; } + +int cram_codec_describe(cram_codec *c, kstring_t *ks) { + if (c && c->describe) + return c->describe(c, ks); + else + return ksprintf(ks, "?"); +} diff --git a/cram/cram_codecs.h b/cram/cram_codecs.h index 56b065255..d93d9955c 100644 --- a/cram/cram_codecs.h +++ b/cram/cram_codecs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2015, 2018, 2020 Genome Research Ltd. +Copyright (c) 2012-2015, 2018, 2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -160,7 +160,7 @@ typedef struct { /* * A generic codec structure. */ -typedef struct cram_codec { +struct cram_codec { enum cram_encoding codec; cram_block *out; varint_vec *vv; @@ -175,6 +175,7 @@ typedef struct cram_codec { int (*size)(cram_slice *slice, struct cram_codec *codec); int (*flush)(struct cram_codec *codec); cram_block *(*get_block)(cram_slice *slice, struct cram_codec *codec); + int (*describe)(struct cram_codec *codec, kstring_t *ks); union { cram_huffman_decoder huffman; @@ -201,7 +202,7 @@ typedef struct cram_codec { cram_const_codec e_xconst; cram_varint_decoder e_varint; } u; -} cram_codec; +}; const char *cram_encoding2str(enum cram_encoding t); diff --git a/cram/cram_external.c b/cram/cram_external.c index e88ff838b..26ef3d7d3 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2015, 2018-2020, 2022 Genome Research Ltd. +Copyright (c) 2015, 2018-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without @@ -40,6 +40,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h #include +#include + +#if defined(HAVE_EXTERNAL_LIBHTSCODECS) +#include +#else +#include "../htscodecs/htscodecs/rANS_static4x16.h" +#endif #include "../htslib/hfile.h" #include "cram.h" @@ -82,6 +89,14 @@ void cram_container_set_num_blocks(cram_container *c, int32_t num_blocks) { c->num_blocks = num_blocks; } +int32_t cram_container_get_num_records(cram_container *c) { + return c->num_records; +} + +int64_t cram_container_get_num_bases(cram_container *c) { + return c->num_bases; +} + /* Returns the landmarks[] array and the number of elements * in num_landmarks. @@ -180,6 +195,294 @@ int cram_block_compression_hdr_decoder2encoder(cram_fd *fd, return 0; } +typedef struct { + cram_block_compression_hdr *hdr; + cram_map *curr_map; + int idx; + int is_tag; // phase 2 using tag_encoding_map +} cram_codec_iter; + +static void cram_codec_iter_init(cram_block_compression_hdr *hdr, + cram_codec_iter *iter) { + iter->hdr = hdr; + iter->curr_map = NULL; + iter->idx = 0; + iter->is_tag = 0; +} + +// See enum cram_DS_ID in cram/cram_structs +static int cram_ds_to_key(enum cram_DS_ID ds) { + switch(ds) { + case DS_RN: return 256*'R'+'N'; + case DS_QS: return 256*'Q'+'S'; + case DS_IN: return 256*'I'+'N'; + case DS_SC: return 256*'S'+'C'; + case DS_BF: return 256*'B'+'F'; + case DS_CF: return 256*'C'+'F'; + case DS_AP: return 256*'A'+'P'; + case DS_RG: return 256*'R'+'G'; + case DS_MQ: return 256*'M'+'Q'; + case DS_NS: return 256*'N'+'S'; + case DS_MF: return 256*'M'+'F'; + case DS_TS: return 256*'T'+'S'; + case DS_NP: return 256*'N'+'P'; + case DS_NF: return 256*'N'+'F'; + case DS_RL: return 256*'R'+'L'; + case DS_FN: return 256*'F'+'N'; + case DS_FC: return 256*'F'+'C'; + case DS_FP: return 256*'F'+'P'; + case DS_DL: return 256*'D'+'L'; + case DS_BA: return 256*'B'+'A'; + case DS_BS: return 256*'B'+'S'; + case DS_TL: return 256*'T'+'L'; + case DS_RI: return 256*'R'+'I'; + case DS_RS: return 256*'R'+'S'; + case DS_PD: return 256*'P'+'D'; + case DS_HC: return 256*'H'+'C'; + case DS_BB: return 256*'B'+'B'; + case DS_QQ: return 256*'Q'+'Q'; + case DS_TN: return 256*'T'+'N'; + case DS_TC: return 256*'T'+'C'; + case DS_TM: return 256*'T'+'M'; + case DS_TV: return 256*'T'+'V'; + default: break; + } + + return -1; // unknown +} + +static cram_codec *cram_codec_iter_next(cram_codec_iter *iter, + int *key) { + cram_codec *cc = NULL; + cram_block_compression_hdr *hdr = iter->hdr; + + if (!iter->is_tag) { + // 1: Iterating through main data-series + do { + cc = hdr->codecs[iter->idx++]; + } while(!cc && iter->idx < DS_END); + if (cc) { + *key = cram_ds_to_key(iter->idx-1); + return cc; + } + + // Reset index for phase 2 + iter->idx = 0; + iter->is_tag = 1; + } + + do { + if (!iter->curr_map) + iter->curr_map = hdr->tag_encoding_map[iter->idx++]; + + cc = iter->curr_map ? iter->curr_map->codec : NULL; + if (cc) { + *key = iter->curr_map->key; + iter->curr_map = iter->curr_map->next; + return cc; + } + } while (iter->idx <= CRAM_MAP_HASH); + + // End of codecs + return NULL; +} + +/* + * A list of data-series, used to create a linked list threaded through + * a single array. + */ +typedef struct ds_list { + int data_series; + int next; +} ds_list; + +KHASH_MAP_INIT_INT(cid, int64_t) + +// Opaque struct for the CRAM block content-id -> data-series map. +struct cram_cid2ds_t { + ds_list *ds; // array of data-series with linked lists threading through it + int ds_size; + int ds_idx; + khash_t(cid) *hash; // key=content_id, value=index to ds array + int *ds_a; // serialised array of data-series returned by queries. +}; + +void cram_cid2ds_free(cram_cid2ds_t *cid2ds) { + if (cid2ds) { + if (cid2ds->hash) + kh_destroy(cid, cid2ds->hash); + free(cid2ds->ds); + free(cid2ds->ds_a); + free(cid2ds); + } +} + +/* + * Map cram block numbers to data-series. It's normally a 1:1 mapping, + * but in rare cases it can be 1:many (or even many:many). + * The key is the block number and the value is an index into the data-series + * array, which we iterate over until reaching a negative value. + * + * Provide cid2ds as NULL to allocate a new map or pass in an existing one + * to append to this map. The new (or existing) map is returned. + * + * Returns the cid2ds (newly allocated or as provided) on success, + * NULL on failure. + */ +cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr, + cram_cid2ds_t *cid2ds) { + cram_cid2ds_t *c2d = cid2ds; + if (!c2d) { + c2d = calloc(1, sizeof(*c2d)); + if (!c2d) + return NULL; + + c2d->hash = kh_init(cid); + if (!c2d->hash) + goto err; + } + + // Iterate through codecs. Initially primary two-left ones in + // rec_encoding_map, and then the three letter in tag_encoding_map. + cram_codec_iter citer; + cram_codec_iter_init(hdr, &citer); + cram_codec *codec; + int key; + + while ((codec = cram_codec_iter_next(&citer, &key))) { + // Having got a codec, we can then use cram_codec_to_id to get + // the block IDs utilised by that codec. This is then our + // map for allocating data blocks to data series, but for shared + // blocks we can't separate out how much is used by each DS. + int bnum[2]; + cram_codec_get_content_ids(codec, bnum); + + khiter_t k; + int ret, i; + for (i = 0; i < 2; i++) { + if (bnum[i] > -2) { + k = kh_put(cid, c2d->hash, bnum[i], &ret); + if (ret < 0) + goto err; + + if (c2d->ds_idx >= c2d->ds_size) { + c2d->ds_size += 100; + c2d->ds_size *= 2; + ds_list *ds_new = realloc(c2d->ds, + c2d->ds_size * sizeof(*ds_new)); + if (!ds_new) + goto err; + c2d->ds = ds_new; + } + + if (ret == 0) { + // Shared content_id, so add to list of DS + + // Maybe data-series should be part of the hash key? + // + // So top-32 bit is content-id, bot-32 bit is key. + // Sort hash by key and then can group all the data-series + // known together. ?? + // + // Brute force for now, scan to see if recorded. + // Typically this is minimal effort as we almost always + // have 1 data-series per block content-id, so the list to + // search is of size 1. + int dsi = kh_value(c2d->hash, k); + while (dsi >= 0) { + if (c2d->ds[dsi].data_series == key) + break; + dsi = c2d->ds[dsi].next; + } + + if (dsi == -1) { + // Block content_id seen before, but not with this DS + c2d->ds[c2d->ds_idx].data_series = key; + c2d->ds[c2d->ds_idx].next = kh_value(c2d->hash, k); + kh_value(c2d->hash, k) = c2d->ds_idx; + c2d->ds_idx++; + } + } else { + // First time this content id has been used + c2d->ds[c2d->ds_idx].data_series = key; + c2d->ds[c2d->ds_idx].next = -1; + kh_value(c2d->hash, k) = c2d->ds_idx; + c2d->ds_idx++; + } + } + } + } + + return c2d; + + err: + if (c2d != cid2ds) + cram_cid2ds_free(c2d); + return NULL; +} + +/* + * Return a list of data series observed as belonging to a block with + * the specified content_id. *n is the number of data series + * returned, or 0 if block is unused. + * Block content_id of -1 is used to indicate the CORE block. + * + * The pointer returned is owned by the cram_cid2ds state and should + * not be freed by the caller. + */ +int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n) { + *n = 0; + if (!c2d || !c2d->hash) + return NULL; + + khiter_t k = kh_get(cid, c2d->hash, content_id); + if (k == kh_end(c2d->hash)) + return NULL; + + if (!c2d->ds_a) { + c2d->ds_a = malloc(c2d->ds_idx * sizeof(int)); + if (!c2d->ds_a) + return NULL; + } + + int dsi = kh_value(c2d->hash, k); // initial ds array index from hash + int idx = 0; + while (dsi >= 0) { + c2d->ds_a[idx++] = c2d->ds[dsi].data_series; + dsi = c2d->ds[dsi].next; // iterate over list within ds array + } + + *n = idx; + return c2d->ds_a; +} + +/* + * Produces a description of the record and tag encodings held within + * a compression header and appends to 'ks'. + * + * Returns 0 on success, + * <0 on failure. + */ +int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks) { + cram_codec_iter citer; + cram_codec_iter_init(hdr, &citer); + cram_codec *codec; + int key, r = 0; + + while ((codec = cram_codec_iter_next(&citer, &key))) { + char key_s[4] = {0}; + int key_i = 0; + if (key>>16) key_s[key_i++] = key>>16; + key_s[key_i++] = (key>>8)&0xff; + key_s[key_i++] = key&0xff; + r |= ksprintf(ks, "\t%s\t", key_s) < 0; + r |= cram_codec_describe(codec, ks) < 0; + r |= kputc('\n', ks) < 0; + } + + return r ? -1 : 0; +} + /* *----------------------------------------------------------------------------- * cram_slice @@ -206,12 +509,17 @@ void cram_slice_hdr_get_coords(cram_block_slice_hdr *h, *----------------------------------------------------------------------------- * cram_block */ -int32_t cram_block_get_content_id(cram_block *b) { return b->content_id; } +int32_t cram_block_get_content_id(cram_block *b) { + return b->content_type == CORE ? -1 : b->content_id; +} int32_t cram_block_get_comp_size(cram_block *b) { return b->comp_size; } int32_t cram_block_get_uncomp_size(cram_block *b) { return b->uncomp_size; } int32_t cram_block_get_crc32(cram_block *b) { return b->crc32; } void * cram_block_get_data(cram_block *b) { return BLOCK_DATA(b); } int32_t cram_block_get_size(cram_block *b) { return BLOCK_SIZE(b); } +enum cram_block_method cram_block_get_method(cram_block *b) { + return (enum cram_block_method)b->orig_method; +} enum cram_content_type cram_block_get_content_type(cram_block *b) { return b->content_type; } @@ -236,6 +544,122 @@ void cram_block_update_size(cram_block *b) { BLOCK_UPLEN(b); } size_t cram_block_get_offset(cram_block *b) { return BLOCK_SIZE(b); } void cram_block_set_offset(cram_block *b, size_t offset) { BLOCK_SIZE(b) = offset; } +/* + * Given a compressed block of data in a specified compression method, + * fill out the 'cm' field with meta-data gleaned from the compressed + * block. + * + * If comp is CRAM_COMP_UNKNOWN, we attempt to auto-detect the compression + * format, but this doesn't work for all methods. + * + * Retuns the detected or specified comp method, and fills out *cm + * if non-NULL. + */ +cram_method_details *cram_expand_method(uint8_t *data, int32_t size, + enum cram_block_method comp) { + cram_method_details *cm = calloc(1, sizeof(*cm)); + if (!cm) + return NULL; + + const char *xz_header = "\xFD""7zXZ"; // including nul + + if (comp == CRAM_COMP_UNKNOWN) { + // Auto-detect + if (size > 1 && data[0] == 0x1f && data[1] == 0x8b) + comp = CRAM_COMP_GZIP; + else if (size > 3 && data[1] == 'B' && data[2] == 'Z' + && data[3] == 'h') + comp = CRAM_COMP_BZIP2; + else if (size > 6 && memcmp(xz_header, data, 6) == 0) + comp = CRAM_COMP_LZMA; + else + comp = CRAM_COMP_UNKNOWN; + } + cm->method = comp; + + // Interrogate the compressed data stream to fill out additional fields. + switch (comp) { + case CRAM_COMP_GZIP: + if (size > 8) { + if (data[8] == 4) + cm->level = 1; + else if (data[8] == 2) + cm->level = 9; + else + cm->level = 5; + } + break; + + case CRAM_COMP_BZIP2: + if (size > 3 && data[3] >= '1' && data[3] <= '9') + cm->level = data[3]-'0'; + break; + + case CRAM_COMP_RANS4x8: + cm->Nway = 4; + if (size > 0 && data[0] == 1) + cm->order = 1; + else + cm->order = 0; + break; + + case CRAM_COMP_RANSNx16: + if (size > 0) { + cm->order = data[0] & 1; + cm->Nway = data[0] & RANS_ORDER_X32 ? 32 : 4; + cm->rle = data[0] & RANS_ORDER_RLE ? 1 : 0; + cm->pack = data[0] & RANS_ORDER_PACK ? 1 : 0; + cm->cat = data[0] & RANS_ORDER_CAT ? 1 : 0; + cm->stripe = data[0] & RANS_ORDER_STRIPE ? 1 : 0; + cm->nosz = data[0] & RANS_ORDER_NOSZ ? 1 : 0; + } + break; + + case CRAM_COMP_ARITH: + if (size > 0) { + // Not in a public header, but the same transforms as rANSNx16 + cm->order = data[0] & 3; + cm->rle = data[0] & RANS_ORDER_RLE ? 1 : 0; + cm->pack = data[0] & RANS_ORDER_PACK ? 1 : 0; + cm->cat = data[0] & RANS_ORDER_CAT ? 1 : 0; + cm->stripe = data[0] & RANS_ORDER_STRIPE ? 1 : 0; + cm->nosz = data[0] & RANS_ORDER_NOSZ ? 1 : 0; + cm->ext = data[0] & 4 /*external*/ ? 1 : 0; + } + break; + + case CRAM_COMP_TOK3: + if (size > 8) { + if (data[8] == 1) + cm->level = 11; + else if (data[8] == 0) + cm->level = 1; + } + break; + + default: + break; + } + + return cm; +} + +/* + *----------------------------------------------------------------------------- + * cram_codecs + */ + +// -2 is unused. +// -1 is CORE +// >= 0 is the block with that Content ID +void cram_codec_get_content_ids(cram_codec *c, int ids[2]) { + ids[0] = cram_codec_to_id(c, &ids[1]); +} + +/* + *----------------------------------------------------------------------------- + * Utility functions + */ /* * Copies the blocks representing the next num_slice slices from a diff --git a/htslib/cram.h b/htslib/cram.h index 8dc6fe1b3..e0b51839c 100644 --- a/htslib/cram.h +++ b/htslib/cram.h @@ -1,7 +1,7 @@ /// @file htslib/cram.h /// CRAM format-specific API functions. /* - Copyright (C) 2015, 2016, 2018-2020, 2022 Genome Research Ltd. + Copyright (C) 2015, 2016, 2018-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield @@ -76,8 +76,59 @@ enum cram_block_method { RANS1 = 10, GZIP_RLE = 11, }; +#else + +// Values as defined in the CRAM specifications. +// See cram/cram_structs.h cram_block_method_int for an expanded version of +// this with local specialisations assigned to codes. +enum cram_block_method { + CRAM_COMP_UNKNOWN = -1, + + // CRAM 2.x and 3.0 + CRAM_COMP_RAW = 0, + CRAM_COMP_GZIP = 1, + CRAM_COMP_BZIP2 = 2, + + // CRAM 3.0 + CRAM_COMP_LZMA = 3, + CRAM_COMP_RANS4x8 = 4, // 4-way interleaving, 8-bit renormalisation + + // CRAM 3.1 + CRAM_COMP_RANSNx16 = 5, // both 4x16 and 32x16 variants, plus transforms + CRAM_COMP_ARITH = 6, // aka Range coding + CRAM_COMP_FQZ = 7, // FQZComp + CRAM_COMP_TOK3 = 8, // Name tokeniser +}; #endif +/* NOTE this structure may be expanded in future releases by appending + * additional fields. + * + * Do not assume the size is fixed and avoid using arrays of this struct. + */ +typedef struct { + enum cram_block_method method; + + // Generic compression level if known (0 if not). + // 1 or 9 for gzip min/max flag (else 5). 1-9 for bzip2 + // 1 or 11 for for tok3 (rans/arith encoder). + int level; + + // For rans* and arith codecs + int order; + + // ransNx16/arith specific + int rle; + int pack; + int stripe; + int cat; + int nosz; + int Nway; + + // Arithmetic coder only + int ext; // external: use gz, xz or bzip2 +} cram_method_details; + enum cram_content_type { CT_ERROR = -1, FILE_HEADER = 0, @@ -97,6 +148,7 @@ typedef struct cram_slice cram_slice; typedef struct cram_metrics cram_metrics; typedef struct cram_block_slice_hdr cram_block_slice_hdr; typedef struct cram_block_compression_hdr cram_block_compression_hdr; +typedef struct cram_codec cram_codec; typedef struct refs_t refs_t; struct hFILE; @@ -147,6 +199,10 @@ int32_t *cram_container_get_landmarks(cram_container *c, int32_t *num_landmarks) HTSLIB_EXPORT void cram_container_set_landmarks(cram_container *c, int32_t num_landmarks, int32_t *landmarks); +HTSLIB_EXPORT +int32_t cram_container_get_num_records(cram_container *c); +HTSLIB_EXPORT +int64_t cram_container_get_num_bases(cram_container *c); /* Returns true if the container is empty (EOF marker) */ HTSLIB_EXPORT @@ -167,9 +223,14 @@ HTSLIB_EXPORT int32_t cram_block_get_crc32(cram_block *b); HTSLIB_EXPORT void * cram_block_get_data(cram_block *b); - HTSLIB_EXPORT enum cram_content_type cram_block_get_content_type(cram_block *b); +HTSLIB_EXPORT +enum cram_block_method cram_block_get_method(cram_block *b); + +HTSLIB_EXPORT +cram_method_details *cram_expand_method(uint8_t *data, int32_t size, + enum cram_block_method comp); HTSLIB_EXPORT void cram_block_set_content_id(cram_block *b, int32_t id); @@ -200,6 +261,27 @@ void cram_block_set_offset(cram_block *b, size_t offset); HTSLIB_EXPORT uint32_t cram_block_size(cram_block *b); +/* + * Returns the Block Content ID values referred to by a cram_codec in + * ids[2]. + * + * -2 is unused. + * -1 is CORE + * >= 0 is the block with that Content ID + */ +HTSLIB_EXPORT +void cram_codec_get_content_ids(cram_codec *c, int ids[2]); + +/* + * Produces a human readable description of the codec parameters. + * This is appended to an existing kstring 'ks'. + * + * Returns 0 on succes, + * <0 on failure + */ +HTSLIB_EXPORT +int cram_codec_describe(cram_codec *c, kstring_t *ks); + /* * Renumbers RG numbers in a cram compression header. * @@ -247,6 +329,66 @@ int cram_transcode_rg(cram_fd *in, cram_fd *out, HTSLIB_EXPORT int cram_copy_slice(cram_fd *in, cram_fd *out, int32_t num_slice); +/* + * Decodes a CRAM block compression header. + * Returns header ptr on success + * NULL on failure + */ +HTSLIB_EXPORT +cram_block_compression_hdr *cram_decode_compression_header(cram_fd *fd, + cram_block *b); +/* + * Frees a cram_block_compression_hdr structure. + */ +HTSLIB_EXPORT +void cram_free_compression_header(cram_block_compression_hdr *hdr); + +typedef struct cram_cid2ds_t cram_cid2ds_t; + +/* + * Map cram block numbers to data-series. It's normally a 1:1 mapping, + * but in rare cases it can be 1:many (or even many:many). + * The key is the block number and the value is an index into the data-series + * array, which we iterate over until reaching a negative value. + * + * Provide cid2ds as NULL to allocate a new map or pass in an existing one + * to append to this map. The new (or existing) map is returned. + * + * Returns the cid2ds (newly allocated or as provided) on success, + * NULL on failure. + */ +HTSLIB_EXPORT +cram_cid2ds_t *cram_update_cid2ds_map(cram_block_compression_hdr *hdr, + cram_cid2ds_t *cid2ds); + +/* + * Return a list of data series observed as belonging to a block with + * the specified content_id. *n is the number of data series + * returned, or 0 if block is unused. + * Block content_id of -1 is used to indicate the CORE block. + * + * The pointer returned is owned by the cram_cid2ds state and should + * not be freed by the caller. + */ +HTSLIB_EXPORT +int *cram_cid2ds_query(cram_cid2ds_t *c2d, int content_id, int *n); + +/* + * Frees a cram_cid2ds_t allocated by cram_update_cid2ds_map + */ +HTSLIB_EXPORT +void cram_cid2ds_free(cram_cid2ds_t *cid2ds); + +/* + * Produces a description of the record and tag encodings held within + * a compression header and appends to 'ks'. + * + * Returns 0 on success, + * <0 on failure. + */ +HTSLIB_EXPORT +int cram_describe_encodings(cram_block_compression_hdr *hdr, kstring_t *ks); + /* *----------------------------------------------------------------------------- * cram slice interrogation From a59bc5256ee4f539ae47edd575dc1340356d922a Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Mon, 16 Jan 2023 11:49:06 +0000 Subject: [PATCH 370/488] Uodate copyright for winter release. --- bgzip.1 | 2 +- bgzip.c | 2 +- cram/cram_encode.c | 2 +- cram/cram_io.c | 2 +- faidx.c | 2 +- htslib/faidx.h | 2 +- htslib/vcf.h | 2 +- tabix.1 | 2 +- tbx.c | 2 +- test/sam_filter/filter.tst | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bgzip.1 b/bgzip.1 index 228542647..235c72a48 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -4,7 +4,7 @@ bgzip \- Block compression/decompression utility .\" .\" Copyright (C) 2009-2011 Broad Institute. -.\" Copyright (C) 2018, 2021 Genome Research Limited. +.\" Copyright (C) 2018, 2021-2022 Genome Research Limited. .\" .\" Author: Heng Li .\" diff --git a/bgzip.c b/bgzip.c index 6961bbfa4..589f79f66 100644 --- a/bgzip.c +++ b/bgzip.c @@ -1,7 +1,7 @@ /* bgzip.c -- Block compression/decompression utility. Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology - Copyright (C) 2010, 2013-2019, 2021 Genome Research Ltd. + Copyright (C) 2010, 2013-2019, 2021-2022 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 6371e5e39..3ce4f846f 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020, 2022 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_io.c b/cram/cram_io.c index 420d4bc91..e7b239c3f 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2022 Genome Research Ltd. +Copyright (c) 2012-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/faidx.c b/faidx.c index c1f9523ea..2eb0f3edc 100644 --- a/faidx.c +++ b/faidx.c @@ -1,6 +1,6 @@ /* faidx.c -- FASTA and FASTQ random access. - Copyright (C) 2008, 2009, 2013-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2020, 2022 Genome Research Ltd. Portions copyright (C) 2011 Broad Institute. Author: Heng Li diff --git a/htslib/faidx.h b/htslib/faidx.h index b237c1cd6..c1b3090a5 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -1,7 +1,7 @@ /// @file htslib/faidx.h /// FASTA random access. /* - Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020, 2022 Genome Research Ltd. Author: Heng Li diff --git a/htslib/vcf.h b/htslib/vcf.h index 03ec57f7f..0d9f812ce 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020 Genome Research Ltd. + Copyright (C) 2012-2020, 2022 Genome Research Ltd. Author: Heng Li diff --git a/tabix.1 b/tabix.1 index 559ec6e69..cfbe30e28 100644 --- a/tabix.1 +++ b/tabix.1 @@ -4,7 +4,7 @@ tabix \- Generic indexer for TAB-delimited genome position files .\" .\" Copyright (C) 2009-2011 Broad Institute. -.\" Copyright (C) 2014, 2016, 2018, 2020 Genome Research Ltd. +.\" Copyright (C) 2014, 2016, 2018, 2020, 2022 Genome Research Ltd. .\" .\" Author: Heng Li .\" diff --git a/tbx.c b/tbx.c index 084a82c8d..d897a21f1 100644 --- a/tbx.c +++ b/tbx.c @@ -1,6 +1,6 @@ /* tbx.c -- tabix API functions. - Copyright (C) 2009, 2010, 2012-2015, 2017-2020 Genome Research Ltd. + Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022 Genome Research Ltd. Copyright (C) 2010-2012 Broad Institute. Author: Heng Li diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index b81a19857..13d2c340e 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Genome Research Ltd. +# Copyright (C) 2020, 2022 Genome Research Ltd. # # Author: James Bonfield # From 1a222ef4483b3b714bba0dad74ff33a53cd323c6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Jan 2023 13:30:31 +0000 Subject: [PATCH 371/488] Merges neighbouring I and D ops into one op within pileup. (PR #1552) This means 4M1D1D1D3M is reported as 4M3D3M instead, and importantly "p->indel=-3" for the first 1D and the 2nd 1D has "p->indel=0" (with p->is_del=1, the same as it would for the 2nd base in a 3D cigar op). Previously samtools mpileup would produce incorrect looking output for the 1D1D scenario. Fixing this in sam.c means not only is samtools mpileup now looking better, but any tool using the mpileup API will be getting consistent results. Note that samtools mpileup already resolved the ...1I1I1I... case, but it did this within the samools bam_plcmd.c code itself. Hence while the pileup API works, it left p->indel=1 instead of p->indel=3 for this situation. So we also resolve that in a similar fashion. Note 2P1I1I is reported as p->indel=2 (a 2bp indel) even though bam_plp_insertion would return e.g. +4**AC, as we're reporting the number of bases inserted in this sequence rather than the padded alignment size. Fixes samtools/samtools#139, or at least the remaining part of the puzzle. Most had previously been fixed already back in 2014. --- sam.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/sam.c b/sam.c index 580e32d40..e2e539b2d 100644 --- a/sam.c +++ b/sam.c @@ -1,6 +1,6 @@ /* sam.c -- SAM and BAM file I/O and manipulation. - Copyright (C) 2008-2010, 2012-2022 Genome Research Ltd. + Copyright (C) 2008-2010, 2012-2023 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -5264,9 +5264,24 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) if (s->x + l - 1 == pos && s->k + 1 < c->n_cigar) { // peek the next operation int op2 = _cop(cigar[s->k+1]); int l2 = _cln(cigar[s->k+1]); - if (op2 == BAM_CDEL) p->indel = -(int)l2; - else if (op2 == BAM_CINS) p->indel = l2; - else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { // no working for adjacent padding + if (op2 == BAM_CDEL && op != BAM_CDEL) { + // At start of a new deletion, merge e.g. 1D2D to 3D. + // Within a deletion (the 2D in 1D2D) we keep p->indel=0 + // and rely on is_del=1 as we would for 3D. + p->indel = -(int)l2; + for (k = s->k+2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CDEL) p->indel -= l2; + else break; + } + } else if (op2 == BAM_CINS) { + p->indel = l2; + for (k = s->k+2; k < c->n_cigar; ++k) { + op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); + if (op2 == BAM_CINS) p->indel += l2; + else if (op2 != BAM_CPAD) break; + } + } else if (op2 == BAM_CPAD && s->k + 2 < c->n_cigar) { int l3 = 0; for (k = s->k + 2; k < c->n_cigar; ++k) { op2 = _cop(cigar[k]); l2 = _cln(cigar[k]); From 818b449fe8e67c8f0375e5992581fb8f58970c9a Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 24 Jan 2023 15:37:34 +0000 Subject: [PATCH 372/488] Improve API docs for bgzf_mt Note that this now works on reading as well as writing. Removed the comment about only when compiled with -DBGZF_MT as this was misleading implying the user has to adjust CFLAGS. It's on by default and there's nothing the user can do to turn it off without manually editing bgzf.c, in which case they're on their own. Also document that n_sub_blks now does nothing. Fixes #1553 --- htslib/bgzf.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 9e4ab3794..cb789ad53 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -316,23 +316,22 @@ typedef struct BGZF BGZF; int bgzf_read_block(BGZF *fp) HTS_RESULT_USED; /** - * Enable multi-threading (when compiled with -DBGZF_MT) via a shared - * thread pool. This means both encoder and decoder can balance - * usage across a single pool of worker jobs. + * Enable multi-threading via a shared thread pool. This means + * both encoder and decoder can balance usage across a single pool + * of worker jobs. * - * @param fp BGZF file handler; must be opened for writing + * @param fp BGZF file handler * @param pool The thread pool (see hts_create_threads) */ HTSLIB_EXPORT int bgzf_thread_pool(BGZF *fp, struct hts_tpool *pool, int qsize); /** - * Enable multi-threading (only effective when the library was compiled - * with -DBGZF_MT) + * Enable multi-threading * - * @param fp BGZF file handler; must be opened for writing - * @param n_threads #threads used for writing - * @param n_sub_blks #blocks processed by each thread; a value 64-256 is recommended + * @param fp BGZF file handler + * @param n_threads #threads used for reading / writing + * @param n_sub_blks Unused (was #blocks processed by each thread) */ HTSLIB_EXPORT int bgzf_mt(BGZF *fp, int n_threads, int n_sub_blks); From 463830bf7de8c4ab731c4d67c49ddc446f498f50 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 4 Jan 2023 16:21:32 +0000 Subject: [PATCH 373/488] Catch errors from bgzf_getline() in hts_readlist, hts_readlines Fixes a couple of places where file read errors could be silently ignored. --- hts.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/hts.c b/hts.c index 66b8643e8..cead9d537 100644 --- a/hts.c +++ b/hts.c @@ -1,6 +1,6 @@ /* hts.c -- format-neutral I/O, indexing, and iterator API functions. - Copyright (C) 2008, 2009, 2012-2022 Genome Research Ltd. + Copyright (C) 2008, 2009, 2012-2023 Genome Research Ltd. Copyright (C) 2012, 2013 Broad Institute. Author: Heng Li @@ -1934,8 +1934,9 @@ char **hts_readlist(const char *string, int is_file, int *_n) if ( !fp ) return NULL; kstring_t str; + int ret; str.s = 0; str.l = str.m = 0; - while (bgzf_getline(fp, '\n', &str) >= 0) + while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { if (str.l == 0) continue; if (hts_resize(char*, n + 1, &m, &s, 0) < 0) @@ -1945,6 +1946,8 @@ char **hts_readlist(const char *string, int is_file, int *_n) goto err; n++; } + if (ret < -1) // Read error + goto err; bgzf_close(fp); free(str.s); } @@ -1991,8 +1994,9 @@ char **hts_readlines(const char *fn, int *_n) BGZF *fp = bgzf_open(fn, "r"); if ( fp ) { // read from file kstring_t str; + int ret; str.s = 0; str.l = str.m = 0; - while (bgzf_getline(fp, '\n', &str) >= 0) { + while ((ret = bgzf_getline(fp, '\n', &str)) >= 0) { if (str.l == 0) continue; if (hts_resize(char *, n + 1, &m, &s, 0) < 0) goto err; @@ -2001,6 +2005,8 @@ char **hts_readlines(const char *fn, int *_n) goto err; n++; } + if (ret < -1) // Read error + goto err; bgzf_close(fp); free(str.s); } else if (*fn == ':') { // read from string From d40bcf98a4756270a9c2b13cdb366ee31a648d8e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 3 Feb 2023 15:23:37 +0000 Subject: [PATCH 374/488] FIx a bug in the codec learning algorithm for TOKA The name tokeniser has a rANS vs Arithmetic coder choice as a parameter (in the "strat" variable). We lacked this distinction when learning which method works best, so in the choice of toka (tok3+arith) vs bzip2 vs gzip etc we selected tok3 and switched back to strat 0, disabling the arithmetic coder. This only affects archive mode, or where a user explicitly used eg "samtools view -O cram,version=3.1,use_arith". --- cram/cram_io.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cram/cram_io.c b/cram/cram_io.c index e7b239c3f..eaf14d416 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2183,6 +2183,8 @@ int cram_compress_block2(cram_fd *fd, cram_slice *s, case FQZ_b: strat = CRAM_MAJOR_VERS(fd->version)+256; break; case FQZ_c: strat = CRAM_MAJOR_VERS(fd->version)+2*256; break; case FQZ_d: strat = CRAM_MAJOR_VERS(fd->version)+3*256; break; + case TOK3: strat = 0; break; + case TOKA: strat = 1; break; default: strat = 0; } metrics->strat = strat; From 3fe2a59034d1d23fd740a66deb8ffab2bdd1f0b2 Mon Sep 17 00:00:00 2001 From: pd3 Date: Fri, 13 Jan 2023 15:58:49 +0100 Subject: [PATCH 375/488] Remove a bottleneck in VCF header processing Rather than looping over all bcf_hrec_t records when adding or querying a new record, keep a hash also for generic and structured records. This is done by hijacking the bcf_hdr_t.id void pointers to store additional information in the new bcf_hdr_aux_t structure, internally accessible via with get_hdr_aux(). Note that - some of the existing code is not using this new speed up yet, see bcf_hdr_combine() - modification of existing bcf_hrec_t records is supported only internally and only to the extent currently used by htslib, see bcf_hdr_update_hrec() Aimed to resolve #1543 --- vcf.c | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 145 insertions(+), 19 deletions(-) diff --git a/vcf.c b/vcf.c index 851d32e88..59d433c19 100644 --- a/vcf.c +++ b/vcf.c @@ -1,7 +1,7 @@ /* vcf.c -- VCF/BCF API functions. Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2022 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Portions copyright (C) 2014 Intel Corporation. Author: Heng Li @@ -51,6 +51,10 @@ DEALINGS IN THE SOFTWARE. */ KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) typedef khash_t(vdict) vdict_t; +KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*) +typedef khash_t(hdict) hdict_t; + + #include "htslib/kseq.h" HTSLIB_EXPORT uint32_t bcf_float_missing = 0x7F800001; @@ -79,6 +83,22 @@ static bcf_idinfo_t bcf_idinfo_def = { .info = { 15, 15, 15 }, .hrec = { NULL, N #define BCF_IS_64BIT (1<<30) +// Opaque structure with auxilary data which allows to extend bcf_hdr_t without breaking ABI. +// Note that this preserving API and ABI requires that the first element is vdict_t struct +// rather than a pointer, as user programs may (and in some cases do) access the dictionary +// directly as (vdict_t*)hdr->dict. +typedef struct +{ + vdict_t dict; // bcf_hdr_t.dict[0] vdict_t dictionary which keeps bcf_idinfo_t for BCF_HL_FLT,BCF_HL_INFO,BCF_HL_FMT + hdict_t *gen; // hdict_t dictionary which keeps bcf_hrec_t* pointers for generic and structured fields +} +bcf_hdr_aux_t; + +static inline bcf_hdr_aux_t *get_hdr_aux(const bcf_hdr_t *hdr) +{ + return (bcf_hdr_aux_t *)hdr->dict[0]; +} + static char *find_chrom_header_line(char *s) { char *nl; @@ -866,8 +886,45 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 1; } +int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) +{ + // currently only for bcf_hdr_set_version + assert( hrec->type==BCF_HL_GEN ); + int ret; + khint_t k; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + for (k=kh_begin(aux->gen); kgen); k++) + { + if ( !kh_exist(aux->gen,k) ) continue; + if ( hrec!=(bcf_hrec_t*)kh_val(aux->gen,k) ) continue; + break; + } + assert( kgen) ); // something went wrong, should never happen + free((char*)kh_key(aux->gen,k)); + kh_del(hdict,aux->gen,k); + kstring_t str = {0,0,0}; + if ( ksprintf(&str, "##%s=%s", tmp->key,tmp->value) < 0 ) + { + free(str.s); + return -1; + } + k = kh_put(hdict, aux->gen, str.s, &ret); + if ( ret<0 ) + { + free(str.s); + return -1; + } + free(hrec->value); + hrec->value = strdup(tmp->value); + if ( !hrec->value ) return -1; + return 0; +} + int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) { + kstring_t str = {0,0,0}; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + int res; if ( !hrec ) return 0; @@ -885,19 +942,49 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) } // Is one of the generic fields and already present? - int i; - for (i=0; inhrec; i++) + if ( ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0 ) + { + free(str.s); + return -1; + } + khint_t k = kh_get(hdict, aux->gen, str.s); + if ( k != kh_end(aux->gen) ) { - if ( hdr->hrec[i]->type!=BCF_HL_GEN ) continue; - if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hrec->key,"fileformat") ) break; - if ( !strcmp(hdr->hrec[i]->key,hrec->key) && !strcmp(hdr->hrec[i]->value,hrec->value) ) break; + // duplicate record + bcf_hrec_destroy(hrec); + free(str.s); + return 0; } - if ( inhrec ) + } + + int i; + if ( hrec->type==BCF_HL_STR && (i=bcf_hrec_find_key(hrec,"ID"))>=0 ) + { + if ( ksprintf(&str, "##%s=", hrec->key,hrec->vals[i]) < 0 ) { + free(str.s); + return -1; + } + khint_t k = kh_get(hdict, aux->gen, str.s); + if ( k != kh_end(aux->gen) ) + { + // duplicate record bcf_hrec_destroy(hrec); + free(str.s); return 0; } } + if ( str.s ) + { + khint_t k = kh_put(hdict, aux->gen, str.s, &res); + if ( res<0 ) + { + bcf_hrec_destroy(hrec); + free(str.s); + return -1; + } + kh_val(aux->gen,k) = hrec; + } // New record, needs to be added int n = hdr->nhrec + 1; @@ -911,29 +998,47 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return hrec->type==BCF_HL_GEN ? 0 : 1; } -/* - * Note that while querying of FLT,INFO,FMT,CTG lines is fast (the keys are hashed), - * the STR,GEN lines are searched for linearly in a linked list of all header lines. - * This may become a problem for VCFs with huge headers, we might need to build a - * dictionary for these lines as well. - */ bcf_hrec_t *bcf_hdr_get_hrec(const bcf_hdr_t *hdr, int type, const char *key, const char *value, const char *str_class) { int i; if ( type==BCF_HL_GEN ) { + // e.g. ##fileformat=VCFv4.2 + // ##source=GenomicsDBImport + // ##bcftools_viewVersion=1.16-80-gdfdb0923+htslib-1.16-34-g215d364 + if ( value ) + { + kstring_t str = {0,0,0}; + ksprintf(&str, "##%s=%s", key,value); + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k = kh_get(hdict, aux->gen, str.s); + free(str.s); + if ( k == kh_end(aux->gen) ) return NULL; + return kh_val(aux->gen, k); + } for (i=0; inhrec; i++) { if ( hdr->hrec[i]->type!=type ) continue; if ( strcmp(hdr->hrec[i]->key,key) ) continue; - if ( !value || !strcmp(hdr->hrec[i]->value,value) ) return hdr->hrec[i]; + return hdr->hrec[i]; } return NULL; } else if ( type==BCF_HL_STR ) { - if (!str_class) - return NULL; + // e.g. ##GATKCommandLine= + // ##ALT= + if (!str_class) return NULL; + if ( !strcmp("ID",key) ) + { + kstring_t str = {0,0,0}; + ksprintf(&str, "##%s=<%s=%s>",str_class,key,value); + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k = kh_get(hdict, aux->gen, str.s); + free(str.s); + if ( k == kh_end(aux->gen) ) return NULL; + return kh_val(aux->gen, k); + } for (i=0; inhrec; i++) { if ( hdr->hrec[i]->type!=type ) continue; @@ -1070,6 +1175,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) bcf_hrec_t *hrec; if ( !key ) { + // no key, remove all entries of this type while ( inhrec ) { if ( hdr->hrec[i]->type!=type ) { i++; continue; } @@ -1185,14 +1291,19 @@ int bcf_hdr_set_version(bcf_hdr_t *hdr, const char *version) { int len; kstring_t str = {0,0,0}; - ksprintf(&str,"##fileformat=%s", version); + if ( ksprintf(&str,"##fileformat=%s", version) < 0 ) return -1; hrec = bcf_hdr_parse_line(hdr, str.s, &len); free(str.s); } else { - free(hrec->value); - hrec->value = strdup(version); + bcf_hrec_t *tmp = bcf_hrec_dup(hrec); + if ( !tmp ) return -1; + free(tmp->value); + tmp->value = strdup(version); + if ( !tmp->value ) return -1; + bcf_hdr_update_hrec(hdr, hrec, tmp); + bcf_hrec_destroy(tmp); } hdr->dirty = 1; return 0; // FIXME: check for errs in this function (return < 0 if so) @@ -1206,6 +1317,14 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) if (!h) return NULL; for (i = 0; i < 3; ++i) if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail; + + bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t)); + if ( !aux ) goto fail; + if ( (aux->gen = kh_init(hdict))==NULL ) { free(aux); goto fail; } + aux->dict = *((vdict_t*)h->dict[0]); + free(h->dict[0]); + h->dict[0] = aux; + if ( strchr(mode,'w') ) { bcf_hdr_append(h, "##fileformat=VCFv4.2"); @@ -1231,6 +1350,13 @@ void bcf_hdr_destroy(bcf_hdr_t *h) if (d == 0) continue; for (k = kh_begin(d); k != kh_end(d); ++k) if (kh_exist(d, k)) free((char*)kh_key(d, k)); + if ( i==0 ) + { + bcf_hdr_aux_t *aux = get_hdr_aux(h); + for (k=kh_begin(aux->gen); kgen); k++) + if ( kh_exist(aux->gen,k) ) free((char*)kh_key(aux->gen,k)); + kh_destroy(hdict, aux->gen); + } kh_destroy(vdict, d); free(h->id[i]); } From 19dc7e843388ceeb902185baff87e8efe69d2470 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 27 Jan 2023 16:16:14 +0000 Subject: [PATCH 376/488] Fix a bug with multi-threading and embed_ref=2 on name sorted data Specifically, this is a multi-threading bug where the cram_encode_container fails leading to access of freed memory. This bug always existed, but we now have a guaranteed way of making the container encode fail. Embed_ref=2 computes a consensus and uses that as the reference. By design, it cannot work on non-position sorted data, hence a guaranteed failure for encoding. (Step 2 will be to do something more sensible than just fail.) --- cram/cram_encode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 3ce4f846f..56e1daf6a 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3829,7 +3829,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (NULL == (c = cram_next_container(fd, b))) { if (fd->ctr) { // prevent cram_close attempting to flush - cram_free_container(fd->ctr); + fd->ctr_mt = fd->ctr; // delay free when threading fd->ctr = NULL; } return -1; From 654364e9c89cdc5bd8dfd3adc84ec459450d3a18 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 27 Jan 2023 16:27:31 +0000 Subject: [PATCH 377/488] Use non-ref mode when all else fails for CRAM encoding If we're attempting to use embed_ref=2 because we have alignments in BAM and no known reference (no M5 tags), and the data is also name sorted so we can't auto-generate a reference from the consensus and/or MD:Z tags, then we now just switch to no_ref mode instead of bailing out. This isn't ideal, but there is little else that can be done unless the user modifies their command line or reheaders the input file. Extra complexities exist as the setting of embed_ref occurs while encoding a container, which also means it can be adjusted within threads and synchronously with other containers being encoded. Hence we now cache more variables from cram_fd into cram_container. --- cram/cram_decode.c | 5 +- cram/cram_encode.c | 152 ++++++++++++++++++++++++++++++++------------ cram/cram_io.c | 6 +- cram/cram_structs.h | 14 ++-- 4 files changed, 128 insertions(+), 49 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 1f8d60f12..73f567106 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2438,8 +2438,9 @@ int cram_decode_slice(cram_fd *fd, cram_container *c, cram_slice *s, } } - if ((!s->ref && s->hdr->ref_base_id < 0) - || memcmp(digest, s->hdr->md5, 16) != 0) { + if (!c->comp_hdr->no_ref && + ((!s->ref && s->hdr->ref_base_id < 0) + || memcmp(digest, s->hdr->md5, 16) != 0)) { char M[33]; const char *rname = sam_hdr_tid2name(sh, ref_id); if (!rname) rname="?"; // cannot happen normally diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 56e1daf6a..63ee8a111 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -61,7 +61,7 @@ KHASH_MAP_INIT_STR(m_s2u64, uint64_t) static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, bam_seq_t *b, int rnum, kstring_t *MD, - int embed_ref); + int embed_ref, int no_ref); /* * Returns index of val into key. @@ -87,6 +87,8 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, cram_block *map = cram_new_block(COMPRESSION_HEADER, 0); int i, mc, r = 0; + int no_ref = c->no_ref; + if (!cb || !map) return NULL; @@ -162,7 +164,7 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, kh_val(h->preservation_map, k).i = h->qs_seq_orient; } - if (fd->no_ref || embed_ref>0) { + if (no_ref || embed_ref>0) { // Reference Required == No k = kh_put(map, h->preservation_map, "RR", &r); if (-1 == r) return NULL; @@ -1746,7 +1748,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { cram_block_compression_hdr *h = c->comp_hdr; cram_block *c_hdr; int multi_ref = 0; - int r1, r2, sn, nref, embed_ref; + int r1, r2, sn, nref, embed_ref, no_ref; spare_bams *spares; if (CRAM_MAJOR_VERS(fd->version) == 1) @@ -1755,25 +1757,17 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { //#define goto_err {fprintf(stderr, "ERR at %s:%d\n", __FILE__, __LINE__);goto err;} #define goto_err goto err + restart: /* Cache references up-front if we have unsorted access patterns */ pthread_mutex_lock(&fd->ref_lock); nref = fd->refs->nref; - embed_ref = fd->embed_ref; pthread_mutex_unlock(&fd->ref_lock); - - if (!fd->no_ref && c->refs_used) { - for (i = 0; i < nref; i++) { - if (c->refs_used[i]) { - cram_get_ref(fd, i, 1, 0); - if (validate_md5(fd, i) < 0) - goto_err; - } - } - } + embed_ref = c->embed_ref; + no_ref = c->no_ref; /* To create M5 strings */ /* Fetch reference sequence */ - if (!fd->no_ref) { + if (!no_ref) { if (!c->bams || !c->bams[0]) goto_err; bam_seq_t *b = c->bams[0]; @@ -1781,7 +1775,20 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { if (embed_ref <= 1) { char *ref = cram_get_ref(fd, bam_ref(b), 1, 0); if (!ref && bam_ref(b) >= 0) { - if (c->multi_seq || embed_ref == 0 || !c->pos_sorted) { + if (!c->pos_sorted) { + // TODO: maybe also check fd->no_ref? + hts_log_warning("Failed to load reference #%d", + bam_ref(b)); + hts_log_warning("Switching to non-ref mode"); + + pthread_mutex_lock(&fd->ref_lock); + c->embed_ref = fd->embed_ref = 0; + c->no_ref = fd->no_ref = 1; + pthread_mutex_unlock(&fd->ref_lock); + goto restart; + } + + if (c->multi_seq || embed_ref == 0) { hts_log_error("Failed to load reference #%d", bam_ref(b)); return -1; } @@ -1792,7 +1799,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { hts_log_warning("NOTE: the CRAM file will be bigger than" " using an external reference"); pthread_mutex_lock(&fd->ref_lock); - embed_ref = fd->embed_ref = 2; + embed_ref = c->embed_ref = fd->embed_ref = 2; pthread_mutex_unlock(&fd->ref_lock); goto auto_ref; } else if (ref) { @@ -1822,6 +1829,21 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { c->ref_seq_id = c->ref_id; } + if (!no_ref && c->refs_used) { + for (i = 0; i < nref; i++) { + if (c->refs_used[i]) { + if (cram_get_ref(fd, i, 1, 0)) { + if (validate_md5(fd, i) < 0) + goto_err; + } else { + hts_log_warning("Failed to find reference, " + "switching to non-ref mode"); + no_ref = c->no_ref = 1; + } + } + } + } + /* Turn bams into cram_records and gather basic stats */ for (r1 = sn = 0; r1 < c->curr_c_rec; sn++) { cram_slice *s = c->slices[sn]; @@ -1845,8 +1867,27 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { // Embed consensus / MD-generated ref if (embed_ref == 2) { if (cram_generate_reference(c, s, r1) < 0) { - hts_log_error("Failed to build reference"); - return -1; + // Should this be a permanent thing via fd->no_ref? + // Doing so means we cannot easily switch back again should + // things fix themselves later on. This is likely not a + // concern though as failure to generate a reference implies + // unsorted data which is rarely recovered from. + + // Only if sn == 0. We're hosed if we're on the 2nd slice and + // the first worked, as no-ref is a container global param. + if (sn > 0) { + hts_log_error("Failed to build reference, " + "switching to non-ref mode"); + return -1; + } else { + hts_log_warning("Failed to build reference, " + "switching to non-ref mode"); + } + pthread_mutex_lock(&fd->ref_lock); + c->embed_ref = fd->embed_ref = 0; + c->no_ref = fd->no_ref = 1; + pthread_mutex_unlock(&fd->ref_lock); + goto restart; } } @@ -1857,7 +1898,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { bam_seq_t *b = c->bams[r1]; /* If multi-ref we need to cope with changing reference per seq */ - if (c->multi_seq && !fd->no_ref) { + if (c->multi_seq && !no_ref) { if (bam_ref(b) != c->ref_seq_id && bam_ref(b) >= 0) { if (c->ref_seq_id >= 0) cram_ref_decr(fd->refs, c->ref_seq_id); @@ -1879,7 +1920,8 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } } - if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref) != 0) { + if (process_one_read(fd, c, s, cr, b, r2, &MD, embed_ref, + no_ref) != 0) { free(MD.s); return -1; } @@ -1946,7 +1988,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } } - if (c->multi_seq && !fd->no_ref) { + if (c->multi_seq && !no_ref) { if (c->ref_seq_id >= 0) cram_ref_decr(fd->refs, c->ref_seq_id); } @@ -1978,12 +2020,14 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { /* Compute MD5s */ + no_ref = c->no_ref; int is_v4 = CRAM_MAJOR_VERS(fd->version) >= 4 ? 1 : 0; + for (i = 0; i < c->curr_slice; i++) { cram_slice *s = c->slices[i]; if (CRAM_MAJOR_VERS(fd->version) != 1) { - if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !fd->no_ref) { + if (s->hdr->ref_seq_id >= 0 && c->multi_seq == 0 && !no_ref) { hts_md5_context *md5 = hts_md5_init(); if (!md5) return -1; @@ -2360,7 +2404,7 @@ int cram_encode_container(cram_fd *fd, cram_container *c) { } /* Cache references up-front if we have unsorted access patterns */ - if (!fd->no_ref && c->refs_used) { + if (!no_ref && c->refs_used) { for (i = 0; i < fd->refs->nref; i++) { if (c->refs_used[i]) cram_ref_decr(fd->refs, i); @@ -2598,7 +2642,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, cram_slice *s, cram_record *cr, int verbatim_NM, int verbatim_MD, int NM, kstring_t *MD, int cf_tag, - int *err) { + int no_ref, int *err) { char *aux, *orig; sam_hrec_rg_t *brg = NULL; int aux_size = bam_get_l_aux(b); @@ -2650,7 +2694,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, // MD:Z if (aux[0] == 'M' && aux[1] == 'D' && aux[2] == 'Z') { - if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { + if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_MD) { if (MD && MD->s && strncasecmp(MD->s, aux+3, orig + aux_size - (aux+3)) == 0) { while (*aux++); if (CRAM_MAJOR_VERS(fd->version) >= 4) @@ -2662,7 +2706,7 @@ static sam_hrec_rg_t *cram_encode_aux(cram_fd *fd, bam_seq_t *b, // NM:i if (aux[0] == 'N' && aux[1] == 'M') { - if (cr->len && !fd->no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) { + if (cr->len && !no_ref && !(cr->flags & BAM_FUNMAP) && !verbatim_NM) { int NM_ = bam_aux2i((uint8_t *)aux+2); if (NM_ == NM) { switch(aux[2]) { @@ -3096,7 +3140,12 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { fd->slices_per_container); if (!c) return NULL; + + pthread_mutex_lock(&fd->ref_lock); + c->no_ref = fd->no_ref; + c->embed_ref = fd->embed_ref; c->record_counter = fd->record_counter; + pthread_mutex_unlock(&fd->ref_lock); c->curr_ref = bam_ref(b); } @@ -3143,7 +3192,7 @@ static cram_container *cram_next_container(cram_fd *fd, bam_seq_t *b) { static int process_one_read(cram_fd *fd, cram_container *c, cram_slice *s, cram_record *cr, bam_seq_t *b, int rnum, kstring_t *MD, - int embed_ref) { + int embed_ref, int no_ref) { int i, fake_qual = -1, NM = 0; char *cp; char *ref, *seq, *qual; @@ -3184,7 +3233,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, // Non reference based encoding means storing the bases verbatim as features, which in // turn means every base also has a quality already stored. - if (!fd->no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) + if (!no_ref || CRAM_MAJOR_VERS(fd->version) >= 3) cr->cram_flags |= CRAM_FLAG_PRESERVE_QUAL_SCORES; if (cr->len <= 0 && CRAM_MAJOR_VERS(fd->version) >= 3) @@ -3262,7 +3311,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, //fprintf(stderr, "\nBAM_CMATCH\nR: %.*s\nS: %.*s\n", // cig_len, &ref[apos], cig_len, &seq[spos]); l = 0; - if (!fd->no_ref && cr->len) { + if (!no_ref && cr->len) { int end = cig_len+apos < c->ref_end ? cig_len : c->ref_end - apos; char *sp = &seq[spos]; @@ -3357,7 +3406,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, } if (l < cig_len && cr->len) { - if (fd->no_ref) { + if (no_ref) { if (CRAM_MAJOR_VERS(fd->version) == 3) { if (cram_add_bases(fd, c, s, cr, spos, cig_len-l, &seq[spos])) @@ -3416,7 +3465,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, if (cram_add_insertion(c, s, cr, spos, cig_len, cr->len ? &seq[spos] : NULL)) return -1; - if (fd->no_ref && cr->len) { + if (no_ref && cr->len) { for (l = 0; l < cig_len; l++, spos++) { cram_add_quality(fd, c, s, cr, spos, qual[spos]); } @@ -3432,7 +3481,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, fd->version)) return -1; - if (fd->no_ref && + if (no_ref && !(cr->cram_flags & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { if (cr->len) { for (l = 0; l < cig_len; l++, spos++) { @@ -3468,7 +3517,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, return -1; } fake_qual = spos; - cr->aend = fd->no_ref ? apos : MIN(apos, c->ref_end); + cr->aend = no_ref ? apos : MIN(apos, c->ref_end); if (cram_stats_add(c->stats[DS_FN], cr->nfeature) < 0) goto block_err; @@ -3491,7 +3540,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, int err = 0; sam_hrec_rg_t *brg = cram_encode_aux(fd, b, c, s, cr, verbatim_NM, verbatim_MD, NM, MD, - cf_tag, &err); + cf_tag, no_ref, &err); if (err) goto block_err; @@ -3784,9 +3833,16 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (!fd->ctr) return -1; fd->ctr->record_counter = fd->record_counter; + + pthread_mutex_lock(&fd->ref_lock); + fd->ctr->no_ref = fd->no_ref; + fd->ctr->embed_ref = fd->embed_ref; + pthread_mutex_unlock(&fd->ref_lock); } c = fd->ctr; + int embed_ref = c->embed_ref; + if (!c->slice || c->curr_rec == c->max_rec || (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) || (c->s_num_bases >= fd->bases_per_slice)) { @@ -3802,9 +3858,6 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { * The multi_seq var here refers to our intention for the next slice. * This slice has already been encoded so we output as-is. */ - pthread_mutex_lock(&fd->ref_lock); - int embed_ref = fd->embed_ref; - pthread_mutex_unlock(&fd->ref_lock); if (fd->multi_seq == -1 && c->curr_rec < c->max_rec/4+10 && fd->last_slice && fd->last_slice < c->max_rec/4+10 && embed_ref<=0) { @@ -3852,6 +3905,27 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->multi_seq = 1; c->pos_sorted = 0; + // Cram_next_container may end up flushing an existing one and + // triggering fd->embed_ref=2 if no reference is found. + // Embedded refs are incompatible with multi-seq, so we bail + // out and switch to no_ref in this scenario. We do this + // within the container only, as multi_seq may be temporary + // and we switch back away from it again. + pthread_mutex_lock(&fd->ref_lock); + if (fd->embed_ref > 0 && c->curr_rec == 0 && c->curr_slice == 0) { + hts_log_warning("Changing from embed_ref to no_ref mode"); + // Should we update fd->embed_ref and no_ref here too? + // Doing so means if we go into multi-seq and back out + // again, eg due a cluster of tiny refs in the middle of + // much larger ones, then we bake in no-ref mode. + // + // However for unsorted data we're realistically not + // going to switch back. + c->embed_ref = fd->embed_ref = 0; // or -1 for auto? + c->no_ref = fd->no_ref = 1; + } + pthread_mutex_unlock(&fd->ref_lock); + if (!c->refs_used) { pthread_mutex_lock(&fd->ref_lock); c->refs_used = calloc(fd->refs->nref, sizeof(int)); @@ -3877,8 +3951,8 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { } else if (c->refs_used && c->refs_used[bam_ref(b)]) { pthread_mutex_lock(&fd->ref_lock); fd->unsorted = 1; - pthread_mutex_unlock(&fd->ref_lock); fd->multi_seq = 1; + pthread_mutex_unlock(&fd->ref_lock); } } diff --git a/cram/cram_io.c b/cram/cram_io.c index eaf14d416..d3c39e47a 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -3203,7 +3203,7 @@ void cram_ref_decr(refs_t *r, int id) { } /* - * Used by cram_ref_load and cram_ref_get. The file handle will have + * Used by cram_ref_load and cram_get_ref. The file handle will have * already been opened, so we can catch it. The ref_entry *e informs us * of whether this is a multi-line fasta file or a raw MD5 style file. * Either way we create a single contiguous sequence. @@ -3455,7 +3455,7 @@ char *cram_get_ref(cram_fd *fd, int id, int start, int end) { hts_log_warning("Reference file given, but ref '%s' not present", r->name); if (cram_populate_ref(fd, id, r) == -1) { - hts_log_error("Failed to populate reference for id %d", id); + hts_log_warning("Failed to populate reference for id %d", id); pthread_mutex_unlock(&fd->refs->lock); pthread_mutex_unlock(&fd->ref_lock); return NULL; @@ -3652,6 +3652,8 @@ cram_container *cram_new_container(int nrec, int nslice) { c->max_apos = 0; c->multi_seq = 0; c->qs_seq_orient = 1; + c->no_ref = 0; + c->embed_ref = -1; // automatic selection c->bams = NULL; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index cbb226b70..724f6cb78 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -457,6 +457,8 @@ struct cram_container { /* Copied from fd before encoding, to allow multi-threading */ int ref_start, first_base, last_base, ref_id, ref_end; char *ref; + int embed_ref; // 1 if embedding ref, 2 if embedding cons + int no_ref; // true if referenceless //struct ref_entry *ref; /* For multi-threading */ @@ -793,14 +795,14 @@ struct cram_fd { cram_container *ctr_mt; // positions for encoding or decoding - int first_base, last_base; + int first_base, last_base; // copied to container // cached reference portion refs_t *refs; // ref meta-data structure char *ref, *ref_free; // current portion held in memory - int ref_id; - int ref_start; - int ref_end; + int ref_id; // copied to container + int ref_start; // copied to container + int ref_end; // copied to container char *ref_fn; // reference fasta filename // compression level and metrics @@ -813,8 +815,8 @@ struct cram_fd { int seqs_per_slice; int bases_per_slice; int slices_per_container; - int embed_ref; - int no_ref; + int embed_ref; // copied to container + int no_ref; // copied to container int ignore_md5; int use_bz2; int use_rans; From 80558d98bdb58a21dd4e55c122bc971fd7d2a9dd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 3 Feb 2023 09:52:34 +0000 Subject: [PATCH 378/488] Add some documentation on cram encoder code structure This is far from complete, but it's a guide on the basic function hierarchy and which bits are used from a single thread per cram_fd and which can be concurrent. --- cram/README | 214 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 214 insertions(+) create mode 100644 cram/README diff --git a/cram/README b/cram/README new file mode 100644 index 000000000..135438227 --- /dev/null +++ b/cram/README @@ -0,0 +1,214 @@ +CRAM encoding internals +======================= + +A quick summary of functions involved. + +The encoder works by accumulating a bunch of BAM records (via the +cram_put_bam_seq function), and at a certain point (eg counter of +records, or switching reference) the array of BAM records it turned +into a container, which in turn creates slices, holding CRAM +data-series in blocks. The function that turns an array of BAM +objects into the container is below. + +cram_encode_container func: + Validate references MD5 against header, unless no_ref mode + If embed_ref <= 1, fetch ref + Switch to embed_ref=2 if failed + + Foreach slice: + If embed_ref == 2 + call cram_generate_reference + if failed switch to no_ref mode + Foreach sequence + call process_one_read to append BAM onto each data series (DS) + call cram_stats_add for each DS to gather metrics + call cram_encode_aux + + # We now have cram DS, per slice + call cram_encoder_init, per DS (based on cram_stats_add data) + + Foreach slice: + call cram_encode_slice to turn DS to blocks + call cram_compess_slice + + call cram_encode_compression_header + +Threading +--------- + +CRAM can be multi-threaded, but this brings complications. + +The above function is the main CPU user, so it is this bit which can +be executed in parallel from multiple threads. To understand this we +need to now look at how the primary loop works when writing a CRAM: + +Encoding main thread: + repeatedly calls cram_put_bam_seq + calls cram_new_container on first time through to initialise + calls cram_next_container when current is full or we need to flush + calls cram_flush_container_mt to flush last container + pushes BAM object onto current container + +If non-threaded, cram_flush_container_mt does: + call cram_flush_container + call cram_encode_container to go from BAM to CRAM data-series + call cram_flush_container2 (writes it out) + +If threaded, cram_flush_container_mt does: + Main: Dispatch cram_flush_thread job + Thread: call cram_encode_container to go from BAM to CRAM data-series + Main: Call cram_flush_result to drain queue of encoded containers + Main: Call cram_flush_container2 (writes it out); + + + +Decisions on when to create new containers, detection of sorted vs unsorted, +switching to multi-seq mode, etc occur at the main thread in +cram_put_bam_seq. + +We can change our mind on container parameters at any point up until +the cram_encode_container call. At that point these parameters get +baked into a container compression header and all data-series +generated need to be in sync with the parameters. + +It is possible that some parameter changes can get detected while +encoding the container, as it is there where we fetch references. Eg +the need to enable embedded reference or switch to non-ref mode. + +While encoding a container, we can change the parameters for *this* +container, and we can also set the default parameter for subsequent +new parameters via the global cram fd to avoid spamming attempts to +load a reference which doesn't exist, but we cannot change other +containers that are being processed in parallel. They'll fend for +themselves. + +References +---------- + +To avoid spamming the reference servers, there is a shared cache of +references being currently used by all the worker threads (leading to +confusing terminology of reference-counting of references). So each +container fetches its section of reference, but the memory for that is +handled via its own layer. + +The shared references and ref meta-data is held in cram_fd -> refs (a +refs_t pointer): + + // References structure. + struct refs_t { + string_alloc_t *pool; // String pool for holding filenames and SN vals + + khash_t(refs) *h_meta; // ref_entry*, index by name + ref_entry **ref_id; // ref_entry*, index by ID + int nref; // number of ref_entry + + char *fn; // current file opened + BGZF *fp; // and the hFILE* to go with it. + + int count; // how many cram_fd sharing this refs struct + + pthread_mutex_t lock; // Mutex for multi-threaded updating + ref_entry *last; // Last queried sequence + int last_id; // Used in cram_ref_decr_locked to delay free + }; + +Within this, ref_entry is the per-reference information: + + typedef struct ref_entry { + char *name; + char *fn; + int64_t length; + int64_t offset; + int bases_per_line; + int line_length; + int64_t count; // for shared references so we know to dealloc seq + char *seq; + mFILE *mf; + int is_md5; // Reference comes from a raw seq found by MD5 + int validated_md5; + } ref_entry; + +Sharing of references to track use between threads is via +cram_ref_incr* and cram_ref_decr* (which locked and unlocked +variants). We free a reference when the usage count hits zero. To +avoid spamming discard and reload in single-thread creation of a +pos-sorted CRAM, we keep track of the last reference in cram_fd and +delay discard by one loop iteration. + +There are complexities here around whether the references come from a +single ref.fa file, are from a local MD5sum cache with one file per +reference (mmapped), or whether they're fetched from some remote +REF_PATH query such as the EBI. (This later case typically downloads +to a local md5 based ref-cache first and mmaps from there.) + +The refs struct start off by being populated from the SAM header. We +have M5 tag and name known, maybe a filename, but length is 0 and seq +is NULL. This is done by cram_load_reference: + +cram_load_reference (cram_fd, filename): + if filename non-NULL + call refs_load_fai + Populates ref_entry with filename, name, length, line-len, etc + sanitise_SQ_lines + If no refs loaded + call refs_from_header + populates ref_entry with name. + Sets length=0 as marker for not-yet-loaded + +The main interface used from the code is cram_get_ref(). It takes a +reference ID, start and end coordinate and returns a pointer to the +relevant sub-sequence. + +cram_get_ref: + r = fd->refs->ref_id[id]; // current ref + call cram_populate_ref if stored length is 0 (ie ref.fa set) + search REF_PATH / REF_CACHE + call bgzf_open if local_path + call open_path_mfile otherwise + copy to local REF_CACHE if required (eg remote fetch) + + If start = 1 and end = ref-length + If ref seq unknown + call cram_ref_load to load entire ref and use that + + If ref seq now known, return it + + // Otherwise known via .fai or we've errored by now. + call load_ref_portion to return a sub-seq from index fasta + +The encoder asks for the entire reference rather than a small portion +of it as we're usually encoding a large amount. The decoder may be +dealing with small range queries, so it only asks for the relevant +sub-section of reference as specified in the cram slice headers. + + +TODO +==== + +- Multi-ref mode is enabled when we have too many small containers in + a row. + + Instead of firing off new containers when we switch reference, we + could always make a new container after N records, separating off + M <= N to make the container such that all M are the same reference, + and shuffling any remaining N-M down as the start of the next. + + This means we can detect how many new containers we would create, + and enable multi-ref mode straight away rather than keeping a recent + history of how many small containers we've emitted. + +- The cache of references currently being used is a better place to + track the global embed-ref and non-ref logic. Better than cram_fd. + Cram_fd is a one-way change, as once we enable non-ref we'll stick + with it. + + However if it was per-ref in the ref-cache then we'd probe and try + each reference once, and then all new containers for that ref would + honour the per-ref parameters. So a single missing reference in the + middle of a large file wouldn't change behaviour for all subsequence + references. + + Optionally we could still do meta-analysis on how many references + are failing, and switch the global cram_fd params to avoid repeated + testing of reference availability if it's becoming obvious that none + of them are known. From c58ae63cf77e1c1e198f5169232066d09df0ff27 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 17 Sep 2022 07:05:00 +0000 Subject: [PATCH 379/488] Add symbol versioning to the ELF shared-object file To ease dependency tracking. In particular, Linux distributions will be able to automatically check dependencies at package build time. --- Makefile | 2 +- htslib.map | 610 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 611 insertions(+), 1 deletion(-) create mode 100644 htslib.map diff --git a/Makefile b/Makefile index fb5f7a0c7..33d905115 100644 --- a/Makefile +++ b/Makefile @@ -354,7 +354,7 @@ print-config: # file used at runtime (when $LD_LIBRARY_PATH includes the build directory). libhts.so: $(LIBHTS_OBJS:.o=.pico) - $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread + $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) -Wl,-version-script,$(srcprefix)htslib.map $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread ln -sf $@ libhts.so.$(LIBHTS_SOVERSION) # Similarly this also creates libhts.NN.dylib as a byproduct, so that programs diff --git a/htslib.map b/htslib.map new file mode 100644 index 000000000..bf25fd5d6 --- /dev/null +++ b/htslib.map @@ -0,0 +1,610 @@ +HTSLIB_1.0 { + bam_aux2A; + bam_aux2Z; + bam_aux2f; + bam_aux2i; + bam_aux_append; + bam_aux_del; + bam_aux_get; + bam_cigar2qlen; + bam_cigar2rlen; + bam_copy1; + bam_destroy1; + bam_dup1; + bam_endpos; + bam_flag2str; + bam_hdr_read; + bam_hdr_write; + bam_init1; + bam_mplp_auto; + bam_mplp_destroy; + bam_mplp_init; + bam_mplp_init_overlaps; + bam_mplp_set_maxcnt; + bam_plp_auto; + bam_plp_destroy; + bam_plp_init; + bam_plp_next; + bam_plp_push; + bam_plp_reset; + bam_plp_set_maxcnt; + bam_read1; + bam_str2flag; + bam_write1; + bcf_add_filter; + bcf_calc_ac; + bcf_clear; + bcf_destroy; + bcf_dup; + bcf_enc_vchar; + bcf_enc_vfloat; + bcf_enc_vint; + bcf_float_missing; + bcf_float_vector_end; + bcf_fmt_array; + bcf_fmt_sized_array; + bcf_get_fmt; + bcf_get_format_string; + bcf_get_format_values; + bcf_get_info; + bcf_get_info_values; + bcf_get_variant_type; + bcf_get_variant_types; + bcf_gt_type; + bcf_has_filter; + bcf_hdr_add_hrec; + bcf_hdr_add_sample; + bcf_hdr_append; + bcf_hdr_combine; + bcf_hdr_destroy; + bcf_hdr_dup; + bcf_hdr_fmt_text; + bcf_hdr_get_hrec; + bcf_hdr_get_version; + bcf_hdr_id2int; + bcf_hdr_init; + bcf_hdr_parse; + bcf_hdr_parse_line; + bcf_hdr_printf; + bcf_hdr_read; + bcf_hdr_remove; + bcf_hdr_seqnames; + bcf_hdr_set; + bcf_hdr_set_samples; + bcf_hdr_set_version; + bcf_hdr_subset; + bcf_hdr_sync; + bcf_hdr_write; + bcf_hrec_add_key; + bcf_hrec_destroy; + bcf_hrec_dup; + bcf_hrec_find_key; + bcf_hrec_format; + bcf_hrec_set_val; + bcf_index_build; + bcf_init; + bcf_is_snp; + bcf_read; + bcf_readrec; + bcf_remove_alleles; + bcf_remove_filter; + bcf_sr_add_reader; + bcf_sr_destroy; + bcf_sr_init; + bcf_sr_next_line; + bcf_sr_regions_destroy; + bcf_sr_regions_flush; + bcf_sr_regions_init; + bcf_sr_regions_next; + bcf_sr_regions_overlap; + bcf_sr_regions_seek; + bcf_sr_remove_reader; + bcf_sr_seek; + bcf_sr_set_regions; + bcf_sr_set_samples; + bcf_sr_set_targets; + bcf_subset; + bcf_subset_format; + bcf_sweep_bwd; + bcf_sweep_destroy; + bcf_sweep_fwd; + bcf_sweep_hdr; + bcf_sweep_init; + bcf_translate; + bcf_trim_alleles; + bcf_type_shift; + bcf_unpack; + bcf_update_alleles; + bcf_update_alleles_str; + bcf_update_filter; + bcf_update_format; + bcf_update_format_string; + bcf_update_id; + bcf_update_info; + bcf_write; + bgzf_check_EOF; + bgzf_close; + bgzf_dopen; + bgzf_flush; + bgzf_flush_try; + bgzf_getc; + bgzf_getline; + bgzf_hopen; + bgzf_index_build_init; + bgzf_index_dump; + bgzf_index_load; + bgzf_is_bgzf; + bgzf_mt; + bgzf_open; + bgzf_raw_read; + bgzf_raw_write; + bgzf_read; + bgzf_read_block; + bgzf_seek; + bgzf_set_cache_size; + bgzf_useek; + bgzf_utell; + bgzf_write; + cram_close; + cram_compress_block; + cram_dopen; + cram_eof; + cram_flush; + cram_free_block; + cram_free_container; + cram_new_block; + cram_new_container; + cram_open; + cram_read_block; + cram_read_container; + cram_seek; + cram_set_header; + cram_set_option; + cram_set_voption; + cram_uncompress_block; + cram_write_block; + cram_write_container; + fai_build; + fai_destroy; + fai_fetch; + fai_load; + faidx_fetch_nseq; + faidx_fetch_seq; + faidx_has_seq; + hclose; + hclose_abruptly; + hdopen; + hfile_destroy; + hfile_init; + hfile_oflags; + hflush; + hgetc2; + hopen; + hpeek; + hputc2; + hputs2; + hread2; + hrec_add_idx; + hseek; + hts_close; + hts_file_type; + hts_get_bgzfp; + hts_getline; + hts_idx_destroy; + hts_idx_finish; + hts_idx_get_meta; + hts_idx_get_n_no_coor; + hts_idx_get_stat; + hts_idx_init; + hts_idx_load; + hts_idx_push; + hts_idx_save; + hts_idx_seqnames; + hts_idx_set_meta; + hts_itr_destroy; + hts_itr_next; + hts_itr_query; + hts_itr_querys; + hts_open; + hts_parse_reg; + hts_readlines; + hts_readlist; + hts_set_fai_filename; + hts_set_threads; + hts_verbose; + hts_version; + hwrite2; + kf_betai; + kf_erfc; + kf_gammap; + kf_gammaq; + kf_lgamma; + kmemmem; + knet_close; + knet_dopen; + knet_open; + knet_read; + knet_seek; + ksplit_core; + ksprintf; + kstrnstr; + kstrstr; + kstrtok; + kt_fisher_exact; + kvsprintf; + sam_format1; + sam_hdr_add_lines; + sam_hdr_dup; + sam_hdr_incr_ref; + sam_hdr_length; + sam_hdr_parse; + sam_hdr_read; + sam_hdr_str; + sam_hdr_write; + sam_index_load; + sam_itr_queryi; + sam_itr_querys; + sam_open_mode; + sam_parse1; + sam_read1; + sam_write1; + seq_nt16_str; + seq_nt16_table; + stringify_argv; + tbx_conf_bed; + tbx_conf_gff; + tbx_conf_psltbl; + tbx_conf_sam; + tbx_conf_vcf; + tbx_destroy; + tbx_index; + tbx_index_build; + tbx_index_load; + tbx_name2id; + tbx_readrec; + tbx_seqnames; + vcf_format; + vcf_hdr_read; + vcf_hdr_write; + vcf_parse; + vcf_read; + vcf_write; + vcf_write_line; +}; + +HTSLIB_1.1 { + bcf_get_fmt_id; + bcf_get_info_id; + faidx_iseq; + faidx_nseq; + faidx_seq_len; +} HTSLIB_1.0; + + +HTSLIB_1.2.1 { + bcf_copy; + bcf_sr_strerror; + hisremote; + hts_detect_format; + hts_format_description; + hts_get_format; + hts_hopen; + hts_set_opt; + regidx_destroy; + regidx_init; + regidx_insert; + regidx_nregs; + regidx_overlap; + regidx_parse_bed; + regidx_parse_tab; + regidx_seq_names; + regidx_seq_nregs; + seq_nt16_int; +} HTSLIB_1.1; + +HTSLIB_1.3 { + bcf_add_id; + bcf_empty; + bcf_hdr_merge; + bcf_index_build2; + bcf_index_load2; + bcf_remove_allele_set; + bgzf_compress; + cram_block_append; + cram_block_get_comp_size; + cram_block_get_content_id; + cram_block_get_content_type; + cram_block_get_crc32; + cram_block_get_data; + cram_block_get_offset; + cram_block_get_uncomp_size; + cram_block_set_comp_size; + cram_block_set_content_id; + cram_block_set_crc32; + cram_block_set_data; + cram_block_set_offset; + cram_block_set_uncomp_size; + cram_block_size; + cram_block_update_size; + cram_container_get_landmarks; + cram_container_get_length; + cram_container_get_num_blocks; + cram_container_is_empty; + cram_container_set_landmarks; + cram_container_set_length; + cram_container_set_num_blocks; + cram_container_size; + cram_copy_slice; + cram_fd_get_fp; + cram_fd_get_header; + cram_fd_get_version; + cram_fd_set_fp; + cram_fd_set_header; + cram_fd_set_version; + cram_major_vers; + cram_minor_vers; + cram_store_container; + cram_transcode_rg; + hfile_add_scheme_handler; + hfile_always_local; + hfile_always_remote; + hts_format_file_extension; + hts_idx_load2; + hts_idx_save_as; + hts_md5_destroy; + hts_md5_final; + hts_md5_hex; + hts_md5_init; + hts_md5_reset; + hts_md5_update; + hts_open_format; + hts_opt_add; + hts_opt_apply; + hts_opt_free; + hts_parse_decimal; + hts_parse_format; + hts_parse_opt_list; + int32_put_blk; + kgetline; + sam_index_build; + sam_index_build2; + sam_index_load2; + sam_open_mode_opts; + tbx_index_build2; + tbx_index_load2; +} HTSLIB_1.2.1; + +HTSLIB_1.4 { + bam_auxB2f; + bam_auxB2i; + bam_auxB_len; + bam_aux_update_str; + bam_mplp_constructor; + bam_mplp_destructor; + bam_mplp_reset; + bam_plp_constructor; + bam_plp_destructor; + bcf_hdr_format; + bcf_index_build3; + bcf_sr_destroy_threads; + bcf_sr_set_opt; + bcf_sr_set_threads; + bgzf_block_write; + bgzf_compression; + bgzf_index_dump_hfile; + bgzf_index_load_hfile; + bgzf_thread_pool; + cram_check_EOF; + cram_get_refs; + errmod_cal; + errmod_destroy; + errmod_init; + fai_build3; + fai_load3; + hgetdelim; + hgets; + hts_check_EOF; + hts_json_fnext; + hts_json_fskip_value; + hts_json_snext; + hts_json_sskip_value; + hts_realloc_or_die; + hts_set_cache_size; + hts_set_thread_pool; + hts_tpool_delete_result; + hts_tpool_destroy; + hts_tpool_dispatch; + hts_tpool_dispatch2; + hts_tpool_init; + hts_tpool_kill; + hts_tpool_next_result; + hts_tpool_next_result_wait; + hts_tpool_process_attach; + hts_tpool_process_destroy; + hts_tpool_process_detach; + hts_tpool_process_empty; + hts_tpool_process_flush; + hts_tpool_process_init; + hts_tpool_process_len; + hts_tpool_process_qsize; + hts_tpool_process_ref_decr; + hts_tpool_process_ref_incr; + hts_tpool_process_reset; + hts_tpool_process_shutdown; + hts_tpool_process_sz; + hts_tpool_result_data; + hts_tpool_size; + hts_tpool_wake_dispatch; + kputd; + probaln_glocal; + sam_cap_mapq; + sam_index_build3; + sam_prob_realn; + tbx_index_build3; +} HTSLIB_1.3; + +HTSLIB_1.5 { + hfile_set_blksize; + hts_get_log_level; + hts_log; + hts_set_log_level; +} HTSLIB_1.4; + +HTSLIB_1.6 { + hts_drand48; + hts_erand48; + hts_lrand48; + hts_srand48; +} HTSLIB_1.5; + +HTSLIB_1.7 { + hfile_mem_get_buffer; + hfile_mem_steal_buffer; + hts_itr_multi_bam; + hts_itr_multi_cram; + hts_itr_multi_next; + hts_itr_regions; + hts_json_alloc_token; + hts_json_free_token; + hts_json_token_str; + hts_json_token_type; + hts_reglist_free; + sam_hdr_change_HD; + sam_itr_regions; +} HTSLIB_1.6; + +HTSLIB_1.9 { + bam_aux_update_array; + bam_aux_update_float; + bam_aux_update_int; + fai_fetchqual; + fai_load3_format; + fai_load_format; + faidx_fetch_qual; +} HTSLIB_1.7; + +HTSLIB_1.10 { + bam_cigar_table; + bam_mplp64_auto; + bam_plp64_auto; + bam_plp64_next; + bam_plp_insertion; + bam_set_qname; + bcf_idx_init; + bcf_idx_save; + bcf_index_load3; + bgzf_peek; + fai_fetch64; + fai_fetchqual64; + fai_parse_region; + fai_set_cache_size; + faidx_fetch_qual64; + faidx_fetch_seq64; + haddextension; + hts_free; + hts_idx_fmt; + hts_idx_load3; + hts_idx_tbi_name; + hts_parse_reg64; + hts_parse_region; + hts_reglist_create; + hts_resize_array_; + hts_tpool_dispatch3; + kgetline2; + regidx_init_string; + regidx_insert_list; + regidx_parse_reg; + regidx_parse_vcf; + regidx_push; + regitr_copy; + regitr_destroy; + regitr_init; + regitr_loop; + regitr_overlap; + regitr_reset; + sam_hdr_add_line; + sam_hdr_add_pg; + sam_hdr_count_lines; + sam_hdr_destroy; + sam_hdr_find_line_id; + sam_hdr_find_line_pos; + sam_hdr_find_tag_id; + sam_hdr_find_tag_pos; + sam_hdr_init; + sam_hdr_line_index; + sam_hdr_line_name; + sam_hdr_name2tid; + sam_hdr_nref; + sam_hdr_pg_id; + sam_hdr_remove_except; + sam_hdr_remove_line_id; + sam_hdr_remove_line_pos; + sam_hdr_remove_lines; + sam_hdr_remove_tag_id; + sam_hdr_tid2len; + sam_hdr_tid2name; + sam_hdr_update_line; + sam_idx_init; + sam_idx_save; + sam_index_load3; + sam_itr_regarray; + sam_parse_region; + tbx_index_load3; +} HTSLIB_1.9; + +HTSLIB_1.11 { + fai_path; + hts_lib_shutdown; + hts_tpool_process_is_shutdown; + vcf_open_mode; +} HTSLIB_1.10; + +HTSLIB_1.12 { + bam_parse_cigar; + bam_set1; + hfile_has_plugin; + hfile_list_plugins; + hfile_list_schemes; + hts_feature_string; + hts_features; + hts_filter_eval; + hts_filter_free; + hts_filter_init; + hts_set_filter_expression; + hts_test_feature; + sam_parse_cigar; + sam_passes_filter; +} HTSLIB_1.11; + +HTSLIB_1.13 { + hts_idx_nseq; +} HTSLIB_1.12; + +HTSLIB_1.14 { + bam_mods_at_next_pos; + bam_mods_at_qpos; + bam_next_basemod; + bam_parse_basemod; + bam_plp_insertion_mod; + hts_base_mod_state_alloc; + hts_base_mod_state_free; + hts_flush; +} HTSLIB_1.13; + +HTSLIB_1.15 { + hts_detect_format2; +} HTSLIB_1.14; + +HTSLIB_1.16 { + bam_mods_query_type; + bam_mods_recorded; + bcf_has_variant_type; + bcf_has_variant_types; + bcf_variant_length; + cram_decode_slice_header; + cram_free_slice_header; + cram_slice_hdr_get_coords; + cram_slice_hdr_get_embed_ref_id; + cram_slice_hdr_get_num_blocks; + hts_filter_eval2; +} HTSLIB_1.15; From fbcb1612a5f030af89b0f0620e41c9f392698edb Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 3 Feb 2023 11:31:29 +0000 Subject: [PATCH 380/488] Add configure (enable|disable)-versioned-symbols options and tests Default is to try to enable versioned symbols in libhts.so. This can be overridden using --disable-versioned-symbols. A simple test is made to check that the linker option is recognised. The test uses the htslib.map file in the source - it seems to work even though the symbols in the map aren't referenced in the test program being linked. The Makefile and config.mk.in are adjusted so that the configure result overrides the Makefile default. --- Makefile | 5 +++-- config.mk.in | 3 ++- configure.ac | 25 ++++++++++++++++++++++++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/Makefile b/Makefile index 33d905115..13624c8f3 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Makefile for htslib, a C library for high-throughput sequencing data formats. # -# Copyright (C) 2013-2022 Genome Research Ltd. +# Copyright (C) 2013-2023 Genome Research Ltd. # # Author: John Marshall # @@ -39,6 +39,7 @@ CFLAGS = -g -Wall -O2 -fvisibility=hidden EXTRA_CFLAGS_PIC = -fpic TARGET_CFLAGS = LDFLAGS = -fvisibility=hidden +VERSION_SCRIPT_LDFLAGS = -Wl,-version-script,$(srcprefix)htslib.map LIBS = $(htslib_default_libs) prefix = /usr/local @@ -354,7 +355,7 @@ print-config: # file used at runtime (when $LD_LIBRARY_PATH includes the build directory). libhts.so: $(LIBHTS_OBJS:.o=.pico) - $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) -Wl,-version-script,$(srcprefix)htslib.map $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread + $(CC) -shared -Wl,-soname,libhts.so.$(LIBHTS_SOVERSION) $(VERSION_SCRIPT_LDFLAGS) $(LDFLAGS) -o $@ $(LIBHTS_OBJS:.o=.pico) $(LIBS) -lpthread ln -sf $@ libhts.so.$(LIBHTS_SOVERSION) # Similarly this also creates libhts.NN.dylib as a byproduct, so that programs diff --git a/config.mk.in b/config.mk.in index 82af49850..7341a170d 100644 --- a/config.mk.in +++ b/config.mk.in @@ -1,6 +1,6 @@ # Optional configure Makefile overrides for htslib. # -# Copyright (C) 2015-2017, 2019 Genome Research Ltd. +# Copyright (C) 2015-2017, 2019, 2023 Genome Research Ltd. # # Author: John Marshall # @@ -43,6 +43,7 @@ RANLIB = @RANLIB@ CPPFLAGS = @CPPFLAGS@ CFLAGS = @CFLAGS@ LDFLAGS = @LDFLAGS@ +VERSION_SCRIPT_LDFLAGS = @VERSION_SCRIPT_LDFLAGS@ LIBS = @LIBS@ PLATFORM = @PLATFORM@ diff --git a/configure.ac b/configure.ac index 7d40948a6..98b0a44c7 100644 --- a/configure.ac +++ b/configure.ac @@ -1,6 +1,6 @@ # Configure script for htslib, a C library for high-throughput sequencing data. # -# Copyright (C) 2015-2022 Genome Research Ltd. +# Copyright (C) 2015-2023 Genome Research Ltd. # # Author: John Marshall # @@ -155,6 +155,11 @@ static_LDFLAGS=$LDFLAGS static_LIBS='-lpthread -lz -lm' private_LIBS=$LDFLAGS +AC_ARG_ENABLE([versioned-symbols], + [AS_HELP_STRING([--disable-versioned-symbols], + [disable versioned symbols in shared library])], + [], [enable_versioned_symbols=yes]) + AC_ARG_ENABLE([bz2], [AS_HELP_STRING([--disable-bz2], [omit support for BZ2-compressed CRAM files])], @@ -249,6 +254,24 @@ esac AC_MSG_RESULT([$host_result]) AC_SUBST([PLATFORM]) +dnl Check for versioned symbol support +dnl Only try for .so shared libraries as other types won't work +AS_IF([test x"$PLATFORM" = xdefault && test x"$enable_versioned_symbols" = xyes], + [AC_CACHE_CHECK([whether the linker supports versioned symbols], + [hts_cv_have_versioned_symbols], [ + save_LDFLAGS=$LDFLAGS + LDFLAGS="-Wl,-version-script,$srcdir/htslib.map $LDFLAGS" + AC_LINK_IFELSE([AC_LANG_PROGRAM()], + [hts_cv_have_versioned_symbols=yes], + [hts_cv_have_versioned_symbols=no]) + LDFLAGS=$save_LDFLAGS + ]) + AS_IF([test "x$hts_cv_have_versioned_symbols" = xyes],[ + VERSION_SCRIPT_LDFLAGS='-Wl,-version-script,$(srcprefix)htslib.map' + AC_SUBST([VERSION_SCRIPT_LDFLAGS]) + ]) +]) + dnl Try to get more control over which symbols are exported in the shared dnl library. HTS_HIDE_DYNAMIC_SYMBOLS From 44814f43179c379fb566bee36d28e16940a25e9d Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 1 Feb 2023 18:08:29 +0000 Subject: [PATCH 381/488] Add Makefile rule to update the symbol version file Adds a very basic script to do this. It expects an htslib.map file to be already present, assumes that all entries are in order and that it has a suitable format for the part that extracts the previous version number. It extracts any new symbols from the .so file and appends them to the map, but only if the HTSlib version number (extracted from PACKAGE_VERSION) has changed. Apart from for testing, this rule should only be used when building a release. Between releases new symbols should be left unversioned so that binaries linked against a libhts.so will continue to work if the library is upgraded from one build against develop to the next release. After running this rule, the shared library should be rebuilt so that it picks up the new version names, which makes for a circular dependency between libhts.so and htslib.map. This rule does not attempt to solve that, so the shared library will currently have to be rebuilt manually. --- Makefile | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/Makefile b/Makefile index 13624c8f3..16142dd7d 100644 --- a/Makefile +++ b/Makefile @@ -813,6 +813,31 @@ shlib-exports-dylib.txt: libhts.dylib shlib-exports-dll.txt: hts.dll.a nm -g hts.dll.a | awk '$$2 == "T" { print $$3 }' | sort -u -o $@ +$(srcprefix)htslib.map: libhts.so + LC_ALL=C ; export LC_ALL; \ + curr_vers=`expr 'X$(PACKAGE_VERSION)' : 'X\([0-9]*\.[0-9.]*\)'` ; \ + last_vers=`awk '/^HTSLIB_[0-9](\.[0-9]+)+/ { lv = $$1 } END { print lv }' htslib.map` ; \ + if test "x$$curr_vers" = 'x' || test "x$$last_vers" = 'x' ; then \ + echo "Version check failed : $$curr_vers / $$las_vers" 1>&2 ; \ + exit 1 ; \ + fi && \ + if test "HTSLIB_$$curr_vers" = "$$last_vers" ; then \ + echo "Refusing to update $@ - HTSlib version not changed" 1>&2 ; \ + exit 1 ; \ + fi && \ + nm --with-symbol-versions -D libhts.so | awk '$$2 ~ /^[DGRT]$$/ && $$3 ~ /@@Base$$/ && $$3 !~ /^(_init|_fini|_edata)@@/ { sub(/@@Base$$/, ";", $$3); print " " $$3 }' > $@.tmp && \ + if [ -s $@.tmp ] ; then \ + cat $@ > $@.new.tmp && \ + printf '\n%s {\n' "HTSLIB_$$curr_vers" >> $@.new.tmp && \ + cat $@.tmp >> $@.new.tmp && \ + printf '} %s;\n' "$$last_vers" >> $@.new.tmp && \ + rm -f $@.tmp && \ + mv $@.new.tmp $@ ; \ + fi ; \ + else \ + rm -f $@.tmp ; \ + fi + install: libhts.a $(BUILT_PROGRAMS) $(BUILT_PLUGINS) installdirs install-$(SHLIB_FLAVOUR) install-pkgconfig $(INSTALL_PROGRAM) $(BUILT_PROGRAMS) $(DESTDIR)$(bindir) if test -n "$(BUILT_PLUGINS)"; then $(INSTALL_PROGRAM) $(BUILT_PLUGINS) $(DESTDIR)$(plugindir); fi From f68c84ff46a63fde05ae2bdf3ebdec568c9c4613 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 3 Feb 2023 17:04:16 +0000 Subject: [PATCH 382/488] Strip out symbol versions from shlib-exports-so.txt --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 16142dd7d..995262e71 100644 --- a/Makefile +++ b/Makefile @@ -805,7 +805,7 @@ header-exports.txt: test/header_syms.pl htslib/*.h test/header_syms.pl htslib/*.h | sort -u -o $@ shlib-exports-so.txt: libhts.so - nm -D -g libhts.so | awk '$$2 == "T" { print $$3 }' | sort -u -o $@ + nm -D -g libhts.so | awk '$$2 == "T" { sub("@.*", "", $$3); print $$3 }' | sort -u -o $@ shlib-exports-dylib.txt: libhts.dylib nm -Ug libhts.dylib | awk '$$2 == "T" { sub("^_", "", $$3); print $$3 }' | sort -u -o $@ @@ -825,7 +825,7 @@ $(srcprefix)htslib.map: libhts.so echo "Refusing to update $@ - HTSlib version not changed" 1>&2 ; \ exit 1 ; \ fi && \ - nm --with-symbol-versions -D libhts.so | awk '$$2 ~ /^[DGRT]$$/ && $$3 ~ /@@Base$$/ && $$3 !~ /^(_init|_fini|_edata)@@/ { sub(/@@Base$$/, ";", $$3); print " " $$3 }' > $@.tmp && \ + nm --with-symbol-versions -D -g libhts.so | awk '$$2 ~ /^[DGRT]$$/ && $$3 ~ /@@Base$$/ && $$3 !~ /^(_init|_fini|_edata)@@/ { sub(/@@Base$$/, ";", $$3); print " " $$3 }' > $@.tmp && \ if [ -s $@.tmp ] ; then \ cat $@ > $@.new.tmp && \ printf '\n%s {\n' "HTSLIB_$$curr_vers" >> $@.new.tmp && \ From f2e845c18531cf9582f4e71a3f146dce075ab4a9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 10 Feb 2023 15:20:36 +0000 Subject: [PATCH 383/488] Fix Cram compression container substitution matrix generation. The matrix is meant to turn ref + code into seq. Eg ref C and BS code 1 may mean seq is T. Instead of writing the codes for the non-ref bases in order ACGTN, we wrote the Nth base number in numerical order of the codes. For ref C + BS code we have 4 alternatives A,G,T and N (C->C is absent as it's not a substitution). So e.g. we may have C: 0=G 1=T 2=A 3=N. We were writing GTAN as 01 10 00 11, from A(c)GTN. We should have been writing the code numbers in A(c)GTN order hence 10 00 01 11. However, we don't actually change or optimise this in htslib, so it's hard coded in cram_structs.h. #define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" Reformatting it's: A: CGTN C:A GTN G:AC TN T:ACG N N:ACGT That basically boils down to 0123 (00 01 10 11 or 0x3b) for all rows. The incorrect order of writing the table made no difference as every row is sorted by both code 0,1,2,3 and nucleotide A,C,G,T,N. --- cram/cram_encode.c | 49 ++++++++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 21 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 63ee8a111..5b56aedd5 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -70,7 +70,7 @@ static int process_one_read(cram_fd *fd, cram_container *c, static int sub_idx(char *key, char val) { int i; - for (i = 0; *key && *key++ != val; i++); + for (i = 0; i < 4 && *key++ != val; i++); return i; } @@ -205,31 +205,38 @@ cram_block *cram_encode_compression_header(cram_fd *fd, cram_container *c, case CRAM_KEY('S','M'): { char smat[5], *mp = smat; + // Output format is for order ACGTN (minus ref base) + // to store the code value 0-3 for each symbol. + // + // Note this is different to storing the symbols in order + // that the codes occur from 0-3, which is what we used to + // do. (It didn't matter as we always had a fixed table in + // the order.) *mp++ = - (sub_idx("CGTN", h->substitution_matrix[0][0]) << 6) | - (sub_idx("CGTN", h->substitution_matrix[0][1]) << 4) | - (sub_idx("CGTN", h->substitution_matrix[0][2]) << 2) | - (sub_idx("CGTN", h->substitution_matrix[0][3]) << 0); + (sub_idx(h->substitution_matrix[0], 'C') << 6) | + (sub_idx(h->substitution_matrix[0], 'G') << 4) | + (sub_idx(h->substitution_matrix[0], 'T') << 2) | + (sub_idx(h->substitution_matrix[0], 'N') << 0); *mp++ = - (sub_idx("AGTN", h->substitution_matrix[1][0]) << 6) | - (sub_idx("AGTN", h->substitution_matrix[1][1]) << 4) | - (sub_idx("AGTN", h->substitution_matrix[1][2]) << 2) | - (sub_idx("AGTN", h->substitution_matrix[1][3]) << 0); + (sub_idx(h->substitution_matrix[1], 'A') << 6) | + (sub_idx(h->substitution_matrix[1], 'G') << 4) | + (sub_idx(h->substitution_matrix[1], 'T') << 2) | + (sub_idx(h->substitution_matrix[1], 'N') << 0); *mp++ = - (sub_idx("ACTN", h->substitution_matrix[2][0]) << 6) | - (sub_idx("ACTN", h->substitution_matrix[2][1]) << 4) | - (sub_idx("ACTN", h->substitution_matrix[2][2]) << 2) | - (sub_idx("ACTN", h->substitution_matrix[2][3]) << 0); + (sub_idx(h->substitution_matrix[2], 'A') << 6) | + (sub_idx(h->substitution_matrix[2], 'C') << 4) | + (sub_idx(h->substitution_matrix[2], 'T') << 2) | + (sub_idx(h->substitution_matrix[2], 'N') << 0); *mp++ = - (sub_idx("ACGN", h->substitution_matrix[3][0]) << 6) | - (sub_idx("ACGN", h->substitution_matrix[3][1]) << 4) | - (sub_idx("ACGN", h->substitution_matrix[3][2]) << 2) | - (sub_idx("ACGN", h->substitution_matrix[3][3]) << 0); + (sub_idx(h->substitution_matrix[3], 'A') << 6) | + (sub_idx(h->substitution_matrix[3], 'C') << 4) | + (sub_idx(h->substitution_matrix[3], 'G') << 2) | + (sub_idx(h->substitution_matrix[3], 'N') << 0); *mp++ = - (sub_idx("ACGT", h->substitution_matrix[4][0]) << 6) | - (sub_idx("ACGT", h->substitution_matrix[4][1]) << 4) | - (sub_idx("ACGT", h->substitution_matrix[4][2]) << 2) | - (sub_idx("ACGT", h->substitution_matrix[4][3]) << 0); + (sub_idx(h->substitution_matrix[4], 'A') << 6) | + (sub_idx(h->substitution_matrix[4], 'C') << 4) | + (sub_idx(h->substitution_matrix[4], 'G') << 2) | + (sub_idx(h->substitution_matrix[4], 'T') << 0); BLOCK_APPEND(map, smat, 5); break; } From d609508c3833170ba27760969ff99fcbe04fa771 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 13 Feb 2023 17:07:39 +0000 Subject: [PATCH 384/488] Tweak the CRAM_SUBST_MATRIX table. The old table equates to: 0 1 2 3 A : C G T N C : A G T N G : A C T N T : A C G N N : A C G T The new one is: 0 1 2 3 A : T C G N C : A G T N G : T C A N T : A G C N N : A C G T This affects the generation of BS codes for Ref/Seq combinations. The idea is we want common substitutions to be sharing the same code value so compression improves. Mostly this is a (tiny) win for compression, across a multitude of technologies and organisms. There are a few exceptions (one of the Streptococcus samples grew, and AVITI had a marginal growth, but generally it's an irrelevance on the platforms that don't have aggressive quality quantisation as the files become dominated elsewhere. Even with this on Illumina, it's generally of the order of a 0.1% to total file size. However it's completely free and has no real CPU impact either. --- cram/cram_structs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 724f6cb78..1ee4b9e85 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -88,7 +88,7 @@ struct hFILE; #define BASES_PER_SLICE (SEQS_PER_SLICE*500) #define SLICE_PER_CNT 1 -#define CRAM_SUBST_MATRIX "CGTNAGTNACTNACGNACGT" +#define CRAM_SUBST_MATRIX "CGTNGTANCATNGCANACGT" #define MAX_STAT_VAL 1024 //#define MAX_STAT_VAL 16 From cb8f462b570706064cc3134e52990a08a12e5cdf Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 15 Feb 2023 16:16:49 +0000 Subject: [PATCH 385/488] Update to htscodecs v1.4.0 * Fix newer clang warning of unused variable. * Improvements for intel -m32 builds. * Check pthread_once() return value and print diagnostic * Add error checking to hist8() and hist1_4() * Print FAIL for xyz_compress() failures * Detect Neon capability at runtime via operating system APIs * Improve checks for pthread calls on FreeBSD * Update hts_pack to operate in line with the spec. * Switch MacOS CI tests to an ARM-based image * Remove overly aggressive buffer overflow checking in rans O1 decoder * Add arth+bzip2 (EXT flag) test file * Remove defunct libz from autoconf and make lack of bz2 fail * Add arith+bzip2 support to name tokeniser --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 3ef17f6fb..cd0737fff 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 3ef17f6fb5b8b6b0ad2d4c1c562165664f0703f8 +Subproject commit cd0737fff5893b0842b047da5aa3209e5f65442c From b898b8ae4e2b0e41d921897d62a1391d5d1f7199 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 24 Jan 2023 17:11:07 +0000 Subject: [PATCH 386/488] January 2023 NEWS update. --- NEWS | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/NEWS b/NEWS index 5e04d71b0..90563526f 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,114 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +* A new API for iterating through a BAM record's aux field. + (PR#1354, addresses #1319. Thanks to John Marshall) + +* Text mode for bgzip. Allows bgzip to compress lines of text with block breaks + at newlines. + (PR#1493, thanks to Mike Lin for the initial version PR#1369) + +* Make tabix support CSI indices with large positions. Unlike SAM and VCF + files, BED files do not set a maximum reference length which hindered CSI + support. This change sets an arbitrary large size of 100G to enable it to + work. + (PR#1506) + +* Add a fai_line_length function. Exposes the internal line-wrap length. + (PR#1516) + +* Check for invalid barcode tags in fastq output. + (PR#1518, fixes samtools#1728. Reported by Poshi) + +* Warn if reference found in a CRAM file is not contained in the specified + reference file. + (PR#1517 and PR#1521, adds diagnostics for #1515. Reported by Wei WeiDeng) + +* Add a faidx_seq_len64 function that can return sequence lengths longer than + INT_MAX. At the same time limit faidx_seq_len to INT_MAX output. Also add a + fai_adjust_region to ensure given ranges do not go beyond the end of the + requested sequence. + (PR#1519) + +* Add a bcf_strerror function to give text descriptions of BCF errors. + (PR#1510) + +* Add CRAM SQ/M5 header checking when specifying a fasta file. This is to + prevent creating a CRAM that cannot be decoded again. + (PR#1522. In response to samtools#1748 though not a direct fix) + +* Improve support for very long input lines (> 2Gbyte). This is mostly useful + for tabix which does not do much interpretation of its input. + (PR#1542, a partial fix for #1539) + +* Speed up load_ref_portion. This function has been sped up by about 7x, which + speeds up low-depth CRAM decoding by about 10%. + (PR#1551) + +* Expand CRAM API to cope with new samtools cram_size command. + (PR#1546) + +* Merges neighbouring I and D ops into one op within pileup. This means + 4M1D1D1D3M is reported as 4M3D3M. Fixing this in sam.c means not only is + samtools mpileup now looking better, but any tool using the mpileup API will + be getting consistent results. + (PR#1552, fixes the last remaining part of samtools#139) + + +Build changes +------------- + +* Use POSIX grep in testing as egrep and fgrep are considered obsolete. + (PR#1509, thanks to David Seifert) + +* Switch to building libdefalte with cmake for Cirris CI. + (PR#1511) + +* Ensure strings in config_vars.h are escaped correctly. + (PR#1530, fixes #1527. Reported by Lucas Czech) + +* Easier modification of shared library permissions during install. + (PR#1532, fixes #1525. Reported by StephDC) + +* Fix build on ancient compilers. Added -std=gnu90 to build tests so older + C compilers will still be happy. + (PR#1524, fixes #1523. Reported by Martin Jakt) + +* Switch MacOS CI tests to an ARM-based image. + (PR#1536) + +* Cut down the number of embed_ref=2 tests that get run. + (PR#1537) + +Bug fixes +--------- + +* VCF. Fix n-squared complexity in sample line with many adjacent tabs [fuzz]. + (PR#1503) + +* Fix error code returned by bcftools. Error code was returning 0 (success) + even on a fail. Replaced by -1. + (PR#1504, thanks to Lilian Janin) + +* Prevent crash when the only FASTA entry has no sequence [fuzz]. + (PR#1507) + +* Fixed typo in sam.h documentation. + (PR#1512, thanks to kojix2) + +* Fix buffer read-overrun in bam_plp_insertion mod. + (PR#1520) + +* Fix hash keys being left behind by bcf_hdr_remove. + (PR#1535, fixes #1533. Reported by Giulio Genovese in #842) + +* Added return on an error to bgzf_read_block. + (PR#1529, fixes #1528. Thanks to Bergur Ragnarsson) + +* Make bcf_hdr_idinfo_exists more robust by checking id value exists. + (PR#1544, fixes #1538. Reported by Giulio Genovese) + + Noteworthy changes in release 1.16 (18th August 2022) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 8737f5c217d1cc873b868dab79efb61b96187b64 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Mon, 6 Feb 2023 14:57:35 +0000 Subject: [PATCH 387/488] Keeping the NEWS file up-to-date. --- NEWS | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/NEWS b/NEWS index 90563526f..7af21f553 100644 --- a/NEWS +++ b/NEWS @@ -53,6 +53,10 @@ Noteworthy changes in release a.b samtools mpileup now looking better, but any tool using the mpileup API will be getting consistent results. (PR#1552, fixes the last remaining part of samtools#139) + +* Update the API documentation for bgzf_mt as it refered to a previous + iteration. + (PR#1556, fixes #1553. Reported by Raghavendra Padmanabhan) Build changes @@ -108,6 +112,18 @@ Bug fixes * Make bcf_hdr_idinfo_exists more robust by checking id value exists. (PR#1544, fixes #1538. Reported by Giulio Genovese) +* Catch errors from bgzf_getline in hts_readlist and hts_readlines. There were + a couple of places where the returned error was ignored. + (PR#1554) + +* Fix a bug in the codec learning algorithm for TOKA. The name tokeniser has a + rANS vs Arithmetic coder choice as a parameter (in the "strat" variable). We + lacked this distinction when learning which method works best, so in the + choice of toka (tok3+arith) vs bzip2 vs gzip etc we selected tok3 and switched + back to strat 0, disabling the arithmetic coder.his only affects archive mode, + or where requested eg "samtools view -O cram,version=3.1,use_arith". + (PR#1559) + Noteworthy changes in release 1.16 (18th August 2022) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From f3960a536ee67d81c5dfb800661253dbcff345cd Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 14 Feb 2023 17:10:47 +0000 Subject: [PATCH 388/488] More additions and improvements. --- NEWS | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) diff --git a/NEWS b/NEWS index 7af21f553..dc9a24182 100644 --- a/NEWS +++ b/NEWS @@ -53,7 +53,7 @@ Noteworthy changes in release a.b samtools mpileup now looking better, but any tool using the mpileup API will be getting consistent results. (PR#1552, fixes the last remaining part of samtools#139) - + * Update the API documentation for bgzf_mt as it refered to a previous iteration. (PR#1556, fixes #1553. Reported by Raghavendra Padmanabhan) @@ -84,15 +84,18 @@ Build changes * Cut down the number of embed_ref=2 tests that get run. (PR#1537) +* Add symbol versions to libhts.so. This is to aid package developers. + (PR#1560 addresses #1505, thanks to John Marshall. Reported by Stefan Bruens) + Bug fixes --------- * VCF. Fix n-squared complexity in sample line with many adjacent tabs [fuzz]. (PR#1503) -* Fix error code returned by bcftools. Error code was returning 0 (success) - even on a fail. Replaced by -1. - (PR#1504, thanks to Lilian Janin) +* Improved bcftools detection and reporting of bgzf decode errors. + (PR#1504, thanks to Lilian Janin. PR#1529 thanks to Bergur Ragnarsson, fixes + #1528. PR#1554) * Prevent crash when the only FASTA entry has no sequence [fuzz]. (PR#1507) @@ -106,24 +109,13 @@ Bug fixes * Fix hash keys being left behind by bcf_hdr_remove. (PR#1535, fixes #1533. Reported by Giulio Genovese in #842) -* Added return on an error to bgzf_read_block. - (PR#1529, fixes #1528. Thanks to Bergur Ragnarsson) - * Make bcf_hdr_idinfo_exists more robust by checking id value exists. (PR#1544, fixes #1538. Reported by Giulio Genovese) -* Catch errors from bgzf_getline in hts_readlist and hts_readlines. There were - a couple of places where the returned error was ignored. - (PR#1554) - -* Fix a bug in the codec learning algorithm for TOKA. The name tokeniser has a - rANS vs Arithmetic coder choice as a parameter (in the "strat" variable). We - lacked this distinction when learning which method works best, so in the - choice of toka (tok3+arith) vs bzip2 vs gzip etc we selected tok3 and switched - back to strat 0, disabling the arithmetic coder.his only affects archive mode, - or where requested eg "samtools view -O cram,version=3.1,use_arith". - (PR#1559) - +* CRAM improvements. Fixed crash with multi-threaded CRAM.Fixed a bug in the + codec parameter learning for CRAM 3.1 name tokeniser. Fixed Cram compression + container substitution matrix generation, + (PR#1558, PR#1559 and PR#1562) Noteworthy changes in release 1.16 (18th August 2022) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 0144d51c324b46886ef7412a0c80b0b8f53120f6 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 16 Feb 2023 11:09:54 +0000 Subject: [PATCH 389/488] Added htscodecs update to v1.4.0 --- NEWS | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NEWS b/NEWS index dc9a24182..8388e9b40 100644 --- a/NEWS +++ b/NEWS @@ -87,6 +87,9 @@ Build changes * Add symbol versions to libhts.so. This is to aid package developers. (PR#1560 addresses #1505, thanks to John Marshall. Reported by Stefan Bruens) +* htscodecs now updated to v1.4.0. + (PR#1563) + Bug fixes --------- From 6652c86ab34aed5aaf3e1211ba75cea982ad269c Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 16 Feb 2023 11:31:42 +0000 Subject: [PATCH 390/488] More space. --- NEWS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 8388e9b40..405e0fd88 100644 --- a/NEWS +++ b/NEWS @@ -115,7 +115,7 @@ Bug fixes * Make bcf_hdr_idinfo_exists more robust by checking id value exists. (PR#1544, fixes #1538. Reported by Giulio Genovese) -* CRAM improvements. Fixed crash with multi-threaded CRAM.Fixed a bug in the +* CRAM improvements. Fixed crash with multi-threaded CRAM. Fixed a bug in the codec parameter learning for CRAM 3.1 name tokeniser. Fixed Cram compression container substitution matrix generation, (PR#1558, PR#1559 and PR#1562) From 0d0eca1a99dc2ddb9b798ad3b0794083e2741dd6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 17 Feb 2023 10:14:42 +0000 Subject: [PATCH 391/488] Prevent spurious and random system errors from test_bgzf.c The test script is expected to fail as it's testing both working and failing conditions, but the test harness prints up strerror(errno), which is usually some unrelated event and the text also differs per platform. Also, as per 1.16, changed it so the things that are expected to fail no longer complain. Given some of the new diagnostics added to bgzf.c, this requires manipulation of the global htslib log level. --- test/test_bgzf.c | 56 ++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 21 deletions(-) diff --git a/test/test_bgzf.c b/test/test_bgzf.c index 5b2192986..a5084e6c6 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -37,6 +37,7 @@ DEALINGS IN THE SOFTWARE. #include "../htslib/bgzf.h" #include "../htslib/hfile.h" +#include "../htslib/hts_log.h" #include "../hfile_internal.h" const char *bgzf_suffix = ".gz"; @@ -159,13 +160,19 @@ static BGZF * try_bgzf_hopen(const char *name, const char *mode, return bgz; } -static int try_bgzf_close(BGZF **bgz, const char *name, const char *func) { +static int try_bgzf_close(BGZF **bgz, const char *name, const char *func, int expected_fail) { BGZF *to_close = *bgz; *bgz = NULL; if (bgzf_close(to_close) != 0) { - fprintf(stderr, "%s : bgzf_close failed on %s : %s\n", - func, name, strerror(errno)); + if (!expected_fail) + fprintf(stderr, "%s : bgzf_close failed on %s%s%s\n", + func, name, + errno ? " : " : "", + errno ? strerror(errno) : ""); return -1; + } else if (expected_fail) { + fprintf(stderr, "%s : bgzf_close worked on %s, but expected failure\n", + func, name); } return 0; } @@ -398,6 +405,7 @@ static int test_read(Files *f) { ssize_t bg_got, f_got; unsigned char bg_buf[BUFSZ], f_buf[BUFSZ]; + errno = 0; bgz = try_bgzf_open(f->src_bgzf, "r", __func__); if (!bgz) return -1; @@ -414,7 +422,7 @@ static int test_read(Files *f) { } } while (bg_got > 0 && f_got > 0); - if (try_bgzf_close(&bgz, f->src_bgzf, __func__) != 0) return -1; + if (try_bgzf_close(&bgz, f->src_bgzf, __func__, 0) != 0) return -1; if (try_fseek_start(f->f_plain, f->src_plain, __func__) != 0) return -1; return 0; @@ -449,7 +457,7 @@ static int test_write_read(Files *f, const char *mode, Open_method method, bg_put = try_bgzf_write(bgz, f->text, f->ltext, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; switch (method) { case USE_BGZF_DOPEN: @@ -491,7 +499,7 @@ static int test_write_read(Files *f, const char *mode, Open_method method, goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -521,7 +529,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { bg_put = try_bgzf_write(bgz, f->text, half, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; // Write second half. Append mode, so an EOF block should be in the @@ -535,7 +543,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; // Try reading pos = 0; @@ -564,7 +572,7 @@ static int test_embed_eof(Files *f, const char *mode, int nthreads) { goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -601,7 +609,7 @@ static int test_index_load_dump(Files *f) { } while (got_src > 0 && got_dest > 0); if (try_fclose(&fdest, f->tmp_idx, __func__) != 0) goto fail; - if (try_bgzf_close(&bgz, f->src_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->src_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -624,7 +632,7 @@ static int test_check_EOF(char *name, int expected) { return -1; } - return try_bgzf_close(&bgz, name, __func__); + return try_bgzf_close(&bgz, name, __func__, 0); } static int test_index_useek_getc(Files *f, const char *mode, @@ -651,7 +659,7 @@ static int test_index_useek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -710,7 +718,7 @@ static int test_index_useek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -741,7 +749,7 @@ static int test_tell_seek_getc(Files *f, const char *mode, if (bg_put < 0) goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -811,7 +819,7 @@ static int test_tell_seek_getc(Files *f, const char *mode, } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; return 0; @@ -841,7 +849,7 @@ static int test_tell_read(Files *f, const char *mode) { if (bg_put < 0) goto fail; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -859,7 +867,7 @@ static int test_tell_read(Files *f, const char *mode) { } } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; free(bg_buf); return 0; @@ -885,7 +893,7 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { bg_put = try_bgzf_write(bgz, f->text, f->ltext, f->tmp_bgzf, __func__); if (bg_put < 0) goto fail; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; bgz = try_bgzf_open(f->tmp_bgzf, "r", __func__); if (!bgz) goto fail; @@ -917,7 +925,7 @@ static int test_bgzf_getline(Files *f, const char *mode, int nthreads) { pos += l + 1; } - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; free(ks_release(&str)); return 0; @@ -934,6 +942,10 @@ static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int n kstring_t str = { 0, 0, NULL }; const char *text = (const char *) f->text; + // Turn off bgzf errors as they're expected. + enum htsLogLevel lvl = hts_get_log_level(); + hts_set_log_level(HTS_LOG_OFF); + bgz = try_bgzf_open(f->tmp_bgzf, mode, __func__); if (!bgz) goto fail; @@ -951,7 +963,7 @@ static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int n if (bgzf_flush(bgz) < 0) goto fail; int64_t block3_start = bgz->block_address; - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) != 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 0) != 0) goto fail; int64_t newsize; for(newsize = block3_start - 1; newsize > block2_start; newsize--) { @@ -1002,12 +1014,14 @@ static int test_bgzf_getline_on_truncated_file(Files *f, const char *mode, int n } } // closing a stream with error returns error - if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__) == 0) goto fail; + if (try_bgzf_close(&bgz, f->tmp_bgzf, __func__, 1) == 0) goto fail; } free(ks_release(&str)); + hts_set_log_level(lvl); return 0; fail: + hts_set_log_level(lvl); if (bgz) bgzf_close(bgz); free(ks_release(&str)); return -1; From 0ad23b0abcaf50bd6c465157870e3b0877d654c2 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 20 Feb 2023 10:24:09 +0000 Subject: [PATCH 392/488] Minor NEWS adjustment and additonal item --- NEWS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/NEWS b/NEWS index 405e0fd88..adaba256a 100644 --- a/NEWS +++ b/NEWS @@ -90,6 +90,9 @@ Build changes * htscodecs now updated to v1.4.0. (PR#1563) +* Cleaned up misleading system error reports in test_bgzf. + (PR#1565) + Bug fixes --------- @@ -106,7 +109,7 @@ Bug fixes * Fixed typo in sam.h documentation. (PR#1512, thanks to kojix2) -* Fix buffer read-overrun in bam_plp_insertion mod. +* Fix buffer read-overrun in bam_plp_insertion_mod. (PR#1520) * Fix hash keys being left behind by bcf_hdr_remove. From c37268ca2cc2464ec6f739b8c42eb5ff630e2b58 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 28 Feb 2023 16:21:53 +0000 Subject: [PATCH 393/488] Switch to CURLINFO_CONTENT_LENGTH_DOWNLOAD_T for newer libcurl The older CURLINFO_CONTENT_LENGTH_DOWNLOAD has been deprecated, and is now causing warnings to be printed on distributions that have moved on to the latest libcurl versions. CURLINFO_CONTENT_LENGTH_DOWNLOAD_T was officially added in curl 7.55.0. --- hfile_libcurl.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/hfile_libcurl.c b/hfile_libcurl.c index 2de7ccbd9..1e4a4486f 100644 --- a/hfile_libcurl.c +++ b/hfile_libcurl.c @@ -1330,13 +1330,20 @@ libcurl_open(const char *url, const char *modes, http_headers *headers) } if (mode == 'r') { +#if LIBCURL_VERSION_NUM >= 0x073700 // 7.55.0 + curl_off_t offset; + + if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD_T, + &offset) == CURLE_OK && offset > 0) + fp->file_size = (off_t) offset; +#else double dval; if (curl_easy_getinfo(fp->easy, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &dval) == CURLE_OK && dval >= 0.0) fp->file_size = (off_t) (dval + 0.1); +#endif } - fp->base.backend = &libcurl_backend; return &fp->base; From 05ef3b73ee1fa2efc2e61a71b3c7f91e75af08f8 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 1 Mar 2023 14:56:04 +0000 Subject: [PATCH 394/488] Check if VCF POS column could be fully parsed and throw an error if not. The position too large error report is also fixed so that it reports the incorrect value instead of an empty string. Resolves #1570 Co-authored-by: Rob Davies --- vcf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/vcf.c b/vcf.c index 59d433c19..d9b0826b1 100644 --- a/vcf.c +++ b/vcf.c @@ -3232,9 +3232,13 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) v->rid = kh_val(d, k).id; } else if (i == 1) { // POS overflow = 0; + char *tmp = p; v->pos = hts_str2uint(p, &p, 63, &overflow); if (overflow) { - hts_log_error("Position value '%s' is too large", p); + hts_log_error("Position value '%s' is too large", tmp); + goto err; + } else if ( *p ) { + hts_log_error("Could not parse the position '%s'", tmp); goto err; } else { v->pos -= 1; From fda1e0375d26723983a264e4f12bcecdec5aa4a6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 28 Feb 2023 13:53:29 +0000 Subject: [PATCH 395/488] Fix cram_index_query_last function The cram_index_query_last used by sam_itr_regarray had problems when dealing with slices whose region is contained within another region. Fundamentally this is due to the cram_index arrays being contiguous in memory until a containment is found, at which point the pointers will be to an entirely different array. This breaks naive pointer comparisons. The cram_index struct already had a "next" field holding the file offset of the next container. This has been replaced by e_next pointing to the next cram_entry struct in file ordering, and e_next->offset is equivalent to the old "next". This allows consumption of the index either as the original nested containment list or as a traditional linked list. Also fixed cram_index_query with from != NULL, which similarly was incorrect before. We never used this function and it's not public, but we now use it within the rewrite of cram_index_query_last. Fixes #1569 --- cram/cram_index.c | 115 +++++++++++++++++++++++++++----------------- cram/cram_structs.h | 5 +- hts.c | 6 +-- 3 files changed, 76 insertions(+), 50 deletions(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 45d420df2..601852d87 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -72,7 +72,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. static void dump_index_(cram_index *e, int level) { int i, n; n = printf("%*s%d / %d .. %d, ", level*4, "", e->refid, e->start, e->end); - printf("%*soffset %"PRId64"\n", MAX(0,50-n), "", e->offset); + printf("%*soffset %"PRId64" %p %p\n", MAX(0,50-n), "", e->offset, e, e->e_next); for (i = 0; i < e->nslice; i++) { dump_index_(&e->e[i], level+1); } @@ -86,6 +86,37 @@ static void dump_index(cram_fd *fd) { } #endif +// Thread a linked list through the nested containment list. +// This makes navigating it and finding the "next" index entry +// trivial. +static cram_index *link_index_(cram_index *e, cram_index *e_last) { + int i; + if (e_last) + e_last->e_next = e; + + // We don't want to link in the top-level cram_index with + // offset=0 and start/end = INT_MIN/INT_MAX. + if (e->offset) + e_last = e; + + for (i = 0; i < e->nslice; i++) + e_last = link_index_(&e->e[i], e_last); + + return e_last; +} + +static void link_index(cram_fd *fd) { + int i; + cram_index *e_last = NULL; + + for (i = 0; i < fd->index_sz; i++) { + e_last = link_index_(&fd->index[i], e_last); + } + + if (e_last) + e_last->e_next = NULL; +} + static int kget_int32(kstring_t *k, size_t *pos, int32_t *val_p) { int sign = 1; int32_t val = 0; @@ -313,7 +344,10 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { free(kstr.s); free(tfn_idx); - // dump_index(fd); + // Convert NCList to linear linked list + link_index(fd); + + //dump_index(fd); return 0; @@ -356,7 +390,7 @@ void cram_index_free(cram_fd *fd) { * entries, but we require at least one per reference.) * * If the index finds multiple slices overlapping this position we - * return the first one only. Subsequent calls should specifying + * return the first one only. Subsequent calls should specify * "from" as the last slice we checked to find the next one. Otherwise * set "from" to be NULL to find the first one. * @@ -371,6 +405,17 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, int i, j, k; cram_index *e; + if (from) { + // Continue from a previous search. + // We switch to just scanning the linked list, as the nested + // lists are typically short. + e = from->e_next; + if (e && e->refid == refid && e->start <= pos) + return e; + else + return NULL; + } + switch(refid) { case HTS_IDX_NONE: case HTS_IDX_REST: @@ -400,8 +445,7 @@ cram_index *cram_index_query(cram_fd *fd, int refid, hts_pos_t pos, return NULL; } - if (!from) - from = &fd->index[refid+1]; + from = &fd->index[refid+1]; // Ref with nothing aligned against it. if (!from->e) @@ -469,52 +513,33 @@ cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { return &from->e[slice]; } +/* + * Find the last container overlapping pos 'end', and the file offset of + * its end (equivalent to the start offset of the container following it). + */ cram_index *cram_index_query_last(cram_fd *fd, int refid, hts_pos_t end) { - cram_index *first = cram_index_query(fd, refid, end, NULL); - cram_index *last = cram_index_last(fd, refid, NULL); - if (!first || !last) - return NULL; - - while (first < last && (first+1)->start <= end) - first++; + cram_index *e = NULL, *prev_e; + do { + prev_e = e; + e = cram_index_query(fd, refid, end, prev_e); + } while (e); - while (first->e) { - int count = 0; - int nslices = first->nslice; - first = first->e; - while (++count < nslices && (first+1)->start <= end) - first++; - } + if (!prev_e) + return NULL; + e = prev_e; - // Compute the start location of next container. - // - // This is useful for stitching containers together in the multi-region - // iterator. Sadly we can't compute this from the single index line. + // Note: offset of e and e->e_next may be the same if we're using a + // multi-ref container where a single container generates multiple + // index entries. // - // Note we can have neighbouring index entries at the same location - // for when we have multi-reference mode and/or multiple slices per - // container. - cram_index *next = first; + // We need to keep iterating until offset differs in order to find + // the genuine file offset for the end of container. do { - if (next >= last) { - // Next non-empty reference - while (++refid+1 < fd->index_sz) - if (fd->index[refid+1].nslice) - break; - if (refid+1 >= fd->index_sz) { - next = NULL; - } else { - next = fd->index[refid+1].e; - last = fd->index[refid+1].e + fd->index[refid+1].nslice; - } - } else { - next++; - } - } while (next && next->offset == first->offset); - - first->next = next ? next->offset : 0; + prev_e = e; + e = e->e_next; + } while (e && e->offset == prev_e->offset); - return first; + return prev_e; } /* diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 1ee4b9e85..0a66d51b9 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -725,7 +725,10 @@ typedef struct cram_index { int slice; // 1.0 landmark index, 1.1 landmark value int len; // 1.1 - size of slice in bytes int64_t offset; // 1.0 1.1 - int64_t next; // derived: offset of next container. + + // Linked list of cram_index entries. Used to convert recursive + // NCList back to a linear list. + struct cram_index *e_next; } cram_index; typedef struct { diff --git a/hts.c b/hts.c index cead9d537..c122ce23a 100644 --- a/hts.c +++ b/hts.c @@ -3408,14 +3408,12 @@ int hts_itr_multi_cram(const hts_idx_t *idx, hts_itr_t *iter) } if (e) { - off[n_off++].v = e->next - ? e->next + off[n_off++].v = e->e_next + ? e->e_next->offset : e->offset + e->slice + e->len; } else { hts_log_warning("Could not set offset end for region %d:%"PRIhts_pos"-%"PRIhts_pos". Skipping", tid, beg, end); } - } else { - hts_log_warning("No index entry for region %d:%"PRIhts_pos"-%"PRIhts_pos"", tid, beg, end); } } } else { From f24ad2cd62b7a083056266e4bff580891ea61fcd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 6 Mar 2023 10:08:38 +0000 Subject: [PATCH 396/488] Avoid deeply nested containment list on old CRAM indices. Unmapped data on ancient .crai files contained an end offset that wasn't 0. This escaped the deep recursion avoidance for unmapped data when building the NC list. --- cram/cram_index.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 601852d87..846dc709d 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -306,7 +306,8 @@ int cram_index_load(cram_fd *fd, const char *fn, const char *fn_idx) { idx_stack[(idx_stack_ptr = 0)] = idx; } - while (!(e.start >= idx->start && e.end <= idx->end) || idx->end == 0) { + while (!(e.start >= idx->start && e.end <= idx->end) || + (idx->start == 0 && idx->refid == -1)) { idx = idx_stack[--idx_stack_ptr]; } From be6633ae347043097902898cf529fff7144949af Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 1 Mar 2023 16:34:03 +0000 Subject: [PATCH 397/488] Fix crypt4gh redirection hopen() uses a ':' in `mode` to indicate that there are extra parameters. hts_crypt4gh_redirect() needs to add this so that the "parent" parameter is picked up by the crypt4gh plug-in. Failing to do this caused it to re-open the file instead of reusing the existing file handle - which worked for regular files but not for things like pipes or htsget. --- hts.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/hts.c b/hts.c index c122ce23a..86b5bb877 100644 --- a/hts.c +++ b/hts.c @@ -1349,6 +1349,7 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode, hFILE *hfile1 = *hfile_ptr; hFILE *hfile2 = NULL; char fn_buf[512], *fn2 = fn_buf; + char mode2[102]; // Size set by sizeof(simple_mode) in hts_hopen() const char *prefix = "crypt4gh:"; size_t fn2_len = strlen(prefix) + strlen(fn) + 1; int ret = -1; @@ -1362,7 +1363,8 @@ static int hts_crypt4gh_redirect(const char *fn, const char *mode, // Reopen fn using the crypt4gh plug-in (if available) snprintf(fn2, fn2_len, "%s%s", prefix, fn); - hfile2 = hopen(fn2, mode, "parent", hfile1, NULL); + snprintf(mode2, sizeof(mode2), "%s%s", mode, strchr(mode, ':') ? "" : ":"); + hfile2 = hopen(fn2, mode2, "parent", hfile1, NULL); if (hfile2) { // Replace original hfile with the new one. The original is now // enclosed within hfile2 From dcd20d9fabfac4398af6c49944dc1c99e0abb6e0 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 2 Mar 2023 10:14:30 +0000 Subject: [PATCH 398/488] Permit fastq output to create empty FASTQ records for seq "*". This is rather questionable, but htslib can now output empty SAM records as empty fastq records. Eg: name 4 * 0 0 * * 0 0 * * becomes @name + Htslib is happy to read this back in and produces the original SAM once more. Bwa mem and minimap2 can both read these fastq entries too, although the SAM output is bugged as they output an empty field instead of "*" for SEQ. Potential reasons for accepting this: - When dealing with paired data, we don't want to output a differing number of records from samtools fastq if read1 has seq and read2 has "*". Note as this filtered at the htslib layer, it's not considered as a singleton so fastq -s won't rescue this. - At least some aligners apparently support this format. Although inevitably they just produce unmapped data. - Arguably this is a case of silly input => silly output! - Users can manually elect to "samtools view -e 'length(seq) > 0'" before using samtools fastq, which then fixes the not-a-singleton problem. - It converts samtools fastq output back to how it was in pre 1.13 era, where we rewrote it to use htslib's interfaces. Potential reason to reject: - It may yield output which trips up some poorly written tools. Fixes samtools/samtools#1799 --- sam.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/sam.c b/sam.c index e2e539b2d..3430a5895 100644 --- a/sam.c +++ b/sam.c @@ -4257,8 +4257,6 @@ int fastq_format1(fastq_state *x, const bam1_t *b, kstring_t *str) str->l = 0; - if (len == 0) return 0; - // Name if (kputc(x->nprefix, str) == EOF || kputs(bam_get_qname(b), str) == EOF) return -1; From 839a2e94647e5a4f7d59967235e42f157ff038d2 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 7 Mar 2023 15:35:15 +0000 Subject: [PATCH 399/488] Fix a couple small VCF auto-indexing bugs. 1. sam_idx_save wasn't validating the file is BGZF. It's invalid usage to try calling this function on uncompressed data, but we should double check. Note this is triggered by a bcftools bug where -o foo.vcf.gz##idx##foo.vcf.gz.csi writes VCF rather than VCF.gz as the "filename" doesn't end in .gz. 2. Add the hts_idx_amend_last calls to vcf_write as we did previously for SAM/BAM. This isn't technically a requirement, as all it's doing is changing virtual offsets to an alternate form that gives the same file offset (see comments above hts_idx_amend_last), but doing so means the auto-build indices match those produced by a standalone index command. This fix isn't complete as it hasn't been worked on for BCF yet. However it comes under the "nicety" category and isn't really fixing a bug so we can try to figure out how to tidy up BCF later (plus VCF.gz is basically the universal format). --- sam.c | 2 +- vcf.c | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/sam.c b/sam.c index 3430a5895..a93664d87 100644 --- a/sam.c +++ b/sam.c @@ -1067,7 +1067,7 @@ int sam_idx_save(htsFile *fp) { errno = -ret; return -1; } - if (bgzf_flush(fp->fp.bgzf) < 0) + if (!fp->is_bgzf || bgzf_flush(fp->fp.bgzf) < 0) return -1; hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); diff --git a/vcf.c b/vcf.c index d9b0826b1..392d9c932 100644 --- a/vcf.c +++ b/vcf.c @@ -3574,6 +3574,8 @@ int vcf_write(htsFile *fp, const bcf_hdr_t *h, bcf1_t *v) if ( fp->format.compression!=no_compression ) { if (bgzf_flush_try(fp->fp.bgzf, fp->line.l) < 0) return -1; + if (fp->idx) + hts_idx_amend_last(fp->idx, bgzf_tell(fp->fp.bgzf)); ret = bgzf_write(fp->fp.bgzf, fp->line.s, fp->line.l); } else { ret = hwrite(fp->fp.hfile, fp->line.s, fp->line.l); From 19cd41cb9e39b0bbfec759a530849136a8801adc Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 8 Mar 2023 09:30:17 +0000 Subject: [PATCH 400/488] Backport attractivechaos/klib#78 to htslib. Original PR by Pall Melsted, with only manual merging and one trivial bug fix by myself. Co-authored-by: Pall Melsted --- htslib/kseq.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/htslib/kseq.h b/htslib/kseq.h index 3e2404568..5d573d3d9 100644 --- a/htslib/kseq.h +++ b/htslib/kseq.h @@ -24,8 +24,6 @@ SOFTWARE. */ -/* Last Modified: 05MAR2012 */ - #ifndef AC_KSEQ_H #define AC_KSEQ_H @@ -57,6 +55,7 @@ unsigned char *buf; \ } kstream_t; +#define ks_err(ks) ((ks)->end == -1) #define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) #define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) @@ -78,11 +77,13 @@ #define __KS_INLINED(__read) \ static inline klib_unused int ks_getc(kstream_t *ks) \ { \ + if (ks_err(ks)) return -3; \ if (ks->is_eof && ks->begin >= ks->end) return -1; \ if (ks->begin >= ks->end) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end == 0) { ks->is_eof = 1; return -1; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ } \ ks->seek_pos++; \ return (int)ks->buf[ks->begin++]; \ @@ -99,11 +100,13 @@ uint64_t seek_pos = str->l; \ for (;;) { \ int i; \ + if (ks_err(ks)) return -3; \ if (ks->begin >= ks->end) { \ if (!ks->is_eof) { \ ks->begin = 0; \ ks->end = __read(ks->f, ks->buf, ks->bufsize); \ if (ks->end == 0) { ks->is_eof = 1; break; } \ + if (ks->end == -1) { ks->is_eof = 1; return -3; } \ } else break; \ } \ if (delimiter == KS_SEP_LINE) { \ @@ -180,25 +183,27 @@ >=0 length of the sequence (normal) -1 end-of-file -2 truncated quality string + -3 error reading stream + -4 overflow error */ #define __KSEQ_READ(SCOPE) \ SCOPE int kseq_read(kseq_t *seq) \ { \ - int c; \ + int c,r; \ kstream_t *ks = seq->f; \ if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \ + if (c < 0) return c; /* end of file or error */ \ seq->last_char = c; \ } /* else: the first header char has been read in the previous call */ \ seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ + if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \ if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ seq->seq.m = 256; \ seq->seq.s = (char*)malloc(seq->seq.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ + while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \ if (c == '\n') continue; /* skip empty lines */ \ seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ @@ -207,7 +212,7 @@ if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ seq->seq.m = seq->seq.l + 2; \ kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ - if (seq->seq.l + 1 >= seq->seq.m) return -3; /* error: adjusting m overflowed */ \ + if (seq->seq.l + 1 >= seq->seq.m) return -4; /* error: adjusting m overflowed */ \ seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ } \ seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ @@ -216,9 +221,10 @@ seq->qual.m = seq->seq.m; \ seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ + while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \ if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ + while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1)) >= 0 && seq->qual.l < seq->seq.l); \ + if (c == -3) return -3; /* stream error */ \ seq->last_char = 0; /* we have not come to the next header line */ \ if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ return seq->seq.l; \ From 46bcc366152ce20b899572433b55a085f9a54ad2 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 15 Mar 2023 14:42:26 +0000 Subject: [PATCH 401/488] Slightly speed up various cram decoding functions (#1580) None of this is huge, but it all adds up. - bam_set1 has been refactored so -O3 is more likely to do unrolling and vectorisation. // Old time inst cyc // gcc -O2 12.36 78936832183 36853852204 // gcc -O3 12.37 78713347525 36867027825 // clang13 -O2 12.43 77451926728 37012866717 // clang13 -O3 12.32 77627221907 36691623424 // gcc12 -O2 12.43 78895089091 37081260172 // gcc12 -O3 12.36 78505904437 36829216967 // New // gcc -O2 12.47 78832021505 37200597109 + // gcc -O3 12.14 76499369401 36390334338 -- // clang13 -O2 12.38 76678460761 36920111561 ~ // clang13 -O3 12.26 76678023071 36548488492 ~ // gcc12 -O2 12.38 78581694397 36880034181 - // gcc12 -O3 12.15 76356625541 36293921439 -- - Improve the MD/NM generation in CRAM decoding. With decode_md=1 (default) by decode changed from 12.91s to 12.57s With decode_md=0 it's 11.92, so that's 1/3rd of the overhead removed. - Changed the block_resize to resize in slightly smaller chunks and to use integer maths. - Reduce excessive pointer redirection in cram_decode_seq. Unsure if this speeds things up much (sometimes it seems to), but it provides tidier code too. Comparisons with Dev(/D) and this commit (/4) on Revio (re/) and NovaSeq (nv/) with a variety of compilers and optimisations. Figures are cycle counts from perf stat Xeon E5-2660 Xeon Gold 6142 re/D gcc12-O2 85699982958 74752510144 re/4 gcc12-O2 82265084038 71947558666 -3.7/3.7 re/D gcc12-O3 85837077212 74392223354 re/4 gcc12-O3 82024293685 71861154116 -4.4/3.4 re/D clang12-3 85608876213 73934329619 re/4 clang12-3 84390364926 73961392095 -1.4/0 re/D clang12-2 86861787827 74255338533 re/4 clang12-2 83186843797 72421845542 -4.2/2.5; better than O3 nv/D gcc12-O2 36694089398 31444641828 nv/4 gcc12-O2 34949122875 30061074125 -4.8/-4.4 nv/D gcc12-O3 36528573980 30792932748 nv/4 gcc12-O3 35069572111 30066058127 -4.0/2.4 nv/D clang12-3 37906764004 32459168883 nv/4 clang12-3 36344679534 30786987972 -4.1/-5.2 nv/D clang12-2 38443827308 32304948037 nv/4 clang12-2 36361384580 31022553379 -5.4/-4.0 Benchmarks on 10 million NovaSeq records, showing billions of cycles as more robust than CPU time. EPYC 7543 before after gcc(7) -O2 28.6 28.3 -1.0 gcc12 -O2 28.2 28.3 +0.4 clang7 -O2 30.2 28.2 -6.6 clang13 -O2 29.9 28.2 -5.7 gcc(7) -O3 28.7 28.2 -1.7 gcc12 -O3 28.0 27.2 -2.9 clang7 -O3 30.1 28.3 -6.0 clang13 -O3 29.7 28.3 -4.7 Xeon Gold 6142 before after gcc(7) -O2 32.8 30.5 -7.0 gcc12 -O2 31.8 30.1 -5.3 clang7 -O2 33.1 29.9 -9.7 clang13 -O2 34.1 30.8 -9.7 gcc(7) -O3 32.7 30.2 -7.6 gcc12 -O3 31.6 29.1 -7.9 clang7 -O3 34.3 30.0 -12.5 clang13 -O3 33.3 30.9 -7.2 --- cram/cram_decode.c | 245 ++++++++++++++++++++++----------------------- cram/cram_io.h | 2 +- sam.c | 14 ++- 3 files changed, 134 insertions(+), 127 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 73f567106..39869cbdd 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1118,6 +1118,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, uint32_t ds = s->data_series; sam_hrecs_t *bfd = sh->hrecs; + cram_codec **codecs = c->comp_hdr->codecs; + if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { memset(qual, 255, cr->len); } @@ -1132,9 +1134,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_FN) { - if (!c->comp_hdr->codecs[DS_FN]) return -1; - r |= c->comp_hdr->codecs[DS_FN]->decode(s,c->comp_hdr->codecs[DS_FN], - blk, (char *)&fn, &out_sz); + if (!codecs[DS_FN]) return -1; + r |= codecs[DS_FN]->decode(s,codecs[DS_FN], + blk, (char *)&fn, &out_sz); if (r) return r; } else { fn = 0; @@ -1146,6 +1148,13 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (!(ds & (CRAM_FC | CRAM_FP))) goto skip_cigar; + if (fn) { + if ((ds & CRAM_FC) && !codecs[DS_FC]) + return -1; + if ((ds & CRAM_FP) && !codecs[DS_FP]) + return -1; + } + for (f = 0; f < fn; f++) { int32_t pos = 0; char op; @@ -1158,22 +1167,20 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_FC) { - if (!c->comp_hdr->codecs[DS_FC]) return -1; - r |= c->comp_hdr->codecs[DS_FC]->decode(s, - c->comp_hdr->codecs[DS_FC], - blk, - &op, &out_sz); + r |= codecs[DS_FC]->decode(s, + codecs[DS_FC], + blk, + &op, &out_sz); if (r) return r; } if (!(ds & CRAM_FP)) continue; - if (!c->comp_hdr->codecs[DS_FP]) return -1; - r |= c->comp_hdr->codecs[DS_FP]->decode(s, - c->comp_hdr->codecs[DS_FP], - blk, - (char *)&pos, &out_sz); + r |= codecs[DS_FP]->decode(s, + codecs[DS_FP], + blk, + (char *)&pos, &out_sz); if (r) return r; pos += prev_pos; @@ -1214,26 +1221,33 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, // 'N' in both ref and seq is also mismatch for NM/MD if (ref_pos + pos-seq_pos > s->ref_end) goto beyond_slice; + + const char *refp = s->ref + ref_pos - s->ref_start + 1; + const int frag_len = pos - seq_pos; + int do_cpy = 1; if (decode_md || decode_nm) { - int i; - for (i = 0; i < pos - seq_pos; i++) { - // FIXME: not N, but nt16 lookup == 15? - char base = s->ref[ref_pos - s->ref_start + 1 + i]; - if (base == 'N') { - if (add_md_char(s, decode_md, - s->ref[ref_pos - s->ref_start + 1 + i], - &md_dist) < 0) - return -1; - nm++; - } else { - md_dist++; + char *N = memchr(refp, 'N', frag_len); + if (N) { + int i; + for (i = 0; i < frag_len; i++) { + char base = refp[i]; + if (base == 'N') { + if (add_md_char(s, decode_md, + 'N', &md_dist) < 0) + return -1; + nm++; + } else { + md_dist++; + } + seq[seq_pos-1+i] = base; } - seq[seq_pos-1+i] = base; + do_cpy = 0; + } else { + md_dist += frag_len; } - } else { - memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], - pos - seq_pos); } + if (do_cpy) + memcpy(&seq[seq_pos-1], refp, frag_len); } } #ifdef USE_X @@ -1271,12 +1285,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, switch (CRAM_MAJOR_VERS(fd->version)) { case 1: if (ds & CRAM_IN) { - r |= c->comp_hdr->codecs[DS_IN] - ? c->comp_hdr->codecs[DS_IN] - ->decode(s, c->comp_hdr->codecs[DS_IN], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) + r |= codecs[DS_IN] + ? codecs[DS_IN]->decode(s, codecs[DS_IN], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2) : (seq[pos-1] = 'N', out_sz2 = 1, 0); have_sc = 1; } @@ -1284,22 +1297,20 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 2: default: if (ds & CRAM_SC) { - r |= c->comp_hdr->codecs[DS_SC] - ? c->comp_hdr->codecs[DS_SC] - ->decode(s, c->comp_hdr->codecs[DS_SC], - blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2) + r |= codecs[DS_SC] + ? codecs[DS_SC]->decode(s, codecs[DS_SC], + blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2) : (seq[pos-1] = 'N', out_sz2 = 1, 0); have_sc = 1; } break; //default: - // r |= c->comp_hdr->codecs[DS_BB] - // ? c->comp_hdr->codecs[DS_BB] - // ->decode(s, c->comp_hdr->codecs[DS_BB], - // blk, &seq[pos-1], &out_sz2) + // r |= codecs[DS_BB] + // ? codecs[DS_BB]->decode(s, codecs[DS_BB], + // blk, &seq[pos-1], &out_sz2) // : (seq[pos-1] = 'N', out_sz2 = 1, 0); } if (have_sc) { @@ -1319,10 +1330,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BS) { - if (!c->comp_hdr->codecs[DS_BS]) return -1; - r |= c->comp_hdr->codecs[DS_BS] - ->decode(s, c->comp_hdr->codecs[DS_BS], blk, - (char *)&base, &out_sz); + if (!codecs[DS_BS]) return -1; + r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk, + (char *)&base, &out_sz); if (pos-1 < cr->len) seq[pos-1] = 'N'; // FIXME look up BS=base value } @@ -1334,10 +1344,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BS) { - if (!c->comp_hdr->codecs[DS_BS]) return -1; - r |= c->comp_hdr->codecs[DS_BS] - ->decode(s, c->comp_hdr->codecs[DS_BS], blk, - (char *)&base, &out_sz); + if (!codecs[DS_BS]) return -1; + r |= codecs[DS_BS]->decode(s, codecs[DS_BS], blk, + (char *)&base, &out_sz); if (r) return -1; if (cr->ref_id < 0 || ref_pos >= bfd->ref[cr->ref_id].len || !s->ref) { if (pos-1 < cr->len) @@ -1376,10 +1385,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_DL) { - if (!c->comp_hdr->codecs[DS_DL]) return -1; - r |= c->comp_hdr->codecs[DS_DL] - ->decode(s, c->comp_hdr->codecs[DS_DL], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_DL]) return -1; + r |= codecs[DS_DL]->decode(s, codecs[DS_DL], blk, + (char *)&i32, &out_sz); if (r) return r; if (decode_md || decode_nm) { if (ref_pos + i32 > s->ref_end) @@ -1431,11 +1439,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_IN) { - if (!c->comp_hdr->codecs[DS_IN]) return -1; - r |= c->comp_hdr->codecs[DS_IN] - ->decode(s, c->comp_hdr->codecs[DS_IN], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz2); + if (!codecs[DS_IN]) return -1; + r |= codecs[DS_IN]->decode(s, codecs[DS_IN], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz2); if (r) return r; cig_op = BAM_CINS; cig_len += out_sz2; @@ -1452,11 +1459,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_BA) { - if (!c->comp_hdr->codecs[DS_BA]) return -1; - r |= c->comp_hdr->codecs[DS_BA] - ->decode(s, c->comp_hdr->codecs[DS_BA], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz); + if (!codecs[DS_BA]) return -1; + r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz); if (r) return r; } cig_op = BAM_CINS; @@ -1475,11 +1481,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_BB) { - if (!c->comp_hdr->codecs[DS_BB]) return -1; - r |= c->comp_hdr->codecs[DS_BB] - ->decode(s, c->comp_hdr->codecs[DS_BB], blk, - cr->len ? &seq[pos-1] : NULL, - &len); + if (!codecs[DS_BB]) return -1; + r |= codecs[DS_BB]->decode(s, codecs[DS_BB], blk, + cr->len ? &seq[pos-1] : NULL, + &len); if (r) return r; if (decode_md || decode_nm) { @@ -1526,13 +1531,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (ds & CRAM_QQ) { - if (!c->comp_hdr->codecs[DS_QQ]) return -1; + if (!codecs[DS_QQ]) return -1; if ((ds & CRAM_QS) && !(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? - r |= c->comp_hdr->codecs[DS_QQ] - ->decode(s, c->comp_hdr->codecs[DS_QQ], blk, - (char *)&qual[pos-1], &len); + r |= codecs[DS_QQ]->decode(s, codecs[DS_QQ], blk, + (char *)&qual[pos-1], &len); if (r) return r; } @@ -1555,11 +1559,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } #endif if (ds & CRAM_BA) { - if (!c->comp_hdr->codecs[DS_BA]) return -1; - r |= c->comp_hdr->codecs[DS_BA] - ->decode(s, c->comp_hdr->codecs[DS_BA], blk, - cr->len ? &seq[pos-1] : NULL, - &out_sz); + if (!codecs[DS_BA]) return -1; + r |= codecs[DS_BA]->decode(s, codecs[DS_BA], blk, + cr->len ? &seq[pos-1] : NULL, + &out_sz); if (decode_md || decode_nm) { if (md_dist >= 0 && decode_md) @@ -1579,13 +1582,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } } if (ds & CRAM_QS) { - if (!c->comp_hdr->codecs[DS_QS]) return -1; + if (!codecs[DS_QS]) return -1; if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ASCII ?. Same as htsjdk - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + (char *)&qual[pos-1], &out_sz); } #ifdef USE_X cig_op = BAM_CBASE_MISMATCH; @@ -1601,13 +1603,12 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'Q': { // Quality score; QS if (ds & CRAM_QS) { - if (!c->comp_hdr->codecs[DS_QS]) return -1; + if (!codecs[DS_QS]) return -1; if (!(cf & CRAM_FLAG_PRESERVE_QUAL_SCORES) && (unsigned char)*qual == 255) memset(qual, 30, cr->len); // ? - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - (char *)&qual[pos-1], &out_sz); + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + (char *)&qual[pos-1], &out_sz); //printf(" %d: QS = %d (ret %d)\n", f, qc, r); } break; @@ -1619,10 +1620,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_HC) { - if (!c->comp_hdr->codecs[DS_HC]) return -1; - r |= c->comp_hdr->codecs[DS_HC] - ->decode(s, c->comp_hdr->codecs[DS_HC], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_HC]) return -1; + r |= codecs[DS_HC]->decode(s, codecs[DS_HC], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CHARD_CLIP; cig_len += i32; @@ -1636,10 +1636,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_PD) { - if (!c->comp_hdr->codecs[DS_PD]) return -1; - r |= c->comp_hdr->codecs[DS_PD] - ->decode(s, c->comp_hdr->codecs[DS_PD], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_PD]) return -1; + r |= codecs[DS_PD]->decode(s, codecs[DS_PD], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CPAD; cig_len += i32; @@ -1653,10 +1652,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, cig_len = 0; } if (ds & CRAM_RS) { - if (!c->comp_hdr->codecs[DS_RS]) return -1; - r |= c->comp_hdr->codecs[DS_RS] - ->decode(s, c->comp_hdr->codecs[DS_RS], blk, - (char *)&i32, &out_sz); + if (!codecs[DS_RS]) return -1; + r |= codecs[DS_RS]->decode(s, codecs[DS_RS], blk, + (char *)&i32, &out_sz); if (r) return r; cig_op = BAM_CREF_SKIP; cig_len += i32; @@ -1703,31 +1701,32 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if (cr->len - seq_pos + 1 > 0) { if (ref_pos + cr->len-seq_pos +1 > s->ref_end) goto beyond_slice; + int remainder = cr->len - (seq_pos-1); + int j = ref_pos - s->ref_start + 1; if (decode_md || decode_nm) { - int i, j = ref_pos - s->ref_start + 1; - // FIXME: Update this to match spec once we're also - // ready to update samtools calmd. (N vs any ambig) - if (memchr(&s->ref[j], 'N', cr->len - (seq_pos-1))) { - for (i = seq_pos-1, j -= i; i < cr->len; i++) { - char base = s->ref[j+i]; + int i; + char *N = memchr(&s->ref[j], 'N', remainder); + if (!N) { + // short cut the common case + md_dist += cr->len - (seq_pos-1); + } else { + char *refp = &s->ref[j-(seq_pos-1)]; + md_dist += N-&s->ref[j]; + int i_start = seq_pos-1 + (N - &s->ref[j]); + for (i = i_start; i < cr->len; i++) { + char base = refp[i]; if (base == 'N') { - if (add_md_char(s, decode_md, 'N', &md_dist) < 0) + if (add_md_char(s, decode_md, 'N', + &md_dist) < 0) return -1; nm++; } else { md_dist++; } - seq[i] = base; } - } else { - // faster than above code - memcpy(&seq[seq_pos-1], &s->ref[j], cr->len - (seq_pos-1)); - md_dist += cr->len - (seq_pos-1); } - } else { - memcpy(&seq[seq_pos-1], &s->ref[ref_pos - s->ref_start +1], - cr->len - (seq_pos-1)); } + memcpy(&seq[seq_pos-1], &s->ref[j], remainder); } ref_pos += cr->len - seq_pos + 1; } @@ -1782,10 +1781,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); if (ds & CRAM_MQ) { - if (!c->comp_hdr->codecs[DS_MQ]) return -1; - r |= c->comp_hdr->codecs[DS_MQ] - ->decode(s, c->comp_hdr->codecs[DS_MQ], blk, - (char *)&cr->mqual, &out_sz); + if (!codecs[DS_MQ]) return -1; + r |= codecs[DS_MQ]->decode(s, codecs[DS_MQ], blk, + (char *)&cr->mqual, &out_sz); } else { cr->mqual = 40; } @@ -1793,10 +1791,9 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, if ((ds & CRAM_QS) && (cf & CRAM_FLAG_PRESERVE_QUAL_SCORES)) { int32_t out_sz2 = cr->len; - if (!c->comp_hdr->codecs[DS_QS]) return -1; - r |= c->comp_hdr->codecs[DS_QS] - ->decode(s, c->comp_hdr->codecs[DS_QS], blk, - qual, &out_sz2); + if (!codecs[DS_QS]) return -1; + r |= codecs[DS_QS]->decode(s, codecs[DS_QS], blk, + qual, &out_sz2); } s->cigar = cigar; diff --git a/cram/cram_io.h b/cram/cram_io.h index 8cc59be51..53ae30f59 100644 --- a/cram/cram_io.h +++ b/cram/cram_io.h @@ -229,7 +229,7 @@ static inline int block_resize(cram_block *b, size_t len) { size_t alloc = b->alloc; while (alloc <= len) - alloc = alloc ? alloc*1.5 : 1024; + alloc = alloc ? alloc + (alloc>>2) : 1024; return block_resize_exact(b, alloc); } diff --git a/sam.c b/sam.c index a93664d87..9f415dbc6 100644 --- a/sam.c +++ b/sam.c @@ -598,9 +598,19 @@ int bam_set1(bam1_t *bam, } cp += n_cigar * 4; - for (i = 0; i + 1 < l_seq; i += 2) { - *cp++ = (seq_nt16_table[(unsigned char)seq[i]] << 4) | seq_nt16_table[(unsigned char)seq[i + 1]]; +#define NN 16 + const uint8_t *useq = (uint8_t *)seq; + for (i = 0; i + NN < l_seq; i += NN) { + int j; + const uint8_t *u2 = useq+i; + for (j = 0; j < NN/2; j++) + cp[j] = (seq_nt16_table[u2[j*2]]<<4) | seq_nt16_table[u2[j*2+1]]; + cp += NN/2; } + for (; i + 1 < l_seq; i += 2) { + *cp++ = (seq_nt16_table[useq[i]] << 4) | seq_nt16_table[useq[i + 1]]; + } + for (; i < l_seq; i++) { *cp++ = seq_nt16_table[(unsigned char)seq[i]] << 4; } From c1634e743aab4822e05fbb7dc41fd6ab21ec6982 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 15 Mar 2023 17:30:41 +0000 Subject: [PATCH 402/488] Remove CRAM 3.1 warning. The code would now trigger on 3.2 as well as 4.x, although 3.2 doesn't exist. It's easier to check this way though as a future-proof. --- cram/cram_io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index d3c39e47a..4f5aab44c 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -5776,7 +5776,7 @@ int cram_set_voption(cram_fd *fd, enum hts_fmt_option opt, va_list args) { return -1; } - if (major > 3 || (major == 3 && minor > 0)) { + if (major > 3 || (major == 3 && minor > 1)) { hts_log_warning( "CRAM version %s is still a draft and subject to change.\n" "This is a technology demonstration that should not be " From ffd74ecbfdd527e30da4a72e3ecfe069805012a2 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 29 Mar 2023 17:35:10 +0100 Subject: [PATCH 403/488] Remove use of sprintf() from HTSlib source None of these instances were really a problem, but using it upsets some downstream packagers (notably R). The easiest way to keep them happy is to stop using it and (mostly) switch to snprintf() instead. Also remove some code from hfile_s3's escape_query() which could never be executed. --- cram/cram_index.c | 21 ++++++++++--------- cram/cram_io.c | 6 +++--- cram/open_trace_file.c | 2 +- hfile_s3.c | 40 +++++++++++++++++------------------- kstring.c | 2 +- plugin.c | 2 +- sam.c | 20 +++++++++--------- test/hfile.c | 2 +- test/sam.c | 2 +- test/test-regidx.c | 22 ++++++++++---------- test/test_mod.c | 46 +++++++++++++++++++++--------------------- test/test_view.c | 3 ++- textutils.c | 4 ++-- 13 files changed, 87 insertions(+), 85 deletions(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 846dc709d..0cc606f42 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -656,9 +656,10 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - ref, ref_start, ref_end - ref_start + 1, - (int64_t)cpos, landmark, sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + ref, ref_start, ref_end - ref_start + 1, + (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) return -4; } @@ -669,9 +670,10 @@ static int cram_index_build_multiref(cram_fd *fd, } if (ref != -2) { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - ref, ref_start, ref_end - ref_start + 1, - (int64_t)cpos, landmark, sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + ref, ref_start, ref_end - ref_start + 1, + (int64_t)cpos, landmark, sz); if (bgzf_write(fp, buf, strlen(buf)) < 0) return -4; } @@ -701,9 +703,10 @@ int cram_index_slice(cram_fd *fd, if (s->hdr->ref_seq_id == -2) { ret = cram_index_build_multiref(fd, c, s, fp, cpos, spos, sz); } else { - sprintf(buf, "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", - s->hdr->ref_seq_id, s->hdr->ref_seq_start, - s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); + snprintf(buf, sizeof(buf), + "%d\t%"PRId64"\t%"PRId64"\t%"PRId64"\t%d\t%d\n", + s->hdr->ref_seq_id, s->hdr->ref_seq_start, + s->hdr->ref_seq_span, (int64_t)cpos, (int)spos, (int)sz); ret = (bgzf_write(fp, buf, strlen(buf)) >= 0)? 0 : -4; } diff --git a/cram/cram_io.c b/cram/cram_io.c index 4f5aab44c..ca226e29c 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -2531,7 +2531,7 @@ static refs_t *refs_load_fai(refs_t *r_orig, const char *fn, int is_err) { /* Only the reference file provided. Get the index file name from it */ if (!(r->fn = string_dup(r->pool, fn))) goto err; - sprintf(fai_fn, "%.*s.fai", PATH_MAX-5, fn); + snprintf(fai_fn, PATH_MAX, "%.*s.fai", PATH_MAX-5, fn); } } @@ -4816,7 +4816,7 @@ static void full_path(char *out, char *in) { strncpy(out, in, PATH_MAX-1); out[PATH_MAX-1] = 0; } else { - int len; + size_t len; // unable to get dir or out+in is too long if (!getcwd(out, PATH_MAX) || @@ -4826,7 +4826,7 @@ static void full_path(char *out, char *in) { return; } - sprintf(out+len, "/%.*s", PATH_MAX - 2 - len, in); + snprintf(out+len, PATH_MAX - len, "/%s", in); // FIXME: cope with `pwd`/../../../foo.fa ? } diff --git a/cram/open_trace_file.c b/cram/open_trace_file.c index 1518396d7..4d617b736 100644 --- a/cram/open_trace_file.c +++ b/cram/open_trace_file.c @@ -242,7 +242,7 @@ static char *expand_path(const char *file, char *dirname, int max_s_digits) { /* Special case for "./" or absolute filenames */ if (*file == '/' || (len==1 && *dirname == '.')) { - sprintf(path, "%s", file); + memcpy(path, file, lenf + 1); } else { /* Handle %[0-9]*s expansions, if required */ char *path_end = path; diff --git a/hfile_s3.c b/hfile_s3.c index ce83875c9..2ce7feb4b 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -451,12 +451,12 @@ static int auth_header_callback(void *ctx, char ***hdrs) { /* like a escape path but for query strings '=' and '&' are untouched */ static char *escape_query(const char *qs) { - size_t i, j = 0, length; + size_t i, j = 0, length, alloced; char *escaped; length = strlen(qs); - - if ((escaped = malloc(length * 3 + 1)) == NULL) { + alloced = length * 3 + 1; + if ((escaped = malloc(alloced)) == NULL) { return NULL; } @@ -467,29 +467,25 @@ static char *escape_query(const char *qs) { c == '_' || c == '-' || c == '~' || c == '.' || c == '/' || c == '=' || c == '&') { escaped[j++] = c; } else { - sprintf(escaped + j, "%%%02X", c); + snprintf(escaped + j, alloced - j, "%%%02X", c); j += 3; } } - if (i != length) { - // in the case of a '?' copy the rest of the qs across unchanged - strcpy(escaped + j, qs + i); - } else { - escaped[j] = '\0'; - } + escaped[j] = '\0'; return escaped; } static char *escape_path(const char *path) { - size_t i, j = 0, length; + size_t i, j = 0, length, alloced; char *escaped; length = strlen(path); + alloced = length * 3 + 1; - if ((escaped = malloc(length * 3 + 1)) == NULL) { + if ((escaped = malloc(alloced)) == NULL) { return NULL; } @@ -502,7 +498,7 @@ static char *escape_path(const char *path) { c == '_' || c == '-' || c == '~' || c == '.' || c == '/') { escaped[j++] = c; } else { - sprintf(escaped + j, "%%%02X", c); + snprintf(escaped + j, alloced - j, "%%%02X", c); j += 3; } } @@ -842,14 +838,14 @@ AWS S3 sig version 4 writing code ****************************************************************/ -static void hash_string(char *in, size_t length, char *out) { +static void hash_string(char *in, size_t length, char *out, size_t out_len) { unsigned char hashed[SHA256_DIGEST_BUFSIZE]; int i, j; s3_sha256((const unsigned char *)in, length, hashed); for (i = 0, j = 0; i < SHA256_DIGEST_BUFSIZE; i++, j+= 2) { - sprintf(out + j, "%02x", hashed[i]); + snprintf(out + j, out_len - j, "%02x", hashed[i]); } } @@ -866,7 +862,7 @@ static void ksfree(kstring_t *s) { } -static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *signature_string) { +static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *signature_string, size_t sig_string_len) { unsigned char date_key[SHA256_DIGEST_BUFSIZE]; unsigned char date_region_key[SHA256_DIGEST_BUFSIZE]; unsigned char date_region_service_key[SHA256_DIGEST_BUFSIZE]; @@ -893,7 +889,7 @@ static int make_signature(s3_auth_data *ad, kstring_t *string_to_sign, char *sig s3_sign_sha256(signing_key, len, (const unsigned char *)string_to_sign->s, string_to_sign->l, signature, &len); for (i = 0, j = 0; i < len; i++, j+= 2) { - sprintf(signature_string + j, "%02x", signature[i]); + snprintf(signature_string + j, sig_string_len - j, "%02x", signature[i]); } ksfree(&secret_access_key); @@ -945,7 +941,7 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten goto cleanup; } - hash_string(canonical_request.s, canonical_request.l, cr_hash); + hash_string(canonical_request.s, canonical_request.l, cr_hash, sizeof(cr_hash)); ksprintf(&scope, "%s/%s/s3/aws4_request", ad->date_short, ad->region.s); @@ -959,7 +955,7 @@ static int make_authorisation(s3_auth_data *ad, char *http_request, char *conten goto cleanup; } - if (make_signature(ad, &string_to_sign, signature_string)) { + if (make_signature(ad, &string_to_sign, signature_string, sizeof(signature_string))) { goto cleanup; } @@ -1094,10 +1090,10 @@ static int write_authorisation_callback(void *auth, char *request, kstring_t *co } if (content) { - hash_string(content->s, content->l, content_hash); + hash_string(content->s, content->l, content_hash, sizeof(content_hash)); } else { // empty hash - hash_string("", 0, content_hash); + hash_string("", 0, content_hash, sizeof(content_hash)); } ad->canonical_query_string.l = 0; @@ -1166,7 +1162,7 @@ static int v4_auth_header_callback(void *ctx, char ***hdrs) { return copy_auth_headers(ad, hdrs); } - hash_string("", 0, content_hash); // empty hash + hash_string("", 0, content_hash, sizeof(content_hash)); // empty hash ad->canonical_query_string.l = 0; diff --git a/kstring.c b/kstring.c index 9b2d60c1f..71facf975 100644 --- a/kstring.c +++ b/kstring.c @@ -57,7 +57,7 @@ int kputd(double d, kstring_t *s) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases - int s2 = sprintf(s->s + s->l, "%g", d); + int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d); len += s2; s->l += s2; return len; diff --git a/plugin.c b/plugin.c index cec5beefd..670081f84 100644 --- a/plugin.c +++ b/plugin.c @@ -210,7 +210,7 @@ const char *hts_plugin_path(void) { } static char s_path[1024]; - sprintf(s_path, "%.1023s", ks.s ? ks.s : ""); + snprintf(s_path, sizeof(s_path), "%s", ks.s ? ks.s : ""); free(ks.s); return s_path; diff --git a/sam.c b/sam.c index 9f415dbc6..c8daa9683 100644 --- a/sam.c +++ b/sam.c @@ -5392,20 +5392,22 @@ int bam_plp_insertion_mod(const bam_pileup1_t *p, for (j = 0; j < nm; j++) { char qual[20]; if (mod[j].qual >= 0) - sprintf(qual, "%d", mod[j].qual); + snprintf(qual, sizeof(qual), "%d", mod[j].qual); else *qual=0; if (mod[j].modified_base < 0) // ChEBI - indel += sprintf(&ins->s[indel], "%c(%d)%s", - "+-"[mod[j].strand], - -mod[j].modified_base, - qual); + indel += snprintf(&ins->s[indel], ins->m - indel, + "%c(%d)%s", + "+-"[mod[j].strand], + -mod[j].modified_base, + qual); else - indel += sprintf(&ins->s[indel], "%c%c%s", - "+-"[mod[j].strand], - mod[j].modified_base, - qual); + indel += snprintf(&ins->s[indel], ins->m - indel, + "%c%c%s", + "+-"[mod[j].strand], + mod[j].modified_base, + qual); } ins->s[indel++] = ']'; ins->l += indel - o_indel; // grow by amount we used diff --git a/test/hfile.c b/test/hfile.c index f6ba0d7cf..8f06a971f 100644 --- a/test/hfile.c +++ b/test/hfile.c @@ -176,7 +176,7 @@ int main(void) original = slurp("vcf.c"); for (i = 1; i <= 6; i++) { char *text; - sprintf(buffer, "test/hfile%d.tmp", i); + snprintf(buffer, sizeof(buffer), "test/hfile%d.tmp", i); text = slurp(buffer); if (strcmp(original, text) != 0) { fprintf(stderr, "%s differs from vcf.c\n", buffer); diff --git a/test/sam.c b/test/sam.c index 28ca1bc5f..eb404bd65 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1504,7 +1504,7 @@ static void faidx1(const char *filename) fin = fopen(filename, "rb"); if (fin == NULL) fail("can't open %s", filename); - sprintf(tmpfilename, "%s.tmp", filename); + snprintf(tmpfilename, sizeof(tmpfilename), "%s.tmp", filename); fout = fopen(tmpfilename, "wb"); if (fout == NULL) fail("can't create temporary %s", tmpfilename); while (fgets(line, sizeof line, fin)) { diff --git a/test/test-regidx.c b/test/test-regidx.c index 90e7244d1..4cad440c7 100644 --- a/test/test-regidx.c +++ b/test/test-regidx.c @@ -304,20 +304,20 @@ void test_explicit(char *tgt, char *qry, char *exp) regidx_destroy(idx); } -void create_line_bed(char *line, char *chr, int start, int end) +void create_line_bed(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s\t%d\t%d\n",chr,start-1,end); + snprintf(line,size,"%s\t%d\t%d\n",chr,start-1,end); } -void create_line_tab(char *line, char *chr, int start, int end) +void create_line_tab(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s\t%d\t%d\n",chr,start,end); + snprintf(line,size,"%s\t%d\t%d\n",chr,start,end); } -void create_line_reg(char *line, char *chr, int start, int end) +void create_line_reg(char *line, size_t size, char *chr, int start, int end) { - sprintf(line,"%s:%d-%d\n",chr,start,end); + snprintf(line,size,"%s:%d-%d\n",chr,start,end); } -typedef void (*set_line_f)(char *line, char *chr, int start, int end); +typedef void (*set_line_f)(char *line, size_t size, char *chr, int start, int end); void test(set_line_f set_line, regidx_parse_f parse) { @@ -329,17 +329,17 @@ void test(set_line_f set_line, regidx_parse_f parse) for (i=1; icore.l_qseq; i++) { - char line[8192], *lp = line; + char line[8192], *lp = line, *ep = line + sizeof(line); n = bam_mods_at_next_pos(b, m, mods, 5); - lp += sprintf(lp, "%d\t%c\t", - i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); + lp += snprintf(lp, ep - lp, "%d\t%c\t", + i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); for (j = 0; j < n && j < 5; j++) { if (extended) { int m_strand, m_implicit; @@ -134,18 +134,18 @@ int main(int argc, char **argv) { m_canonical != mods[j].canonical_base || m_strand != mods[j].strand) goto err; - lp += sprintf(lp, "%c%c%s%c%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - "?."[m_implicit], - mods[j].qual); + lp += snprintf(lp, ep - lp, "%c%c%s%c%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + "?."[m_implicit], + mods[j].qual); } else { - lp += sprintf(lp, "%c%c%s%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - mods[j].qual); + lp += snprintf(lp, ep - lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); } } *lp++ = '\n'; @@ -172,15 +172,15 @@ int main(int argc, char **argv) { int pos; while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { - char line[8192]={0}, *lp = line; - lp += sprintf(lp, "%d\t%c\t", pos, - seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); + char line[8192]={0}, *lp = line, *ep = line + sizeof(line); + lp += snprintf(lp, ep - lp, "%d\t%c\t", pos, + seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); for (j = 0; j < n && j < 5; j++) { - lp += sprintf(lp, "%c%c%s%d ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - mods[j].qual); + lp += snprintf(lp, ep - lp, "%c%c%s%d ", + mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + mods[j].qual); } *lp++ = '\n'; *lp++ = 0; diff --git a/test/test_view.c b/test/test_view.c index f33c1cdf0..02d109297 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -362,7 +362,8 @@ int main(int argc, char *argv[]) } strcpy(modew, "w"); - if (opts.clevel >= 0 && opts.clevel <= 9) sprintf(modew + 1, "%d", opts.clevel); + if (opts.clevel >= 0 && opts.clevel <= 9) + snprintf(modew + 1, sizeof(modew) - 1, "%d", opts.clevel); if (opts.flag & WRITE_CRAM) strcat(modew, "c"); else if (opts.flag & WRITE_BINARY_COMP) strcat(modew, "b"); else if (opts.flag & WRITE_COMPRESSED) strcat(modew, "z"); diff --git a/textutils.c b/textutils.c index 53a3b252d..0cc2af818 100644 --- a/textutils.c +++ b/textutils.c @@ -453,7 +453,7 @@ const char * hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len) { const char *slim = (len < SIZE_MAX)? &s[len] : NULL; - char *t = buf; + char *t = buf, *bufend = buf + buflen; size_t qlen = quote? 1 : 0; if (quote) *t++ = quote; @@ -482,7 +482,7 @@ hts_strprint(char *buf, size_t buflen, char quote, const char *s, size_t len) } if (clen == 4) { - sprintf(t, "\\x%02X", (unsigned char) c); + snprintf(t, bufend - t, "\\x%02X", (unsigned char) c); t += clen; } else { From 7ed911e0ba88bb85688ece3dfb4b5a43186ede9b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 30 Mar 2023 09:40:01 +0100 Subject: [PATCH 404/488] Trivial fix to expr, removing "^". Fixes #1592 --- version.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.sh b/version.sh index 65d1ccae6..ba900bdf1 100755 --- a/version.sh +++ b/version.sh @@ -48,7 +48,7 @@ then v1=`expr "$VERSION" : '\([0-9]*\)'` v2=`expr "$VERSION" : '[0-9]*.\([0-9]*\)'` v3=`expr "$VERSION" : '[0-9]*.[0-9]*.\([0-9]*\)'` - if [ -z "`expr "$VERSION" : '^\([0-9.]*\)$'`" ] + if [ -z "`expr "$VERSION" : '\([0-9.]*\)$'`" ] then VERSION="$v1.$v2.255" else From 26f4d4465c9452ca698be79809d7a00d4c377d54 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 15 Mar 2023 11:45:49 +0000 Subject: [PATCH 405/488] Make SIMD tests work when building multiarch binaries MacOS multiarch binaries compile source code for each architecture and then join them together using 'lipo'. This means architecture specific code both in the actual source and configure tests need to be compilable on both architectures. Switch the configure tests and hts_probe_cc.sh so that they check if a given flag is needed to compile the test code instead of just testing to see if the flag works. By adding #ifdef __x86_64__ guards around the test code, compilation will work on non-x86_64 returning the result that no special compiler flag is needed. Similar #ifdef guards are added to the source files so that the SIMD-specific code only gets compiled for x86_64. The htscodecs submodule is updated to pull in these source file changes. The SIMD parts of built-in htscodecs are now compiled unconditionally. Tests for NEON have also been removed as they weren't really doing anything. The configure and hts_probe_cc.sh are adjusted to exactly match those used by htscodecs' configure, for ease of maintenance. --- Makefile | 16 +++- configure.ac | 122 ++++++++++++++++----------- hts_probe_cc.sh | 103 ++++++++++++++-------- htscodecs | 2 +- htscodecs_bundled.mk | 8 +- m4/ax_check_compile_flag.m4 | 53 ------------ m4/hts_check_compile_flags_needed.m4 | 63 ++++++++++++++ 7 files changed, 218 insertions(+), 149 deletions(-) delete mode 100644 m4/ax_check_compile_flag.m4 create mode 100644 m4/hts_check_compile_flags_needed.m4 diff --git a/Makefile b/Makefile index 3e95a0bef..9b7f7f2f4 100644 --- a/Makefile +++ b/Makefile @@ -126,10 +126,18 @@ srcdir = . srcprefix = HTSPREFIX = +# Flags for SIMD code HTS_CFLAGS_AVX2 = HTS_CFLAGS_AVX512 = HTS_CFLAGS_SSE4 = +# Control building of SIMD code. Not used if configure has been run. +HTS_BUILD_AVX2 = +HTS_BUILD_AVX512 = +HTS_BUILD_SSSE3 = +HTS_BUILD_POPCNT = +HTS_BUILD_SSE4_1 = + include htslib_vars.mk include htscodecs.mk @@ -274,7 +282,9 @@ config.h: echo '#endif' >> $@ echo '#define HAVE_DRAND48 1' >> $@ echo '#define HAVE_LIBCURL 1' >> $@ - if [ "x$(HTS_CFLAGS_SSE4)" != "x" ] ; then \ + if [ "x$(HTS_BUILD_POPCNT)" != "x" ] && \ + [ "x$(HTS_BUILD_SSE4_1)" != "x" ] && \ + [ "x$(HTS_BUILD_SSSE3)" != "x" ]; then \ echo '#define HAVE_POPCNT 1' >> $@ ; \ echo '#define HAVE_SSE4_1 1' >> $@ ; \ echo '#define HAVE_SSSE3 1' >> $@ ; \ @@ -282,10 +292,10 @@ config.h: echo '#define UBSAN 1' >> $@ ; \ echo '#endif' >> $@ ; \ fi - if [ "x$(HTS_CFLAGS_AVX2)" != "x" ] ; then \ + if [ "x$(HTS_BUILD_AVX2)" != "x" ] ; then \ echo '#define HAVE_AVX2 1' >> $@ ; \ fi - if [ "x$(HTS_CFLAGS_AVX512)" != "x" ] ; then \ + if [ "x$(HTS_BUILD_AVX512)" != "x" ] ; then \ echo '#define HAVE_AVX512 1' >> $@ ; \ fi diff --git a/configure.ac b/configure.ac index 98b0a44c7..ff2367c1b 100644 --- a/configure.ac +++ b/configure.ac @@ -30,7 +30,7 @@ AC_CONFIG_SRCDIR(hts.c) AC_CONFIG_HEADERS(config.h) m4_include([m4/hts_prog_cc_warnings.m4]) -m4_include([m4/ax_check_compile_flag.m4]) +m4_include([m4/hts_check_compile_flags_needed.m4]) m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) @@ -71,16 +71,53 @@ dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS) HTS_PROG_CC_WERROR(hts_late_cflags) dnl Check for various compiler flags to enable SIMD features -dnl Options for rANS32x16 sse4.1 version -AX_CHECK_COMPILE_FLAG([-mssse3 -mpopcnt -msse4.1], [ - hts_cflags_sse4="-mssse3 -mpopcnt -msse4.1" - AC_SUBST([hts_cflags_sse4]) - AC_DEFINE([HAVE_SSSE3],1, - [Defined to 1 if the compiler can issue SSSE3 instructions.]) - AC_DEFINE([HAVE_POPCNT],1, - [Defined to 1 if the compiler can issue popcnt instructions.]) - AC_DEFINE([HAVE_SSE4_1],1, - [Defined to 1 if the compiler can issue SSE4.1 instructions.]) +dnl Options for rANS32x16 sse4.1 version - ssse3 +hts_cflags_sse4="" +HTS_CHECK_COMPILE_FLAGS_NEEDED([ssse3], [-mssse3], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_shuffle_epi8(a, b); + return *((char *) &c); + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_SSSE3],1,[Defined to 1 if rANS source using SSSE3 can be compiled.]) +]) + +dnl Options for rANS32x16 sse4.1 version - popcnt +HTS_CHECK_COMPILE_FLAGS_NEEDED([popcnt], [-mpopcnt], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + unsigned int i = _mm_popcnt_u32(1); + return i != 1; + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_POPCNT],1,[Defined to 1 if rANS source using popcnt can be compiled.]) +]) + +dnl Options for rANS32x16 sse4.1 version - sse4.1 +HTS_CHECK_COMPILE_FLAGS_NEEDED([sse4.1], [-msse4.1], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif + ]],[[ + #ifdef __x86_64__ + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_max_epu32(a, b); + return *((char *) &c); + #endif + ]])], [ + hts_cflags_sse4="$flags_needed $hts_cflags_sse4" + AC_DEFINE([HAVE_SSE4_1],1,[Defined to 1 if rANS source using SSE4.1 can be compiled. +]) dnl Propagate HTSlib's unaligned access preference to htscodecs AH_VERBATIM([UBSAN],[ /* Prevent unaligned access in htscodecs SSE4 rANS codec */ @@ -89,60 +126,43 @@ dnl Propagate HTSlib's unaligned access preference to htscodecs #endif ]) AC_DEFINE([UBSAN],1,[]) - ], [], [], [AC_LANG_PROGRAM([[ - #include "x86intrin.h" - ]],[[ - unsigned int i = _mm_popcnt_u32(1); - __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1); - __m128i c = _mm_max_epu32(a, b); - b = _mm_shuffle_epi8(a, c); - return *((char *) &b); - ]])]) +]) +AC_SUBST([hts_cflags_sse4]) dnl Options for rANS32x16 avx2 version -AX_CHECK_COMPILE_FLAG([-mavx2], [ - hts_cflags_avx2="-mavx2" - AC_SUBST([hts_cflags_avx2]) - AC_DEFINE([HAVE_AVX2],1, - [Defined to 1 if the compiler can issue AVX2 instructions.]) - ], [], [], [AC_LANG_PROGRAM([[ - #include "x86intrin.h" +HTS_CHECK_COMPILE_FLAGS_NEEDED([avx2], [-mavx2], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ + #include "x86intrin.h" + #endif ]],[[ + #ifdef __x86_64__ __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); __m256i b = _mm256_add_epi32(a, a); long long c = _mm256_extract_epi64(b, 0); return (int) c; - ]])]) + #endif + ]])], [ + hts_cflags_avx2="$flags_needed" + AC_SUBST([hts_cflags_avx2]) + AC_DEFINE([HAVE_AVX2],1,[Defined to 1 if rANS source using AVX2 can be compiled.]) +]) dnl Options for rANS32x16 avx512 version -AX_CHECK_COMPILE_FLAG([-mavx512f], [ - hts_cflags_avx512="-mavx512f" - AC_SUBST([hts_cflags_avx512]) - AC_DEFINE([HAVE_AVX512],1, - [Defined to 1 if the compiler can issue AVX512 instructions.]) - ], [], [], [AC_LANG_PROGRAM([[ +HTS_CHECK_COMPILE_FLAGS_NEEDED([avx512f], [-mavx512f], [AC_LANG_PROGRAM([[ + #ifdef __x86_64__ #include "x86intrin.h" + #endif ]],[[ + #ifdef __x86_64__ __m512i a = _mm512_set1_epi32(1); __m512i b = _mm512_add_epi32(a, a); return *((char *) &b); - ]])]) - -dnl Detect ARM Neon availability -AC_CACHE_CHECK([whether C compiler supports ARM Neon], [hts_cv_have_neon], [ - AC_COMPILE_IFELSE([ - AC_LANG_PROGRAM([[ - #include "arm_neon.h" - ]], [[ - int32x4_t a = vdupq_n_s32(1); - int32x4_t b = vaddq_s32(a, a); - return *((char *) &b); - ]])], [hts_cv_have_neon=yes], [hts_cv_have_neon=no])]) -if test "$hts_cv_have_neon" = yes; then - hts_have_neon=yes - AC_SUBST([hts_have_neon]) -fi - + #endif + ]])], [ + hts_cflags_avx512="$flags_needed" + AC_SUBST([hts_cflags_avx512]) + AC_DEFINE([HAVE_AVX512],1,[Defined to 1 if rANS source using AVX512F can be compiled.]) +]) dnl Avoid chicken-and-egg problem where pkg-config supplies the dnl PKG_PROG_PKG_CONFIG macro, but we want to use it to check diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh index 37d6bae7e..5e5ddec1e 100755 --- a/hts_probe_cc.sh +++ b/hts_probe_cc.sh @@ -43,30 +43,76 @@ run_compiler () return $retval } +# Run a test. $1 is the flag to try, $2 is the Makefile variable to set +# with the flag probe result, $3 is a Makefile variable which will be +# set to 1 if the code was built successfully. The code to test should +# be passed in via fd 0. +# First try compiling conftest.c without the flag. If that fails, try +# again with it to see if the flag is needed. +run_test () +{ + rm -f conftest conftest.err conftest.c + cat - > conftest.c + if run_compiler ; then + echo "$2 =" + echo "$3 = 1" + elif run_compiler "$1" ; then + echo "$2 = $1" + echo "$3 = 1" + else + echo "$3 =" + fi +} + echo "# Compiler probe results, generated by $0" -# Check for sse4.1 etc. support +# Check for ssse3 +run_test "-mssse3" HTS_CFLAGS_SSSE3 HTS_BUILD_SSSE3 <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); + __m128i c = _mm_shuffle_epi8(a, b); + return *((char *) &c); +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF -rm -f conftest conftest.err conftest.c -cat - <<'EOF' > conftest.c +# Check for popcnt +run_test "-mpopcnt" HTS_CFLAGS_POPCNT HTS_BUILD_POPCNT <<'EOF' +#ifdef __x86_64__ #include "x86intrin.h" int main(int argc, char **argv) { unsigned int i = _mm_popcnt_u32(1); - __m128i a = _mm_set_epi32(1, 2, 3, i), b = _mm_set_epi32(4, 3, 2, 1); + return i != 1; +} +#else +int main(int argc, char **argv) { return 0; } +#endif +EOF + +# Check for sse4.1 etc. support +run_test "-msse4.1" HTS_CFLAGS_SSE4_1 HTS_BUILD_SSE4_1 <<'EOF' +#ifdef __x86_64__ +#include "x86intrin.h" +int main(int argc, char **argv) { + __m128i a = _mm_set_epi32(1, 2, 3, 4), b = _mm_set_epi32(4, 3, 2, 1); __m128i c = _mm_max_epu32(a, b); - b = _mm_shuffle_epi8(a, c); - return *((char *) &b); + return *((char *) &c); } +#else +int main(int argc, char **argv) { return 0; } +#endif EOF -FLAGS="-mpopcnt -msse4.1 -mssse3" -if run_compiler "$FLAGS" ; then - echo "HTS_CFLAGS_SSE4 = $FLAGS" -fi + +echo 'HTS_CFLAGS_SSE4 = $(HTS_CFLAGS_SSSE3) $(HTS_CFLAGS_POPCNT) $(HTS_CFLAGS_SSE4_1)' # Check for avx2 -rm -f conftest.c -cat - <<'EOF' > conftest.c +run_test -mavx2 HTS_CFLAGS_AVX2 HTS_BUILD_AVX2 <<'EOF' +#ifdef __x86_64__ #include "x86intrin.h" int main(int argc, char **argv) { __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); @@ -74,41 +120,24 @@ int main(int argc, char **argv) { long long c = _mm256_extract_epi64(b, 0); return (int) c; } +#else +int main(int argc, char **argv) { return 0; } +#endif EOF -FLAGS="-mavx2" -if run_compiler "$FLAGS" ; then - echo "HTS_CFLAGS_AVX2 = $FLAGS" -fi # Check for avx512 -rm -f conftest.c -cat - <<'EOF' > conftest.c +run_test -mavx512f HTS_CFLAGS_AVX512 HTS_BUILD_AVX512 <<'EOF' +#ifdef __x86_64__ #include "x86intrin.h" int main(int argc, char **argv) { __m512i a = _mm512_set1_epi32(1); __m512i b = _mm512_add_epi32(a, a); return *((char *) &b); } +#else +int main(int argc, char **argv) { return 0; } +#endif EOF -FLAGS="-mavx512f" -if run_compiler "$FLAGS" ; then - echo "HTS_CFLAGS_AVX512 = $FLAGS" -fi - -# Check for neon - -rm -f conftest.c -cat - <<'EOF' > conftest.c -#include "arm_neon.h" -int main(int argc, char **argv) { - int32x4_t a = vdupq_n_s32(1); - int32x4_t b = vaddq_s32(a, a); - return *((char *) &b); -} -EOF -if run_compiler "" ; then - echo "HTS_HAVE_NEON = yes" -fi rm -f conftest.c diff --git a/htscodecs b/htscodecs index cd0737fff..d4aed5859 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit cd0737fff5893b0842b047da5aa3209e5f65442c +Subproject commit d4aed585929e2dab9dd8e6a2b74484dfc347c0f2 diff --git a/htscodecs_bundled.mk b/htscodecs_bundled.mk index 91a9c39e9..6274350f5 100644 --- a/htscodecs_bundled.mk +++ b/htscodecs_bundled.mk @@ -28,10 +28,10 @@ HTSCODECS_SOURCES = $(HTSPREFIX)htscodecs/htscodecs/arith_dynamic.c \ $(HTSPREFIX)htscodecs/htscodecs/htscodecs.c \ $(HTSPREFIX)htscodecs/htscodecs/pack.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static4x16pr.c \ - $(if $(HTS_CFLAGS_AVX2),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c) \ - $(if $(HTS_CFLAGS_AVX512),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c) \ - $(if $(HTS_CFLAGS_SSE4),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c) \ - $(if $(HTS_HAVE_NEON),$(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c) \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx2.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_avx512.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_sse4.c \ + $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr_neon.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static32x16pr.c \ $(HTSPREFIX)htscodecs/htscodecs/rANS_static.c \ $(HTSPREFIX)htscodecs/htscodecs/rle.c \ diff --git a/m4/ax_check_compile_flag.m4 b/m4/ax_check_compile_flag.m4 deleted file mode 100644 index 16bb46495..000000000 --- a/m4/ax_check_compile_flag.m4 +++ /dev/null @@ -1,53 +0,0 @@ -# =========================================================================== -# https://www.gnu.org/software/autoconf-archive/ax_check_compile_flag.html -# =========================================================================== -# -# SYNOPSIS -# -# AX_CHECK_COMPILE_FLAG(FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) -# -# DESCRIPTION -# -# Check whether the given FLAG works with the current language's compiler -# or gives an error. (Warnings, however, are ignored) -# -# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on -# success/failure. -# -# If EXTRA-FLAGS is defined, it is added to the current language's default -# flags (e.g. CFLAGS) when the check is done. The check is thus made with -# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to -# force the compiler to issue an error when a bad flag is given. -# -# INPUT gives an alternative input source to AC_COMPILE_IFELSE. -# -# NOTE: Implementation based on AX_CFLAGS_GCC_OPTION. Please keep this -# macro in sync with AX_CHECK_{PREPROC,LINK}_FLAG. -# -# LICENSE -# -# Copyright (c) 2008 Guido U. Draheim -# Copyright (c) 2011 Maarten Bosmans -# -# Copying and distribution of this file, with or without modification, are -# permitted in any medium without royalty provided the copyright notice -# and this notice are preserved. This file is offered as-is, without any -# warranty. - -#serial 6 - -AC_DEFUN([AX_CHECK_COMPILE_FLAG], -[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF -AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl -AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [ - ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS - _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1" - AC_LINK_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])], - [AS_VAR_SET(CACHEVAR,[yes])], - [AS_VAR_SET(CACHEVAR,[no])]) - _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags]) -AS_VAR_IF(CACHEVAR,yes, - [m4_default([$2], :)], - [m4_default([$3], :)]) -AS_VAR_POPDEF([CACHEVAR])dnl -])dnl AX_CHECK_COMPILE_FLAGS diff --git a/m4/hts_check_compile_flags_needed.m4 b/m4/hts_check_compile_flags_needed.m4 new file mode 100644 index 000000000..fb668e86f --- /dev/null +++ b/m4/hts_check_compile_flags_needed.m4 @@ -0,0 +1,63 @@ +# hts_check_compile_flags_needed.m4 +# +# SYNOPSIS +# +# HTS_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAGS, [INPUT], [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS]) +# +# DESCRIPTION +# +# Check whether the given FLAGS are required to build and link INPUT with +# the current language's compiler. Compilation and linking are first +# tries without FLAGS. If that fails it then tries to compile and +# link again with FLAGS. +# +# FEATURE describes the feature being tested, and is used when printing +# messages and to name the cache entry (along with the tested flags). +# +# ACTION-SUCCESS/ACTION-FAILURE are shell commands to execute on +# success/failure. In ACTION-SUCCESS, $flags_needed will be set to +# either an empty string or FLAGS depending on the test results. +# +# If EXTRA-FLAGS is defined, it is added to the current language's default +# flags (e.g. CFLAGS) when the check is done. The check is thus made with +# the flags: "CFLAGS EXTRA-FLAGS FLAG". This can for example be used to +# force the compiler to issue an error when a bad flag is given. +# +# If omitted, INPUT defaults to AC_LANG_PROGRAM(), although that probably +# isn't very useful. +# +# NOTE: Implementation based on AX_CHECK_COMPILE_FLAG. +# +# LICENSE +# +# Copyright (c) 2008 Guido U. Draheim +# Copyright (c) 2011 Maarten Bosmans +# Copyright (c) 2023 Robert Davies +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +# AX_CHECK_COMPILE_FLAGS_NEEDED(FEATURE, FLAG, [ACTION-SUCCESS], [ACTION-FAILURE], [EXTRA-FLAGS], [INPUT]) + +AC_DEFUN([HTS_CHECK_COMPILE_FLAGS_NEEDED], +[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF +AS_VAR_PUSHDEF([CACHEVAR],[hts_cv_check_[]_AC_LANG_ABBREV[]flags_needed_$1_$6_$2])dnl +AC_CACHE_CHECK([_AC_LANG compiler flags needed for $1], CACHEVAR, [ + AC_LINK_IFELSE([m4_default([$3],[AC_LANG_PROGRAM()])], + [AS_VAR_SET(CACHEVAR,[none])], + [ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS + _AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $6 $2" + AC_LINK_IFELSE([m4_default([$3],[AC_LANG_PROGRAM()])], + [AS_VAR_SET(CACHEVAR,[$2])], + [AS_VAR_SET(CACHEVAR,[unsupported])]) + _AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])]) +AS_VAR_IF(CACHEVAR,unsupported, [ + m4_default([$5], :) +], [ + AS_VAR_IF(CACHEVAR,none,[flags_needed=""], [flags_needed="$CACHEVAR"]) + m4_default([$4], :) +]) +AS_VAR_POPDEF([CACHEVAR])dnl +])dnl HTS_CHECK_COMPILE_FLAGS_NEEDED From 3e0fd29c75fcabaa01fa5f0f34adb262c1825371 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 22 Mar 2023 17:22:49 +0000 Subject: [PATCH 406/488] Make MacOS tests build a multiarch version of the library --- .cirrus.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index dc93b071d..fc4405b08 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -15,7 +15,7 @@ libdeflate_template: &LIBDEFLATE pushd "$HOME" git clone --depth 1 https://github.com/ebiggers/libdeflate.git pushd libdeflate - cmake -B build -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -DCMAKE_C_FLAGS='-g -O3 -fPIC' + cmake -B build -DLIBDEFLATE_BUILD_SHARED_LIB=OFF -DLIBDEFLATE_BUILD_GZIP=OFF -DCMAKE_C_FLAGS="-g -O3 -fPIC $LIBDEFLATE_CFLAGS" cmake --build build --verbose popd popd @@ -186,6 +186,9 @@ macosx_task: environment: CC: clang + CFLAGS: "-Wall -arch arm64 -arch x86_64" + LDFLAGS: "-arch arm64 -arch x86_64" + LIBDEFLATE_CFLAGS: "-arch arm64 -arch x86_64" LC_ALL: C CIRRUS_CLONE_DEPTH: 1 From 93434e041c53860398044ab6c0735230b389278b Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 20 Mar 2023 14:36:29 +0000 Subject: [PATCH 407/488] Fix bug where bin number could overflow when looking for max_off When searching for `max_off`, hts_itr_query() and hts_itr_multi_bam() look for a bin to the right of the end of the region. For whole chromosomes, this would be HTS_POS_MAX, which is far beyond the maximum bin position supported. The `bin` calculation overflowed leading to either a negative bin number or an incorrect positive one, depending on the number of levels in the index. Negative bin numbers simply caused time to be wasted as the search loop eventually counted up to zero, but incorrect positive ones could cause the iterator to finish too early. Fix by catching the out-of-bounds case and setting max_off to UINT64_MAX, whch should be used for bins beyond the end of the indexable range. --- hts.c | 62 ++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 24 deletions(-) diff --git a/hts.c b/hts.c index 86b5bb877..0d5abf53a 100644 --- a/hts.c +++ b/hts.c @@ -3145,17 +3145,24 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } // compute max_off: a virtual offset from a bin to the right of end - bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; - if (bin >= idx->n_bins) bin = 0; - while (1) { - // search for an extant bin by moving right, but moving up to the - // parent whenever we get to a first child (which also covers falling - // off the RHS, which wraps around and immediately goes up to bin 0) - while (bin % 8 == 1) bin = hts_bin_parent(bin); - if (bin == 0) { max_off = (uint64_t)-1; break; } - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } - bin++; + // First check if end lies within the range of the index (it won't + // if it's HTS_POS_MAX) + if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { + // search for an extant bin by moving right, but moving up to the + // parent whenever we get to a first child (which also covers falling + // off the RHS, which wraps around and immediately goes up to bin 0) + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = UINT64_MAX; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { max_off = kh_val(bidx, k).list[0].u; break; } + bin++; + } + } else { + // Searching to end of reference + max_off = UINT64_MAX; } // retrieve bins @@ -3314,20 +3321,27 @@ int hts_itr_multi_bam(const hts_idx_t *idx, hts_itr_t *iter) } // compute max_off: a virtual offset from a bin to the right of end - bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; - if (bin >= idx->n_bins) bin = 0; - while (1) { - // search for an extant bin by moving right, but moving up to the - // parent whenever we get to a first child (which also covers falling - // off the RHS, which wraps around and immediately goes up to bin 0) - while (bin % 8 == 1) bin = hts_bin_parent(bin); - if (bin == 0) { max_off = (uint64_t)-1; break; } - k = kh_get(bin, bidx, bin); - if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { - max_off = kh_val(bidx, k).list[0].u; - break; + // First check if end lies within the range of the index (it + // won't if it's HTS_POS_MAX) + if (end < 1LL << (idx->min_shift + 3 * idx->n_lvls)) { + bin = hts_bin_first(idx->n_lvls) + ((end-1) >> idx->min_shift) + 1; + if (bin >= idx->n_bins) bin = 0; + while (1) { + // search for an extant bin by moving right, but moving up to the + // parent whenever we get to a first child (which also covers falling + // off the RHS, which wraps around and immediately goes up to bin 0) + while (bin % 8 == 1) bin = hts_bin_parent(bin); + if (bin == 0) { max_off = UINT64_MAX; break; } + k = kh_get(bin, bidx, bin); + if (k != kh_end(bidx) && kh_val(bidx, k).n > 0) { + max_off = kh_val(bidx, k).list[0].u; + break; + } + bin++; } - bin++; + } else { + // Searching to end of reference + max_off = UINT64_MAX; } //convert coordinates to file offsets From a616e851373a73cdd30ae69ad92351465d1419ea Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 6 Apr 2023 11:56:33 +0100 Subject: [PATCH 408/488] Add MZ:i tag as a check for base modification validity. (#1590) If a sequence is hard-clipped after calling the base modifications, then the tool may, or may not, update the MM and ML tags accordingly. We have no way of distinguishing these two cases. While the base modification parsing code already detects overflows where the coordinates go beyond the sequence end, this isn't fool proof, especially if the clipping is short. So instead we have an (as yet unwritten) proposal of MZ:i tag holding the sequence length, to be written at the same time as the MM and ML tags. This can then be used as a sanity check later on, to detect cases where the sequence has changed length via a tool that is unaware of base modifications. TODO: as a separate PR, we should add a new API that can trim bases off the start/end of MM/ML strings to make it trivial for tools that are doing hard clipping via htslib. (Indeed we don't even have an API for SEQ/QUAL either, so it can do all together). This would make it far easier for people to keep everything in sync, and this code could then also update MZ while it's at it. That's new API though so it can arrive as a separate commit. See https://github.com/samtools/hts-specs/issues/646 --- sam.c | 30 ++++++++++++++++++++++-------- test/base_mods/MM-MZf1.sam | 5 +++++ test/base_mods/MM-MZf2.sam | 5 +++++ test/base_mods/MM-MZp.sam | 5 +++++ test/base_mods/MM-multi.sam | 2 +- test/base_mods/base-mods.sh | 1 + test/base_mods/base-mods.tst | 6 ++++++ test/pileup_mod.c | 7 ++++--- 8 files changed, 49 insertions(+), 12 deletions(-) create mode 100644 test/base_mods/MM-MZf1.sam create mode 100644 test/base_mods/MM-MZf2.sam create mode 100644 test/base_mods/MM-MZp.sam diff --git a/sam.c b/sam.c index c8daa9683..8f135c333 100644 --- a/sam.c +++ b/sam.c @@ -6221,14 +6221,24 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { if (!mm) return 0; if (mm[0] != 'Z') { - hts_log_error("MM tag is not of type Z"); + hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b)); + return -1; + } + + uint8_t *mi = bam_aux_get(b, "MZ"); + if (mi && bam_aux2i(mi) != b->core.l_qseq) { + // bam_aux2i with set errno = EINVAL and return 0 if the tag + // isn't integer, but 0 will be a seq-length mismatch anyway so + // triggers an error here too. + hts_log_error("%s: MM/MZ data length is incompatible with" + " SEQ length", bam_get_qname(b)); return -1; } uint8_t *ml = bam_aux_get(b, "ML"); if (!ml) ml = bam_aux_get(b, "Ml"); if (ml && (ml[0] != 'B' || ml[1] != 'C')) { - hts_log_error("ML tag is not of type B,C"); + hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b)); return -1; } uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; @@ -6314,7 +6324,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { delta = strtol(cp, &cp_end, 10); if (cp_end == cp) { - hts_log_error("Hit end of MM tag. Missing semicolon?"); + hts_log_error("%s: Hit end of MM tag. Missing " + "semicolon?", bam_get_qname(b)); return -1; } @@ -6343,8 +6354,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { state->implicit [mod_num] = implicit; if (delta < 0) { - hts_log_error("MM tag refers to bases beyond sequence " - "length"); + hts_log_error("%s: MM tag refers to bases beyond sequence " + "length", bam_get_qname(b)); return -1; } state->MMcount [mod_num] = delta; @@ -6359,7 +6370,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { } if (++mod_num >= MAX_BASE_MOD) { - hts_log_error("Too many base modification types"); + hts_log_error("%s: Too many base modification types", + bam_get_qname(b)); return -1; } ms++; n++; @@ -6377,7 +6389,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { } } if (ml > ml_end) { - hts_log_error("Insufficient number of entries in ML tag"); + hts_log_error("%s: Insufficient number of entries in ML " + "tag", bam_get_qname(b)); return -1; } } else { @@ -6389,7 +6402,8 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { cp++; } if (!*cp) { - hts_log_error("Hit end of MM tag. Missing semicolon?"); + hts_log_error("%s: Hit end of MM tag. Missing semicolon?", + bam_get_qname(b)); return -1; } } diff --git a/test/base_mods/MM-MZf1.sam b/test/base_mods/MM-MZf1.sam new file mode 100644 index 000000000..35074fd05 --- /dev/null +++ b/test/base_mods/MM-MZf1.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:37 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36 diff --git a/test/base_mods/MM-MZf2.sam b/test/base_mods/MM-MZf2.sam new file mode 100644 index 000000000..843f93a1b --- /dev/null +++ b/test/base_mods/MM-MZf2.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:f:36 diff --git a/test/base_mods/MM-MZp.sam b/test/base_mods/MM-MZp.sam new file mode 100644 index 000000000..836a09725 --- /dev/null +++ b/test/base_mods/MM-MZp.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36 diff --git a/test/base_mods/MM-multi.sam b/test/base_mods/MM-multi.sam index b2259a09e..1c7288f50 100644 --- a/test/base_mods/MM-multi.sam +++ b/test/base_mods/MM-multi.sam @@ -3,5 +3,5 @@ @CO r2 has them combined together, for example as produced by @CO a joint basecaller which assigns probabilities to all @CO trained events simultaneously. -r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 r2 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/test/base_mods/base-mods.sh b/test/base_mods/base-mods.sh index f3f3ca4b7..388ff369e 100755 --- a/test/base_mods/base-mods.sh +++ b/test/base_mods/base-mods.sh @@ -31,5 +31,6 @@ test_mod="../test_mod" pileup_mod="../pileup_mod" test_driver $@ +rm _err.tmp _out.tmp exit $? diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index 3809c0e6e..d246223c8 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -42,3 +42,9 @@ P MM-explicit-x.out $test_mod -x MM-explicit.sam # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam P MM-pileup2.out $pileup_mod < MM-pileup2.sam + +# Validation testing. We just care about exit status here, but the +# test data is a copy of MM-pileup.sam so that suffices too. +P MM-pileup.out $pileup_mod < MM-MZp.sam +F MM-pileup.out $pileup_mod < MM-MZf1.sam +F MM-pileup.out $pileup_mod < MM-MZf2.sam diff --git a/test/pileup_mod.c b/test/pileup_mod.c index 95c353771..323c0c6c2 100644 --- a/test/pileup_mod.c +++ b/test/pileup_mod.c @@ -73,7 +73,8 @@ void process_pileup(sam_hdr_t *h, const bam_pileup1_t *p, // as each new read is added or removed from the pileups. int pileup_cd_create(void *data, const bam1_t *b, bam_pileup_cd *cd) { hts_base_mod_state *m = hts_base_mod_state_alloc(); - bam_parse_basemod(b, m); + if (bam_parse_basemod(b, m) < 0) + return -1; cd->p = m; return 0; } @@ -201,7 +202,7 @@ int main(int argc, char **argv) { bam_plp_destructor(iter, pileup_cd_destroy); const bam_pileup1_t *p; - int tid, pos, n; + int tid, pos, n = 0; while ((p = bam_plp_auto(iter, &tid, &pos, &n)) != 0) { switch (compact) { case 0: @@ -221,5 +222,5 @@ int main(int argc, char **argv) { bam_destroy1(b); sam_hdr_destroy(h); - return 0; + return n != 0; } From dba4bdf1becefd4e5632b682da017e3effc7dc17 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 28 Mar 2023 10:03:53 +0100 Subject: [PATCH 409/488] Make reg2bins faster on whole-chromosome queries It's faster to iterate through the index hash table when requesting whole chromosomes, compared to the old behaviour of iterating through all the bins that could exist and looking them up in the hash table to see if they're present. The latter method works better for narrow ranges though, so we choose which to use based on the number of bins covering the range compared to the number in the index. --- hts.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 69 insertions(+), 11 deletions(-) diff --git a/hts.c b/hts.c index 0d5abf53a..89d12f66b 100644 --- a/hts.c +++ b/hts.c @@ -2903,25 +2903,80 @@ uint64_t hts_idx_get_n_no_coor(const hts_idx_t* idx) ****************/ // Note: even with 32-bit hts_pos_t, end needs to be 64-bit here due to 1LL<= end) return 0; - if (end >= 1LL<>s); e = t + (end>>s); n = e - b + 1; - if (itr->bins.n + n > itr->bins.m) { - itr->bins.m = itr->bins.n + n; - kroundup32(itr->bins.m); - itr->bins.a = (int*)realloc(itr->bins.a, sizeof(int) * itr->bins.m); + int i; + b = t + (beg>>s); e = t + (end>>s); + for (i = b; i <= e; ++i) { + if (kh_get(bin, bidx, i) != kh_end(bidx)) { + assert(itr->bins.n < itr->bins.m); + itr->bins.a[itr->bins.n++] = i; + } } - for (i = b; i <= e; ++i) itr->bins.a[itr->bins.n++] = i; } return itr->bins.n; } +static inline int reg2bins_wide(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) +{ + khint_t i; + hts_pos_t max_shift = 3 * n_lvls + min_shift; + --end; + if (beg < 0) beg = 0; + for (i = kh_begin(bidx); i != kh_end(bidx); i++) { + if (!kh_exist(bidx, i)) continue; + hts_pos_t bin = (hts_pos_t) kh_key(bidx, i); + int level = hts_bin_level(bin); + if (level > n_lvls) continue; // Dodgy index? + hts_pos_t first = hts_bin_first(level); + hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level)); + hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level)); + if (beg_at_level <= bin && bin <= end_at_level) { + assert(itr->bins.n < itr->bins.m); + itr->bins.a[itr->bins.n++] = bin; + } + } + return itr->bins.n; +} + +static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shift, int n_lvls, bidx_t *bidx) +{ + int l, t, s = min_shift + (n_lvls<<1) + n_lvls; + size_t reg_bin_count = 0, hash_bin_count = kh_n_buckets(bidx), max_bins; + hts_pos_t end1; + if (end >= 1LL<= end) return 0; + end1 = end - 1; + + // Count bins to see if it's faster to iterate through the hash table + // or the set of bins covering the region + for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + reg_bin_count += (end1 >> s) - (beg >> s) + 1; + } + max_bins = reg_bin_count < kh_size(bidx) ? reg_bin_count : kh_size(bidx); + if (itr->bins.m - itr->bins.n < max_bins) { + // Worst-case memory usage. May be wasteful on very sparse + // data, but the bin list usually won't be too big anyway. + size_t new_m = max_bins + itr->bins.n; + if (new_m > INT_MAX || new_m > SIZE_MAX / sizeof(int)) { + errno = ENOMEM; + return -1; + } + int *new_a = realloc(itr->bins.a, new_m * sizeof(*new_a)); + if (!new_a) return -1; + itr->bins.a = new_a; + itr->bins.m = new_m; + } + if (reg_bin_count < hash_bin_count) { + return reg2bins_narrow(beg, end, itr, min_shift, n_lvls, bidx); + } else { + return reg2bins_wide(beg, end, itr, min_shift, n_lvls, bidx); + } +} + static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) { int l, t, s; @@ -3166,7 +3221,10 @@ hts_itr_t *hts_itr_query(const hts_idx_t *idx, int tid, hts_pos_t beg, hts_pos_t } // retrieve bins - reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls); + if (reg2bins(beg, end, iter, idx->min_shift, idx->n_lvls, bidx) < 0) { + hts_itr_destroy(iter); + return NULL; + } for (i = n_off = 0; i < iter->bins.n; ++i) if ((k = kh_get(bin, bidx, iter->bins.a[i])) != kh_end(bidx)) From 07638e1cac22e76c2f59c9988feabd467a15c340 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 28 Mar 2023 13:33:42 +0100 Subject: [PATCH 410/488] Make reg2intervals() faster on whole-chromosome queries As for reg2bins(), it may be faster to iterate through the hash table entries, depending on the number and how wide the region being searched is. --- hts.c | 137 ++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 105 insertions(+), 32 deletions(-) diff --git a/hts.c b/hts.c index 89d12f66b..b3f6b9a08 100644 --- a/hts.c +++ b/hts.c @@ -2977,54 +2977,127 @@ static inline int reg2bins(int64_t beg, int64_t end, hts_itr_t *itr, int min_shi } } +static inline int add_to_interval(hts_itr_t *iter, bins_t *bin, + int tid, uint32_t interval, + uint64_t min_off, uint64_t max_off) +{ + hts_pair64_max_t *off; + int j; + + if (!bin->n) + return 0; + off = realloc(iter->off, (iter->n_off + bin->n) * sizeof(*off)); + if (!off) + return -2; + + iter->off = off; + for (j = 0; j < bin->n; ++j) { + if (bin->list[j].v > min_off && bin->list[j].u < max_off) { + iter->off[iter->n_off].u = min_off > bin->list[j].u + ? min_off : bin->list[j].u; + iter->off[iter->n_off].v = max_off < bin->list[j].v + ? max_off : bin->list[j].v; + // hts_pair64_max_t::max is now used to link + // file offsets to region list entries. + // The iterator can use this to decide if it + // can skip some file regions. + iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval; + iter->n_off++; + } + } + return 0; +} + +static inline int reg2intervals_narrow(hts_itr_t *iter, const bidx_t *bidx, + int tid, int64_t beg, int64_t end, + uint32_t interval, + uint64_t min_off, uint64_t max_off, + int min_shift, int n_lvls) +{ + int l, t, s = min_shift + n_lvls * 3; + hts_pos_t b, e, i; + + for (--end, l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + b = t + (beg>>s); e = t + (end>>s); + for (i = b; i <= e; ++i) { + khint_t k = kh_get(bin, bidx, i); + if (k != kh_end(bidx)) { + bins_t *bin = &kh_value(bidx, k); + int res = add_to_interval(iter, bin, tid, interval, min_off, max_off); + if (res < 0) + return res; + } + } + } + return 0; +} + +static inline int reg2intervals_wide(hts_itr_t *iter, const bidx_t *bidx, + int tid, int64_t beg, int64_t end, + uint32_t interval, + uint64_t min_off, uint64_t max_off, + int min_shift, int n_lvls) +{ + khint_t i; + hts_pos_t max_shift = 3 * n_lvls + min_shift; + --end; + if (beg < 0) beg = 0; + for (i = kh_begin(bidx); i != kh_end(bidx); i++) { + if (!kh_exist(bidx, i)) continue; + hts_pos_t bin = (hts_pos_t) kh_key(bidx, i); + int level = hts_bin_level(bin); + if (level > n_lvls) continue; // Dodgy index? + hts_pos_t first = hts_bin_first(level); + hts_pos_t beg_at_level = first + (beg >> (max_shift - 3 * level)); + hts_pos_t end_at_level = first + (end >> (max_shift - 3 * level)); + if (beg_at_level <= bin && bin <= end_at_level) { + bins_t *bin = &kh_value(bidx, i); + int res = add_to_interval(iter, bin, tid, interval, min_off, max_off); + if (res < 0) + return res; + } + } + return 0; +} + static inline int reg2intervals(hts_itr_t *iter, const hts_idx_t *idx, int tid, int64_t beg, int64_t end, uint32_t interval, uint64_t min_off, uint64_t max_off, int min_shift, int n_lvls) { int l, t, s; int i, j; - hts_pos_t b, e; - hts_pair64_max_t *off; + hts_pos_t end1; bidx_t *bidx; - khint_t k; - int start_n_off = iter->n_off; + int start_n_off; + size_t reg_bin_count = 0, hash_bin_count; + int res; if (!iter || !idx || (bidx = idx->bidx[tid]) == NULL || beg >= end) return -1; + hash_bin_count = kh_n_buckets(bidx); + s = min_shift + (n_lvls<<1) + n_lvls; if (end >= 1LL<>s); e = t + (end>>s); - - for (i = b; i <= e; ++i) { - if ((k = kh_get(bin, bidx, i)) != kh_end(bidx)) { - bins_t *p = &kh_value(bidx, k); + end1 = end - 1; + // Count bins to see if it's faster to iterate through the hash table + // or the set of bins covering the region + for (l = 0, t = 0; l <= n_lvls; s -= 3, t += 1<<((l<<1)+l), ++l) { + reg_bin_count += (end1 >> s) - (beg >> s) + 1; + } - if (p->n) { - off = realloc(iter->off, (iter->n_off + p->n) * sizeof(*off)); - if (!off) - return -2; + start_n_off = iter->n_off; - iter->off = off; - for (j = 0; j < p->n; ++j) { - if (p->list[j].v > min_off && p->list[j].u < max_off) { - iter->off[iter->n_off].u = min_off > p->list[j].u - ? min_off : p->list[j].u; - iter->off[iter->n_off].v = max_off < p->list[j].v - ? max_off : p->list[j].v; - // hts_pair64_max_t::max is now used to link - // file offsets to region list entries. - // The iterator can use this to decide if it - // can skip some file regions. - iter->off[iter->n_off].max = ((uint64_t) tid << 32) | interval; - iter->n_off++; - } - } - } - } - } + // Populate iter->off with the intervals for this region + if (reg_bin_count < hash_bin_count) { + res = reg2intervals_narrow(iter, bidx, tid, beg, end, interval, + min_off, max_off, min_shift, n_lvls); + } else { + res = reg2intervals_wide(iter, bidx, tid, beg, end, interval, + min_off, max_off, min_shift, n_lvls); } + if (res < 0) + return res; if (iter->n_off - start_n_off > 1) { ks_introsort(_off_max, iter->n_off - start_n_off, iter->off + start_n_off); From ca51cc55e8ea1ead5d26aa4ede90b58c07e8d893 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 6 Apr 2023 08:36:18 +0100 Subject: [PATCH 411/488] Fix typo in kh_int_hash_func2 macro. This was fixed upstream in attractivechaos/klib@384277a Fixes #1598 --- htslib/khash.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/khash.h b/htslib/khash.h index 7cd0c9b62..4cea91020 100644 --- a/htslib/khash.h +++ b/htslib/khash.h @@ -447,7 +447,7 @@ static kh_inline khint_t __ac_Wang_hash(khint_t key) key ^= (key >> 16); return key; } -#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) +#define kh_int_hash_func2(key) __ac_Wang_hash((khint_t)(key)) /* --- END OF HASH FUNCTIONS --- */ From fc2448c27362837758f5b777f25fc7d2fe420c82 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 27 Apr 2023 09:27:38 +0100 Subject: [PATCH 412/488] Switched back to openssl for Alpine. --- INSTALL | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/INSTALL b/INSTALL index dd2c3ec90..804593443 100644 --- a/INSTALL +++ b/INSTALL @@ -266,7 +266,9 @@ Alpine Linux ------------ doas apk update # Ensure the package list is up to date -doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev libressl-dev +doas apk add autoconf automake make gcc musl-dev perl bash zlib-dev bzip2-dev xz-dev curl-dev openssl-dev + +Note: some older Alpine versions use libressl-dev rather than openssl-dev. OpenSUSE -------- From 279cc9ec8d3fd5ad94aa0df730dac6e9ec0ead8f Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 2 May 2023 10:24:06 +1200 Subject: [PATCH 413/488] Mention in INSTALL that using plugins may need -rdynamic --- INSTALL | 3 +++ 1 file changed, 3 insertions(+) diff --git a/INSTALL b/INSTALL index 804593443..e0fddd9d7 100644 --- a/INSTALL +++ b/INSTALL @@ -129,6 +129,9 @@ various features and specify further optional external requirements: any enabled pluggable facilities (such as libcurl file access) are built directly within HTSlib. + Programs that are statically linked to a libhts.a with plugins enabled + need to be linked using -rdynamic or a similar linker option. + The repository contains several additional plugins, including the iRODS () file access plugin previously distributed with HTSlib. From 415d2cea57feecdd904f84d8aa25390d96843630 Mon Sep 17 00:00:00 2001 From: kojix2 <2xijok@gmail.com> Date: Tue, 9 May 2023 00:07:46 +0900 Subject: [PATCH 414/488] Fix example in docs for sam_hdr_add_line --- htslib/sam.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/sam.h b/htslib/sam.h index 514a6be04..a0b8438aa 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -485,7 +485,7 @@ int sam_hdr_add_lines(sam_hdr_t *h, const char *lines, size_t len); /// Adds a single line to an existing header. /*! * Specify type and one or more key,value pairs, ending with the NULL key. - * Eg. sam_hdr_add_line(h, "SQ", "ID", "foo", "LN", "100", NULL). + * Eg. sam_hdr_add_line(h, "SQ", "SN", "foo", "LN", "100", NULL). * * @param type Type of the added line. Eg. "SQ" * @return 0 on success, -1 on failure From 6125f1c56934745f3a4b81f6e512c11720849b55 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 2 May 2023 12:04:05 +0100 Subject: [PATCH 415/488] Rename aux tag MZ to MN. See https://github.com/samtools/hts-specs/pull/714 --- sam.c | 4 ++-- test/base_mods/{MM-MZf1.sam => MM-MNf1.sam} | 4 ++-- test/base_mods/{MM-MZf2.sam => MM-MNf2.sam} | 4 ++-- test/base_mods/{MM-MZp.sam => MM-MNp.sam} | 4 ++-- test/base_mods/MM-multi.sam | 2 +- test/base_mods/base-mods.tst | 6 +++--- 6 files changed, 12 insertions(+), 12 deletions(-) rename test/base_mods/{MM-MZf1.sam => MM-MNf1.sam} (84%) rename test/base_mods/{MM-MZf2.sam => MM-MNf2.sam} (84%) rename test/base_mods/{MM-MZp.sam => MM-MNp.sam} (84%) diff --git a/sam.c b/sam.c index 8f135c333..05910cf8a 100644 --- a/sam.c +++ b/sam.c @@ -6225,12 +6225,12 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { return -1; } - uint8_t *mi = bam_aux_get(b, "MZ"); + uint8_t *mi = bam_aux_get(b, "MN"); if (mi && bam_aux2i(mi) != b->core.l_qseq) { // bam_aux2i with set errno = EINVAL and return 0 if the tag // isn't integer, but 0 will be a seq-length mismatch anyway so // triggers an error here too. - hts_log_error("%s: MM/MZ data length is incompatible with" + hts_log_error("%s: MM/MN data length is incompatible with" " SEQ length", bam_get_qname(b)); return -1; } diff --git a/test/base_mods/MM-MZf1.sam b/test/base_mods/MM-MNf1.sam similarity index 84% rename from test/base_mods/MM-MZf1.sam rename to test/base_mods/MM-MNf1.sam index 35074fd05..f973d274c 100644 --- a/test/base_mods/MM-MZf1.sam +++ b/test/base_mods/MM-MNf1.sam @@ -1,5 +1,5 @@ @SQ SN:I LN:999 -r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:37 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:37 r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 -r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:i:36 diff --git a/test/base_mods/MM-MZf2.sam b/test/base_mods/MM-MNf2.sam similarity index 84% rename from test/base_mods/MM-MZf2.sam rename to test/base_mods/MM-MNf2.sam index 843f93a1b..a88924122 100644 --- a/test/base_mods/MM-MZf2.sam +++ b/test/base_mods/MM-MNf2.sam @@ -1,5 +1,5 @@ @SQ SN:I LN:999 -r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 -r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:f:36 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:f:36 diff --git a/test/base_mods/MM-MZp.sam b/test/base_mods/MM-MNp.sam similarity index 84% rename from test/base_mods/MM-MZp.sam rename to test/base_mods/MM-MNp.sam index 836a09725..7bdca0f31 100644 --- a/test/base_mods/MM-MZp.sam +++ b/test/base_mods/MM-MNp.sam @@ -1,5 +1,5 @@ @SQ SN:I LN:999 -r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 r1- 16 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:G-m,0,1,4,1,2;G-h,0,7;N-n,17,2; Ml:B:C,230,204,179,153,128,6,159,240,215 r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 -r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MZ:i:36 +r3 0 I 11 0 10S20M6S * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+mh,2,2,0,0,4,1;N+n,15,2; Ml:B:C,128,0,153,0,0,159,179,0,204,0,230,6,215,240 MN:i:36 diff --git a/test/base_mods/MM-multi.sam b/test/base_mods/MM-multi.sam index 1c7288f50..ac2831bc1 100644 --- a/test/base_mods/MM-multi.sam +++ b/test/base_mods/MM-multi.sam @@ -3,5 +3,5 @@ @CO r2 has them combined together, for example as produced by @CO a joint basecaller which assigns probabilities to all @CO trained events simultaneously. -r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MZ:i:36 +r1 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 MN:i:36 r2 0 * 0 0 * * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA * Mm:Z:C+mh,2,2,0,0,4,1;N+n,15; Ml:B:C,77,159,103,133,128,108,154,82,179,57,204,31,240 diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index d246223c8..237f7906c 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -45,6 +45,6 @@ P MM-pileup2.out $pileup_mod < MM-pileup2.sam # Validation testing. We just care about exit status here, but the # test data is a copy of MM-pileup.sam so that suffices too. -P MM-pileup.out $pileup_mod < MM-MZp.sam -F MM-pileup.out $pileup_mod < MM-MZf1.sam -F MM-pileup.out $pileup_mod < MM-MZf2.sam +P MM-pileup.out $pileup_mod < MM-MNp.sam +F MM-pileup.out $pileup_mod < MM-MNf1.sam +F MM-pileup.out $pileup_mod < MM-MNf2.sam From f2d17a7d2c96494337839d257b0f08f28ddbc6fa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 May 2023 15:58:46 +0100 Subject: [PATCH 416/488] Protect against overly large containers. It's possible to construct CRAM containers that are extremely large such that building a block of BAM records representing the container overflows due to the size of the combined aux fields. We could change how we construct blocks of data, and work at a more individual read level, but realistically it's just not good form to be handling arbitrarily large containers as they may cause excessive memory issues which brings its own denial attacks. POTENTIAL SECURITY ISSUE: Note the previous could overflow cr->aux, which then went negative and caused negative offsets to be passed to memcpy. This would lead to a crash. I cannot see a way to get this to not crash and hence leak data, but it could form a denial of service on a remote server using htslib. . --- cram/cram_decode.c | 7 +++++++ cram/cram_structs.h | 18 +++++++++--------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 39869cbdd..47b7ed076 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -2059,6 +2059,13 @@ static int cram_decode_aux(cram_fd *fd, *has_NM = 1; } } + + // We could go to 2^32 fine, but we shouldn't be hitting this anyway, + // and it's protecting against memory hogs too. + if (BLOCK_SIZE(s->aux_blk) > (1u<<31)) { + hts_log_error("CRAM->BAM aux block size overflow"); + goto block_err; + } } return r; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 0a66d51b9..8b21d29c0 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -500,8 +500,8 @@ typedef struct cram_record { // Auxiliary data int32_t ntags; // TC - int32_t aux; // idx to s->aux_blk - int32_t aux_size; // total size of packed ntags in aux_blk + uint32_t aux; // idx to s->aux_blk + uint32_t aux_size; // total size of packed ntags in aux_blk #ifndef TN_external int32_t TN_idx; // TN; idx to s->TN; #else @@ -509,15 +509,15 @@ typedef struct cram_record { #endif int TL; - int32_t seq; // idx to s->seqs_blk - int32_t qual; // idx to s->qual_blk - int32_t cigar; // idx to s->cigar + uint32_t seq; // idx to s->seqs_blk + uint32_t qual; // idx to s->qual_blk + uint32_t cigar; // idx to s->cigar int32_t ncigar; int64_t aend; // alignment end int32_t mqual; // MQ - int32_t feature; // idx to s->feature - int32_t nfeature; // number of features + uint32_t feature; // idx to s->feature + uint32_t nfeature; // number of features int32_t mate_flags; // MF } cram_record; @@ -623,8 +623,8 @@ struct cram_slice { uint32_t ncigar; cram_feature *features; - int nfeatures; - int afeatures; // allocated size of features + uint32_t nfeatures; + uint32_t afeatures; // allocated size of features #ifndef TN_external // TN field (Tag Name) From f3ad960fa36e263684fd0822a14cb72e8f1b2d5c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 3 May 2023 16:02:52 +0100 Subject: [PATCH 417/488] Don't create overly large CRAM blocks. Currently CRAM containers can in some circumstance become huge. To prevent this we currently have a limit of the number of sequences (default 10,000) and also by number of bases (default 500 * number of seqs) so long-read technologies don't put too much in a container. However if we have 10k of reads with jointly under 5Mb of sequence that also have over 2GB worth of aux data, then we can trigger the overflow fixed in the previous commit. How do we get >430 bytes worth of aux for every base and >214Kb of aux for every read, in real world data rather than in deliberate stress testing? One possibility is with SEQ "*" (eg secondary alignments from minimap2) on very long-read data with heavy aux tag usage, as this doesn't increase base count at all. The same issue occurs to a lesser extent which supplementaries and hard-clipping. We now create new containers when seq+aux goes beyond the specified limit instead of just seq. In normal circumstances this will have a limited effect. Thanks to Martin Pollard for triggering and reporting this corner case. --- cram/cram_encode.c | 5 +++-- cram/cram_structs.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 5b56aedd5..9797fa7a8 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -3852,7 +3852,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (!c->slice || c->curr_rec == c->max_rec || (bam_ref(b) != c->curr_ref && c->curr_ref >= -1) || - (c->s_num_bases >= fd->bases_per_slice)) { + (c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice)) { int slice_rec, curr_rec, multi_seq = fd->multi_seq == 1; int curr_ref = c->slice ? c->curr_ref : bam_ref(b); @@ -3885,7 +3885,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { if (CRAM_MAJOR_VERS(fd->version) == 1 || c->curr_rec == c->max_rec || fd->multi_seq != 1 || !c->slice || - c->s_num_bases >= fd->bases_per_slice) { + c->s_num_bases + c->s_aux_bytes >= fd->bases_per_slice) { if (NULL == (c = cram_next_container(fd, b))) { if (fd->ctr) { // prevent cram_close attempting to flush @@ -3997,6 +3997,7 @@ int cram_put_bam_seq(cram_fd *fd, bam_seq_t *b) { c->curr_rec++; c->curr_c_rec++; c->s_num_bases += bam_seq_len(b); + c->s_aux_bytes += bam_get_l_aux(b); c->n_mapped += (bam_flag(b) & BAM_FUNMAP) ? 0 : 1; fd->record_counter++; diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 8b21d29c0..15b7f145b 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -473,6 +473,7 @@ struct cram_container { uint32_t crc32; // CRC32 uint64_t s_num_bases; // number of bases in this slice + uint64_t s_aux_bytes; // number of bytes of aux in BAM uint32_t n_mapped; // Number of mapped reads int ref_free; // whether 'ref' is owned by us and must be freed. From c04f61e9b5747763dcf31c4e26eb3739c16e58a5 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 15 May 2023 14:21:28 +0100 Subject: [PATCH 418/488] Add a missing break statement in cram_codec_to_id. (#1614) Note: This code is currently not used except in the experimental CRAMv4.0. --- cram/cram_codecs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 21240c141..cc5e52b2c 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -3910,7 +3910,8 @@ int cram_codec_to_id(cram_codec *c, int *id2) { switch (c->codec) { case E_CONST_INT: case E_CONST_BYTE: - bnum1 = -2; // no blocks used + bnum1 = -2; // no blocks used + break; case E_HUFFMAN: bnum1 = c->u.huffman.ncodes == 1 ? -2 : -1; From e13611a942095dfea0944fe28836934c17c2ef6c Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 11 Apr 2023 12:04:21 +0100 Subject: [PATCH 419/488] Fix fd_seek on pipes on modern MinGW releases. MinGW 12.x started returning non-zero values from lseek when the fd is a pipe. This is unhelpful and it breaks bgzf_check_EOF as seeking to the end is actually seeking to the end of the pipe memory buffer, causing invalid EOFs. (This breaks bcftools CI tests.) Fixes samtools/bcftools#1901 Co-authored-by: John Marshall --- hfile.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/hfile.c b/hfile.c index ebb5b2244..f8d42e49a 100644 --- a/hfile.c +++ b/hfile.c @@ -564,6 +564,16 @@ static ssize_t fd_write(hFILE *fpv, const void *buffer, size_t nbytes) static off_t fd_seek(hFILE *fpv, off_t offset, int whence) { hFILE_fd *fp = (hFILE_fd *) fpv; +#ifdef _WIN32 + // On windows lseek can return non-zero values even on a pipe. Instead + // it's likely to seek somewhere within the pipe memory buffer. + // This breaks bgzf_check_EOF among other things. + if (GetFileType((HANDLE)_get_osfhandle(fp->fd)) == FILE_TYPE_PIPE) { + errno = ESPIPE; + return -1; + } +#endif + return lseek(fp->fd, offset, whence); } From 878cff4a443fe5b3b49aae61953de1bbddeddc8a Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 23 May 2023 15:36:17 +0100 Subject: [PATCH 420/488] Amalgamate multiple CIGAR ops into single entry. (#1607) Amalgamate multiple CIGAR ops into single entry. Multiple matching (or sequence (mis)matching)) ops (e.g. 10M40M) give a different VCF using BAQ than a single operation of the same length (e.g. 50M). This change compresses the multiple operations into one. --- realn.c | 22 +++++++++++++++++++++- sam.c | 2 +- test/realn03.fa | 2 ++ test/realn03.fa.fai | 1 + test/realn03.sam | 4 ++++ test/realn03_exp.sam | 4 ++++ test/test.pl | 3 +++ 7 files changed, 36 insertions(+), 2 deletions(-) create mode 100644 test/realn03.fa create mode 100644 test/realn03.fa.fai create mode 100644 test/realn03.sam create mode 100644 test/realn03_exp.sam diff --git a/realn.c b/realn.c index 5354dee78..d7e8255f8 100644 --- a/realn.c +++ b/realn.c @@ -1,6 +1,6 @@ /* realn.c -- BAQ calculation and realignment. - Copyright (C) 2009-2011, 2014-2016, 2018, 2021 Genome Research Ltd. + Copyright (C) 2009-2011, 2014-2016, 2018, 2021, 2023 Genome Research Ltd. Portions copyright (C) 2009-2011 Broad Institute. Author: Heng Li @@ -268,8 +268,28 @@ int sam_prob_realn(bam1_t *b, const char *ref, hts_pos_t ref_len, int flag) { // tseq,tref are no longer needed, so we can steal them to avoid mallocs uint8_t *left = tseq; uint8_t *rght = tref; + int len = 0; + for (k = 0, x = c->pos, y = 0; k < c->n_cigar; ++k) { int op = cigar[k]&0xf, l = cigar[k]>>4; + + // concatenate alignment matches (including sequence (mis)matches) + // otherwise 50M50M gives a different result to 100M + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + if ((k + 1) < c->n_cigar) { + int next_op = bam_cigar_op(cigar[k + 1]); + + if (next_op == BAM_CMATCH || next_op == BAM_CEQUAL || next_op == BAM_CDIFF) { + len += l; + continue; + } + } + + // last of M/X/= ops + l += len; + len = 0; + } + if (l == 0) continue; if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { // Sanity check running off the end of the sequence diff --git a/sam.c b/sam.c index 05910cf8a..e1cc2988a 100644 --- a/sam.c +++ b/sam.c @@ -5227,7 +5227,7 @@ static inline int resolve_cigar2(bam_pileup1_t *p, hts_pos_t pos, cstate_t *s) uint32_t *cigar = bam_get_cigar(b); int k; // determine the current CIGAR operation - //fprintf(stderr, "%s\tpos=%d\tend=%d\t(%d,%d,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); + //fprintf(stderr, "%s\tpos=%ld\tend=%ld\t(%d,%ld,%d)\n", bam_get_qname(b), pos, s->end, s->k, s->x, s->y); if (s->k == -1) { // never processed p->qpos = 0; if (c->n_cigar == 1) { // just one operation, save a loop diff --git a/test/realn03.fa b/test/realn03.fa new file mode 100644 index 000000000..9ac86c11e --- /dev/null +++ b/test/realn03.fa @@ -0,0 +1,2 @@ +>MX +CGTCTACTACG diff --git a/test/realn03.fa.fai b/test/realn03.fa.fai new file mode 100644 index 000000000..dfaa59355 --- /dev/null +++ b/test/realn03.fa.fai @@ -0,0 +1 @@ +MX 11 4 11 12 diff --git a/test/realn03.sam b/test/realn03.sam new file mode 100644 index 000000000..50266242d --- /dev/null +++ b/test/realn03.sam @@ -0,0 +1,4 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MX LN:11 +M 64 MX 1 60 11M * 0 0 CGTCTCCTACG IIIIIIIIIII +X 64 MX 1 60 5=1X5= * 0 0 CGTCTCCTACG IIIIIIIIIII diff --git a/test/realn03_exp.sam b/test/realn03_exp.sam new file mode 100644 index 000000000..3b608d0c6 --- /dev/null +++ b/test/realn03_exp.sam @@ -0,0 +1,4 @@ +@HD VN:1.6 SO:coordinate +@SQ SN:MX LN:11 +M 64 MX 1 60 11M * 0 0 CGTCTCCTACG IIIIIIIIIII BQ:Z:D@@@@@@@@@D +X 64 MX 1 60 5=1X5= * 0 0 CGTCTCCTACG IIIIIIIIIII BQ:Z:D@@@@@@@@@D diff --git a/test/test.pl b/test/test.pl index 1595557a2..368bd4f18 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1099,6 +1099,9 @@ sub test_realn { # Revert quality values (using data in ZQ tags) test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn02.fa -i $$opts{path}/realn02_exp-a.sam -o -", out => "realn02_exp.sam"); + + # Make sure multiple matches are treated the same way as a single match of the same length. + test_cmd($opts, cmd => "$test_realn -f $$opts{path}/realn03.fa -e -i $$opts{path}/realn03.sam -o -", out => "realn03_exp.sam"); } sub test_bcf_set_variant_type From 20072b3ed83e961da8a66120368ac591b569bc0e Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 26 May 2023 15:26:00 +0100 Subject: [PATCH 421/488] Update to latest htscodecs - Speed up fqz_qual_stats function - Speed up fqcomp encoding through memory prefetching - Optimise fqzcomp decoder - Remove prefetching from c_simple_mode.h - Autoconf improvements - Updates for 1.5.0 release - Only use the ARM NEON 32-way unrolled rANS on AArch64. - Add include of config.h to test programs. - Add FreeBSD to the CI tests - On AMD, don't always use AVX512 in preference to AVX2. - Add a -b option to change block size for rans4x16 test. --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index d4aed5859..109f06949 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit d4aed585929e2dab9dd8e6a2b74484dfc347c0f2 +Subproject commit 109f069490fca15d85e2d261822c15bc3080db8a From 334c76adaaa2c6c916b9326f3da6ab4981a07c7d Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 26 May 2023 15:53:57 +0100 Subject: [PATCH 422/488] Don't set _POSIX_C_SOURCE for htscodecs tests This caused problems on freebsd, and is no longer needed now the test sources include config.h --- Makefile | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/Makefile b/Makefile index 9b7f7f2f4..0cfec6e6b 100644 --- a/Makefile +++ b/Makefile @@ -732,17 +732,17 @@ htscodecs/tests/tokenise_name3: htscodecs/tests/tokenise_name3_test.o $(HTSCODEC htscodecs/tests/varint: htscodecs/tests/varint_test.o $(HTSCODECS_OBJS) $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread -htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c $(htscodecs_arith_dynamic_h) -htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) -htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c $(htscodecs_rANS_static4x16_h) -htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c $(htscodecs_rANS_static_h) -htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c $(htscodecs_tokenise_name3_h) -htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs -D_POSIX_C_SOURCE=200112L +htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c $(htscodecs_varint_h) test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) From abb4c730c6c283018f42500842c734d67deaf619 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Tue, 18 Apr 2023 07:15:00 +0100 Subject: [PATCH 423/488] Set _XOPEN_SOURCE in configure if it's not already set HTSlib uses POSIX and XSI functions, so to be strictly conforming POSIX says it should define _XOPEN_SOURCE before any standard headers are included. Some system headers may use this to enable or disable declarations for these functions. Define it unconditionally in configure, unless it has already been set by the user via CPPFLAGS etc. This mirrors what happens if you build HTSlib by typing "make" without running configure first. As it is now set by default, other locations where _XOPEN_SOURCE may be altered are removed. Co-authored-by: Rob Davies --- configure.ac | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index ff2367c1b..b18760da5 100644 --- a/configure.ac +++ b/configure.ac @@ -70,6 +70,19 @@ dnl Flags to treat warnings as errors. These need to be applied to CFLAGS dnl later as they can interfere with some of the tests (notably AC_SEARCH_LIBS) HTS_PROG_CC_WERROR(hts_late_cflags) +# HTSlib uses X/Open-only facilities (M_SQRT2 etc, drand48() etc), and +# various POSIX functions that are provided by various _POSIX_C_SOURCE values +# or by _XOPEN_SOURCE >= 500. It also uses usleep(), which is removed when +# _XOPEN_SOURCE >= 700. Additionally, some definitions may require +# _XOPEN_SOURCE >= 600 on some platforms (snprintf on MinGW, +# PTHREAD_MUTEX_RECURSIVE on some Linux distributions). Hence we set it to 600. + +# Define _XOPEN_SOURCE unless the user has already done so via $CPPFLAGS etc. +AC_CHECK_DECL([_XOPEN_SOURCE], [], + [AC_DEFINE([_XOPEN_SOURCE], [600], [Specify X/Open requirements])], + []) + + dnl Check for various compiler flags to enable SIMD features dnl Options for rANS32x16 sse4.1 version - ssse3 hts_cflags_sse4="" @@ -263,7 +276,9 @@ case $basic_host in # This also sets __USE_MINGW_ANSI_STDIO which in turn makes PRId64, # %lld and %z printf formats work. It also enforces the snprintf to # be C99 compliant so it returns the correct values (in kstring.c). - CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" + + # Now set by default, so no need to do it here. + # CPPFLAGS="$CPPFLAGS -D_XOPEN_SOURCE=600" ;; *) host_result="plain .so" @@ -583,7 +598,8 @@ AC_SEARCH_LIBS(regcomp, regex, [libregex=needed], []) dnl Look for PTHREAD_MUTEX_RECURSIVE. dnl This is normally in pthread.h except on some broken glibc implementations. -AC_CHECK_DECL(PTHREAD_MUTEX_RECURSIVE, [], [AC_DEFINE([_XOPEN_SOURCE],[600], [Needed for PTHREAD_MUTEX_RECURSIVE])], [[#include ]]) +dnl Now set by default +dnl AC_CHECK_DECL(PTHREAD_MUTEX_RECURSIVE, [], [AC_DEFINE([_XOPEN_SOURCE],[600], [Needed for PTHREAD_MUTEX_RECURSIVE])], [[#include ]]) if test "$s3" = enabled ; then AC_DEFINE([ENABLE_S3], 1, [Define if HTSlib should enable S3 support.]) From ccf7c990c8297ca7089619b629990c79bc32b8ef Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 31 May 2023 14:35:03 +0100 Subject: [PATCH 424/488] Fix trailing space in config.h made by configure To stop check_spaces.pl from tripping up on it. --- configure.ac | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/configure.ac b/configure.ac index b18760da5..3230f3935 100644 --- a/configure.ac +++ b/configure.ac @@ -136,8 +136,7 @@ dnl Propagate HTSlib's unaligned access preference to htscodecs /* Prevent unaligned access in htscodecs SSE4 rANS codec */ #if defined(HTS_ALLOW_UNALIGNED) && HTS_ALLOW_UNALIGNED == 0 #undef UBSAN -#endif - ]) +#endif]) AC_DEFINE([UBSAN],1,[]) ]) AC_SUBST([hts_cflags_sse4]) From e8f773b0e3f82157107ac6eb8d1fb17b2aae5d17 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 2 Jun 2023 11:58:25 +0100 Subject: [PATCH 425/488] Ignore generated config_vars.h file in copyright check --- test/maintainer/check_copyright.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/test/maintainer/check_copyright.pl b/test/maintainer/check_copyright.pl index 43fb5be4e..608182085 100755 --- a/test/maintainer/check_copyright.pl +++ b/test/maintainer/check_copyright.pl @@ -49,6 +49,7 @@ sub check { # Exclusions: my %exclude = map { ("$root/$_", 1) } ( 'config.h', # Auto-generated +'config_vars.h', # Auto-generated 'version.h', # Auto-generated 'cram/rANS_byte.h', # "Public domain" 'os/lzma_stub.h', # "Public domain" From 9936a58154e145f2761db5c6816d700cfb4f3d31 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Wed, 31 May 2023 14:48:59 +0100 Subject: [PATCH 426/488] Switch to `/usr/bin/env perl` for all perl scripts Some of them were using it already, and it's slightly more portable on platforms (mainly BSDs) that don't put perl in /usr/bin/. Happily most of them do put env in the same place these days... --- test/compare_sam.pl | 3 ++- test/maintainer/check_copyright.pl | 2 +- test/maintainer/check_spaces.pl | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/test/compare_sam.pl b/test/compare_sam.pl index 23b67d3a4..499cb2390 100755 --- a/test/compare_sam.pl +++ b/test/compare_sam.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl -w +#!/usr/bin/env perl # # Copyright (C) 2013-2018 Genome Research Ltd. # @@ -26,6 +26,7 @@ # Optionally can skip header or ignore specific types of diff. use strict; +use warnings; use Getopt::Long; my %opts; diff --git a/test/maintainer/check_copyright.pl b/test/maintainer/check_copyright.pl index 608182085..22556df01 100755 --- a/test/maintainer/check_copyright.pl +++ b/test/maintainer/check_copyright.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # check_copyright.pl : Basic source file checks for copyright boilerplate # # Author : Rob Davies diff --git a/test/maintainer/check_spaces.pl b/test/maintainer/check_spaces.pl index 81b4ededc..e48518f25 100755 --- a/test/maintainer/check_spaces.pl +++ b/test/maintainer/check_spaces.pl @@ -1,4 +1,4 @@ -#!/usr/bin/perl +#!/usr/bin/env perl # check_spaces.pl : Check source files for tabs and trailing spaces # # Author : Rob Davies From 90af5ee56f84779d27a03fa2aaa59a6bd4ee90cf Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 1 Jun 2023 10:31:07 +0100 Subject: [PATCH 427/488] Stop the overwriting of the end value. If the end column appeared before the begin column then the end value was being overwritten in begin value initialisation. --- tbx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tbx.c b/tbx.c index d897a21f1..154e2a81b 100644 --- a/tbx.c +++ b/tbx.c @@ -103,10 +103,18 @@ int tbx_parse1(const tbx_conf_t *conf, size_t len, char *line, tbx_intv_t *intv) intv->ss = line + b; intv->se = line + i; } else if (id == conf->bc) { // here ->beg is 0-based. - intv->beg = intv->end = strtoll(line + b, &s, 0); + intv->beg = strtoll(line + b, &s, 0); + + if (conf->bc <= conf->ec) // don't overwrite an already set end point + intv->end = intv->beg; + if ( s==line+b ) return -1; // expected int - if (!(conf->preset&TBX_UCSC)) --intv->beg; - else ++intv->end; + + if (!(conf->preset&TBX_UCSC)) + --intv->beg; + else if (conf->bc <= conf->ec) + ++intv->end; + if (intv->beg < 0) { hts_log_warning("Coordinate <= 0 detected. " "Did you forget to use the -0 option?"); From f613a93feb5f323f08ccfe53549dad8784580e17 Mon Sep 17 00:00:00 2001 From: vasudeva8 <113358286+vasudeva8@users.noreply.github.com> Date: Fri, 9 Jun 2023 11:29:26 +0100 Subject: [PATCH 428/488] Ensure NUL termination of Z/H data in sam_format_aux1; fix base mod state reuse Ensure NUL termination of Z/H data in sam_format_aux1 Avoid failure to get base modifications when reusing the hts_base_mod_state struct. Minor comment updates --- htslib/sam.h | 6 ++++-- sam.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index a0b8438aa..fe5b1ebdc 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1,7 +1,7 @@ /// @file htslib/sam.h /// High-level SAM/BAM/CRAM sequence file operations. /* - Copyright (C) 2008, 2009, 2013-2022 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013-2023 Genome Research Ltd. Copyright (C) 2010, 2012, 2013 Broad Institute. Author: Heng Li @@ -1414,7 +1414,7 @@ const char *sam_parse_region(sam_hdr_t *h, const char *s, int *tid, /** @param fp Pointer to the destination file * @param h Pointer to the header structure previously read * @param b Pointer to the record to be written - * @return >= 0 on successfully writing the record, -1 on error + * @return >= 0 on successfully writing the record, -ve on error */ HTSLIB_EXPORT int sam_write1(samFile *fp, const sam_hdr_t *h, const bam1_t *b) HTS_RESULT_USED; @@ -1519,6 +1519,7 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key, r |= kputc_(type, ks) < 0; r |= kputc_(':', ks) < 0; while (s < end && *s) r |= kputc_(*s++, ks) < 0; + r |= kputsn("", 0, ks) < 0; //ensures NUL termination if (s >= end) goto bad_aux; ++s; @@ -2286,6 +2287,7 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, * @param state The base modification state pointer. * @param mods A supplied array for returning base modifications * @param n_mods The size of the mods array + * @param pos Pointer holding position of modification in sequence * @return The number of modifications found on success, * 0 if no more modifications are present, * -1 on failure. diff --git a/sam.c b/sam.c index e1cc2988a..d77ac7f9d 100644 --- a/sam.c +++ b/sam.c @@ -6215,6 +6215,10 @@ static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; * */ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { + + //reset position, else upcoming calls may fail on seq pos - length comparison + state->seq_pos = 0; + // Read MM and ML tags uint8_t *mm = bam_aux_get(b, "MM"); if (!mm) mm = bam_aux_get(b, "Mm"); @@ -6244,8 +6248,6 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; if (ml) ml += 6; - state->seq_pos = 0; - // Aggregate freqs of ACGTN if reversed, to get final-delta (later) int freq[16]; if (b->core.flag & BAM_FREVERSE) From 5e0ccef5f3c2864dbd7a966b55ca06282fa3b2c8 Mon Sep 17 00:00:00 2001 From: vasudeva8 <113358286+vasudeva8@users.noreply.github.com> Date: Fri, 9 Jun 2023 11:43:05 +0100 Subject: [PATCH 429/488] Changes to avoid segfault with uncompressed bam (PR #1632) Avoids segfault when writing bam/bcf with mode "wbu" by changing "wbu" to "wb0". The ensures the output file will be properly wrapped in BGZF blocks, even though it's not been compressed. Fixes #1617 --- hts.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/hts.c b/hts.c index b3f6b9a08..f2bc5fcb6 100644 --- a/hts.c +++ b/hts.c @@ -835,7 +835,7 @@ char *hts_format_description(const htsFormat *format) htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) { - char smode[101], *cp, *cp2, *mode_c; + char smode[101], *cp, *cp2, *mode_c, *uncomp = NULL; htsFile *fp = NULL; hFILE *hfile = NULL; char fmt_code = '\0'; @@ -853,8 +853,13 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) fmt_code = 'b'; else if (*cp == 'c') fmt_code = 'c'; - else + else { *cp2++ = *cp; + // Cache the uncompress flag 'u' pos if present + if (!uncomp && (*cp == 'u')) { + uncomp = cp2 - 1; + } + } } mode_c = cp2; *cp2++ = fmt_code; @@ -866,6 +871,11 @@ htsFile *hts_open_format(const char *fn, const char *mode, const htsFormat *fmt) *mode_c = format_to_mode[fmt->format]; } + // Uncompressed bam/bcf is not supported, change 'u' to '0' on write + if (uncomp && *mode_c == 'b' && (strchr(smode, 'w') || strchr(smode, 'a'))) { + *uncomp = '0'; + } + // If we really asked for a compressed text format then mode_c above will // point to nul. We set to 'z' to enable bgzf. if (strchr(mode, 'w') && fmt && fmt->compression == bgzf) { From 7f69840c2fbf73dc7601c17a3ade4db676858cf4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Sun, 11 Jun 2023 18:50:59 +0200 Subject: [PATCH 430/488] cram/cram_external.c: fix external htscodecs include This patch fixes a duplicate file extension, causing otherwise the following build failure when building htslib against an externally built htscodecs: gcc -g -O2 -ffile-prefix-map=/<>=. -fstack-protector-strong -Wformat -Werror=format-security -ffat-lto-objects -ffat-lto-objects -I. -I. -DSAMTOOLS=1 -Wdate-time -D_FORTIFY_SOURCE=2 -c -o cram/cram_external.o cram/cram_external.c cram/cram_external.c:46:10: fatal error: htscodecs/rANS_static4x16.h.h: No such file or directory 46 | #include | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- cram/cram_external.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cram/cram_external.c b/cram/cram_external.c index 26ef3d7d3..7455185ad 100644 --- a/cram/cram_external.c +++ b/cram/cram_external.c @@ -43,7 +43,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #if defined(HAVE_EXTERNAL_LIBHTSCODECS) -#include +#include #else #include "../htscodecs/htscodecs/rANS_static4x16.h" #endif From 79b3a42db9059537318d85876623860d212ab159 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=89tienne=20Mollier?= Date: Sun, 11 Jun 2023 19:49:42 +0200 Subject: [PATCH 431/488] htslib-s3-plugin.7: fix whatis entry When preparing the htslib 1.17 upload in Debian, lintian caught a bad-whatis-entry issue. Looking closer, whatis(1) and apropos(1) commands fail to locate htslib-s3-plugin(7) manual and the parsing of the page fails with: $ lexgrog htslib-s3-plugin.7 htslib-s3-plugin.7: parse failed It seems to stem from the two words "s3 plugin" in the name of the manual, instead of having a single word as needed. This change names the manual page "htslib-s3-plugin" instead, making the mandb, the lexgrog parser, and lintian happy: $ lexgrog htslib-s3-plugin.7 htslib-s3-plugin.7: "htslib-s3-plugin - htslib AWS S3 plugin" The name could also simply be "s3-plugin", but having htslib in the name felt more apropos. :) --- htslib-s3-plugin.7 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib-s3-plugin.7 b/htslib-s3-plugin.7 index b37eacac3..eee6fb27c 100644 --- a/htslib-s3-plugin.7 +++ b/htslib-s3-plugin.7 @@ -1,6 +1,6 @@ .TH htslib-s3-plugin 7 "21 February 2023" "htslib-1.17" "Bioinformatics tools" .SH NAME -s3 plugin \- htslib AWS S3 plugin +htslib-s3-plugin \- htslib AWS S3 plugin .\" .\" Copyright (C) 2021-2022 Genome Research Ltd. .\" From c11aebe327d4a850fb9cb0af82655539ec768a68 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 21 Jun 2023 21:26:33 +0200 Subject: [PATCH 432/488] Allow repeated calls of bcf_sr_set_regions (PR #1624) and make repeated bcf_sr_seek()+next_line() calls consistent. Resolves #1623 and https://github.com/samtools/bcftools/issues/1918 --- htslib/synced_bcf_reader.h | 8 +++++--- synced_bcf_reader.c | 14 ++++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 78e9a0b4a..bbe5ea2ba 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -1,7 +1,7 @@ /// @file htslib/synced_bcf_reader.h /// Stream through multiple VCF files. /* - Copyright (C) 2012-2017, 2019-2021 Genome Research Ltd. + Copyright (C) 2012-2017, 2019-2023 Genome Research Ltd. Author: Petr Danecek @@ -306,8 +306,10 @@ int bcf_sr_set_samples(bcf_srs_t *readers, const char *samples, int is_file); * Targets (but not regions) can be prefixed with "^" to request logical complement, * for example "^X,Y,MT" indicates that sequences X, Y and MT should be skipped. * - * API note: bcf_sr_set_regions/bcf_sr_set_targets MUST be called before the - * first call to bcf_sr_add_reader(). + * API notes: + * - bcf_sr_set_targets MUST be called before the first call to bcf_sr_add_reader() + * - calling bcf_sr_set_regions AFTER readers have been initialized will + * reposition the readers and discard all previous regions. */ HTSLIB_EXPORT int bcf_sr_set_targets(bcf_srs_t *readers, const char *targets, int is_file, int alleles); diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 23e0ecaef..702f260ee 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1,6 +1,6 @@ /* synced_bcf_reader.c -- stream through multiple VCF files. - Copyright (C) 2012-2021 Genome Research Ltd. + Copyright (C) 2012-2023 Genome Research Ltd. Author: Petr Danecek @@ -76,6 +76,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str); static int _regions_match_alleles(bcf_sr_regions_t *reg, int als_idx, bcf1_t *rec); static void _regions_sort_and_merge(bcf_sr_regions_t *reg); static int _bcf_sr_regions_overlap(bcf_sr_regions_t *reg, const char *seq, hts_pos_t start, hts_pos_t end, int missed_reg_handler); +static void bcf_sr_seek_start(bcf_srs_t *readers); char *bcf_sr_strerror(int errnum) { @@ -187,8 +188,10 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file) { if ( readers->nreaders || readers->regions ) { - hts_log_error("Must call bcf_sr_set_regions() before bcf_sr_add_reader()"); - return -1; + if ( readers->regions ) bcf_sr_regions_destroy(readers->regions); + readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); + bcf_sr_seek_start(readers); + return 0; } readers->regions = bcf_sr_regions_init(regions,is_file,0,1,-2); @@ -676,7 +679,6 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) hts_log_error("This should never happen, just to keep clang compiler happy: %d",BCF_SR_AUX(files)->targets_overlap); exit(1); } - if ( beg <= files->regions->prev_end || end < files->regions->start || beg > files->regions->end ) continue; } @@ -843,7 +845,11 @@ static void bcf_sr_seek_start(bcf_srs_t *readers) for (i=0; inseqs; i++) reg->regs[i].creg = -1; reg->iseq = 0; + reg->start = -1; + reg->end = -1; reg->prev_seq = -1; + reg->prev_start = -1; + reg->prev_end = -1; } From 7de2df29cb1c9f6b807b8e4b8bc09a1313303db3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 4 May 2023 12:22:12 +0100 Subject: [PATCH 433/488] Change bounds checking in probaln_glocal In 3 places when filling out forwards and backwards arrays, the "u" array index has bounds checks of "u < 3 || u >= i_dim-3". Understanding this code is tricky however! My hypothesis that the upper bounds check here is because we use u, u+1 and u+2 in array indices, and we iterate with "k <= l_ref" so we can access one beyond the end of the array. However the arrays are allocated to be dimension (l_query+1)*i_dim, so (assuming correctness of l_ref vs l_query in bw/i_dim calculation) we have compensated for this over-step already. This has been validated with address sanitiser. The effect of the i_dim-3 limit is that having band width equal to query length causes the final state element to be incorrectly labelled as an insertion. This hypothesis may however be incorrect, as the lower bound "u < 3" also seems redundant, yet changing this to "u < 0" does give different quality scores in about 1 in 4000 sequences (tested on 10 million illumina short read BAQ calculations). Hence for now this is left unchanged. In normal behaviour using a band, tested using "samtools calmd -r -E" to generate BQ tags, this commit does not change output. Fixes #1605 --- probaln.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/probaln.c b/probaln.c index 192f4b751..8a60372b3 100644 --- a/probaln.c +++ b/probaln.c @@ -245,10 +245,24 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, { // f[l_query+1] double sum; double M = 1./s[l_query]; + // Note this goes up to <= l_ref, meaning we are accessing 1 beyond + // the end of the sequence. However we allocated above with + // (l_query+1)*i_dim (plus appropriate l_ref vs l_query in band width) + // so this should be sufficient. + // + // This fixes Issue #1605 where band width equal to sequence length + // gives incorrect alignments, due to the last value not being filled + // out correctly. + // + // I am unsure why the limit was previously set at u >= i_dim - 3, but + // can only conjecture it was due to forgetting the l_query+1 alloc. + // I am also unsure why "u < 3" is used instead of "u < 0", however + // changing that does change behaviour for common usage (unlike + // "idim - 3" to "idim"). for (k = 1, sum = 0.; k <= l_ref; ++k) { int u; set_u(u, bw, l_query, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; sum += M*f[l_query*i_dim + u+0] * sM + M*f[l_query*i_dim + u+1] * sI; } s[l_query+1] = sum; // the last scaling factor @@ -272,7 +286,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int u; double *bi = &b[l_query*i_dim]; set_u(u, bw, l_query, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; bi[u+0] = sM / s[l_query] / s[l_query+1]; bi[u+1] = sI / s[l_query] / s[l_query+1]; } // b[l_query-1..1] @@ -350,7 +364,7 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, int u; double e = (ref[k - 1] > 3 || query[0] > 3)? 1. : ref[k - 1] == query[0]? 1. - qual[0] : qual[0] * EM; set_u(u, bw, 1, k); - if (u < 3 || u >= i_dim - 3) continue; + if (u < 3 || u >= i_dim) continue; sum += e * b[1*i_dim + u+0] * bM + EI * b[1*i_dim + u+1] * bI; } set_u(k, bw, 0, 0); From b52f3fad9f0340ce29f098d048f77e2d8f991e30 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Tue, 13 Jun 2023 17:37:10 +0100 Subject: [PATCH 434/488] Adjust comments in probaln_glocal() Adds a comment explaining that the f[] and b[] arrays count positions from 1, allowing 0 to be used to more easily handle the edges of the alignment matrix. Changes the comment explaining the line: if (u < 3 || u >= i_dim) continue; used in some of the loops over f[] and b[]. While it does prevent overstepping the array boundaries, its main function is to select the parts over which the scores have previously been calculated. A change in 5d7a7823 to fix excess memory usage got the high end slightly wrong (using i_dim - 3). When the query sequence length was less than the band width, this could lead to the last column being incorrectly missed out from parts of the calculation. --- probaln.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/probaln.c b/probaln.c index 8a60372b3..c841c7522 100644 --- a/probaln.c +++ b/probaln.c @@ -140,6 +140,13 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, bM = (1 - c->d) / l_ref; // (bM+bI)*l_ref==1 bI = c->d / l_ref; + // f[] and b[] are 2-d arrays of three scores, with rows along the + // query and columns across the band. The first query base and + // first band position appear at index 1 allowing edge conditions + // to be stored in index 0. Hence the loops below appear to use + // 1-based indexing instead of 0-based as you'd normally expect in C, + // and the sequences are accessed using query[i - 1] and ref[k - 1]. + /*** forward ***/ // f[0] set_u(k, bw, 0, 0); @@ -245,20 +252,19 @@ int probaln_glocal(const uint8_t *ref, int l_ref, const uint8_t *query, { // f[l_query+1] double sum; double M = 1./s[l_query]; - // Note this goes up to <= l_ref, meaning we are accessing 1 beyond - // the end of the sequence. However we allocated above with - // (l_query+1)*i_dim (plus appropriate l_ref vs l_query in band width) - // so this should be sufficient. - // - // This fixes Issue #1605 where band width equal to sequence length - // gives incorrect alignments, due to the last value not being filled - // out correctly. - // - // I am unsure why the limit was previously set at u >= i_dim - 3, but - // can only conjecture it was due to forgetting the l_query+1 alloc. - // I am also unsure why "u < 3" is used instead of "u < 0", however - // changing that does change behaviour for common usage (unlike - // "idim - 3" to "idim"). + // Note that this goes from 1 to l_ref inclusive, but as the + // alignment is banded not all of the values will have been + // calculated (the rest are taken as 0), so the summation + // actually goes over the values set in the last iteration of + // the previous loop (when i = l_query). For some reason lost to + // time this is done by looking for valid values of 'u' instead of + // working out 'beg' and 'end'. + + // From HTSlib 1.8 to 1.17, the endpoint was incorrectly set + // to i_dim - 3. When l_query <= bandwidth, this caused the last + // column to be missed, and if l_ref == l_query then a match at the end + // could incorrectly be reported as an insertion. See #1605. + for (k = 1, sum = 0.; k <= l_ref; ++k) { int u; set_u(u, bw, l_query, k); From c3a6fcdafecf320f65fd80acb1616f2fe81f1273 Mon Sep 17 00:00:00 2001 From: pd3 Date: Mon, 5 Jun 2023 15:19:04 +0200 Subject: [PATCH 435/488] Add support for non-standard chromosome names containing [:-] characters Note hts_parse_region() cannot be used because it requires the header and without the header the caller does not learn the contig name. Resolves #1620 --- htslib/synced_bcf_reader.h | 2 ++ synced_bcf_reader.c | 32 +++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index bbe5ea2ba..9a6b48438 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -338,6 +338,8 @@ int bcf_sr_set_regions(bcf_srs_t *readers, const char *regions, int is_file); * supply 'from' in place of 'to'. When 'to' is negative, first * abs(to) will be attempted and if that fails, 'from' will be used * instead. + * If chromosome name contains the characters ':' or '-', it should + * be put in curly brackets, for example as "{weird-chr-name:1-2}:1000-2000" * * The bcf_sr_regions_t struct returned by a successful call should be freed * via bcf_sr_regions_destroy() when it is no longer needed. diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index 702f260ee..d24e5f444 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1032,6 +1032,9 @@ void _regions_sort_and_merge(bcf_sr_regions_t *reg) } // File name or a list of genomic locations. If file name, NULL is returned. +// Recognises regions in the form chr, chr:pos, chr:beg-end, chr:beg-, {weird-chr-name}:pos. +// Cannot use hts_parse_region() as that requires the header and if header is not present, +// wouldn't learn the chromosome name. static bcf_sr_regions_t *_regions_init_string(const char *str) { bcf_sr_regions_t *reg = (bcf_sr_regions_t *) calloc(1, sizeof(bcf_sr_regions_t)); @@ -1043,9 +1046,23 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) hts_pos_t from, to; while ( 1 ) { - while ( *ep && *ep!=',' && *ep!=':' ) ep++; tmp.l = 0; - kputsn(sp,ep-sp,&tmp); + if ( *ep=='{' ) + { + while ( *ep && *ep!='}' ) ep++; + if ( !*ep ) + { + hts_log_error("Could not parse the region, mismatching braces in: \"%s\"", str); + goto exit_nicely; + } + ep++; + kputsn(sp+1,ep-sp-2,&tmp); + } + else + { + while ( *ep && *ep!=',' && *ep!=':' ) ep++; + kputsn(sp,ep-sp,&tmp); + } if ( *ep==':' ) { sp = ep+1; @@ -1053,7 +1070,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( sp==ep ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } if ( !*ep || *ep==',' ) { @@ -1064,7 +1081,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep!='-' ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } ep++; sp = ep; @@ -1072,7 +1089,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( *ep && *ep!=',' ) { hts_log_error("Could not parse the region(s): %s", str); - free(reg); free(tmp.s); return NULL; + goto exit_nicely; } if ( sp==ep ) to = MAX_CSI_COOR-1; _regions_add(reg, tmp.s, from, to); @@ -1088,6 +1105,11 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } free(tmp.s); return reg; + +exit_nicely: + bcf_sr_regions_destroy(reg); + free(tmp.s); + return NULL; } // ichr,ifrom,ito are 0-based; From 3c36c9b9562966ddc00d487447a42c622323ec8b Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 15 Jun 2023 10:18:45 +0100 Subject: [PATCH 436/488] An attempt to parse malformatted region such as {1:1}-2 should fail --- synced_bcf_reader.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index d24e5f444..a43ab15ae 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1096,12 +1096,17 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) if ( !*ep ) break; sp = ep; } - else + else if ( !*ep || *ep==',' ) { if ( tmp.l ) _regions_add(reg, tmp.s, -1, -1); if ( !*ep ) break; sp = ++ep; } + else + { + hts_log_error("Could not parse the region(s): %s", str); + goto exit_nicely; + } } free(tmp.s); return reg; From 28a8082c096b8ecdc79ddbc33a032c63854c7186 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 19 Jun 2023 16:25:54 +0100 Subject: [PATCH 437/488] Expand test-bcf-sr.c capabilities Add -O,--output-fmt option so it can write vcf or bcf as well as its original summary format. Add -o,--output option so it's possible to write to a file without shell redirection. Add --args option so input files can be listed directly on the command line instead of via a fofn, to make basic tests easier. Add -r,--regions and -t,--targets options, which behave the same as the equivalents in `bcftools view`. Add the --no-index option to the usage text. Simplify writing the original format. Everything can be sent directly to the output file without going via a kstring. The output writing parts are also moved into separate functions to keep main() from getting too big. Add a few extra error checks. Call exit(EXIT_FAILURE) on failure, not exit(-1). Make the -h option return success. --- test/test-bcf-sr.c | 201 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 159 insertions(+), 42 deletions(-) diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index ee64afe41..e91af7f28 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -28,12 +28,17 @@ #include +#include #include #include #include #include +#include +#include #include "../htslib/synced_bcf_reader.h" +#include "../htslib/hts.h" +#include "../htslib/vcf.h" void error(const char *format, ...) { @@ -41,16 +46,78 @@ void error(const char *format, ...) va_start(ap, format); vfprintf(stderr, format, ap); va_end(ap); - exit(-1); + exit(EXIT_FAILURE); } -void usage(void) +void usage(int exit_code) { fprintf(stderr, "Usage: test-bcf-sr [OPTIONS] vcf-list.txt\n"); + fprintf(stderr, " test-bcf-sr [OPTIONS] -args file1.bcf [...]\n"); fprintf(stderr, "Options:\n"); + fprintf(stderr, " --args pass filenames directly in argument list\n"); + fprintf(stderr, " --no-index allow streaming\n"); + fprintf(stderr, " -o, --output output file (stdout if not set)\n"); + fprintf(stderr, " -O, --output-fmt fmt: vcf,bcf,summary\n"); fprintf(stderr, " -p, --pair logic: snps,indels,both,snps+ref,indels+ref,both+ref,exact,some,all\n"); + fprintf(stderr, " -r, --regions comma-separated list of regions\n"); + fprintf(stderr, " -t, --targets comma-separated list of targets\n"); fprintf(stderr, "\n"); - exit(-1); + exit(exit_code); +} + +void write_summary_format(bcf_srs_t *sr, FILE *out) +{ + int n, i, j; + while ((n = bcf_sr_next_line(sr)) > 0) { + for (i=0; inreaders; i++) + { + if ( !bcf_sr_has_line(sr,i) ) continue; + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + fprintf(out, "%s:%"PRIhts_pos, + bcf_seqname_safe(bcf_sr_get_header(sr,i),rec),rec->pos+1); + break; + } + + for (i=0; inreaders; i++) + { + fprintf(out, "\t"); + + if ( !bcf_sr_has_line(sr,i) ) + { + fprintf(out, "%s","-"); + continue; + } + + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + fprintf(out, "%s", rec->n_allele > 1 ? rec->d.allele[1] : "."); + for (j=2; jn_allele; j++) + { + fprintf(out, ",%s", rec->d.allele[j]); + } + } + fprintf(out, "\n"); + } +} + +void write_vcf_bcf_format(bcf_srs_t *sr, bcf_hdr_t *hdr, vcfFile *vcf_out, + const char *fmt_type) +{ + int i, n; + if (bcf_hdr_write(vcf_out, hdr) != 0) + error("Couldn't write %s header\n", fmt_type); + + while ((n = bcf_sr_next_line(sr)) > 0) { + for (i=0; inreaders; i++) + { + if ( !bcf_sr_has_line(sr,i) ) continue; + bcf1_t *rec = bcf_sr_get_line(sr, i); + if (!rec) error("bcf_sr_get_line() unexpectedly returned NULL\n"); + if (vcf_write(vcf_out, hdr, rec) < 0) + error("vcf_write() failed\n"); + } + } } int main(int argc, char *argv[]) @@ -58,16 +125,31 @@ int main(int argc, char *argv[]) static struct option loptions[] = { {"help",no_argument,NULL,'h'}, + {"output-fmt",required_argument,NULL,'O'}, {"pair",required_argument,NULL,'p'}, + {"regions",required_argument,NULL,'r'}, + {"targets",required_argument,NULL,'t'}, {"no-index",no_argument,NULL,1000}, + {"args",no_argument,NULL,1001}, {NULL,0,NULL,0} }; - int c, pair = 0, use_index = 1; - while ((c = getopt_long(argc, argv, "p:h", loptions, NULL)) >= 0) + int c, pair = 0, use_index = 1, use_fofn = 1; + enum htsExactFormat out_fmt = text_format; // for original pos + alleles + const char *out_fn = NULL, *regions = NULL, *targets = NULL; + while ((c = getopt_long(argc, argv, "o:O:p:r:t:h", loptions, NULL)) >= 0) { switch (c) { + case 'o': + out_fn = optarg; + break; + case 'O': + if (!strcasecmp(optarg, "vcf")) out_fmt = vcf; + else if (!strcasecmp(optarg, "bcf")) out_fmt = bcf; + else if (!strcasecmp(optarg, "summary")) out_fmt = text_format; + else error("Unknown output format \"%s\"\n", optarg); + break; case 'p': if ( !strcmp(optarg,"snps") ) pair |= BCF_SR_PAIR_SNPS; else if ( !strcmp(optarg,"snp+ref") ) pair |= BCF_SR_PAIR_SNPS|BCF_SR_PAIR_SNP_REF; @@ -83,68 +165,103 @@ int main(int argc, char *argv[]) else if ( !strcmp(optarg,"exact") ) pair = BCF_SR_PAIR_EXACT; else error("The --pair logic \"%s\" not recognised.\n", optarg); break; + case 'r': + regions = optarg; + break; + case 't': + targets = optarg; + break; case 1000: use_index = 0; break; - default: usage(); + case 1001: + use_fofn = 0; + break; + case 'h': + usage(EXIT_SUCCESS); + default: usage(EXIT_FAILURE); } } if ( !pair ) pair = BCF_SR_PAIR_EXACT; - if ( optind == argc ) usage(); + if ( optind == argc ) usage(EXIT_FAILURE); - int i, j, n, nvcf; - char **vcf = hts_readlist(argv[optind], 1, &nvcf); - if ( !vcf ) error("Could not parse %s\n", argv[optind]); + int i, nvcf; + char **vcfs = NULL; + if (use_fofn) { + vcfs = hts_readlist(argv[optind], 1, &nvcf); + if ( !vcfs ) error("Could not parse %s\n", argv[optind]); + } else { + vcfs = &argv[optind]; + nvcf = argc - optind; + } bcf_srs_t *sr = bcf_sr_init(); + if (!sr) error("bcf_sr_init() failed\n"); bcf_sr_set_opt(sr, BCF_SR_PAIR_LOGIC, pair); if (use_index) { bcf_sr_set_opt(sr, BCF_SR_REQUIRE_IDX); } else { bcf_sr_set_opt(sr, BCF_SR_ALLOW_NO_IDX); } - for (i=0; ierrnum)); - kstring_t str = {0,0,0}; - while ( (n=bcf_sr_next_line(sr)) ) + if (regions) { - for (i=0; inreaders; i++) + if (bcf_sr_set_regions(sr, regions, 0) != 0) + error("Failed to set regions\n"); + } + + if (targets) + { + if (bcf_sr_set_targets(sr, targets, 0, 0) != 0) + error("Failed to set targets\n"); + } + + for (i=0; ierrnum)); + + if (!sr->readers || sr->nreaders < 1) + error("No readers set, even though one was added\n"); + + if (out_fmt == text_format) { + FILE *out = stdout; + if (out_fn) { - if ( !bcf_sr_has_line(sr,i) ) continue; - bcf1_t *rec = bcf_sr_get_line(sr, i); - printf("%s:%"PRIhts_pos, bcf_seqname_safe(bcf_sr_get_header(sr,i),rec),rec->pos+1); - break; + out = fopen(out_fn, "w"); + if (!out) error("Couldn't open \"%s\" for writing: %s\n", + out_fn, strerror(errno)); } - - for (i=0; inreaders; i++) + write_summary_format(sr, out); + if (out_fn) { - printf("\t"); + if (fclose(out) != 0) + error("Error on closing %s : %s\n", + out_fn, strerror(errno)); + } + } else { + const char *fmt_type = out_fmt == vcf ? "VCF" : "BCF"; - if ( !bcf_sr_has_line(sr,i) ) - { - printf("%s","-"); - continue; - } + bcf_hdr_t *hdr = bcf_sr_get_header(sr, 0); + if (!hdr) error("%s output, but don't have a header\n", fmt_type); - str.l = 0; - bcf1_t *rec = bcf_sr_get_line(sr, i); - kputs(rec->n_allele > 1 ? rec->d.allele[1] : ".", &str); - for (j=2; jn_allele; j++) - { - kputc(',', &str); - kputs(rec->d.allele[j], &str); - } - printf("%s",str.s); - } - printf("\n"); + if (!out_fn) { out_fn = "-"; } + vcfFile *vcf_out = vcf_open(out_fn, out_fmt == vcf ? "w" : "wb"); + if (!vcf_out) error("Couldn't open \"%s\" for writing: %s\n", + out_fn, strerror(errno)); + write_vcf_bcf_format(sr, hdr, vcf_out, fmt_type); + if (vcf_close(vcf_out) != 0) + error("Error on closing \"%s\"\n", out_fn); } - free(str.s); + if (sr->errnum) error("Synced reader error: %s\n", + bcf_sr_strerror(sr->errnum)); + bcf_sr_destroy(sr); - for (i=0; i Date: Tue, 20 Jun 2023 12:25:05 +0100 Subject: [PATCH 438/488] Add synced reader region tests, and move no-index tests Add some tests to exercise the --regions / --targets synced reader options. Currently this only includes tests for the chromosomes with [:-] characters in the name, but it could be expanded easily to do others. Test files have been borrowed from pull request samtools/bcftools#1938. Move the synced reader no-index tests from test-bcf-sr.pl to test.pl. The former isn't a good place for them as it gets called 10 times, but the no-index test only needs to run once. It also allows the code running the test to be simplified a bit. Also fix the exit code on test-bcf-sr.pl failure from -1 to 1. Co-authored-by: Petr Danecek --- test/bcf-sr/weird-chr-names.1.out | 9 +++ test/bcf-sr/weird-chr-names.2.out | 8 +++ test/bcf-sr/weird-chr-names.3.out | 9 +++ test/bcf-sr/weird-chr-names.4.out | 8 +++ test/bcf-sr/weird-chr-names.5.out | 9 +++ test/bcf-sr/weird-chr-names.6.out | 8 +++ test/bcf-sr/weird-chr-names.vcf | 12 ++++ test/test-bcf-sr.pl | 60 +--------------- test/test.pl | 109 ++++++++++++++++++++++++++++++ 9 files changed, 173 insertions(+), 59 deletions(-) create mode 100644 test/bcf-sr/weird-chr-names.1.out create mode 100644 test/bcf-sr/weird-chr-names.2.out create mode 100644 test/bcf-sr/weird-chr-names.3.out create mode 100644 test/bcf-sr/weird-chr-names.4.out create mode 100644 test/bcf-sr/weird-chr-names.5.out create mode 100644 test/bcf-sr/weird-chr-names.6.out create mode 100644 test/bcf-sr/weird-chr-names.vcf diff --git a/test/bcf-sr/weird-chr-names.1.out b/test/bcf-sr/weird-chr-names.1.out new file mode 100644 index 000000000..5705c7575 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.1.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . +1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.2.out b/test/bcf-sr/weird-chr-names.2.out new file mode 100644 index 000000000..980818a71 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.2.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.3.out b/test/bcf-sr/weird-chr-names.3.out new file mode 100644 index 000000000..5b3ac8e18 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.3.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1 1 . C T . . . +1:1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.4.out b/test/bcf-sr/weird-chr-names.4.out new file mode 100644 index 000000000..0d9e274ab --- /dev/null +++ b/test/bcf-sr/weird-chr-names.4.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.5.out b/test/bcf-sr/weird-chr-names.5.out new file mode 100644 index 000000000..6cb41e14f --- /dev/null +++ b/test/bcf-sr/weird-chr-names.5.out @@ -0,0 +1,9 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1-1 1 . C T . . . +1:1-1 2 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.6.out b/test/bcf-sr/weird-chr-names.6.out new file mode 100644 index 000000000..a707ed85c --- /dev/null +++ b/test/bcf-sr/weird-chr-names.6.out @@ -0,0 +1,8 @@ +##fileformat=VCFv4.3 +##FILTER= +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1:1-1 1 . C T . . . diff --git a/test/bcf-sr/weird-chr-names.vcf b/test/bcf-sr/weird-chr-names.vcf new file mode 100644 index 000000000..c367be477 --- /dev/null +++ b/test/bcf-sr/weird-chr-names.vcf @@ -0,0 +1,12 @@ +##fileformat=VCFv4.3 +##reference=ref.fa +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +1 1 . C T . . . +1 2 . C T . . . +1:1 1 . C T . . . +1:1 2 . C T . . . +1:1-1 1 . C T . . . +1:1-1 2 . C T . . . diff --git a/test/test-bcf-sr.pl b/test/test-bcf-sr.pl index cd9859c14..2e290cb3a 100755 --- a/test/test-bcf-sr.pl +++ b/test/test-bcf-sr.pl @@ -34,7 +34,6 @@ my $opts = parse_params(); run_test($opts); -test_no_index($opts); exit; @@ -58,7 +57,7 @@ sub error " -v, --verbose \n", " -h, -?, --help This help message\n", "\n"; - exit -1; + exit 1; } sub parse_params { @@ -572,60 +571,3 @@ sub pairing_score } return (1<<(28+$min)) + $cnt; } - -sub test_no_index { - my ($opts) = @_; - - my $vcfdir = "$FindBin::Bin/bcf-sr"; - if ($^O =~ /^msys/) { - $vcfdir = `cygpath -w $vcfdir`; - $vcfdir =~ s/\r?\n//; - $vcfdir =~ s/\\/\\\\/g; - } - - # Positive test - open(my $fh, '>', "$$opts{tmp}/no_index_1.txt") - || error("$$opts{tmp}/no_index_1.txt : $!"); - print $fh "$vcfdir/merge.noidx.a.vcf\n"; - print $fh "$vcfdir/merge.noidx.b.vcf\n"; - print $fh "$vcfdir/merge.noidx.c.vcf\n"; - close($fh) || error("$$opts{tmp}/no_index_1.txt : $!"); - - my $cmd = "$FindBin::Bin/test-bcf-sr --no-index -p all $$opts{tmp}/no_index_1.txt > $$opts{tmp}/no_index_1.out 2> $$opts{tmp}/no_index_1.err"; - my ($ret) = _cmd($cmd); - if ($ret) { - error("The command failed [$ret]: $cmd\n"); - } - - if ($^O =~ /^msys/) { - cmd("diff --strip-trailing-cr $vcfdir/merge.noidx.abc.expected.out $$opts{tmp}/no_index_1.out"); - } else { - cmd("cmp $vcfdir/merge.noidx.abc.expected.out $$opts{tmp}/no_index_1.out"); - } - - # Check bad input detection - - my @bad_file_tests = (["out-of-order header", - ["merge.noidx.a.vcf", "merge.noidx.hdr_order.vcf"]], - ["out-of-order records", - ["merge.noidx.a.vcf", "merge.noidx.rec_order.vcf"]], - ["out-of-order records", - ["merge.noidx.rec_order.vcf", "merge.noidx.a.vcf"]]); - my $count = 2; - foreach my $test_params (@bad_file_tests) { - my ($badness, $inputs) = @$test_params; - open($fh, '>', "$$opts{tmp}/no_index_$count.txt") - || error("$$opts{tmp}/no_index_$count.txt : $!"); - foreach my $input (@$inputs) { - print $fh "$vcfdir/$input\n"; - } - close($fh) || error("$$opts{tmp}/no_index_$count.txt : $!"); - - $cmd = "$FindBin::Bin/test-bcf-sr --no-index -p all $$opts{tmp}/no_index_$count.txt > $$opts{tmp}/no_index_$count.out 2> $$opts{tmp}/no_index_$count.err"; - my ($ret) = _cmd($cmd); - if ($ret == 0) { - error("Failed to detect $badness: $cmd\n"); - } - $count++; - } -} diff --git a/test/test.pl b/test/test.pl index 368bd4f18..f8e94faa3 100755 --- a/test/test.pl +++ b/test/test.pl @@ -54,6 +54,8 @@ test_vcf_sweep($opts,out=>'test-vcf-sweep.out'); test_vcf_various($opts); test_bcf_sr_sort($opts); +test_bcf_sr_no_index($opts); +test_bcf_sr_range($opts); test_command($opts,cmd=>'test-bcf-translate -',out=>'test-bcf-translate.out'); test_convert_padded_header($opts); test_rebgzip($opts); @@ -1033,6 +1035,113 @@ sub test_bcf_sr_sort } } +sub test_bcf_sr_no_index { + my ($opts) = @_; + + my $test = "test_bcf_sr_no_index"; + + my $vcfdir = "$$opts{path}/bcf-sr"; + + # Positive test + test_cmd($opts, out => "bcf-sr/merge.noidx.abc.expected.out", + cmd => "$$opts{path}/test-bcf-sr --no-index -p all --args $vcfdir/merge.noidx.a.vcf $vcfdir/merge.noidx.b.vcf $vcfdir/merge.noidx.c.vcf 2> $$opts{tmp}/no_index_1.err"); + + # Check bad input detection + + my @bad_file_tests = (["out-of-order header", + ["merge.noidx.a.vcf", "merge.noidx.hdr_order.vcf"]], + ["out-of-order records", + ["merge.noidx.a.vcf", "merge.noidx.rec_order.vcf"]], + ["out-of-order records", + ["merge.noidx.rec_order.vcf", "merge.noidx.a.vcf"]]); + my $count = 2; + foreach my $test_params (@bad_file_tests) { + my ($badness, $inputs) = @$test_params; + my @ins = map { "$vcfdir/$_" } @$inputs; + + my $cmd = "$$opts{path}/test-bcf-sr --no-index -p all --args @ins > $$opts{tmp}/no_index_$count.out 2> $$opts{tmp}/no_index_$count.err"; + print "$test:\n\t$cmd (expected fail)\n"; + my ($ret) = _cmd($cmd); + if ($ret == 0) { + failed($opts, $test, "Failed to detect $badness: $cmd\n"); + } else { + passed($opts, $test); + } + $count++; + } +} + +sub test_bcf_sr_range { + my ($opts) = @_; + + my $test = "test_bcf_sr_range"; + + my $vcfdir = "$$opts{path}/bcf-sr"; + + my @tests = (['r', '1', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1-2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1,1:2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['r', '1:1-1', 'weird-chr-names.vcf', 'weird-chr-names.2.out'], + ['r', '{1:1}', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1,{1:1}:2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['r', '{1:1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.4.out'], + ['r', '{1:1-1}', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1,{1:1-1}:2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['r', '{1:1-1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.6.out'], + ['r', '{1:1-1}-2', 'weird-chr-names.vcf', undef], # Expected failure + ['t', '1', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1-2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1,1:2', 'weird-chr-names.vcf', 'weird-chr-names.1.out'], + ['t', '1:1-1', 'weird-chr-names.vcf', 'weird-chr-names.2.out'], + ['t', '{1:1}', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1,{1:1}:2', 'weird-chr-names.vcf', 'weird-chr-names.3.out'], + ['t', '{1:1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.4.out'], + ['t', '{1:1-1}', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1-2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1,{1:1-1}:2', 'weird-chr-names.vcf', 'weird-chr-names.5.out'], + ['t', '{1:1-1}:1-1', 'weird-chr-names.vcf', 'weird-chr-names.6.out'], + ['t', '{1:1-1}-2', 'weird-chr-names.vcf', undef] # Expected failure + ); + + my $count = 0; + my %converted; + foreach my $tst (@tests) { + my ($option, $range, $in, $exp_out) = @$tst; + $count++; + if (!$converted{$in}) { + my $cmd = "$$opts{path}/test_view -b -p $$opts{tmp}/$in.bcf -x $$opts{tmp}/$in.bcf.csi $vcfdir/$in"; + print "$test:\n\t$cmd\n"; + my ($ret) = _cmd($cmd); + if ($ret) { + failed($opts, $test); + $converted{$in} = 'fail'; + next; + } else { + passed($opts, $test); + $converted{$in} = "$$opts{tmp}/$in.bcf"; + } + } + next if ($converted{$in} eq 'fail'); + my $cmd = "$$opts{path}/test-bcf-sr -O vcf -o $$opts{tmp}/range_test_$count.out.vcf -$option '$range' --args $converted{$in}"; + if ($exp_out) { + test_compare($opts, $cmd, "$vcfdir/$exp_out", + "$$opts{tmp}/range_test_$count.out.vcf", + fix_newlines => 1); + } else { + print "$test:\n\t$cmd (expected fail)\n"; + my ($ret) = _cmd($cmd); + if ($ret) { + passed($opts, $test); + } else { + failed($opts, $test); + } + } + } +} + sub test_command { my ($opts, %args) = @_; From f4a3b994be8f7904caeb5d58eaa14952ad39f2ea Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 6 Jul 2023 09:38:17 +0100 Subject: [PATCH 439/488] Fix a containment bug in cram_index_last. The index is a loaded into a nested containment list, so the last entry in the index array is not necessarily the last slice, as the last slice may be entirely contained within a previous one. Fixes #1639 --- cram/cram_index.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cram/cram_index.c b/cram/cram_index.c index 0cc606f42..39bc7cae0 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -511,7 +511,13 @@ cram_index *cram_index_last(cram_fd *fd, int refid, cram_index *from) { slice = fd->index[refid+1].nslice - 1; - return &from->e[slice]; + // e is the last entry in the nested containment list, but it may + // contain further slices within it. + cram_index *e = &from->e[slice]; + while (e->e_next) + e = e->e_next; + + return e; } /* From 84bf64be293d15dc50ec84d060681e19954b7265 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 28 Jun 2023 12:15:10 +0100 Subject: [PATCH 440/488] Migrate base modification code out of sam.c Sam.c is already a behemoth over 6,000 lines long. It's hard to find things in it, and splitting by domain (with a sam prefix) makes life easier as a developer. Note: when sorted by file size, the split off code is already close to the median, so this isn't creating pointlessly small files. The functions are still declared in sam.h there is no API change. This commit changes no functionality as it's simply code migration. --- Makefile | 2 + sam.c | 514 -------------------------------------------------- sam_mods.c | 543 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 545 insertions(+), 514 deletions(-) create mode 100644 sam_mods.c diff --git a/Makefile b/Makefile index 0cfec6e6b..83b49609d 100644 --- a/Makefile +++ b/Makefile @@ -214,6 +214,7 @@ LIBHTS_OBJS = \ regidx.o \ region.o \ sam.o \ + sam_mods.o \ synced_bcf_reader.o \ vcf_sweep.o \ tbx.o \ @@ -457,6 +458,7 @@ hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_ hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) +sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) bcf_sr_sort.o bcf_sr_sort.pico: bcf_sr_sort.c config.h $(bcf_sr_sort_h) $(htslib_khash_str2int_h) $(htslib_kbitset_h) diff --git a/sam.c b/sam.c index d77ac7f9d..fc4e677df 100644 --- a/sam.c +++ b/sam.c @@ -6105,517 +6105,3 @@ void bam_mplp_destructor(bam_mplp_t iter, } #endif // ~!defined(BAM_NO_PILEUP) - -// --------------------------- -// Base Modification retrieval -// -// These operate by recording state in an opaque type, allocated and freed -// via the functions below. -// -// Initially we call bam_parse_basemod to process the tags and record the -// modifications in the state structure, and then functions such as -// bam_next_basemod can iterate over this cached state. - -/* - * Base modification are stored in MM/Mm tags as defined as - * - * ::= | "" - * ::= - * - * ::= "A" | "C" | "G" | "T" | "N". - * - * ::= "+" | "-". - * - * ::= | - * ::= | - * ::= - * ::= - * - * ::= "," | ";" - * - * We do not allocate additional memory other than the fixed size - * state, thus we track up to 256 pointers to different locations - * within the MM and ML tags. Each pointer is for a distinct - * modification code (simple or ChEBI), meaning some may point to the - * same delta-list when multiple codes are combined together - * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array. - * - * Each numeric in the delta-list is tracked in MMcount[], counted - * down until it hits zero in which case the next delta is fetched. - * - * ML array similarly holds the locations in the quality (ML) tag per - * type, but these are interleaved so C+mhfc,10,15 will have 4 types - * all pointing to the same delta position, but in ML we store - * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML - * also has MLstride indicating how many positions along ML to jump - * each time we consume a base. (4 in our above example, but usually 1 - * for the simple case). - * - * One complexity of the base modification system is that mods are - * always stored in the original DNA orientation. This is so that - * tools that may reverse-complement a sequence (eg "samtools fastq -T - * MM,ML") can pass through these modification tags irrespective of - * whether they have any knowledge of their internal workings. - * - * Because we don't wish to allocate extra memory, we cannot simply - * reverse the MM and ML tags. Sadly this means we have to manage the - * reverse complementing ourselves on-the-fly. - * For reversed reads we start at the right end of MM and no longer - * stop at the semicolon. Instead we use MMend[] array to mark the - * termination point. - */ -#define MAX_BASE_MOD 256 -struct hts_base_mod_state { - int type[MAX_BASE_MOD]; // char or minus-CHEBI - int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) - char strand[MAX_BASE_MOD]; // strand of modification; + or - - int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod - char *MM[MAX_BASE_MOD]; // next pos delta (string) - char *MMend[MAX_BASE_MOD]; // end of pos-delta string - uint8_t *ML[MAX_BASE_MOD]; // next qual - int MLstride[MAX_BASE_MOD]; // bytes between quals for this type - int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? - int seq_pos; // current position along sequence - int nmods; // used array size (0 to MAX_BASE_MOD-1). -}; - -hts_base_mod_state *hts_base_mod_state_alloc(void) { - return calloc(1, sizeof(hts_base_mod_state)); -} - -void hts_base_mod_state_free(hts_base_mod_state *state) { - free(state); -} - -/* - * Count frequency of A, C, G, T and N canonical bases in the sequence - */ -static void seq_freq(const bam1_t *b, int freq[16]) { - int i; - - memset(freq, 0, 16*sizeof(*freq)); - uint8_t *seq = bam_get_seq(b); - for (i = 0; i < b->core.l_qseq; i++) - freq[bam_seqi(seq, i)]++; - freq[15] = b->core.l_qseq; // all bases count as N for base mods -} - -//0123456789ABCDEF -//=ACMGRSVTWYHKDBN aka seq_nt16_str[] -//=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str -//084C2A6E195D3B7F -static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; - -/* - * Parse the MM and ML tags to populate the base mod state. - * This structure will have been previously allocated via - * hts_base_mod_state_alloc, but it does not need to be repeatedly - * freed and allocated for each new bam record. (Although obviously - * it requires a new call to this function.) - * - */ -int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { - - //reset position, else upcoming calls may fail on seq pos - length comparison - state->seq_pos = 0; - - // Read MM and ML tags - uint8_t *mm = bam_aux_get(b, "MM"); - if (!mm) mm = bam_aux_get(b, "Mm"); - if (!mm) - return 0; - if (mm[0] != 'Z') { - hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b)); - return -1; - } - - uint8_t *mi = bam_aux_get(b, "MN"); - if (mi && bam_aux2i(mi) != b->core.l_qseq) { - // bam_aux2i with set errno = EINVAL and return 0 if the tag - // isn't integer, but 0 will be a seq-length mismatch anyway so - // triggers an error here too. - hts_log_error("%s: MM/MN data length is incompatible with" - " SEQ length", bam_get_qname(b)); - return -1; - } - - uint8_t *ml = bam_aux_get(b, "ML"); - if (!ml) ml = bam_aux_get(b, "Ml"); - if (ml && (ml[0] != 'B' || ml[1] != 'C')) { - hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b)); - return -1; - } - uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; - if (ml) ml += 6; - - // Aggregate freqs of ACGTN if reversed, to get final-delta (later) - int freq[16]; - if (b->core.flag & BAM_FREVERSE) - seq_freq(b, freq); - - char *cp = (char *)mm+1; - int mod_num = 0; - int implicit = 1; - while (*cp) { - for (; *cp; cp++) { - // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; - unsigned char btype = *cp++; - - if (btype != 'A' && btype != 'C' && - btype != 'G' && btype != 'T' && - btype != 'U' && btype != 'N') - return -1; - if (btype == 'U') btype = 'T'; - - btype = seq_nt16_table[btype]; - - // Strand - if (*cp != '+' && *cp != '-') - return -1; // malformed - char strand = *cp++; - - // List of modification types - char *ms = cp, *me; // mod code start and end - char *cp_end = NULL; - int chebi = 0; - if (isdigit_c(*cp)) { - chebi = strtol(cp, &cp_end, 10); - cp = cp_end; - ms = cp-1; - } else { - while (*cp && isalpha_c(*cp)) - cp++; - if (*cp == '\0') - return -1; - } - - me = cp; - - // Optional explicit vs implicit marker - if (*cp == '.') { - // default is implicit = 1; - cp++; - } else if (*cp == '?') { - implicit = 0; - cp++; - } else if (*cp != ',' && *cp != ';') { - // parse error - return -1; - } - - long delta; - int n = 0; // nth symbol in a multi-mod string - int stride = me-ms; - int ndelta = 0; - - if (b->core.flag & BAM_FREVERSE) { - // We process the sequence in left to right order, - // but delta is successive count of bases to skip - // counting right to left. This also means the number - // of bases to skip at left edge is unrecorded (as it's - // the remainder). - // - // To output mods in left to right, we step through the - // MM list in reverse and need to identify the left-end - // "remainder" delta. - int total_seq = 0; - for (;;) { - cp += (*cp == ','); - if (*cp == 0 || *cp == ';') - break; - - delta = strtol(cp, &cp_end, 10); - if (cp_end == cp) { - hts_log_error("%s: Hit end of MM tag. Missing " - "semicolon?", bam_get_qname(b)); - return -1; - } - - cp = cp_end; - total_seq += delta+1; - ndelta++; - } - delta = freq[seqi_rc[btype]] - total_seq; // remainder - } else { - delta = *cp == ',' - ? strtol(cp+1, &cp_end, 10) - : 0; - if (!cp_end) { - // empty list - delta = INT_MAX; - cp_end = cp+1; - } - } - // Now delta is first in list or computed remainder, - // and cp_end is either start or end of the MM list. - while (ms < me) { - state->type [mod_num] = chebi ? -chebi : *ms; - state->strand [mod_num] = (strand == '-'); - state->canonical[mod_num] = btype; - state->MLstride [mod_num] = stride; - state->implicit [mod_num] = implicit; - - if (delta < 0) { - hts_log_error("%s: MM tag refers to bases beyond sequence " - "length", bam_get_qname(b)); - return -1; - } - state->MMcount [mod_num] = delta; - if (b->core.flag & BAM_FREVERSE) { - state->MM [mod_num] = cp+1; - state->MMend[mod_num] = cp_end; - state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL; - } else { - state->MM [mod_num] = cp_end; - state->MMend[mod_num] = NULL; - state->ML [mod_num] = ml ? ml+n : NULL; - } - - if (++mod_num >= MAX_BASE_MOD) { - hts_log_error("%s: Too many base modification types", - bam_get_qname(b)); - return -1; - } - ms++; n++; - } - - // Skip modification deltas - if (ml) { - if (b->core.flag & BAM_FREVERSE) { - ml += ndelta*stride; - } else { - while (*cp && *cp != ';') { - if (*cp == ',') - ml+=stride; - cp++; - } - } - if (ml > ml_end) { - hts_log_error("%s: Insufficient number of entries in ML " - "tag", bam_get_qname(b)); - return -1; - } - } else { - // cp_end already known if FREVERSE - if (cp_end && (b->core.flag & BAM_FREVERSE)) - cp = cp_end; - else - while (*cp && *cp != ';') - cp++; - } - if (!*cp) { - hts_log_error("%s: Hit end of MM tag. Missing semicolon?", - bam_get_qname(b)); - return -1; - } - } - } - - state->nmods = mod_num; - - return 0; -} - -/* - * Fills out mods[] with the base modifications found. - * Returns the number found (0 if none), which may be more than - * the size of n_mods if more were found than reported. - * Returns <= -1 on error. - * - * This always marches left to right along sequence, irrespective of - * reverse flag or modification strand. - */ -int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, - hts_base_mod *mods, int n_mods) { - if (b->core.flag & BAM_FREVERSE) { - if (state->seq_pos < 0) - return -1; - } else { - if (state->seq_pos >= b->core.l_qseq) - return -1; - } - - int i, j, n = 0; - unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos); - state->seq_pos++; - if (b->core.flag & BAM_FREVERSE) - base = seqi_rc[base]; - - for (i = 0; i < state->nmods; i++) { - if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) - continue; - - if (state->MMcount[i]-- > 0) - continue; - - char *MMptr = state->MM[i]; - if (n < n_mods) { - mods[n].modified_base = state->type[i]; - mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; - mods[n].strand = state->strand[i]; - mods[n].qual = state->ML[i] ? *state->ML[i] : -1; - } - n++; - if (state->ML[i]) - state->ML[i] += (b->core.flag & BAM_FREVERSE) - ? -state->MLstride[i] - : +state->MLstride[i]; - - if (b->core.flag & BAM_FREVERSE) { - // process MM list backwards - char *cp; - for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--) - if (*cp == ',') - break; - state->MMend[i] = cp; - if (cp != state->MM[i]) - state->MMcount[i] = strtol(cp+1, NULL, 10); - else - state->MMcount[i] = INT_MAX; - } else { - if (*state->MM[i] == ',') - state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10); - else - state->MMcount[i] = INT_MAX; - } - - // Multiple mods at the same coords. - for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) { - if (n < n_mods) { - mods[n].modified_base = state->type[j]; - mods[n].canonical_base = seq_nt16_str[state->canonical[j]]; - mods[n].strand = state->strand[j]; - mods[n].qual = state->ML[j] ? *state->ML[j] : -1; - } - n++; - state->MMcount[j] = state->MMcount[i]; - state->MM[j] = state->MM[i]; - if (state->ML[j]) - state->ML[j] += (b->core.flag & BAM_FREVERSE) - ? -state->MLstride[j] - : +state->MLstride[j]; - } - i = j-1; - } - - return n; -} - -/* - * Looks for the next location with a base modification. - */ -int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, - hts_base_mod *mods, int n_mods, int *pos) { - if (state->seq_pos >= b->core.l_qseq) - return 0; - - // Look through state->MMcount arrays to see when the next lowest is - // per base type; - int next[16], freq[16] = {0}, i; - memset(next, 0x7f, 16*sizeof(*next)); - if (b->core.flag & BAM_FREVERSE) { - for (i = 0; i < state->nmods; i++) { - if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) - next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; - } - } else { - for (i = 0; i < state->nmods; i++) { - if (next[state->canonical[i]] > state->MMcount[i]) - next[state->canonical[i]] = state->MMcount[i]; - } - } - - // Now step through the sequence counting off base types. - for (i = state->seq_pos; i < b->core.l_qseq; i++) { - unsigned char bc = bam_seqi(bam_get_seq(b), i); - if (next[bc] <= freq[bc] || next[15] <= freq[15]) - break; - freq[bc]++; - if (bc != 15) // N - freq[15]++; - } - *pos = state->seq_pos = i; - - if (i >= b->core.l_qseq) { - // Check for more MM elements than bases present. - for (i = 0; i < state->nmods; i++) { - if (!(b->core.flag & BAM_FREVERSE) && - state->MMcount[i] < 0x7f000000) { - hts_log_warning("MM tag refers to bases beyond sequence length"); - return -1; - } - } - return 0; - } - - if (b->core.flag & BAM_FREVERSE) { - for (i = 0; i < state->nmods; i++) - state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]]; - } else { - for (i = 0; i < state->nmods; i++) - state->MMcount[i] -= freq[state->canonical[i]]; - } - - int r = bam_mods_at_next_pos(b, state, mods, n_mods); - return r > 0 ? r : 0; -} - -/* - * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos. - * This can only march forwards along the read, but can do so by more than - * one base-pair. - * - * This makes it useful for calling from pileup iterators where qpos may - * start part way through a read for the first occurrence of that record. - */ -int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, - hts_base_mod *mods, int n_mods) { - // FIXME: for now this is inefficient in implementation. - int r = 0; - while (state->seq_pos <= qpos) - if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) - break; - - return r; -} - -/* - * Returns the list of base modification codes provided for this - * alignment record as an array of character codes (+ve) or ChEBI numbers - * (negative). - * - * Returns the array, with *ntype filled out with the size. - * The array returned should not be freed. - * It is a valid pointer until the state is freed using - * hts_base_mod_free(). - */ -int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { - *ntype = state->nmods; - return state->type; -} - -/* - * Returns data about a specific modification type for the alignment record. - * Code is either positive (eg 'm') or negative for ChEBI numbers. - * - * Return 0 on success or -1 if not found. The strand, implicit and canonical - * fields are filled out if passed in as non-NULL pointers. - */ -int bam_mods_query_type(hts_base_mod_state *state, int code, - int *strand, int *implicit, char *canonical) { - // Find code entry - int i; - for (i = 0; i < state->nmods; i++) { - if (state->type[i] == code) - break; - } - if (i == state->nmods) - return -1; - - // Return data - if (strand) *strand = state->strand[i]; - if (implicit) *implicit = state->implicit[i]; - if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; - - return 0; -} diff --git a/sam_mods.c b/sam_mods.c new file mode 100644 index 000000000..68259a2ea --- /dev/null +++ b/sam_mods.c @@ -0,0 +1,543 @@ +/* sam_mods.c -- Base modification handling in SAM and BAM. + + Copyright (C) 2020-2023 Genome Research Ltd. + + Author: James Bonfield + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#define HTS_BUILDING_LIBRARY // Enables HTSLIB_EXPORT, see htslib/hts_defs.h +#include + +#include "htslib/sam.h" +#include "textutils_internal.h" + +// --------------------------- +// Base Modification retrieval +// +// These operate by recording state in an opaque type, allocated and freed +// via the functions below. +// +// Initially we call bam_parse_basemod to process the tags and record the +// modifications in the state structure, and then functions such as +// bam_next_basemod can iterate over this cached state. + +/* + * Base modification are stored in MM/Mm tags as defined as + * + * ::= | "" + * ::= + * + * ::= "A" | "C" | "G" | "T" | "N". + * + * ::= "+" | "-". + * + * ::= | + * ::= | + * ::= + * ::= + * + * ::= "," | ";" + * + * We do not allocate additional memory other than the fixed size + * state, thus we track up to 256 pointers to different locations + * within the MM and ML tags. Each pointer is for a distinct + * modification code (simple or ChEBI), meaning some may point to the + * same delta-list when multiple codes are combined together + * (e.g. "C+mh,1,5,18,3;"). This is the MM[] array. + * + * Each numeric in the delta-list is tracked in MMcount[], counted + * down until it hits zero in which case the next delta is fetched. + * + * ML array similarly holds the locations in the quality (ML) tag per + * type, but these are interleaved so C+mhfc,10,15 will have 4 types + * all pointing to the same delta position, but in ML we store + * Q(m0)Q(h0)Q(f0)Q(c0) followed by Q(m1)Q(h1)Q(f1)Q(c1). This ML + * also has MLstride indicating how many positions along ML to jump + * each time we consume a base. (4 in our above example, but usually 1 + * for the simple case). + * + * One complexity of the base modification system is that mods are + * always stored in the original DNA orientation. This is so that + * tools that may reverse-complement a sequence (eg "samtools fastq -T + * MM,ML") can pass through these modification tags irrespective of + * whether they have any knowledge of their internal workings. + * + * Because we don't wish to allocate extra memory, we cannot simply + * reverse the MM and ML tags. Sadly this means we have to manage the + * reverse complementing ourselves on-the-fly. + * For reversed reads we start at the right end of MM and no longer + * stop at the semicolon. Instead we use MMend[] array to mark the + * termination point. + */ +#define MAX_BASE_MOD 256 +struct hts_base_mod_state { + int type[MAX_BASE_MOD]; // char or minus-CHEBI + int canonical[MAX_BASE_MOD];// canonical base, as seqi (1,2,4,8,15) + char strand[MAX_BASE_MOD]; // strand of modification; + or - + int MMcount[MAX_BASE_MOD]; // no. canonical bases left until next mod + char *MM[MAX_BASE_MOD]; // next pos delta (string) + char *MMend[MAX_BASE_MOD]; // end of pos-delta string + uint8_t *ML[MAX_BASE_MOD]; // next qual + int MLstride[MAX_BASE_MOD]; // bytes between quals for this type + int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? + int seq_pos; // current position along sequence + int nmods; // used array size (0 to MAX_BASE_MOD-1). +}; + +hts_base_mod_state *hts_base_mod_state_alloc(void) { + return calloc(1, sizeof(hts_base_mod_state)); +} + +void hts_base_mod_state_free(hts_base_mod_state *state) { + free(state); +} + +/* + * Count frequency of A, C, G, T and N canonical bases in the sequence + */ +static void seq_freq(const bam1_t *b, int freq[16]) { + int i; + + memset(freq, 0, 16*sizeof(*freq)); + uint8_t *seq = bam_get_seq(b); + for (i = 0; i < b->core.l_qseq; i++) + freq[bam_seqi(seq, i)]++; + freq[15] = b->core.l_qseq; // all bases count as N for base mods +} + +//0123456789ABCDEF +//=ACMGRSVTWYHKDBN aka seq_nt16_str[] +//=TGKCYSBAWRDMHVN comp1ement of seq_nt16_str +//084C2A6E195D3B7F +static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; + +/* + * Parse the MM and ML tags to populate the base mod state. + * This structure will have been previously allocated via + * hts_base_mod_state_alloc, but it does not need to be repeatedly + * freed and allocated for each new bam record. (Although obviously + * it requires a new call to this function.) + * + */ +int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { + // Reset position, else upcoming calls may fail on + // seq pos - length comparison + state->seq_pos = 0; + + // Read MM and ML tags + uint8_t *mm = bam_aux_get(b, "MM"); + if (!mm) mm = bam_aux_get(b, "Mm"); + if (!mm) + return 0; + if (mm[0] != 'Z') { + hts_log_error("%s: MM tag is not of type Z", bam_get_qname(b)); + return -1; + } + + uint8_t *mi = bam_aux_get(b, "MN"); + if (mi && bam_aux2i(mi) != b->core.l_qseq) { + // bam_aux2i with set errno = EINVAL and return 0 if the tag + // isn't integer, but 0 will be a seq-length mismatch anyway so + // triggers an error here too. + hts_log_error("%s: MM/MN data length is incompatible with" + " SEQ length", bam_get_qname(b)); + return -1; + } + + uint8_t *ml = bam_aux_get(b, "ML"); + if (!ml) ml = bam_aux_get(b, "Ml"); + if (ml && (ml[0] != 'B' || ml[1] != 'C')) { + hts_log_error("%s: ML tag is not of type B,C", bam_get_qname(b)); + return -1; + } + uint8_t *ml_end = ml ? ml+6 + le_to_u32(ml+2) : NULL; + if (ml) ml += 6; + + // Aggregate freqs of ACGTN if reversed, to get final-delta (later) + int freq[16]; + if (b->core.flag & BAM_FREVERSE) + seq_freq(b, freq); + + char *cp = (char *)mm+1; + int mod_num = 0; + int implicit = 1; + while (*cp) { + for (; *cp; cp++) { + // cp should be [ACGTNU][+-]([a-zA-Z]+|[0-9]+)[.?]?(,\d+)*; + unsigned char btype = *cp++; + + if (btype != 'A' && btype != 'C' && + btype != 'G' && btype != 'T' && + btype != 'U' && btype != 'N') + return -1; + if (btype == 'U') btype = 'T'; + + btype = seq_nt16_table[btype]; + + // Strand + if (*cp != '+' && *cp != '-') + return -1; // malformed + char strand = *cp++; + + // List of modification types + char *ms = cp, *me; // mod code start and end + char *cp_end = NULL; + int chebi = 0; + if (isdigit_c(*cp)) { + chebi = strtol(cp, &cp_end, 10); + cp = cp_end; + ms = cp-1; + } else { + while (*cp && isalpha_c(*cp)) + cp++; + if (*cp == '\0') + return -1; + } + + me = cp; + + // Optional explicit vs implicit marker + if (*cp == '.') { + // default is implicit = 1; + cp++; + } else if (*cp == '?') { + implicit = 0; + cp++; + } else if (*cp != ',' && *cp != ';') { + // parse error + return -1; + } + + long delta; + int n = 0; // nth symbol in a multi-mod string + int stride = me-ms; + int ndelta = 0; + + if (b->core.flag & BAM_FREVERSE) { + // We process the sequence in left to right order, + // but delta is successive count of bases to skip + // counting right to left. This also means the number + // of bases to skip at left edge is unrecorded (as it's + // the remainder). + // + // To output mods in left to right, we step through the + // MM list in reverse and need to identify the left-end + // "remainder" delta. + int total_seq = 0; + for (;;) { + cp += (*cp == ','); + if (*cp == 0 || *cp == ';') + break; + + delta = strtol(cp, &cp_end, 10); + if (cp_end == cp) { + hts_log_error("%s: Hit end of MM tag. Missing " + "semicolon?", bam_get_qname(b)); + return -1; + } + + cp = cp_end; + total_seq += delta+1; + ndelta++; + } + delta = freq[seqi_rc[btype]] - total_seq; // remainder + } else { + delta = *cp == ',' + ? strtol(cp+1, &cp_end, 10) + : 0; + if (!cp_end) { + // empty list + delta = INT_MAX; + cp_end = cp+1; + } + } + // Now delta is first in list or computed remainder, + // and cp_end is either start or end of the MM list. + while (ms < me) { + state->type [mod_num] = chebi ? -chebi : *ms; + state->strand [mod_num] = (strand == '-'); + state->canonical[mod_num] = btype; + state->MLstride [mod_num] = stride; + state->implicit [mod_num] = implicit; + + if (delta < 0) { + hts_log_error("%s: MM tag refers to bases beyond sequence " + "length", bam_get_qname(b)); + return -1; + } + state->MMcount [mod_num] = delta; + if (b->core.flag & BAM_FREVERSE) { + state->MM [mod_num] = cp+1; + state->MMend[mod_num] = cp_end; + state->ML [mod_num] = ml ? ml+n +(ndelta-1)*stride: NULL; + } else { + state->MM [mod_num] = cp_end; + state->MMend[mod_num] = NULL; + state->ML [mod_num] = ml ? ml+n : NULL; + } + + if (++mod_num >= MAX_BASE_MOD) { + hts_log_error("%s: Too many base modification types", + bam_get_qname(b)); + return -1; + } + ms++; n++; + } + + // Skip modification deltas + if (ml) { + if (b->core.flag & BAM_FREVERSE) { + ml += ndelta*stride; + } else { + while (*cp && *cp != ';') { + if (*cp == ',') + ml+=stride; + cp++; + } + } + if (ml > ml_end) { + hts_log_error("%s: Insufficient number of entries in ML " + "tag", bam_get_qname(b)); + return -1; + } + } else { + // cp_end already known if FREVERSE + if (cp_end && (b->core.flag & BAM_FREVERSE)) + cp = cp_end; + else + while (*cp && *cp != ';') + cp++; + } + if (!*cp) { + hts_log_error("%s: Hit end of MM tag. Missing semicolon?", + bam_get_qname(b)); + return -1; + } + } + } + + state->nmods = mod_num; + + return 0; +} + +/* + * Fills out mods[] with the base modifications found. + * Returns the number found (0 if none), which may be more than + * the size of n_mods if more were found than reported. + * Returns <= -1 on error. + * + * This always marches left to right along sequence, irrespective of + * reverse flag or modification strand. + */ +int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods) { + if (b->core.flag & BAM_FREVERSE) { + if (state->seq_pos < 0) + return -1; + } else { + if (state->seq_pos >= b->core.l_qseq) + return -1; + } + + int i, j, n = 0; + unsigned char base = bam_seqi(bam_get_seq(b), state->seq_pos); + state->seq_pos++; + if (b->core.flag & BAM_FREVERSE) + base = seqi_rc[base]; + + for (i = 0; i < state->nmods; i++) { + if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) + continue; + + if (state->MMcount[i]-- > 0) + continue; + + char *MMptr = state->MM[i]; + if (n < n_mods) { + mods[n].modified_base = state->type[i]; + mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; + mods[n].strand = state->strand[i]; + mods[n].qual = state->ML[i] ? *state->ML[i] : -1; + } + n++; + if (state->ML[i]) + state->ML[i] += (b->core.flag & BAM_FREVERSE) + ? -state->MLstride[i] + : +state->MLstride[i]; + + if (b->core.flag & BAM_FREVERSE) { + // process MM list backwards + char *cp; + for (cp = state->MMend[i]-1; cp != state->MM[i]; cp--) + if (*cp == ',') + break; + state->MMend[i] = cp; + if (cp != state->MM[i]) + state->MMcount[i] = strtol(cp+1, NULL, 10); + else + state->MMcount[i] = INT_MAX; + } else { + if (*state->MM[i] == ',') + state->MMcount[i] = strtol(state->MM[i]+1, &state->MM[i], 10); + else + state->MMcount[i] = INT_MAX; + } + + // Multiple mods at the same coords. + for (j=i+1; j < state->nmods && state->MM[j] == MMptr; j++) { + if (n < n_mods) { + mods[n].modified_base = state->type[j]; + mods[n].canonical_base = seq_nt16_str[state->canonical[j]]; + mods[n].strand = state->strand[j]; + mods[n].qual = state->ML[j] ? *state->ML[j] : -1; + } + n++; + state->MMcount[j] = state->MMcount[i]; + state->MM[j] = state->MM[i]; + if (state->ML[j]) + state->ML[j] += (b->core.flag & BAM_FREVERSE) + ? -state->MLstride[j] + : +state->MLstride[j]; + } + i = j-1; + } + + return n; +} + +/* + * Looks for the next location with a base modification. + */ +int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods, int *pos) { + if (state->seq_pos >= b->core.l_qseq) + return 0; + + // Look through state->MMcount arrays to see when the next lowest is + // per base type; + int next[16], freq[16] = {0}, i; + memset(next, 0x7f, 16*sizeof(*next)); + if (b->core.flag & BAM_FREVERSE) { + for (i = 0; i < state->nmods; i++) { + if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) + next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; + } + } else { + for (i = 0; i < state->nmods; i++) { + if (next[state->canonical[i]] > state->MMcount[i]) + next[state->canonical[i]] = state->MMcount[i]; + } + } + + // Now step through the sequence counting off base types. + for (i = state->seq_pos; i < b->core.l_qseq; i++) { + unsigned char bc = bam_seqi(bam_get_seq(b), i); + if (next[bc] <= freq[bc] || next[15] <= freq[15]) + break; + freq[bc]++; + if (bc != 15) // N + freq[15]++; + } + *pos = state->seq_pos = i; + + if (i >= b->core.l_qseq) { + // Check for more MM elements than bases present. + for (i = 0; i < state->nmods; i++) { + if (!(b->core.flag & BAM_FREVERSE) && + state->MMcount[i] < 0x7f000000) { + hts_log_warning("MM tag refers to bases beyond sequence length"); + return -1; + } + } + return 0; + } + + if (b->core.flag & BAM_FREVERSE) { + for (i = 0; i < state->nmods; i++) + state->MMcount[i] -= freq[seqi_rc[state->canonical[i]]]; + } else { + for (i = 0; i < state->nmods; i++) + state->MMcount[i] -= freq[state->canonical[i]]; + } + + int r = bam_mods_at_next_pos(b, state, mods, n_mods); + return r > 0 ? r : 0; +} + +/* + * As per bam_mods_at_next_pos, but at a specific qpos >= the previous qpos. + * This can only march forwards along the read, but can do so by more than + * one base-pair. + * + * This makes it useful for calling from pileup iterators where qpos may + * start part way through a read for the first occurrence of that record. + */ +int bam_mods_at_qpos(const bam1_t *b, int qpos, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods) { + // FIXME: for now this is inefficient in implementation. + int r = 0; + while (state->seq_pos <= qpos) + if ((r = bam_mods_at_next_pos(b, state, mods, n_mods)) < 0) + break; + + return r; +} + +/* + * Returns the list of base modification codes provided for this + * alignment record as an array of character codes (+ve) or ChEBI numbers + * (negative). + * + * Returns the array, with *ntype filled out with the size. + * The array returned should not be freed. + * It is a valid pointer until the state is freed using + * hts_base_mod_free(). + */ +int *bam_mods_recorded(hts_base_mod_state *state, int *ntype) { + *ntype = state->nmods; + return state->type; +} + +/* + * Returns data about a specific modification type for the alignment record. + * Code is either positive (eg 'm') or negative for ChEBI numbers. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_query_type(hts_base_mod_state *state, int code, + int *strand, int *implicit, char *canonical) { + // Find code entry + int i; + for (i = 0; i < state->nmods; i++) { + if (state->type[i] == code) + break; + } + if (i == state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} From 60ea4e06f01efa7b4c89c91904ab16eeda6557eb Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Jun 2023 11:49:37 +0100 Subject: [PATCH 441/488] Correct base modification implicit / explicit status when mixed together. We didn't reset back to implicit after an explicit mod, so "C+m?,4;G+o,2;" would set "m" to be explicit (?) and also leave "o" as explicit. --- sam_mods.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sam_mods.c b/sam_mods.c index 68259a2ea..4b6172e16 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -214,6 +214,7 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { me = cp; // Optional explicit vs implicit marker + implicit = 1; if (*cp == '.') { // default is implicit = 1; cp++; From 27e813cfa3ad1cf866ffb83fc2d9eb41daafe257 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Jun 2023 11:57:33 +0100 Subject: [PATCH 442/488] Add a bam_mods_queryi interface. This allows querying by the i^{th} modificaton type rather than by code. This is useful when we have multiple mods with differing meta-data, such as "C+m.,4;G-m?,1;". The previous bam_mods_query_type isn't sufficient as it's the same code "m" being used. Fixes #1550 --- htslib/sam.h | 20 ++++++++++++++++++++ sam_mods.c | 19 +++++++++++++++++++ test/base_mods/MM-chebi.out | 2 +- test/base_mods/MM-double.out | 2 +- test/base_mods/MM-explicit-x.out | 6 +++--- test/base_mods/MM-explicit.out | 6 +++--- test/base_mods/MM-multi.out | 4 ++-- test/test_mod.c | 7 ++++++- 8 files changed, 55 insertions(+), 11 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index fe5b1ebdc..3ba29f001 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2349,6 +2349,26 @@ HTSLIB_EXPORT int bam_mods_query_type(hts_base_mod_state *state, int code, int *strand, int *implicit, char *canonical); +/// Returns data about the i^th modification type for the alignment record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param i Modification index, from 0 to ntype-1 + * @param strand Boolean for top (0) or bottom (1) strand + * @param implicit Boolean for whether unlisted positions should be + * implicitly assumed to be unmodified, or require an + * explicit score and should be considered as unknown. + * Returned. + * @param canonical Canonical base type associated with this modification + * Returned. + * + * @return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +HTSLIB_EXPORT +int bam_mods_queryi(hts_base_mod_state *state, int i, + int *strand, int *implicit, char *canonical); + /// Returns the list of base modification codes provided for this /// alignment record as an array of character codes (+ve) or ChEBI numbers /// (negative). diff --git a/sam_mods.c b/sam_mods.c index 4b6172e16..e5bdb1b6c 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -542,3 +542,22 @@ int bam_mods_query_type(hts_base_mod_state *state, int code, return 0; } + +/* + * Returns data about the ith modification type for the alignment record. + * + * Return 0 on success or -1 if not found. The strand, implicit and canonical + * fields are filled out if passed in as non-NULL pointers. + */ +int bam_mods_queryi(hts_base_mod_state *state, int i, + int *strand, int *implicit, char *canonical) { + if (i < 0 || i >= state->nmods) + return -1; + + // Return data + if (strand) *strand = state->strand[i]; + if (implicit) *implicit = state->implicit[i]; + if (canonical) *canonical = "?AC?G???T??????N"[state->canonical[i]]; + + return 0; +} diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out index a6e7654cf..8df8130df 100644 --- a/test/base_mods/MM-chebi.out +++ b/test/base_mods/MM-chebi.out @@ -35,7 +35,7 @@ 34 C C+m204 C+(76792)33 35 A --- -Present: m #-76792 n +Present: m. #-76792. n. 6 C C+m102 15 N N+n212 17 C C+m128 diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out index e21ae314e..e346192b8 100644 --- a/test/base_mods/MM-double.out +++ b/test/base_mods/MM-double.out @@ -35,7 +35,7 @@ 34 A 35 T --- -Present: m m o +Present: m. m. o. 1 G G-m115 7 C C+m128 12 G G-m141 diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out index 4abedc719..4078543ca 100644 --- a/test/base_mods/MM-explicit-x.out +++ b/test/base_mods/MM-explicit-x.out @@ -24,7 +24,7 @@ 23 C 24 T --- -Present: m h +Present: m. h. 9 C C+m200 C+h10 10 C C+m50 C+h170 14 C C+m160 C+h20 @@ -57,7 +57,7 @@ Present: m h 23 C 24 T --- -Present: m h +Present: m? h? 9 C C+m200 C+h10 10 C C+m50 C+h170 13 C C+m10 C+h5 @@ -92,7 +92,7 @@ Present: m h 23 C 24 T --- -Present: m h +Present: m. h? 9 C C+m200 C+h10 10 C C+h170 13 C C+h5 diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out index f28b25f83..186e790ae 100644 --- a/test/base_mods/MM-explicit.out +++ b/test/base_mods/MM-explicit.out @@ -24,7 +24,7 @@ 23 C 24 T --- -Present: m h +Present: m. h. 9 C C+m200 C+h10 10 C C+m50 C+h170 14 C C+m160 C+h20 @@ -57,7 +57,7 @@ Present: m h 23 C 24 T --- -Present: m h +Present: m? h? 9 C C+m200 C+h10 10 C C+m50 C+h170 13 C C+m10 C+h5 @@ -92,7 +92,7 @@ Present: m h 23 C 24 T --- -Present: m h +Present: m. h? 9 C C+m200 C+h10 10 C C+h170 13 C C+h5 diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out index e411a81ee..73b480b80 100644 --- a/test/base_mods/MM-multi.out +++ b/test/base_mods/MM-multi.out @@ -35,7 +35,7 @@ 34 C C+m230 C+h6 35 A --- -Present: m h n +Present: m. h. n. 6 C C+m128 15 N N+n215 17 C C+m153 @@ -84,7 +84,7 @@ Present: m h n 34 C C+m204 C+h31 35 A --- -Present: m h n +Present: m. h. n. 6 C C+m77 C+h159 15 N N+n240 17 C C+m103 C+h133 diff --git a/test/test_mod.c b/test/test_mod.c index 42768341d..1b53b9e91 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -166,8 +166,13 @@ int main(int argc, char **argv) { int all_mods_n = 0; all_mods = bam_mods_recorded(m, &all_mods_n); printf("Present:"); - for (i = 0; i < all_mods_n; i++) + for (i = 0; i < all_mods_n; i++) { + int m_strand, m_implicit; + char m_canonical; + bam_mods_queryi(m, i, &m_strand, &m_implicit, &m_canonical); printf(all_mods[i] > 0 ? " %c" : " #%d", all_mods[i]); + putchar("?."[m_implicit]); + } putchar('\n'); int pos; From fea4ef94bda646f17079a8a28e318ff9a46096c8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 28 Jun 2023 17:20:34 +0100 Subject: [PATCH 443/488] Add bam_parse_basemod2 API with additional flags argument. The only flag at the moment is HTS_MOD_REPORT_UNCHECKED. This changes bam_mods_at_next_pos to report modified bases with qual=HTS_MOD_UNCHECKED for explicitly modification types that do not have coverage for this specific position. For consistency, also set the unknown qual from -1 to HTS_MOD_UNKNOWN (#defined to -1). This is used when ML is absent. Arguably this could be 255 to match things like unknown MAPQ, but this ship has sailed. The test/test_mod tool has a -f INT argument to specify the basemod2 flags. TODO: modify other functions too. TODO: add tests, based on MM-explicit.sam --- htslib/sam.h | 26 ++++++++++++++++++++++++++ sam_mods.c | 31 +++++++++++++++++++++++++++---- test/test_mod.c | 25 ++++++++++++++++++++----- 3 files changed, 73 insertions(+), 9 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index 3ba29f001..483cc67d3 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2224,6 +2224,12 @@ typedef struct hts_base_mod { int qual; } hts_base_mod; +#define HTS_MOD_UNKNOWN -1 // In MM but no ML +#define HTS_MOD_UNCHECKED -2 // Not in MM and in explicit mode + +// Flags for hts_parse_basemod2 +#define HTS_MOD_REPORT_UNCHECKED 1 + /// Allocates an hts_base_mode_state. /** * @return An hts_base_mode_state pointer on success, @@ -2260,6 +2266,22 @@ void hts_base_mod_state_free(hts_base_mod_state *state); HTSLIB_EXPORT int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state); +/// Parses the Mm and Ml tags out of a bam record. +/** + * @param b BAM alignment record + * @param state The base modification state pointer. + * @param flags A bit-field controlling base modification processing + * + * @return 0 on success, + * -1 on failure. + * + * This fills out the contents of the modification state, resetting the + * iterator location to the first sequence base. + */ +HTSLIB_EXPORT +int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, + uint32_t flags); + /// Returns modification status for the next base position in the query seq. /** * @param b BAM alignment record @@ -2281,6 +2303,10 @@ HTSLIB_EXPORT int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, hts_base_mod *mods, int n_mods); +HTSLIB_EXPORT +int bam_mods_at_next_pos2(const bam1_t *b, hts_base_mod_state *state, + hts_base_mod *mods, int n_mods); + /// Finds the next location containing base modifications and returns them /** * @param b BAM alignment record diff --git a/sam_mods.c b/sam_mods.c index e5bdb1b6c..245e96e1c 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -99,6 +99,7 @@ struct hts_base_mod_state { int implicit[MAX_BASE_MOD]; // treat unlisted positions as non-modified? int seq_pos; // current position along sequence int nmods; // used array size (0 to MAX_BASE_MOD-1). + uint32_t flags; // Bit-field: see HTS_MOD_REPORT_UNCHECKED }; hts_base_mod_state *hts_base_mod_state_alloc(void) { @@ -135,11 +136,17 @@ static int seqi_rc[] = { 0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15 }; * freed and allocated for each new bam record. (Although obviously * it requires a new call to this function.) * + * Flags are copied into the state and used to control reporting functions. + * Currently the only flag is HTS_MOD_REPORT_UNCHECKED, to control whether + * explicit "C+m?" mods report quality HTS_MOD_UNCHECKED for the bases + * outside the explicitly reported region. */ -int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { +int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, + uint32_t flags) { // Reset position, else upcoming calls may fail on // seq pos - length comparison state->seq_pos = 0; + state->flags = flags; // Read MM and ML tags uint8_t *mm = bam_aux_get(b, "MM"); @@ -339,6 +346,10 @@ int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { return 0; } +int bam_parse_basemod(const bam1_t *b, hts_base_mod_state *state) { + return bam_parse_basemod2(b, state, 0); +} + /* * Fills out mods[] with the base modifications found. * Returns the number found (0 if none), which may be more than @@ -365,20 +376,32 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, base = seqi_rc[base]; for (i = 0; i < state->nmods; i++) { + int unchecked = 0; if (state->canonical[i] != base && state->canonical[i] != 15/*N*/) continue; - if (state->MMcount[i]-- > 0) - continue; + if (state->MMcount[i]-- > 0) { + if (!state->implicit[i] && + (state->flags & HTS_MOD_REPORT_UNCHECKED)) + unchecked = 1; + else + continue; + } char *MMptr = state->MM[i]; if (n < n_mods) { mods[n].modified_base = state->type[i]; mods[n].canonical_base = seq_nt16_str[state->canonical[i]]; mods[n].strand = state->strand[i]; - mods[n].qual = state->ML[i] ? *state->ML[i] : -1; + mods[n].qual = unchecked + ? HTS_MOD_UNCHECKED + : (state->ML[i] ? *state->ML[i] : HTS_MOD_UNKNOWN); } n++; + + if (unchecked) + continue; + if (state->ML[i]) state->ML[i] += (b->core.flag & BAM_FREVERSE) ? -state->MLstride[i] diff --git a/test/test_mod.c b/test/test_mod.c index 1b53b9e91..43447bb36 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -88,6 +88,7 @@ static char *code(int id) { int main(int argc, char **argv) { char out[1024] = {0}; int extended = 0; + uint32_t flags = 0; if (argc > 1 && strcmp(argv[1], "-x") == 0) { extended = 1; @@ -95,6 +96,12 @@ int main(int argc, char **argv) { argc--; } + if (argc > 2 && strcmp(argv[1], "-f") == 0) { + flags = atoi(argv[2]); + argv+=2; + argc-=2; + } + if (argc < 2) return 1; @@ -110,7 +117,7 @@ int main(int argc, char **argv) { int r; while ((r = sam_read1(in, h, b)) >= 0) { - if (bam_parse_basemod(b, m) < 0) { + if (bam_parse_basemod2(b, m, flags) < 0) { fprintf(stderr, "Failed to parse MM/ML aux tags\n"); goto err; } @@ -124,6 +131,14 @@ int main(int argc, char **argv) { lp += snprintf(lp, ep - lp, "%d\t%c\t", i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); for (j = 0; j < n && j < 5; j++) { + char qstr[10]; + if (mods[j].qual == HTS_MOD_UNCHECKED) + qstr[0] = '#', qstr[1] = 0; + else if (mods[j].qual == HTS_MOD_UNKNOWN) + qstr[0] = '.', qstr[1] = 0; + else + snprintf(qstr, 10, "%d", mods[j].qual); + if (extended) { int m_strand, m_implicit; char m_canonical; @@ -134,18 +149,18 @@ int main(int argc, char **argv) { m_canonical != mods[j].canonical_base || m_strand != mods[j].strand) goto err; - lp += snprintf(lp, ep - lp, "%c%c%s%c%d ", + lp += snprintf(lp, ep - lp, "%c%c%s%c%s ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), "?."[m_implicit], - mods[j].qual); + qstr); } else { - lp += snprintf(lp, ep - lp, "%c%c%s%d ", + lp += snprintf(lp, ep - lp, "%c%c%s%s ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), - mods[j].qual); + qstr); } } *lp++ = '\n'; From a09710cdbef3745933e4c5de26cb76467782acaa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 29 Jun 2023 16:25:43 +0100 Subject: [PATCH 444/488] Add more internal sam_mods.c documentation --- sam_mods.c | 86 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/sam_mods.c b/sam_mods.c index 245e96e1c..f22baca04 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -38,6 +38,92 @@ DEALINGS IN THE SOFTWARE. */ // modifications in the state structure, and then functions such as // bam_next_basemod can iterate over this cached state. +/* Overview of API. + +We start by allocating an hts_base_mod_state and parsing the MM, ML and MN +tags into it. This has optional flags controlling how we report base +modifications in "explicit" coordinates. See below + + hts_base_mod_state *m = hts_base_mod_state_alloc(); + bam_parse_basemod2(b, m, HTS_MOD_REPORT_UNCHECKED); + // Or: bam_parse_basemod(b, m), which is equiv to flags==0 + //... do something ... + hts_base_mod_state_free(m); + +In the default implicit MM coordinate system, any location not +reported is implicitly assumed to contain no modification. We only +report the places we think are likely modified. + +Some tools however only look for base modifications in particular +contexts, eg CpG islands. Here we need to distinguish between +not-looked-for and looked-for-but-didn't-find. These calls have an +explicit coordinate system, where we only know information about the +coordinates explicitly listed and everything else is considered to be +unverified. + +By default we don't get reports on the other coordinates in an +explicit MM tag, but the HTS_MOD_REPORT_UNCHECKED flag will report +them (with quality HTS_MOD_UNCHECKED) meaning we can do consensus +modification analysis with accurate counting when dealing with a +mixture of explicit and implicit records. + + +We have different ways of processing the base modifications. We can +iterate either mod-by-mod or position-by-position, or we can simply +query a specific coordinate as may be done when processing a pileup. + +To check for base modifications as a specific location within a +sequence we can use bam_mods_at_qpos. This provides complete random +access within the MM string. However currently this is inefficiently +implemented so should only be used for occasional analysis or as a way +to start iterating at a specific location. It modifies the state +position, so after the first use we can then switch to +bam_mods_at_next_pos to iterate position by position from then on. + + hts_base_mod mods[10]; + int n = bam_mods_at_qpos(b, pos, m, mods, 10); + +For base by base, we have bam_mods_at_next_pos. This strictly starts +at the first base and reports entries one at a time. It's more +efficient than a loop repeatedly calling ...at-pos. + + hts_base_mod mods[10]; + int n = bam_mods_at_next_pos(b, m, mods, 10); + for (int i = 0; i < n; i++) { + // report mod i of n + } + +Iterating over modifications instead of coordinates is simpler and +more efficient as it skips reporting of unmodified bases. This is +done with bam_next_basemod. Note this does not yet honour the +HTS_MOD_REPORT_UNCHECKED flag. + + hts_base_mod mods[10]; + while ((n=bam_next_basemod(b, m, mods, 10, &pos)) > 0) { + for (j = 0; j < n; j++) { + // Report 'n'th mod at sequence position 'pos' + } + } + +There are also functions that query meta-data about the MM line rather +than per-site information. + +bam_mods_recorded returns an array of ints holding the +ve code ('m') +or -ve CHEBI numeric values. + + int ntypes, *types = bam_mods_recorded(m, &ntype); + +We can then query a specific modification type to get further +information on the strand it is operating on, whether it has implicit +or explicit coordinates, and what it's corresponding canonical base it +is (The "C" in "C+m"). bam_mods_query_type does this by code name, +while bam_mods_queryi does this by numeric i^{th} type (from 0 to ntype-1). + + bam_mods_query_type(m, 'c', &strand, &implicit, &canonical); + bam_mods_queryi(m, 2, &strand, &implicit, &canonical); + +*/ + /* * Base modification are stored in MM/Mm tags as defined as * From 7822d8d0553e3d93516956b588f9db8086204499 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 30 Jun 2023 15:25:37 +0100 Subject: [PATCH 445/488] Update bam_next_basemod too to cope with HTS_MOD_REPORT_UNCHECKED. Also improve tests --- htslib/sam.h | 4 -- sam_mods.c | 22 ++++-- test/base_mods/MM-explicit-f.out | 111 +++++++++++++++++++++++++++++++ test/base_mods/base-mods.tst | 5 ++ test/test_mod.c | 14 +++- 5 files changed, 143 insertions(+), 13 deletions(-) create mode 100644 test/base_mods/MM-explicit-f.out diff --git a/htslib/sam.h b/htslib/sam.h index 483cc67d3..cffa04701 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -2303,10 +2303,6 @@ HTSLIB_EXPORT int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, hts_base_mod *mods, int n_mods); -HTSLIB_EXPORT -int bam_mods_at_next_pos2(const bam1_t *b, hts_base_mod_state *state, - hts_base_mod *mods, int n_mods); - /// Finds the next location containing base modifications and returns them /** * @param b BAM alignment record diff --git a/sam_mods.c b/sam_mods.c index f22baca04..3eb042328 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -95,8 +95,7 @@ efficient than a loop repeatedly calling ...at-pos. Iterating over modifications instead of coordinates is simpler and more efficient as it skips reporting of unmodified bases. This is -done with bam_next_basemod. Note this does not yet honour the -HTS_MOD_REPORT_UNCHECKED flag. +done with bam_next_basemod. hts_base_mod mods[10]; while ((n=bam_next_basemod(b, m, mods, 10, &pos)) > 0) { @@ -534,7 +533,13 @@ int bam_mods_at_next_pos(const bam1_t *b, hts_base_mod_state *state, } /* - * Looks for the next location with a base modification. + * Return data at the next modified location. + * + * bam_mods_at_next_pos does quite a bit of work, so we don't want to + * repeatedly call it for every location until we find a mod. Instead + * we check how many base types we can consume before the next mod, + * and scan through the sequence looking for them. Once we're at that + * site, we defer back to bam_mods_at_next_pos for the return values. */ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, hts_base_mod *mods, int n_mods, int *pos) { @@ -545,16 +550,21 @@ int bam_next_basemod(const bam1_t *b, hts_base_mod_state *state, // per base type; int next[16], freq[16] = {0}, i; memset(next, 0x7f, 16*sizeof(*next)); + const int unchecked = state->flags & HTS_MOD_REPORT_UNCHECKED; if (b->core.flag & BAM_FREVERSE) { for (i = 0; i < state->nmods; i++) { - if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) + if (unchecked && !state->implicit[i]) + next[seqi_rc[state->canonical[i]]] = 1; + else if (next[seqi_rc[state->canonical[i]]] > state->MMcount[i]) next[seqi_rc[state->canonical[i]]] = state->MMcount[i]; } } else { for (i = 0; i < state->nmods; i++) { - if (next[state->canonical[i]] > state->MMcount[i]) + if (unchecked && !state->implicit[i]) + next[state->canonical[i]] = 0; + else if (next[state->canonical[i]] > state->MMcount[i]) next[state->canonical[i]] = state->MMcount[i]; - } + } } // Now step through the sequence counting off base types. diff --git a/test/base_mods/MM-explicit-f.out b/test/base_mods/MM-explicit-f.out new file mode 100644 index 000000000..6462c99b9 --- /dev/null +++ b/test/base_mods/MM-explicit-f.out @@ -0,0 +1,111 @@ +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T +--- +Present: m. h. +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 + +=== + +0 A +1 T +2 C C+m# C+h# +3 A +4 T +5 C C+m# C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+m# C+h# +23 C C+m# C+h# +24 T +--- +Present: m? h? +2 C C+m# C+h# +5 C C+m# C+h# +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 +22 C C+m# C+h# +23 C C+m# C+h# + +=== + +0 A +1 T +2 C C+h# +3 A +4 T +5 C C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+h# +23 C C+h# +24 T +--- +Present: m. h? +2 C C+h# +5 C C+h# +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 +22 C C+h# +23 C C+h# + +=== + diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index 237f7906c..ff1f7651a 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -39,6 +39,11 @@ P MM-multi.out $test_mod MM-multi.sam P MM-explicit.out $test_mod MM-explicit.sam P MM-explicit-x.out $test_mod -x MM-explicit.sam +# Report bases outside the explicitly called ranges, so we could exclude +# these in any depth based consensus analysis and only gather statistics +# for sites known to be have been scanned. +P MM-explicit-f.out $test_mod -f 1 MM-explicit.sam + # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam P MM-pileup2.out $pileup_mod < MM-pileup2.sam diff --git a/test/test_mod.c b/test/test_mod.c index 43447bb36..e59da4827 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -174,7 +174,7 @@ int main(int argc, char **argv) { if (argc > 1) puts("---"); - bam_parse_basemod(b, m); + bam_parse_basemod2(b, m, flags); // List possible mod choices. int *all_mods; @@ -196,11 +196,19 @@ int main(int argc, char **argv) { lp += snprintf(lp, ep - lp, "%d\t%c\t", pos, seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); for (j = 0; j < n && j < 5; j++) { - lp += snprintf(lp, ep - lp, "%c%c%s%d ", + char qstr[10]; + if (mods[j].qual == HTS_MOD_UNCHECKED) + qstr[0] = '#', qstr[1] = 0; + else if (mods[j].qual == HTS_MOD_UNKNOWN) + qstr[0] = '.', qstr[1] = 0; + else + snprintf(qstr, 10, "%d", mods[j].qual); + + lp += snprintf(lp, ep - lp, "%c%c%s%s ", mods[j].canonical_base, "+-"[mods[j].strand], code(mods[j].modified_base), - mods[j].qual); + qstr); } *lp++ = '\n'; *lp++ = 0; From 6e3e8dafa3c5c4b016fc1f0ac8f56eefcd1dda60 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 30 Jun 2023 16:16:41 +0100 Subject: [PATCH 446/488] Fix possible double frees in bcf_hdr_add_hrec() error handling bcf_hdr_add_hrec() should neither call bcf_hrec_destroy(hrec) nor store any pointers to hrec when it returns -1, otherwise double frees or stale pointer dereferences may result. Remove bcf_hrec_destroy(hrec) call that was incorrectly made when handling a hash table insert failure, and move hdr->hrec reallocation so that all possible failures occur before hrec is added into the header. Add bcf_hdr_add_hrec() documentation, including a warning that the caller should not touch hrec after a successful return. --- htslib/vcf.h | 16 ++++++++++++++++ vcf.c | 16 ++++++++++------ 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 0d9f812ce..8bbf480c8 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -691,6 +691,22 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). HTSLIB_EXPORT int bcf_hrec_format(const bcf_hrec_t *hrec, kstring_t *str); + /// Add a header record into a header + /** + * @param hdr Destination header + * @param hrec Header record + * @return 0 on success, -1 on failure + * + * If this function returns success, ownership of @p hrec will have + * been transferred to the header structure. It may also have been + * freed if it was a duplicate of a record already in the header. + * Therefore the @p hrec pointer should not be used after a successful + * return from this function. + * + * If this function returns failure, ownership will not have been taken + * and the caller is responsible for cleaning up @p hrec. + */ + HTSLIB_EXPORT int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec); diff --git a/vcf.c b/vcf.c index 392d9c932..6dc7aec32 100644 --- a/vcf.c +++ b/vcf.c @@ -974,23 +974,27 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 0; } } + + // New record, needs to be added + int n = hdr->nhrec + 1; + bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); + if (!new_hrec) { + free(str.s); + return -1; + } + hdr->hrec = new_hrec; + if ( str.s ) { khint_t k = kh_put(hdict, aux->gen, str.s, &res); if ( res<0 ) { - bcf_hrec_destroy(hrec); free(str.s); return -1; } kh_val(aux->gen,k) = hrec; } - // New record, needs to be added - int n = hdr->nhrec + 1; - bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); - if (!new_hrec) return -1; - hdr->hrec = new_hrec; hdr->hrec[hdr->nhrec] = hrec; hdr->dirty = 1; hdr->nhrec = n; From 6ad0fffc8c71d617c8433665b75071f09ab423a3 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 6 Jul 2023 15:29:15 +0100 Subject: [PATCH 447/488] Prevent dangling hrec pointer after bcf_hdr_add_hrec() failure Borrow a bit of code from bcf_hdr_remove() that removes hrec pointers from the hdr->dict[] dictionaries, turn it into bcf_hdr_unregister_hrec() and use it to clean up the dictionary should bcf_hdr_add_hrec() fail. There's actually only one place in bcf_hdr_add_hrec() where this needs to be called. In all other paths returning an error, either the hrec type is not one that needs to be cleaned up, or hrec will not have been added to the dictionary. bcf_hdr_remove() is updated to call the new function when it's removing all lines of a given type. The code handling lines with a specific key is unchanged as is that case it already has the key to look up in the dictionary and so doesn't need to hunt for it in the header record. --- vcf.c | 32 ++++++++++++++++++++------------ 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/vcf.c b/vcf.c index 6dc7aec32..9d4029dfc 100644 --- a/vcf.c +++ b/vcf.c @@ -886,6 +886,24 @@ static int bcf_hdr_register_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) return 1; } +static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +{ + if (hrec->type == BCF_HL_FLT || + hrec->type == BCF_HL_INFO || + hrec->type == BCF_HL_FMT || + hrec->type == BCF_HL_CTG) { + int id = bcf_hrec_find_key(hrec, "ID"); + if (id < 0 || !hrec->vals[id]) + return; + vdict_t *dict = (hrec->type == BCF_HL_CTG + ? (vdict_t*)hdr->dict[BCF_DT_CTG] + : (vdict_t*)hdr->dict[BCF_DT_ID]); + khint_t k = kh_get(vdict, dict, hrec->vals[id]); + if (k != kh_end(dict)) + kh_val(dict, k).hrec[hrec->type==BCF_HL_CTG ? 0 : hrec->type] = NULL; + } +} + int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) { // currently only for bcf_hdr_set_version @@ -980,6 +998,7 @@ int bcf_hdr_add_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) bcf_hrec_t **new_hrec = realloc(hdr->hrec, n*sizeof(bcf_hrec_t*)); if (!new_hrec) { free(str.s); + bcf_hdr_unregister_hrec(hdr, hrec); return -1; } hdr->hrec = new_hrec; @@ -1184,18 +1203,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) { if ( hdr->hrec[i]->type!=type ) { i++; continue; } hrec = hdr->hrec[i]; - - if ( type==BCF_HL_FLT || type==BCF_HL_INFO || type==BCF_HL_FMT || type== BCF_HL_CTG ) - { - int j = bcf_hrec_find_key(hdr->hrec[i], "ID"); - if ( j>=0 ) - { - vdict_t *d = type==BCF_HL_CTG ? (vdict_t*)hdr->dict[BCF_DT_CTG] : (vdict_t*)hdr->dict[BCF_DT_ID]; - khint_t k = kh_get(vdict, d, hdr->hrec[i]->vals[j]); - kh_val(d, k).hrec[type==BCF_HL_CTG?0:type] = NULL; - } - } - + bcf_hdr_unregister_hrec(hdr, hrec); hdr->dirty = 1; hdr->nhrec--; if ( i < hdr->nhrec ) From 10f1516b75a9bb98dbf4a25c8dcfd01ff2d40975 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 6 Jul 2023 17:31:58 +0100 Subject: [PATCH 448/488] Remove items from hdict in bcf_hdr_remove() As bcf_hdr_remove() deletes hrec structs it needs to ensure that the pointers to them in the bcf_hdr_aux_t::gen dictionary are removed as well, otherwise callers to bcf_hdr_get_hrec() may get a stale pointer to the deleted item. Unfortunately bcf_hdr_remove_from_hdict() needs to allocate some memory to find the items in the dictionary. If that fails it falls back to a search through the dictionary values to find the item, so we can be sure that it will always succeed. Enhance tests to ensure bcf_hdr_get_hrec() returns NULL for removed records. --- test/test-vcf-api.c | 33 ++++++++++++++++++++++++++++++++- vcf.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index 51a9f41f6..f016f1b3f 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -125,6 +125,7 @@ void write_bcf(char *fname) check0(bcf_hdr_append(hdr, "##INFO=")); check0(bcf_hdr_append(hdr, "##FILTER=")); check0(bcf_hdr_append(hdr, "##unused=")); + check0(bcf_hdr_append(hdr, "##unused=")); check0(bcf_hdr_append(hdr, "##unused=unformatted text 1")); check0(bcf_hdr_append(hdr, "##unused=unformatted text 2")); check0(bcf_hdr_append(hdr, "##contig=")); @@ -297,12 +298,42 @@ void bcf_to_vcf(char *fname) if (!out) error("Couldn't open \"%s\" : %s\n", gz_fname, strerror(errno)); bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr); - bcf_hdr_remove(hdr_out,BCF_HL_STR,"unused"); + if (!bcf_hdr_get_hrec(hdr_out, BCF_HL_STR,"ID","BB","unused")) + error("Missing header ##unused="); + bcf_hdr_remove(hdr_out,BCF_HL_STR,"BB"); + if (bcf_hdr_get_hrec(hdr_out, BCF_HL_STR,"ID","BB","unused")) + error("Got pointer to deleted header ##unused="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_GEN,"unused","unformatted text 1",NULL)) + error("Missing header ##unused=unformatted text 1"); bcf_hdr_remove(hdr_out,BCF_HL_GEN,"unused"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_GEN,"unused","unformatted text 1",NULL)) + error("Got pointer to deleted header ##unused=unformatted text 1"); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_FLT,"ID","Flt",NULL)) + error("Missing header ##FILTER="); bcf_hdr_remove(hdr_out,BCF_HL_FLT,"Flt"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FLT,"ID","Flt",NULL)) + error("Got pointer to deleted header ##FILTER="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_INFO,"ID","UI",NULL)) + error("Missing header ##INFO="); bcf_hdr_remove(hdr_out,BCF_HL_INFO,"UI"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_INFO,"ID","UI",NULL)) + error("Got pointer to deleted header ##INFO="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","UF",NULL)) + error("Missing header ##INFO="); bcf_hdr_remove(hdr_out,BCF_HL_FMT,"UF"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","UF",NULL)) + error("Got pointer to deleted header ##INFO="); + + if (!bcf_hdr_get_hrec(hdr_out,BCF_HL_CTG,"ID","Unused",NULL)) + error("Missing header ##contig="); bcf_hdr_remove(hdr_out,BCF_HL_CTG,"Unused"); + if (bcf_hdr_get_hrec(hdr_out,BCF_HL_FMT,"ID","Unused",NULL)) + error("Got pointer to header ##contig="); + if ( bcf_hdr_write(out, hdr_out)!=0 ) error("Failed to write to %s\n", fname); int r; while ((r = bcf_read1(fp, hdr, rec)) >= 0) diff --git a/vcf.c b/vcf.c index 9d4029dfc..9e589f993 100644 --- a/vcf.c +++ b/vcf.c @@ -904,6 +904,47 @@ static void bcf_hdr_unregister_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec) } } +static void bcf_hdr_remove_from_hdict(bcf_hdr_t *hdr, bcf_hrec_t *hrec) +{ + kstring_t str = KS_INITIALIZE; + bcf_hdr_aux_t *aux = get_hdr_aux(hdr); + khint_t k; + int id; + + switch (hrec->type) { + case BCF_HL_GEN: + if (ksprintf(&str, "##%s=%s", hrec->key,hrec->value) < 0) + str.l = 0; + break; + case BCF_HL_STR: + id = bcf_hrec_find_key(hrec, "ID"); + if (id < 0) + return; + if (!hrec->vals[id] || + ksprintf(&str, "##%s=", hrec->key, hrec->vals[id]) < 0) + str.l = 0; + break; + default: + return; + } + if (str.l) { + k = kh_get(hdict, aux->gen, str.s); + } else { + // Couldn't get a string for some reason, so try the hard way... + for (k = kh_begin(aux->gen); k < kh_end(aux->gen); k++) { + if (kh_exist(aux->gen, k) && kh_val(aux->gen, k) == hrec) + break; + } + } + if (k != kh_end(aux->gen) && kh_val(aux->gen, k) == hrec) { + kh_val(aux->gen, k) = NULL; + free((char *) kh_key(aux->gen, k)); + kh_key(aux->gen, k) = NULL; + kh_del(hdict, aux->gen, k); + } + free(str.s); +} + int bcf_hdr_update_hrec(bcf_hdr_t *hdr, bcf_hrec_t *hrec, const bcf_hrec_t *tmp) { // currently only for bcf_hdr_set_version @@ -1204,6 +1245,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) if ( hdr->hrec[i]->type!=type ) { i++; continue; } hrec = hdr->hrec[i]; bcf_hdr_unregister_hrec(hdr, hrec); + bcf_hdr_remove_from_hdict(hdr, hrec); hdr->dirty = 1; hdr->nhrec--; if ( i < hdr->nhrec ) @@ -1245,6 +1287,7 @@ void bcf_hdr_remove(bcf_hdr_t *hdr, int type, const char *key) } if ( i==hdr->nhrec ) return; hrec = hdr->hrec[i]; + bcf_hdr_remove_from_hdict(hdr, hrec); } hdr->nhrec--; From 2e672f33a860e60e6ce42b77a07713558fb0507d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 10 Jul 2023 09:42:22 +0100 Subject: [PATCH 449/488] Fix decompress_peek_gz to cope with files starting on empty gzip blocks. --- hts.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/hts.c b/hts.c index f2bc5fcb6..5ce425212 100644 --- a/hts.c +++ b/hts.c @@ -324,10 +324,24 @@ decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) zs.avail_out = destsize; if (inflateInit2(&zs, 31) != Z_OK) return -1; - while (zs.total_out < destsize) - if (inflate(&zs, Z_SYNC_FLUSH) != Z_OK) break; + int ret; + unsigned char *last_in = buffer; + while (zs.total_out < destsize) { + ret = inflate(&zs, Z_SYNC_FLUSH); + if (ret == Z_STREAM_END && zs.avail_in && zs.total_out < destsize) { + if (last_in == zs.next_in) + break; // paranoia to avoid potential looping + else + last_in = zs.next_in; + inflateReset(&zs); + continue; + } + if (ret != Z_OK) + break; + } - destsize = zs.total_out; + // zs.total_out can sometimes be wrong as inflateReset resets it + destsize = zs.next_out - dest; inflateEnd(&zs); return destsize; From a809db76f4fa9e1425599c9a0ca4999a1fd88b44 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 13 Jul 2023 14:42:35 +0100 Subject: [PATCH 450/488] Ensure number of modifications is always set in bam_parse_basemod2() Set state->nmods to 0 at the start, in case the function returns early. This ensures calls to bam_mods_at_next_pos() won't try to replay the mods from an earlier line. --- sam_mods.c | 1 + test/base_mods/MM-not-all-modded.out | 180 +++++++++++++++++++++++++++ test/base_mods/MM-not-all-modded.sam | 5 + test/base_mods/base-mods.tst | 3 + 4 files changed, 189 insertions(+) create mode 100644 test/base_mods/MM-not-all-modded.out create mode 100644 test/base_mods/MM-not-all-modded.sam diff --git a/sam_mods.c b/sam_mods.c index 3eb042328..fe8db85f7 100644 --- a/sam_mods.c +++ b/sam_mods.c @@ -231,6 +231,7 @@ int bam_parse_basemod2(const bam1_t *b, hts_base_mod_state *state, // Reset position, else upcoming calls may fail on // seq pos - length comparison state->seq_pos = 0; + state->nmods = 0; state->flags = flags; // Read MM and ML tags diff --git a/test/base_mods/MM-not-all-modded.out b/test/base_mods/MM-not-all-modded.out new file mode 100644 index 000000000..f7e3906a5 --- /dev/null +++ b/test/base_mods/MM-not-all-modded.out @@ -0,0 +1,180 @@ +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A +--- +Present: m. h. n. +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A +--- +Present: + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A +--- +Present: m. h. n. +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 + +=== + +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A +--- +Present: + +=== + diff --git a/test/base_mods/MM-not-all-modded.sam b/test/base_mods/MM-not-all-modded.sam new file mode 100644 index 000000000..0858c766d --- /dev/null +++ b/test/base_mods/MM-not-all-modded.sam @@ -0,0 +1,5 @@ +@SQ SN:I LN:999 +r1 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r1b 0 I 1 0 36M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A +r2 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A Mm:Z:C+m,2,2,1,4,1;C+h,6,7;N+n,15,2; Ml:B:C,128,153,179,204,230,159,6,215,240 +r2b 0 I 4 0 3S33M * 0 0 AGCTCTCCAGAGTCGNACGCCATYCGCGCGCCACCA DF?GCH88.EG8.7@E9G8A?H9.:C?8,@,,9F@A diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index ff1f7651a..aca4cdea1 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -44,6 +44,9 @@ P MM-explicit-x.out $test_mod -x MM-explicit.sam # for sites known to be have been scanned. P MM-explicit-f.out $test_mod -f 1 MM-explicit.sam +# Ensure state gets reset correctly between reads +P MM-not-all-modded.out $test_mod MM-not-all-modded.sam + # Pileup testing P MM-pileup.out $pileup_mod < MM-pileup.sam P MM-pileup2.out $pileup_mod < MM-pileup2.sam From 5dc826f0bbe3b462f1fdcefb8fcfa6830ee7ecc9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 13 Jul 2023 14:14:46 +0100 Subject: [PATCH 451/488] Fix to 2e672f33 decompress_peek_gz change. The "zs.total_out < destsize" should have been "zs.avail_out" to be more robust to total_out being reset by inflateReset. However looking again neither the avail_in or avail_out checks are necessary, as once we hit the end of either input or output buffer the next cycle triggers ret == Z_BUF_ERROR and we drop out as normal. Thanks to John Marshall for the spot. --- hts.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/hts.c b/hts.c index 5ce425212..9683948b0 100644 --- a/hts.c +++ b/hts.c @@ -326,21 +326,21 @@ decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) int ret; unsigned char *last_in = buffer; - while (zs.total_out < destsize) { + while (zs.avail_out > 0) { ret = inflate(&zs, Z_SYNC_FLUSH); - if (ret == Z_STREAM_END && zs.avail_in && zs.total_out < destsize) { + if (ret == Z_STREAM_END) { if (last_in == zs.next_in) - break; // paranoia to avoid potential looping + break; // Paranoia to avoid potential looping. Shouldn't happen else last_in = zs.next_in; inflateReset(&zs); - continue; - } - if (ret != Z_OK) + } else if (ret != Z_OK) { + // eg Z_BUF_ERROR due to avail_in/out becoming zero break; + } } - // zs.total_out can sometimes be wrong as inflateReset resets it + // NB: zs.total_out is changed by inflateReset, so use pointer diff instead destsize = zs.next_out - dest; inflateEnd(&zs); From 85d44d0c7f8edc4e0480654ef42790b7a20d59e8 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 14 Jul 2023 09:06:48 +0100 Subject: [PATCH 452/488] Ensure simple_test_driver.sh cleans up its temporary files --- test/simple_test_driver.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/test/simple_test_driver.sh b/test/simple_test_driver.sh index 9ab412511..5bc020eb6 100644 --- a/test/simple_test_driver.sh +++ b/test/simple_test_driver.sh @@ -51,6 +51,7 @@ run_test() { else # Expected non-zero exit code and got it r="P" + rm -f _out.tmp _err.tmp fi elif [ "$p" = "N" ] then @@ -69,11 +70,12 @@ run_test() { # Output differed r="F" y="output" + rm -f _out.tmp2 fi else # Expected zero exit code and got it. r="P" - rm -f _out.tmp _out.tmp2 _err.tmp + rm -f _out.tmp _err.tmp fi if [ "$r" = "F" ] @@ -107,6 +109,7 @@ run_test() { ;; *) echo "XFAIL: $@" + rm -f _out.tmp _err.tmp nefail=`expr $nefail + 1` ;; esac From 64ae397c16c48274f25b4ea5a30ce65410ca32af Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 14 Jul 2023 09:12:21 +0100 Subject: [PATCH 453/488] Ensure base mod test result is noticed by the Makefile * Remove line from base-mods.sh which caused it to always return success. * Explicitly check for non-zero return from base mod tests that are expected to work that way. --- test/base_mods/base-mods.sh | 1 - test/base_mods/base-mods.tst | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/base_mods/base-mods.sh b/test/base_mods/base-mods.sh index 388ff369e..f3f3ca4b7 100755 --- a/test/base_mods/base-mods.sh +++ b/test/base_mods/base-mods.sh @@ -31,6 +31,5 @@ test_mod="../test_mod" pileup_mod="../pileup_mod" test_driver $@ -rm _err.tmp _out.tmp exit $? diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index aca4cdea1..55d2d0034 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -23,6 +23,7 @@ # First field: # INIT = initialisation, not counted in testing # P = expected to pass +# N = expected to return non-zero # F = expected to fail # Second field: @@ -54,5 +55,5 @@ P MM-pileup2.out $pileup_mod < MM-pileup2.sam # Validation testing. We just care about exit status here, but the # test data is a copy of MM-pileup.sam so that suffices too. P MM-pileup.out $pileup_mod < MM-MNp.sam -F MM-pileup.out $pileup_mod < MM-MNf1.sam -F MM-pileup.out $pileup_mod < MM-MNf2.sam +N MM-pileup.out $pileup_mod < MM-MNf1.sam +N MM-pileup.out $pileup_mod < MM-MNf2.sam From b9e33b7be467614bd4601d162292afdbf541d109 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 14 Jul 2023 09:15:59 +0100 Subject: [PATCH 454/488] Improve test/test_mod.c Stop test/test_mod from writing trailing whitespace, to make commiting the outputs easier for developers using pre-commit checks for whitespace issues. Remove array `out`, as it's contents were never used, or even set due to conditionals on argc. This also allows the output to be written directly to stdout instead of going through an intermediate buffer. --- test/base_mods/MM-chebi.out | 86 +++---- test/base_mods/MM-double.out | 86 +++---- test/base_mods/MM-explicit-f.out | 192 ++++++++-------- test/base_mods/MM-explicit-x.out | 176 +++++++-------- test/base_mods/MM-explicit.out | 176 +++++++-------- test/base_mods/MM-multi.out | 174 +++++++-------- test/base_mods/MM-not-all-modded.out | 320 +++++++++++++-------------- test/test_mod.c | 67 +++--- 8 files changed, 633 insertions(+), 644 deletions(-) diff --git a/test/base_mods/MM-chebi.out b/test/base_mods/MM-chebi.out index 8df8130df..89970ddf9 100644 --- a/test/base_mods/MM-chebi.out +++ b/test/base_mods/MM-chebi.out @@ -1,48 +1,48 @@ -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m102 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n212 -16 A -17 C C+m128 -18 G -19 C C+(76792)161 -20 C C+m153 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m179 -32 A -33 C -34 C C+m204 C+(76792)33 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m102 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n212 +16 A +17 C C+m128 +18 G +19 C C+(76792)161 +20 C C+m153 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m179 +32 A +33 C +34 C C+m204 C+(76792)33 +35 A --- Present: m. #-76792. n. -6 C C+m102 -15 N N+n212 -17 C C+m128 -19 C C+(76792)161 -20 C C+m153 -31 C C+m179 -34 C C+m204 C+(76792)33 +6 C C+m102 +15 N N+n212 +17 C C+m128 +19 C C+(76792)161 +20 C C+m153 +31 C C+m179 +34 C C+m204 C+(76792)33 === diff --git a/test/base_mods/MM-double.out b/test/base_mods/MM-double.out index e346192b8..431dfff07 100644 --- a/test/base_mods/MM-double.out +++ b/test/base_mods/MM-double.out @@ -1,48 +1,48 @@ -0 A -1 G G-m115 -2 G -3 A -4 T -5 C -6 T -7 C C+m128 -8 T -9 A -10 G -11 C -12 G G-m141 -13 G G-m166 G+o102 -14 A -15 T -16 C -17 G -18 G -19 C -20 G -21 G -22 G G-m192 -23 G -24 G -25 A -26 T -27 A -28 T -29 G -30 C C+m153 -31 C C+m179 -32 A -33 T -34 A -35 T +0 A +1 G G-m115 +2 G +3 A +4 T +5 C +6 T +7 C C+m128 +8 T +9 A +10 G +11 C +12 G G-m141 +13 G G-m166 G+o102 +14 A +15 T +16 C +17 G +18 G +19 C +20 G +21 G +22 G G-m192 +23 G +24 G +25 A +26 T +27 A +28 T +29 G +30 C C+m153 +31 C C+m179 +32 A +33 T +34 A +35 T --- Present: m. m. o. -1 G G-m115 -7 C C+m128 -12 G G-m141 -13 G G-m166 G+o102 -22 G G-m192 -30 C C+m153 -31 C C+m179 +1 G G-m115 +7 C C+m128 +12 G G-m141 +13 G G-m166 G+o102 +22 G G-m192 +30 C C+m153 +31 C C+m179 === diff --git a/test/base_mods/MM-explicit-f.out b/test/base_mods/MM-explicit-f.out index 6462c99b9..0f7326cd8 100644 --- a/test/base_mods/MM-explicit-f.out +++ b/test/base_mods/MM-explicit-f.out @@ -1,111 +1,111 @@ -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+m50 C+h170 -11 T -12 A -13 C -14 C C+m160 C+h20 -15 G -16 C -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h. -9 C C+m200 C+h10 -10 C C+m50 C+h170 -14 C C+m160 C+h20 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 === -0 A -1 T -2 C C+m# C+h# -3 A -4 T -5 C C+m# C+h# -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+m50 C+h170 -11 T -12 A -13 C C+m10 C+h5 -14 C C+m160 C+h20 -15 G -16 C C+m10 C+h5 -17 T -18 A -19 T -20 A -21 G -22 C C+m# C+h# -23 C C+m# C+h# -24 T +0 A +1 T +2 C C+m# C+h# +3 A +4 T +5 C C+m# C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+m# C+h# +23 C C+m# C+h# +24 T --- Present: m? h? -2 C C+m# C+h# -5 C C+m# C+h# -9 C C+m200 C+h10 -10 C C+m50 C+h170 -13 C C+m10 C+h5 -14 C C+m160 C+h20 -16 C C+m10 C+h5 -22 C C+m# C+h# -23 C C+m# C+h# +2 C C+m# C+h# +5 C C+m# C+h# +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 +22 C C+m# C+h# +23 C C+m# C+h# === -0 A -1 T -2 C C+h# -3 A -4 T -5 C C+h# -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+h170 -11 T -12 A -13 C C+h5 -14 C C+m160 C+h20 -15 G -16 C C+h5 -17 T -18 A -19 T -20 A -21 G -22 C C+h# -23 C C+h# -24 T +0 A +1 T +2 C C+h# +3 A +4 T +5 C C+h# +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C C+h# +23 C C+h# +24 T --- Present: m. h? -2 C C+h# -5 C C+h# -9 C C+m200 C+h10 -10 C C+h170 -13 C C+h5 -14 C C+m160 C+h20 -16 C C+h5 -22 C C+h# -23 C C+h# +2 C C+h# +5 C C+h# +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 +22 C C+h# +23 C C+h# === diff --git a/test/base_mods/MM-explicit-x.out b/test/base_mods/MM-explicit-x.out index 4078543ca..8acfbf2fe 100644 --- a/test/base_mods/MM-explicit-x.out +++ b/test/base_mods/MM-explicit-x.out @@ -1,103 +1,103 @@ -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m.200 C+h.10 -10 C C+m.50 C+h.170 -11 T -12 A -13 C -14 C C+m.160 C+h.20 -15 G -16 C -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h.10 +10 C C+m.50 C+h.170 +11 T +12 A +13 C +14 C C+m.160 C+h.20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h. -9 C C+m200 C+h10 -10 C C+m50 C+h170 -14 C C+m160 C+h20 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 === -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m?200 C+h?10 -10 C C+m?50 C+h?170 -11 T -12 A -13 C C+m?10 C+h?5 -14 C C+m?160 C+h?20 -15 G -16 C C+m?10 C+h?5 -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m?200 C+h?10 +10 C C+m?50 C+h?170 +11 T +12 A +13 C C+m?10 C+h?5 +14 C C+m?160 C+h?20 +15 G +16 C C+m?10 C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m? h? -9 C C+m200 C+h10 -10 C C+m50 C+h170 -13 C C+m10 C+h5 -14 C C+m160 C+h20 -16 C C+m10 C+h5 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 === -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m.200 C+h?10 -10 C C+h?170 -11 T -12 A -13 C C+h?5 -14 C C+m.160 C+h?20 -15 G -16 C C+h?5 -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m.200 C+h?10 +10 C C+h?170 +11 T +12 A +13 C C+h?5 +14 C C+m.160 C+h?20 +15 G +16 C C+h?5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h? -9 C C+m200 C+h10 -10 C C+h170 -13 C C+h5 -14 C C+m160 C+h20 -16 C C+h5 +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 === diff --git a/test/base_mods/MM-explicit.out b/test/base_mods/MM-explicit.out index 186e790ae..0f3701fcd 100644 --- a/test/base_mods/MM-explicit.out +++ b/test/base_mods/MM-explicit.out @@ -1,103 +1,103 @@ -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+m50 C+h170 -11 T -12 A -13 C -14 C C+m160 C+h20 -15 G -16 C -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C +14 C C+m160 C+h20 +15 G +16 C +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h. -9 C C+m200 C+h10 -10 C C+m50 C+h170 -14 C C+m160 C+h20 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +14 C C+m160 C+h20 === -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+m50 C+h170 -11 T -12 A -13 C C+m10 C+h5 -14 C C+m160 C+h20 -15 G -16 C C+m10 C+h5 -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+m50 C+h170 +11 T +12 A +13 C C+m10 C+h5 +14 C C+m160 C+h20 +15 G +16 C C+m10 C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m? h? -9 C C+m200 C+h10 -10 C C+m50 C+h170 -13 C C+m10 C+h5 -14 C C+m160 C+h20 -16 C C+m10 C+h5 +9 C C+m200 C+h10 +10 C C+m50 C+h170 +13 C C+m10 C+h5 +14 C C+m160 C+h20 +16 C C+m10 C+h5 === -0 A -1 T -2 C -3 A -4 T -5 C -6 A -7 T -8 T -9 C C+m200 C+h10 -10 C C+h170 -11 T -12 A -13 C C+h5 -14 C C+m160 C+h20 -15 G -16 C C+h5 -17 T -18 A -19 T -20 A -21 G -22 C -23 C -24 T +0 A +1 T +2 C +3 A +4 T +5 C +6 A +7 T +8 T +9 C C+m200 C+h10 +10 C C+h170 +11 T +12 A +13 C C+h5 +14 C C+m160 C+h20 +15 G +16 C C+h5 +17 T +18 A +19 T +20 A +21 G +22 C +23 C +24 T --- Present: m. h? -9 C C+m200 C+h10 -10 C C+h170 -13 C C+h5 -14 C C+m160 C+h20 -16 C C+h5 +9 C C+m200 C+h10 +10 C C+h170 +13 C C+h5 +14 C C+m160 C+h20 +16 C C+h5 === diff --git a/test/base_mods/MM-multi.out b/test/base_mods/MM-multi.out index 73b480b80..41054a7c0 100644 --- a/test/base_mods/MM-multi.out +++ b/test/base_mods/MM-multi.out @@ -1,97 +1,97 @@ -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m128 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n215 -16 A -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m204 -32 A -33 C -34 C C+m230 C+h6 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A --- Present: m. h. n. -6 C C+m128 -15 N N+n215 -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -31 C C+m204 -34 C C+m230 C+h6 +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 === -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m77 C+h159 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n240 -16 A -17 C C+m103 C+h133 -18 G -19 C C+m128 C+h108 -20 C C+m154 C+h82 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m179 C+h57 -32 A -33 C -34 C C+m204 C+h31 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m77 C+h159 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n240 +16 A +17 C C+m103 C+h133 +18 G +19 C C+m128 C+h108 +20 C C+m154 C+h82 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m179 C+h57 +32 A +33 C +34 C C+m204 C+h31 +35 A --- Present: m. h. n. -6 C C+m77 C+h159 -15 N N+n240 -17 C C+m103 C+h133 -19 C C+m128 C+h108 -20 C C+m154 C+h82 -31 C C+m179 C+h57 -34 C C+m204 C+h31 +6 C C+m77 C+h159 +15 N N+n240 +17 C C+m103 C+h133 +19 C C+m128 C+h108 +20 C C+m154 C+h82 +31 C C+m179 C+h57 +34 C C+m204 C+h31 === diff --git a/test/base_mods/MM-not-all-modded.out b/test/base_mods/MM-not-all-modded.out index f7e3906a5..64fc847e2 100644 --- a/test/base_mods/MM-not-all-modded.out +++ b/test/base_mods/MM-not-all-modded.out @@ -1,178 +1,178 @@ -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m128 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n215 -16 A -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m204 -32 A -33 C -34 C C+m230 C+h6 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A --- Present: m. h. n. -6 C C+m128 -15 N N+n215 -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -31 C C+m204 -34 C C+m230 C+h6 +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 === -0 A -1 G -2 C -3 T -4 C -5 T -6 C -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N -16 A -17 C -18 G -19 C -20 C -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C -32 A -33 C -34 C -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A --- Present: === -0 A -1 G -2 C -3 T -4 C -5 T -6 C C+m128 -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N N+n215 -16 A -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C C+m204 -32 A -33 C -34 C C+m230 C+h6 -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C C+m128 +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N N+n215 +16 A +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C C+m204 +32 A +33 C +34 C C+m230 C+h6 +35 A --- Present: m. h. n. -6 C C+m128 -15 N N+n215 -17 C C+m153 -18 G N+n240 -19 C C+h159 -20 C C+m179 -31 C C+m204 -34 C C+m230 C+h6 +6 C C+m128 +15 N N+n215 +17 C C+m153 +18 G N+n240 +19 C C+h159 +20 C C+m179 +31 C C+m204 +34 C C+m230 C+h6 === -0 A -1 G -2 C -3 T -4 C -5 T -6 C -7 C -8 A -9 G -10 A -11 G -12 T -13 C -14 G -15 N -16 A -17 C -18 G -19 C -20 C -21 A -22 T -23 Y -24 C -25 G -26 C -27 G -28 C -29 G -30 C -31 C -32 A -33 C -34 C -35 A +0 A +1 G +2 C +3 T +4 C +5 T +6 C +7 C +8 A +9 G +10 A +11 G +12 T +13 C +14 G +15 N +16 A +17 C +18 G +19 C +20 C +21 A +22 T +23 Y +24 C +25 G +26 C +27 G +28 C +29 G +30 C +31 C +32 A +33 C +34 C +35 A --- Present: diff --git a/test/test_mod.c b/test/test_mod.c index e59da4827..89fb54243 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -86,7 +86,6 @@ static char *code(int id) { } int main(int argc, char **argv) { - char out[1024] = {0}; int extended = 0; uint32_t flags = 0; @@ -126,10 +125,9 @@ int main(int argc, char **argv) { int i, j, n; hts_base_mod mods[5]; for (i = 0; i < b->core.l_qseq; i++) { - char line[8192], *lp = line, *ep = line + sizeof(line); + char sp = '\t'; n = bam_mods_at_next_pos(b, m, mods, 5); - lp += snprintf(lp, ep - lp, "%d\t%c\t", - i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); + printf("%d\t%c", i, seq_nt16_str[bam_seqi(bam_get_seq(b), i)]); for (j = 0; j < n && j < 5; j++) { char qstr[10]; if (mods[j].qual == HTS_MOD_UNCHECKED) @@ -149,30 +147,25 @@ int main(int argc, char **argv) { m_canonical != mods[j].canonical_base || m_strand != mods[j].strand) goto err; - lp += snprintf(lp, ep - lp, "%c%c%s%c%s ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - "?."[m_implicit], - qstr); + printf("%c%c%c%s%c%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + "?."[m_implicit], + qstr); } else { - lp += snprintf(lp, ep - lp, "%c%c%s%s ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - qstr); + printf("%c%c%c%s%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + qstr); } + sp = ' '; } - *lp++ = '\n'; - *lp++ = 0; - - if (argc > 1) - printf("%s", line); - else - strcat(out, line); + putchar('\n'); } - if (argc > 1) puts("---"); + puts("---"); bam_parse_basemod2(b, m, flags); @@ -192,9 +185,9 @@ int main(int argc, char **argv) { int pos; while ((n=bam_next_basemod(b, m, mods, 5, &pos)) > 0) { - char line[8192]={0}, *lp = line, *ep = line + sizeof(line); - lp += snprintf(lp, ep - lp, "%d\t%c\t", pos, - seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); + char sp = '\t'; + printf("%d\t%c", pos, + seq_nt16_str[bam_seqi(bam_get_seq(b), pos)]); for (j = 0; j < n && j < 5; j++) { char qstr[10]; if (mods[j].qual == HTS_MOD_UNCHECKED) @@ -204,24 +197,20 @@ int main(int argc, char **argv) { else snprintf(qstr, 10, "%d", mods[j].qual); - lp += snprintf(lp, ep - lp, "%c%c%s%s ", - mods[j].canonical_base, - "+-"[mods[j].strand], - code(mods[j].modified_base), - qstr); + printf("%c%c%c%s%s", + sp, mods[j].canonical_base, + "+-"[mods[j].strand], + code(mods[j].modified_base), + qstr); + sp = ' '; } - *lp++ = '\n'; - *lp++ = 0; - - if (argc > 1) - printf("%s", line); - else - strcat(out, line); + putchar('\n'); } + if (n < 0) goto err; - if (argc > 1) puts("\n===\n"); + puts("\n===\n"); } fflush(stdout); if (sam_close(in) != 0 || r < -1) From 27007583af8884582b57a23d5aa43a5665928de1 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Wed, 19 Jul 2023 23:53:23 +1200 Subject: [PATCH 455/488] Add missing Makefile dependencies [minor] --- Makefile | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/Makefile b/Makefile index 83b49609d..4ed5f1086 100644 --- a/Makefile +++ b/Makefile @@ -478,7 +478,7 @@ textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstr cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h) -cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htslib_hfile_h) $(cram_h) +cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htscodecs_rANS_static4x16_h) $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) @@ -735,17 +735,17 @@ htscodecs/tests/varint: htscodecs/tests/varint_test.o $(HTSCODECS_OBJS) $(CC) $(LDFLAGS) -o $@ $^ $(LIBS) -lm -lpthread htscodecs/tests/arith_dynamic_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c $(htscodecs_arith_dynamic_h) +htscodecs/tests/arith_dynamic_test.o: htscodecs/tests/arith_dynamic_test.c config.h $(htscodecs_arith_dynamic_h) htscodecs/tests/fqzcomp_qual_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) +htscodecs/tests/fqzcomp_qual_test.o: htscodecs/tests/fqzcomp_qual_test.c config.h $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) htscodecs/tests/rANS_static4x16pr_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c $(htscodecs_rANS_static4x16_h) +htscodecs/tests/rANS_static4x16pr_test.o: htscodecs/tests/rANS_static4x16pr_test.c config.h $(htscodecs_rANS_static4x16_h) htscodecs/tests/rANS_static_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c $(htscodecs_rANS_static_h) +htscodecs/tests/rANS_static_test.o: htscodecs/tests/rANS_static_test.c config.h $(htscodecs_rANS_static_h) htscodecs/tests/tokenise_name3_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c $(htscodecs_tokenise_name3_h) +htscodecs/tests/tokenise_name3_test.o: htscodecs/tests/tokenise_name3_test.c config.h $(htscodecs_tokenise_name3_h) htscodecs/tests/varint_test.o: CPPFLAGS += -Ihtscodecs -htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c $(htscodecs_varint_h) +htscodecs/tests/varint_test.o: htscodecs/tests/varint_test.c config.h $(htscodecs_varint_h) test/hts_endian.o: test/hts_endian.c config.h $(htslib_hts_endian_h) test/fuzz/hts_open_fuzzer.o: test/fuzz/hts_open_fuzzer.c config.h $(htslib_hfile_h) $(htslib_hts_h) $(htslib_sam_h) $(htslib_vcf_h) @@ -755,7 +755,7 @@ test/pileup.o: test/pileup.c config.h $(htslib_sam_h) $(htslib_kstring_h) test/pileup_mod.o: test/pileup_mod.c config.h $(htslib_sam_h) test/plugins-dlhts.o: test/plugins-dlhts.c config.h test/sam.o: test/sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_faidx_h) $(htslib_khash_h) $(htslib_hts_log_h) -test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hfile_internal_h) +test/test_bgzf.o: test/test_bgzf.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(hfile_internal_h) test/test_expr.o: test/test_expr.c config.h $(htslib_hts_expr_h) test/test_kfunc.o: test/test_kfunc.c config.h $(htslib_kfunc_h) test/test_kstring.o: test/test_kstring.c config.h $(htslib_kstring_h) @@ -770,7 +770,7 @@ test/test_faidx.o: test/test_faidx.c config.h $(htslib_faidx_h) test/test_index.o: test/test_index.c config.h $(htslib_sam_h) $(htslib_vcf_h) test/test-vcf-api.o: test/test-vcf-api.c config.h $(htslib_hts_h) $(htslib_vcf_h) $(htslib_kstring_h) $(htslib_kseq_h) test/test-vcf-sweep.o: test/test-vcf-sweep.c config.h $(htslib_vcf_sweep_h) -test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) +test/test-bcf-sr.o: test/test-bcf-sr.c config.h $(htslib_synced_bcf_reader_h) $(htslib_hts_h) $(htslib_vcf_h) test/test-bcf-translate.o: test/test-bcf-translate.c config.h $(htslib_vcf_h) test/test_introspection.o: test/test_introspection.c config.h $(htslib_hts_h) $(htslib_hfile_h) test/test-bcf_set_variant_type.o: test/test-bcf_set_variant_type.c config.h $(htslib_hts_h) vcf.c From 6285a68f81464a5b96639034cf6aa2c49069f920 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 20 Jul 2023 00:13:55 +1200 Subject: [PATCH 456/488] Make last_in a pointer to const [minor] Zlib can be configured such that zs.next_in is a pointer to const. --- hts.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hts.c b/hts.c index 9683948b0..b7b528a61 100644 --- a/hts.c +++ b/hts.c @@ -325,7 +325,7 @@ decompress_peek_gz(hFILE *fp, unsigned char *dest, size_t destsize) if (inflateInit2(&zs, 31) != Z_OK) return -1; int ret; - unsigned char *last_in = buffer; + const unsigned char *last_in = buffer; while (zs.avail_out > 0) { ret = inflate(&zs, Z_SYNC_FLUSH); if (ret == Z_STREAM_END) { From 89141250d1c5262b2cfc5d41b05d1a30f2c7092e Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 20 Jul 2023 11:42:58 +0100 Subject: [PATCH 457/488] Switch to htscodecs 1.5.1 * Trivial bug fix to tests/r4x16pr demo * Updates for 1.5.1 release --- htscodecs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htscodecs b/htscodecs index 109f06949..11b5007ff 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 109f069490fca15d85e2d261822c15bc3080db8a +Subproject commit 11b5007ffb68bea9f6c777874a215e4187ce659a From 2051536d4ccc99690db292bd9858f2a455e0fba8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 13 Jul 2023 09:52:07 +0100 Subject: [PATCH 458/488] Add fai_thread_pool interface. This is a simple skim around bgzf_thread_pool. Also added a missing @param qsize to the bgzf_thread_pool documentation. Fixes #1638 --- faidx.c | 5 +++++ htslib/bgzf.h | 2 ++ htslib/faidx.h | 12 ++++++++++++ 3 files changed, 19 insertions(+) diff --git a/faidx.c b/faidx.c index 2eb0f3edc..5dd4bf1c0 100644 --- a/faidx.c +++ b/faidx.c @@ -975,6 +975,11 @@ void fai_set_cache_size(faidx_t *fai, int cache_size) { bgzf_set_cache_size(fai->bgzf, cache_size); } +// Adds a thread pool to the underlying BGZF layer. +int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize) { + return bgzf_thread_pool(fai->bgzf, pool, qsize); +} + char *fai_path(const char *fa) { char *fai = NULL; if (!fa) { diff --git a/htslib/bgzf.h b/htslib/bgzf.h index cb789ad53..97788a6fe 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -322,6 +322,8 @@ typedef struct BGZF BGZF; * * @param fp BGZF file handler * @param pool The thread pool (see hts_create_threads) + * @param qsize The size of the job queue. If 0 this is twice the + * number of threads in the pool. */ HTSLIB_EXPORT int bgzf_thread_pool(BGZF *fp, struct hts_tpool *pool, int qsize); diff --git a/htslib/faidx.h b/htslib/faidx.h index c1b3090a5..c3a4c95f6 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -70,6 +70,9 @@ struct faidx_t; /// Opaque structure representing FASTA index typedef struct faidx_t faidx_t; +/// Opaque structure; sole item needed from htslib/thread_pool.h +struct hts_tpool; + /// File format to be dealing with. enum fai_format_options { FAI_NONE, @@ -357,6 +360,15 @@ int fai_adjust_region(const faidx_t *fai, int tid, HTSLIB_EXPORT void fai_set_cache_size(faidx_t *fai, int cache_size); +/// Adds a thread pool to the underlying BGZF layer. +/** @param fai FAI file handler + * @param pool The thread pool (see hts_create_threads) + * @param qsize The size of the job queue. If 0 this is twice the + * number of threads in the pool. + */ +HTSLIB_EXPORT +int fai_thread_pool(faidx_t *fai, struct hts_tpool *pool, int qsize); + /// Determines the path to the reference index file /** @param fa String with the path to the reference file * @return String with the path to the reference index file, or NULL on failure From 4dbb9913f7d535b85383c4257df8b1a7301bfbd8 Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Thu, 20 Jul 2023 16:01:23 +0100 Subject: [PATCH 459/488] Summer 2023 copyright update. --- bgzf.c | 2 +- configure.ac | 2 +- cram/cram_decode.c | 2 +- cram/cram_index.c | 2 +- cram/cram_structs.h | 2 +- hfile_s3.c | 2 +- hts_probe_cc.sh | 2 +- htslib/bgzf.h | 2 +- htslib/faidx.h | 2 +- htslib/kseq.h | 2 +- htslib/vcf.h | 2 +- probaln.c | 2 +- tbx.c | 2 +- test/base_mods/base-mods.tst | 2 +- test/test-bcf-sr.c | 2 +- test/test-bcf-sr.pl | 2 +- test/test-regidx.c | 2 +- test/test-vcf-api.c | 2 +- test/test.pl | 2 +- test/test_bgzf.c | 2 +- test/test_mod.c | 2 +- 21 files changed, 21 insertions(+), 21 deletions(-) diff --git a/bgzf.c b/bgzf.c index 468289106..45f2b1150 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2,7 +2,7 @@ Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013-2021 Genome Research Ltd + Copyright (C) 2009, 2013-2022 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/configure.ac b/configure.ac index 3230f3935..c1afb38e7 100644 --- a/configure.ac +++ b/configure.ac @@ -35,7 +35,7 @@ m4_include([m4/hts_hide_dynamic_syms.m4]) m4_include([m4/pkg.m4]) dnl Copyright notice to be copied into the generated configure script -AC_COPYRIGHT([Portions copyright (C) 2020-2021 Genome Research Ltd. +AC_COPYRIGHT([Portions copyright (C) 2020-2023 Genome Research Ltd. This configure script is free software: you are free to change and redistribute it. There is NO WARRANTY, to the extent permitted by law.]) diff --git a/cram/cram_decode.c b/cram/cram_decode.c index 47b7ed076..26c7c1fac 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2020, 2022 Genome Research Ltd. +Copyright (c) 2012-2020, 2022-2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_index.c b/cram/cram_index.c index 39bc7cae0..b775e9431 100644 --- a/cram/cram_index.c +++ b/cram/cram_index.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2013-2020 Genome Research Ltd. +Copyright (c) 2013-2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/cram/cram_structs.h b/cram/cram_structs.h index 15b7f145b..160663392 100644 --- a/cram/cram_structs.h +++ b/cram/cram_structs.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2012-2016, 2018-2020 Genome Research Ltd. +Copyright (c) 2012-2016, 2018-2020, 2023 Genome Research Ltd. Author: James Bonfield Redistribution and use in source and binary forms, with or without diff --git a/hfile_s3.c b/hfile_s3.c index 2ce7feb4b..e2718f656 100644 --- a/hfile_s3.c +++ b/hfile_s3.c @@ -1,6 +1,6 @@ /* hfile_s3.c -- Amazon S3 backend for low-level file streams. - Copyright (C) 2015-2017, 2019-2022 Genome Research Ltd. + Copyright (C) 2015-2017, 2019-2023 Genome Research Ltd. Author: John Marshall diff --git a/hts_probe_cc.sh b/hts_probe_cc.sh index 5e5ddec1e..71c6f5d01 100755 --- a/hts_probe_cc.sh +++ b/hts_probe_cc.sh @@ -2,7 +2,7 @@ # Check compiler options for non-configure builds and create Makefile fragment # -# Copyright (C) 2022 Genome Research Ltd. +# Copyright (C) 2022-2023 Genome Research Ltd. # # Author: Rob Davies # diff --git a/htslib/bgzf.h b/htslib/bgzf.h index 97788a6fe..ea4ec3ece 100644 --- a/htslib/bgzf.h +++ b/htslib/bgzf.h @@ -3,7 +3,7 @@ /* Copyright (c) 2008 Broad Institute / Massachusetts Institute of Technology 2011, 2012 Attractive Chaos - Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022 Genome Research Ltd + Copyright (C) 2009, 2013, 2014, 2017, 2018-2019, 2022-2023 Genome Research Ltd Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/htslib/faidx.h b/htslib/faidx.h index c3a4c95f6..4351b3fbe 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -1,7 +1,7 @@ /// @file htslib/faidx.h /// FASTA random access. /* - Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020, 2022 Genome Research Ltd. + Copyright (C) 2008, 2009, 2013, 2014, 2016, 2017-2020, 2022-2023 Genome Research Ltd. Author: Heng Li diff --git a/htslib/kseq.h b/htslib/kseq.h index 5d573d3d9..ea887f14c 100644 --- a/htslib/kseq.h +++ b/htslib/kseq.h @@ -1,7 +1,7 @@ /* The MIT License Copyright (c) 2008, 2009, 2011 Attractive Chaos - Copyright (C) 2013, 2018, 2020 Genome Research Ltd. + Copyright (C) 2013, 2018, 2020, 2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/htslib/vcf.h b/htslib/vcf.h index 8bbf480c8..83659ae12 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -2,7 +2,7 @@ /// High-level VCF/BCF variant calling file operations. /* Copyright (C) 2012, 2013 Broad Institute. - Copyright (C) 2012-2020, 2022 Genome Research Ltd. + Copyright (C) 2012-2020, 2022-2023 Genome Research Ltd. Author: Heng Li diff --git a/probaln.c b/probaln.c index c841c7522..b42f85685 100644 --- a/probaln.c +++ b/probaln.c @@ -1,7 +1,7 @@ /* The MIT License Copyright (C) 2003-2006, 2008-2010 by Heng Li - Copyright (C) 2016-2017, 2020 Genome Research Ltd. + Copyright (C) 2016-2017, 2020, 2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the diff --git a/tbx.c b/tbx.c index 154e2a81b..c2c5c6f9d 100644 --- a/tbx.c +++ b/tbx.c @@ -1,6 +1,6 @@ /* tbx.c -- tabix API functions. - Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022 Genome Research Ltd. + Copyright (C) 2009, 2010, 2012-2015, 2017-2020, 2022-2023 Genome Research Ltd. Copyright (C) 2010-2012 Broad Institute. Author: Heng Li diff --git a/test/base_mods/base-mods.tst b/test/base_mods/base-mods.tst index 55d2d0034..889c3780e 100644 --- a/test/base_mods/base-mods.tst +++ b/test/base_mods/base-mods.tst @@ -1,4 +1,4 @@ -# Copyright (C) 2020 Genome Research Ltd. +# Copyright (C) 2020, 2023 Genome Research Ltd. # # Author: James Bonfield # diff --git a/test/test-bcf-sr.c b/test/test-bcf-sr.c index e91af7f28..80daf0423 100644 --- a/test/test-bcf-sr.c +++ b/test/test-bcf-sr.c @@ -1,5 +1,5 @@ /* - Copyright (C) 2017, 2020 Genome Research Ltd. + Copyright (C) 2017, 2020, 2023 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test-bcf-sr.pl b/test/test-bcf-sr.pl index 2e290cb3a..5c32e00f4 100755 --- a/test/test-bcf-sr.pl +++ b/test/test-bcf-sr.pl @@ -1,7 +1,7 @@ #!/usr/bin/env perl # test-bcf-sr.pl -- Test bcf synced reader's allele pairing # -# Copyright (C) 2017-2018, 2020 Genome Research Ltd. +# Copyright (C) 2017-2018, 2020, 2023 Genome Research Ltd. # # Author: petr.danecek@sanger # diff --git a/test/test-regidx.c b/test/test-regidx.c index 4cad440c7..4ba623734 100644 --- a/test/test-regidx.c +++ b/test/test-regidx.c @@ -2,7 +2,7 @@ gcc -g -Wall -O0 -I. -I../htslib/ -L../htslib regidx.c -o test-regidx test-regidx.c -lhts - Copyright (C) 2014,2016,2018, 2020 Genome Research Ltd. + Copyright (C) 2014,2016,2018, 2020, 2023 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test-vcf-api.c b/test/test-vcf-api.c index f016f1b3f..eff653686 100644 --- a/test/test-vcf-api.c +++ b/test/test-vcf-api.c @@ -1,6 +1,6 @@ /* test/test-vcf-api.c -- VCF test harness. - Copyright (C) 2013, 2014, 2017-2021 Genome Research Ltd. + Copyright (C) 2013, 2014, 2017-2021, 2023 Genome Research Ltd. Author: Petr Danecek diff --git a/test/test.pl b/test/test.pl index f8e94faa3..566e7cfce 100755 --- a/test/test.pl +++ b/test/test.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl # -# Copyright (C) 2012-2022 Genome Research Ltd. +# Copyright (C) 2012-2023 Genome Research Ltd. # # Author: Petr Danecek # diff --git a/test/test_bgzf.c b/test/test_bgzf.c index a5084e6c6..6cb6db902 100644 --- a/test/test_bgzf.c +++ b/test/test_bgzf.c @@ -1,6 +1,6 @@ /* test/test_bgzf.c -- bgzf unit tests - Copyright (C) 2017, 2019 Genome Research Ltd + Copyright (C) 2017, 2019, 2022-2023 Genome Research Ltd Author: Robert Davies diff --git a/test/test_mod.c b/test/test_mod.c index 89fb54243..d8a53f3de 100644 --- a/test/test_mod.c +++ b/test/test_mod.c @@ -1,6 +1,6 @@ /* test/test_mod.c -- testing of base modification functions - Copyright (C) 2020-2021 Genome Research Ltd. + Copyright (C) 2020-2021, 2023 Genome Research Ltd. Author: James Bonfield From d06b5988058e49301178d1784b3dc077bde34000 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Fri, 3 Mar 2023 11:55:32 +0000 Subject: [PATCH 460/488] Demonstration of htslib/sam api usage. The README.md file is a basic introduction to the sample code in this directory, while DEMO.md has more detailed information per program and API calls introduced. This is a work in progress and updated documentation may be available on www.htslib.org. --- samples/DEMO.md | 1432 +++++++++++++++++++++++++++++++++ samples/Makefile | 106 +++ samples/README.md | 229 ++++++ samples/add_header.c | 128 +++ samples/cram.c | 168 ++++ samples/dump_aux.c | 188 +++++ samples/flags_demo.c | 110 +++ samples/flags_htsopt_field.c | 115 +++ samples/index_multireg_read.c | 150 ++++ samples/index_reg_read.c | 143 ++++ samples/index_write.c | 166 ++++ samples/mod_aux.c | 221 +++++ samples/mod_aux_ba.c | 147 ++++ samples/mod_bam.c | 229 ++++++ samples/modstate.c | 190 +++++ samples/mpileup.c | 204 +++++ samples/pileup.c | 183 +++++ samples/pileup_mod.c | 218 +++++ samples/read_aux.c | 207 +++++ samples/read_bam.c | 139 ++++ samples/read_fast.c | 116 +++ samples/read_header.c | 173 ++++ samples/read_refname.c | 125 +++ samples/rem_header.c | 138 ++++ samples/sample.ref.fa | 4 + samples/sample.sam | 29 + samples/split.c | 153 ++++ samples/split2.c | 158 ++++ samples/split_thread1.c | 161 ++++ samples/split_thread2.c | 171 ++++ samples/update_header.c | 131 +++ samples/write_fast.c | 101 +++ 32 files changed, 6133 insertions(+) create mode 100644 samples/DEMO.md create mode 100644 samples/Makefile create mode 100644 samples/README.md create mode 100644 samples/add_header.c create mode 100644 samples/cram.c create mode 100644 samples/dump_aux.c create mode 100644 samples/flags_demo.c create mode 100644 samples/flags_htsopt_field.c create mode 100644 samples/index_multireg_read.c create mode 100644 samples/index_reg_read.c create mode 100644 samples/index_write.c create mode 100644 samples/mod_aux.c create mode 100644 samples/mod_aux_ba.c create mode 100644 samples/mod_bam.c create mode 100644 samples/modstate.c create mode 100644 samples/mpileup.c create mode 100644 samples/pileup.c create mode 100644 samples/pileup_mod.c create mode 100644 samples/read_aux.c create mode 100644 samples/read_bam.c create mode 100644 samples/read_fast.c create mode 100644 samples/read_header.c create mode 100644 samples/read_refname.c create mode 100644 samples/rem_header.c create mode 100644 samples/sample.ref.fa create mode 100644 samples/sample.sam create mode 100644 samples/split.c create mode 100644 samples/split2.c create mode 100644 samples/split_thread1.c create mode 100644 samples/split_thread2.c create mode 100644 samples/update_header.c create mode 100644 samples/write_fast.c diff --git a/samples/DEMO.md b/samples/DEMO.md new file mode 100644 index 000000000..1f14b7485 --- /dev/null +++ b/samples/DEMO.md @@ -0,0 +1,1432 @@ +# HTS API + +## HTSLib APIs and samtools + +HTSLib is a C library implementation used to access and process the genome +sequence data. HTSLib implements multiple API interfaces, HTS API, VCF API and +SAM API. HTS API provides a framework for use by other APIs and applications, +implements bgzf compression, htscodecs and provides CRAM format support. VCF +APIs work with variant data in VCF and BCF format. + +SAM API works with sequence data of different formats, SAM / BAM / CRAM / +FASTA / FASTQ, and provides methods to do operations on the data. It uses +methods from HTS API. + +'samtools' is the utility used to read and modify sequence data. It uses SAM +APIs from HTSLib to work on the sequence data. + + +## About this document + +There are a number of demonstration utilities and their source code in +'samples' directory of HTSLib and this document gives the description of them +and the usage of API of HTSLib. The samples are for demonstration +purposes only and proper error handling is required for actual usage. This +document is based on HTSLib version 1.17. + +Updates to this document may be made along with later releases when required. + + +## The sample apps + +Flags - This application showcases the basic read of alignment files and flag +access. It reads and shows the count of read1 and read2 alignments. + +Split - This application showcases the basic read and write of alignment data. +It saves the read1 and read2 as separate files in given directory, one as sam +and other as bam. + +Split2 - This application showcases the output file format selection. It saves +the read1 and read2 as separate files in given directory, both as compressed +sam though the extensions are different. + +Cram - This application showcases the different way in which cram reference +data is used for cram output creation. + +Read_fast - This application showcases the fasta/fastq data read. + +Read_header - This application showcases the read and access of header data. +It can show all header line of given type, data of a given tag on a specific +header line or for all lines of given type. + +Read_ref - This application showcases the read and access of header data. +It shows all reference names which has length equal or greater to given input. + +Read_bam - This application showcases read of different alignment data fields. +It shows contents of each alignment. + +Read_aux - This application showcases read of specific auxiliary tag data in +alignment. It shows the data retrieved using 2 APIs, one as a string with tag +data and other as raw data alternatively. + +Dump_aux - This application showcases read of all auxiliary tag data one by one +in an alignment. It shows the data retrieved. + +Add_header - This application showcases the write of header lines to a file. +It adds header line of types, SQ, RG, PG and CO and writes to standard output. + +Remove_header - This application showcases removal of header line from a file. +It removes either all header lines of given type or one specific line of given +type with given unique identifier. Modified header is written on standard +output. + +Update_header - This application shows the update of header line fields, where +update is allowed. It takes the header line type, unique identifier for the +line, tag to be modified and the new value. Updated data is written on standard +output. + +Mod_bam - This application showcases the update of alignment data. It takes +alignment name, position of field to be modified and new value of it. +Modified data is written on standard output. + +Mod_aux - This application showcases the update of auxiliary data in alignment. +It takes alignment name, tag to be modified, its type and new value. Modified +data is written on standard output. + +Mod_aux_ba - This application showcases the update of auxiliary array data in +alignment. It adds count of ATCGN base as an array in auxiliary data, BA:I. +Modified data is written on standard output. + +Write_fast - This application showcases the fasta/fastq data write. It appends +a dummy data to given file. + +Index_write - This application showcases the creation of index along with +output creation. Based on file type and shift, it creates bai, csi or crai +files. + +Read_reg - This application showcases the usage of region specification in +alignment read. + +Read_multireg - This application showcases the usage of mulitple regionn +specification in alignment read. + +Pileup - This application showcases the pileup api, where all alignments +covering a reference position are accessed together. It displays the bases +covering each position on standard output. + +Mpileup - This application showcases the mpileup api, which supports multiple +input files for pileup and gives a side by side view of them in pileup format. +It displays the bases covering each position on standard output. + +Modstate - This application showcases the access of base modifications in +alignment. It shows the modifications present in an alignment and accesses them +using available APIs. There are 2 APIs and which one to be used can be selected +through input. + +Pileup_mod - This application showcases the base modification access in pileup +mode. It shows the pileup display with base modifications. + +Flags_field - This application showcases the read of selected fields alone, +reducing the overhead / increasing the performance. It reads the flag field +alone and shows the count of read1 and read2. This has impact only on CRAM +files. + +Split_thread1 - This application showcases the use of threads in file handling. +It saves the read1 and read2 as separate files in given directory, one as sam +and other as bam. 2 threads are used for read and 1 each dedicated for each +output file. + +Split_thread2 - This application showcases the use of thread pool in file +handling. It saves the read1 and read2 as separate files in given directory, +one as sam and other as bam. A pool of 4 threads is created and shared for both +read and write. + + +## Building the sample apps + +The samples expect the HTSLib is installed, libraries and header file path are +part of the PATH environment variable. If not, these paths need to be explicitly +passed during the build time. + +Gcc and compatible compilers can be used to build the samples. + +These applications can be linked statically or dynamically to HTSLib. +For static linking, along with htslib other libraries and/or headers required +to build are, math, pthread, curl, lzma, z and bz2 libraries. + +A makefile is available along with source files which links statically to +htslib. To use dynamic linking, update the makefile's 'LDFLAGS' and 'rpath' +path. The 'rpath' path to be set as the path to lib directory of htslib +installation. + + +## Usage of HTS APIs +### Sequence data file access for read + +The sequence data file for read may be opened using the sam_open method. It +opens the file and returns samFile (htsFile) pointer on success or NULL on +failure. The input can be path to a file in disk, network, cloud or '-' +designating the standard input. + +SAM, BAM and CRAM file formats are supported and the input file format is +detected from the file content. + +Once done with the file, it needs to be closed with sam_close. + +Many times, header details would be required and can be read using +sam_hdr_read api. It returns sam_hdr_t pointer or NULL. The returned header +needs to be destroyed using sam_hdr_destroy when no longer required. + +The sequence data may be compressed or uncompressed on disk and on memory it +is read and kept as uncompressed BAM format. It can be read from a file using +sam_read1 api. samFile pointer, header and bam storage are to be passed as +argument and it returns 0 on success, -1 on end of file and < -1 in case of +errors. + +The bam storage has to be initialised using bam_init1 api before the call and +can be reused for successive reads. Once done, it needs to be destroyed using +bam_destroy1. The member field named core - bam1_core_t - in bam storage, +bam1_t, has the sequence data in an easily accessible way. Using the fields +and macros, data can easily be read from it. + + #include + + int main(int argc, char *argv[]) + { + ... + //initialize + if (!(bamdata = bam_init1())) { + ... + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + ... + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + ... + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; + } +Refer: flags_demo.c + +This shows the count of read1 and read2 alignments. + + ./flags /tmp/sample.sam.gz + +To read CRAM files, reference data is required and if it is not available, based +on configuration, library may try to download it from external repositories. + + +### Sequence data file access for write + +File access for write is similar to read with a few additional optional steps. + +The output file can be opened using sam_open api as in read, with "w" instead +of "r" as mode. This opens the file for writing and uses mode to select the +output file type. "w" alone denotes SAM, "wb" denotes BAM and "wc" denotes CRAM. + +Another way is to use sam_open_mode method, which sets the output file type and +compression based on the file name and explicit textual format specification. +This method expects a buffer to append type and compression flags. Usually a +buffer with standard file open flag is used, the buffer past the flag is passed +to the method to ensure existing flags and updates from this method are present +in the same buffer without being overwritten. This method will add more flags +indicating file type and compression based on name. If explicit format detail +given, then extension is ignored and the explicit specification is used. This +updated buffer can be used with sam_open to select the file format. + +sam_open_format method may also be used to open the file for output as more +information on the output file can be specified using this. Can use +mode buffer from sam_open_mode api or explicit format structure for this. + +The header data can be written using the sam_hdr_write api. When the header +data is copied to another variable and has different lifetime, it is good to +increase the reference count of the header using sam_hdr_incr_ref and +sam_hdr_destroy called as many times as required. + +The alignment data can be written using the sam_write1 api. It takes a samFile +pointer, header pointer and the alignment data. The header data is required to +set the reference name in the alignment. It returns -ve value on error. + + int main(int argc, char *argv[]) + { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || + (sam_hdr_write(outfile2, in_samhdr) == -1)) { + ... + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + ... + } +Refer: split.c + +This creates 1.sam and 2.bam in /tmp/ containing read1 and read2 respectively. + + ./split /tmp/sample.sam.gz /tmp/ + +Below code excerpt shows sam_open_mode api usage. + + int main(int argc, char *argv[]) + { + ... + //set file open mode based on file name for 1st and as explicit for 2nd + if ((sam_open_mode(mode1+1, file1, NULL) == -1) || + (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + //open output files + outfile1 = sam_open(file1, mode1); //as compressed SAM through sam_open + outfile2 = sam_open_format(file2, mode2, NULL); //as compressed SAM through sam_open_format + ... + } +Refer: split2.c + +This creates 1.sam.gz and 2.sam in /tmp/ both having compressed data. + + ./split2 /tmp/sample.sam.gz /tmp/ + +An htsFormat structure filled appropriately can also be used to specify output +file format while using sam_open_format api. + + +### CRAM writing + +CRAM files uses reference data and compresses alignment data. A CRAM file may +be created with external reference data file - most appropriate, with embedded +reference in it or with no reference data at all. It can also be created using +an autogenerated reference, based on consensus with-in the alignment data. +The reference detail can be set to an htsFormat structure using hts_parse_format +api and used with sam_open_format api to create appropriate CRAM file. + ... + snprintf(reffmt1, size1, "cram,reference=%s", reffile); + snprintf(reffmt2, size2, "cram,embed_ref=1,reference=%s", reffile); + ... + if (hts_parse_format(&fmt1, reffmt1) == -1 || //using external reference - uses the M5/UR tags to get + reference data during read + hts_parse_format(&fmt2, reffmt2) == -1 || //embed the reference internally + hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 || //embed autogenerated reference + hts_parse_format(&fmt4, "cram,no_ref=1") == -1) { //no reference data encoding at all + ... + outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2); + ... +Refer: cram.c + + +### FASTA/FASTQ data access + +FASTA/FASTQ files have the raw sequence data and the data can be read one by +one using sam_read1 or a selected range using a region. The data can be written +similar to alignment data using sam_write1 api. To write the file, format +can be set by updating mode buffer using sam_open_mode with file name +or explicit format text. This mode buffer can be used with sam_open or can be +used with sam_open_format with explicit format information in htsFormat +structure. + + ... + if (!(bamdata = bam_init1())) { + ... + if (!(infile = sam_open(inname, "r"))) { + ... + if (infile->format.format != fasta_format && infile->format.format != fastq_format) { + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + printf("\nsequence: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]); + } + if (infile->format.format == fastq_format) { + printf("\nquality: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", bam_get_qual(bamdata)[c]); + ... +Refer: read_fast.c + + ... + char mode[4] = "a"; + ... + if (sam_open_mode(mode + 1, outname, NULL) < 0) { + ... + if (!(outfile = sam_open(outname, mode))) { + ... + if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) + < 0) { + ... + if (sam_write1(outfile, out_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + ... +Refer: write_fast.c + + +### Header data read + +The header gives the version, reference details, read group, change history +and comments. These data are stored inside the sam_hdr_t. Each of these +entries, except comments, have their unique identifier and it is required to +access different fields of them. The api sam_hdr_count_lines gives the count +of the specified type of header line. The value of a unique identifier to a +specific type of header line can be retrieved with sam_hdr_line_name api. The +api sam_hdr_find_tag_id and sam_hdr_find_tag_pos can get the field data from a +header line using unique identifier values or using position. The full header +line can be retrieved using sam_hdr_find_line_pos or sam_hdr_line_id with +position and unique identifier values respectively. + + ... + if (!(in_samhdr = sam_hdr_read(infile))) { + ... + ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data); + ... + ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data); + ... + linecnt = sam_hdr_count_lines(in_samhdr, header); + ... + ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data); + ... + ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data); + ... +Refer: read_header.c + +This will show the VN tag's value from HD header. + + ./read_header /tmp/sample.sam.gz HD VN + +Shows the 2nd SQ line's LN field value. + + ./read_header /tmp/sample.sam.gz SQ SN T2 LN + +Below code excerpt shows the reference names which has length above given value. + ... + linecnt = sam_hdr_count_lines(in_samhdr, "SQ"); //get reference count + ... + //iterate and check each reference's length + for (pos = 1, c = 0; c < linecnt; ++c) { + if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) { + ... + size = atoll(data.s); + if (size < minsize) { + //not required + continue; + } + if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) { + //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same! + ... + printf("%d,%s,%s\n", pos, id, data.s); + ... +Refer: read_refname.c + + +### Alignment data read + +The alignment / sequence data contains many fields. Mainly the read/query +name, flags indicating the properties of the read, reference sequence name, +position in reference to which it matches, quality of the read, CIGAR string +indicating the match status, position of mate / reverse strand, name of +reference sequence to which mate matches, the insert length, base sequence, +quality value of each base and auxiliary fields. + +Header data would be required to retrieve the reference names as alignment +contains the position of the reference in the header. + +A few of the data are directly visible in bam1_t and the rest are hidden +inside data member of bam1_t and can easily be retrieved using macros. +bam_get_qname gives the name of the read, sam_hdr_tid2name gives the reference +name. bam_get_cigar retrieves the cigar operation array, which can be decoded +using bam_cigar_oplen to get count of bases to which that operation applicable +and bam_cigar_opchr to get the cigar operation. bam_seqi retrieves the base +data at a given position in alignment and it can be converted to character by +indexing the seq_nt16_str array. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE] + printf("NAME: %s\n", bam_get_qname(bamdata)); //get the query name using the macro + flags = bam_flag2str(bamdata->core.flag); //flags as string + ... + tidname = sam_hdr_tid2name(in_samhdr, bamdata->core.tid); + ... + printf("MQUAL: %d\n", bamdata->core.qual); //map quality value + cigar = bam_get_cigar(bamdata); //retrieves the cigar data + for (i = 0; i < bamdata->core.n_cigar; ++i) { //no. of cigar data entries + printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i])); //the macros gives the count of operation + and the symbol of operation for given cigar entry + } + printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize); + data = bam_get_seq(bamdata); + //get the sequence data + if (bamdata->core.l_qseq != bam_cigar2qlen(bamdata->core.n_cigar, cigar)) { //checks the length with CIGAR and query + ... + for (i = 0; i < bamdata->core.l_qseq ; ++i) { //sequence length + printf("%c", seq_nt16_str[bam_seqi(data, i)]); //retrieves the base from (internal compressed) sequence data + ... + printf("%c", bam_get_qual(bamdata)[i]+33); //retrives the quality value + ... +Refer: read_bam.c + +Shows the data from alignments. + + ./read_bam /tmp/sample.sam.gz + + +### Aux data read + +Auxiliary data gives extra information about the alignment. There can be a +number of such data and can be accessed by specifying required tag or by +iterating one by one through them once the alignment is read as bam1_t. The +auxiliary data are stored along with the variable length data in the data +field of bam1_t. There are macros defined to retrieve information about +auxiliary data from the data field of bam1_t. + +Data for a specific tag can be retrieved as a string or can be retrieved as raw +data. bam_aux_get_str retrieves as a string, with tag name, tag type and data. +bam_aux_get can get raw data and with bam_aux_type and bam_aux2A, bam_aux2f etc. +the raw data can be extracted. + +To iterate through all data, the start of aux data is retrieved using macro +bam_aux_first and successive ones using bam_aux_next. Macro bam_aux_tag gives +the tag of the aux field and bam_aux_type gives the information about type of +the aux field. + +Bam_aux2i, bam_aux2f, bam_aux2Z macros retrieve the aux data's value as +integer, float and string respectively. The integer value may be of different +precision / size and the bam_aux_type character indicates how to use the +value. The string/hex data are NULL terminated. + +For array data, bam_aux_type will return 'B' and bam_auxB_len gives the length +of the array. bam_aux_type with the next byte will give the type of data in +the array. bam_auxB2i, bam_auxB2f will give integer and float data from a +given position of the array. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (i % 2) { //use options alternatively to demonstrate both + //option 1 - get data as string with tag and type + if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) { + printf("%s\n",sdata.s); + ... + //option 2 - get raw data + if (!(data = bam_aux_get(bamdata, tag))) { + ... + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + ... +Refer: read_aux.c + +Shows the MD aux tag from alignments. + + ./read_aux ../../samtools/test/mpileup/mpileup.1.bam MD + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + data = bam_aux_first(bamdata); //get the first aux data + while (data) { + printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data)); + //macros gets the tag and type of aux data + //dump the data + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + ... + data = bam_aux_next(bamdata, data); //get the next aux data + ... +Refer: dump_aux.c + +Shows all the tags from all alignments. + + ./dump_aux ../../samtools/test/mpileup/mpileup.1.bam + + +### Add/Remove/Update header + +There are specific types of data that can be part of header data. They have +a tag from HD, SQ, RG, PG and CO. Fully formatted header lines, separated by new +line, can be added with sam_hdr_add_lines api. A single header line can be added +using sam_hdr_add_line api where the header type, tag and value pair are passed +as arguments, terminated with a NULL argument. The PG header lines are special +that they have a kind of linkage to previous PG lines. This linkage can be auto +generated by using sam_hdr_add_pg api which sets the 'PP' field used in linkage. +sam_hdr_write api does the write of the header data to file. + + ... + //add SQ line with SN as TR1 and TR2 + if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) { //length as 0 for NULL terminated data + ... + //add RG line with ID as RG1 + if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) { + ... + //add pg line + if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) { //NULL is to indicate end of args + ... + if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) { //NULL is to indicate end of args + ... + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + ... +Refer: add_header.c + +Not all type of header data can be removed but where it is possible, either a +specific header line can be removed or all of a header type can be removed. To +remove a specific line, header type, unique identifier field tag and its value +to be used. To remove all lines of a type, header type and unique identifier +field tag are to be used. + + ... + //remove specific line + if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) { + ... + //remove multiple lines of a header type + if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) { + ... + if (sam_hdr_write(outfile, in_samhdr) < 0) { + ... +Refer: rem_header.c + +Shows the file content after removing SQ line with SN 2. + ./rem_header ../../samtools/test/mpileup/mpileup.1.bam SQ 2 + +The unique identifier for the line needs to be found to update a field, though +not all types in the header may be modifiable. The api sam_hdr_update_line +takes the unique identifier for the header line type, its value, the field +which needs to be modified and the new value with which to modify it, followed +by a NULL. +e.g. To change LN field from 2000 to 2250 in SQ line with unique identifier SN +as 'chr1', sam_hdr_update_line( header, "SQ", "SN", "chr1", "LN", "2250", +NULL). To change PP field from ABC to DEF in PG line with ID APP.10, +sam_hdr_update_line( header, "PG", "ID", "APP.10", "PP", "DEF", NULL). + + ... + //update with new data + if (sam_hdr_update_line(in_samhdr, header, id, idval, tag, val, NULL) < 0) { + printf("Failed to update data\n"); + goto end; + } + ... +Refer: update_header.c + +Shows new sam file with 2nd SQ line having length as 38. + + ./update_header /tmp/sample.sam.gz SQ T1 LN 38 + + +### Update alignment data + +Many of the bam data fields may be updated by setting new value to appropriate +field in bam1_core_t structure and for a few, creating a new bam1_t record would +be easier than update of existing record. + + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + ... + case 1:// QNAME + ret = bam_set_qname(bamdata, val); + break; + case 2:// FLAG + bamdata->core.flag = atol(val) & 0xFFFF; + break; + case 3:// RNAME + case 7:// RNEXT + if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) { + ... + if (field == 3) { + //reference + bamdata->core.tid = ret; + } + else { + //mate reference + bamdata->core.mtid = ret; + } + break; + case 4:// POS + bamdata->core.pos = atoll(val); + break; + case 5:// MAPQ + bamdata->core.qual = atoi(val) & 0x0FF; + break; + case 6:// CIGAR + { + ... + //get cigar array and set all data in new bam record + if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) { + ... + if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid, + bamdata->core.pos, bamdata->core.qual, ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos, + bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata), + (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) { + ... + //correct sequence data as input is expected in ascii format and not as compressed inside bam! + memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2); + //copy the aux data + memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); + ... + break; + case 8:// PNEXT + bamdata->core.mpos = atoll(val); + break; + case 9:// TLEN + bamdata->core.isize = atoll(val); + break; + case 10:// SEQ + ... + for( c = 0; c < i; ++c) { + bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]); + } + break; + case 11:// QUAL + ... + for (c = 0; c < i; ++c) { + val[c] -= 33; //phred score from ascii value + } + memcpy(bam_get_qual(bamdata), val, i); + ... +Refer: mod_bam.c + +Shows data with RNAME modified to T2. + + ./mod_bam /tmp/sample.sam ITR1 3 T2 + +The auxiliary data in bam1_t structure can be modified using +bam_aux_update_float, bam_aux_update_int etc. apis. If the aux field is not +present at all, it can be appended using bam_aux_append. + + ... + //matched to qname, update aux + if (!(data = bam_aux_get(bamdata, tag))) { + //tag not present append + ... + if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) { + ... + else { + char auxtype = bam_aux_type(data); + //update the tag with newer value + switch (type) { + case 'f': + case 'd': + ... + if (bam_aux_update_float(bamdata, tag, atof(val))) { + ... + case 'C': + case 'S': + case 'I': + ... + if (bam_aux_update_int(bamdata, tag, atoll(val))) { + ... + case 'Z': + ... + if (bam_aux_update_str(bamdata, tag, length, val)) { + ... + case 'A': + ... + //update the char data directly on buffer + *(data+1) = val[0]; + ... +Refer: mod_aux.c + +Shows the given record's MD tag set to Test. + + ./mod_aux samtools/test/mpileup/mpileup.1.bam ERR013140.6157908 MD Z Test + +The array aux fields can be updated using bam_aux_update_array api. + + ... + if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) { + ... +Refer: mod_aux_ba.c + +Shows the records updated with an array of integers, containing count of ACGT +and N in that order. + + ./mod_aux_ba samtools/test/mpileup/mpileup.1.bam + + +### Create an index + +Indexes help to read data faster without iterating sequentially through the +file. Indexes contain the position information about alignments and that they +can be read easily. There are different type of indices, BAI, CSI, CRAI, TBI, +FAI etc. and are usually used with iterators. + +Indexing of plain/textual files are not supported, compressed SAM&FASTA/Q, BAM, +and CRAM files can be indexed. CRAM files are indexed as .crai and the other two +can be indexed as .bai or .csi files. Each of these types have different +internal representations of the index information. Bai uses a fixed +configuration values where as csi has them dynamically updated based on the +alignment data. + +Indexes can be created either with save of alignment data or explicitly by +read of existing alignment file. + +To create index along with alignment write, the sam_idx_init api need to be +invoked before the start of alignment data write. This api takes the output +samFile pointer, header pointer, minimum shift and index file path. For BAI +index, the min shift has to be 0. + +At the end of write, sam_idx_save api need to be invoked to save the index. + + //write header + if (sam_hdr_write(outfile, in_samhdr)) { + ... + // initialize indexing, before start of write + if (sam_idx_init(outfile, in_samhdr, size, fileidx)) { + ... + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + ... + if (sam_idx_save(outfile)) { +Refer:index_write.c + +Creates mpileup.1.bam and mpileup.1.bam.bai in /tmp/. + + ./idx_on_write ../../samtools/test/mpileup/mpileup.1.bam 0 /tmp/ + +To create index explicitly on an existing alignment data file, the +sam_index_build api or its alike can be used. sam_index_build takes the +alignment file path, min shift for the index and creates the index file in +same path. The output name will be based on the alignment file format and min +shift passed. + +The sam_index_build2 api takes the index file path as well and gives more +control than the previous one. The sam_index_build3 api provides an option to +configure the number of threads in index creation. + + +### Read with iterators + +Index file helps to read required data without sequentially accessing the file +and are required to use iterators. The interested reference, start and end +position etc. are required to read data with iterators. With index and these +information, an iterator is created and relevant alignments can be accessed by +iterating it. + +The api sam_index_load and the like does the index loading. It takes input +samFile pointer and file path. It loads the index file based on the input file +name, from the same path and with implicit index file extension - cram file +with .crai and others with .bai. The sam_index_load2 api accepts explicit path +to index file, which allows loading it from a different location and explicit +extensions. The sam_index_load3 api supports download/save of the index +locally from a remote location. These apis returns NULL on failure and index +pointer on success. + +The index file path can be appended to alignment file path and used as well. +In this case the paths are expected to be separated by '##idx##'. + +The sam_iter_queryi or sam_iter_querys apis may be used to create an iterator +and sam_itr_next api does the alignment data retrieval. Along with retrieval +of current data, it advances the iterator to next relevant data. The +sam_iter_queryi takes the interested positions as numeric values and +sam_iter_querys takes the interested position as a string. + +With sam_iter_queryi, the reference id can be the 0 based index of reference +data, -2 for unmapped alignments, -3 to start read from beginning of file, -4 +to continue from current position, -5 to return nothing. Based on the +reference id given, alignment covering the given start and end positions will +be read with sam_iter_next api. + +With sam_iter_querys, the reference sequence is identified with the name and +interested positions can be described with start and end separated by '-' as +string. When sequence is identified as '.', it begins from the start of file +and when it is '*', unmapped alignments are read. Reference with [:], +:S, :S-E, :-E retrieves all data, all data covering position +S onwards, all data covering position S to E, all data covering upto position +E of reference with ID respectively on read using sam_iter_next. + +The index and iterator created are to be destroyed once the need is over. +sam_itr_destroy and hts_idx_destroy apis does this. + + ... + //load index file + if (!(idx = sam_index_load2(infile, inname, idxfile))) { + ... + //create iterator + if (!(iter = sam_itr_querys(idx, in_samhdr, region))) { + ... + //read using iterator + while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) { + ... + if (iter) { + sam_itr_destroy(iter); + } + if (idx) { + hts_idx_destroy(idx); + ... +Refer:index_reg_read.c + +With sample.sam, region as \* will show alignments with name UNMAP2 and UNMAP3 + + ./read_reg /tmp/sample.sam.gz \* + +With region as \., it shows all alignments + + ./read_reg /tmp/sample.sam.gz \. + +With region as T1:1-4, start 1 and end 4 it shows nothing and with T1:1-5 it +shows alignment with name ITR1. + + ./read_reg /tmp/sample.sam.gz T1:1-5 + +With region as T2:30-100, it shows alignment with name ITR2M which refers the +reference data T2. + + ./read_reg /tmp/sample.sam.gz T2:30-100 + + +Multiple interested regions can be specified for read using sam_itr_regarray. +It takes index path, header, count of regions and region descriptions as array +of char array / string. This array passed need to be released by the user +itself. + + ... + //load index file, assume it to be present in same location + if (!(idx = sam_index_load(infile, inname))) { + ... + //create iterator + if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) { + ... + if (regions) { + //can be freed as it is no longer required + free(regions); + regions = NULL; + } + //get required area + while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) { + ... +Refer:index_multireg_read.c + +With compressed sample.sam and 2 regions from reference T1 (30 to 32) and 1 +region from T2 (34 onwards), alignments with name A1, B1, A2 and ITR2M would +be shown. + + ./read_multireg /tmp/sample.sam.gz 2 T1:30-32,T2:34 + +To use numeric indices instead of textual regions, sam_itr_regions can be used. +It takes index file path, header, count of regions and an array of region +description (hts_reglist_t*), which has the start end positions as numerals. + +The index and iterators are to be destroyed using the sam_itr_destroy and +hts_idx_destroy. The hts_reglist_t* array passed is destroyed by the library +on iterator destroy. The regions array (array of char array/string) needs to be +destroyed by the user itself. + + +### Pileup and MPileup + +Pileup shows the transposed view of the SAM alignment data, i.e. it shows the +the reference positions and bases which cover that position through different +reads side by side. MPileup facilitates the piling up of multiple sam files +against each other and same reference at the same time. + +Mpileup has replaced the pileup. The input expects the data to be sorted by +position. + +Pileup needs to be initialized with bam_pileup_init method which takes pointer +to a method, which will be called by pileup to read data from required files, +and pointer to data which might be required for this read method to do the +read operation. It returns a pointer to the pileup iterator. + +User can specify methods which need to be invoked during the load and unload +of an alignment, like constructor and destructor of objects. +Bam_plp_constructor and bam_plp_destructor methods does the setup of +these methods in the pileup iterator. During invocation of these methods, the +pointer to data passed in the initialization is passed as well. If user want +to do any custom status handling or actions during load or unload, it can be +done in these methods. Alignment specific data can be created and stored in +an argument passed to the constructor and the same will be accessible during +pileup status return. The same will be accessible during destructor as well +where any deallocation can be made. + +User is expected to invoke bam_plp_auto api to get the pileup status. It +returns the pileup status or NULL on end. During this all alignments are read +one by one, using the method given in initialization for data read, until one +for a new reference is found or all alignment covering a position is read. On +such condition, the pileup status is returned and the same continuous on next +bam_plp_auto call. The pileup status returned is an array for all positions +for which the processing is completed. Along with the result, the reference +index, position in reference data and number of alignments which covers this +position are passed. User can iterate the result array and get bases from each +alignment which covers the given reference position. The alignment specific +custom data which were created in constructor function will also be available +in the result. + +The bam_plp_auto api invokes the data read method to load an alignment and the +constructor method is invoked during the load. Once the end of alignment is +passed, it is removed from the processing and destructor method is invoked, +that user could do deallocations and custom actions as in load during this +time. The custom data passed during the initialization is passed to the +constructor and destructor methods during invocation. + +Once the forward and reverse strands are identified, the better of the quality +is identified and used. Both reads are required for this and hence reads are +cached until its mate is read. The maximum number of reads that can be cached +is controlled by bam_plp_set_maxcnt. Reads covering a position are cached and +as soon as mate is found, quality is adjusted and is removed from cache. Reads +above the cache limit are discarded. + +Once done, the pileup iterator to be discarded by sam_plp_destroy api. + + ... + if (!(plpiter = bam_plp_init(readdata, &conf))) { + ... + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &n))) { + printf("%d\t%d\t", tid+1, refpos+1); + for (j = 0; j < n; ++j) { + //doesnt detect succeeding insertion and deletion together here, only insertion is identified + //deletion is detected in plp->is_del as and when pos reaches the position + //if detection ahead is required, use bam_plp_insertion here which gives deletion length along with insertion + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]))); + if (plp[j].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[j].indel); + for (k = 0; k < plp[j].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos + k + 1)])); + } + } + else if (plp[j].indel < 0) { + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + ... + if (plpiter) { + bam_plp_destroy(plpiter); + ... +Refer:pileup.c + +The read method may use a simple read or it could be an advanced read using +indices, iterators and region specifications based on the need. The constructor +method may create any custom data and store it in the pointer passed to it. The +same need to be released by use on destructor method. + +MPileup works same as the pileup and supports multiple inputs against the same +reference, giving side by side view of reference and alignments from different +inputs. + +MPileup needs to be initialized with bam_mpileup_init method which takes +pointer to a method, which will be called by pileup to read data from required +files, and an array of pointer to data which might be required for this read +method to do the read operation. It returns a pointer to the mpileup iterator. + +User can specify methods which need to be invoked during the load and unload +of an alignment, like constructor and destructor of objects. +bam_mplp_constructor and bam_mplp_destructor methods does the setup +of these methods in the pileup iterator. During invocation of these methods, +the pointer to data passed in the initialization is passed as well. If user +want to do any custom status handling or actions during load or unload, it can +be done on these methods. Alignment specific data can be created and +stored in the custom data pointer and the same will be accessible during +return of pileup status. The same will be accessible during destructor as well +where any deallocation can be made. + +User is expected to invoke bam_mplp_auto api to get the pileup status. It +returns the pileup status. During this all alignments are read one by one, +using the method given in initialization for data read, until one for a new +reference is found or all alignment covering a position is read. On such +condition, the pileup status is returned and the same continuous on next +bam_mplp_auto call. + +The pileup status is returned through a parameter in the method itself, is an +array for all inputs, each containing array for positions on which the +processing is completed. Along with the result, the reference index, position +in reference data and number of alignments which covers this position are +passed. User can iterate the result array and get bases from each alignment +which covers the given reference position. The alignment specific custom data +which were created in constructor function will also be available in the +result. + +Once the forward and reverse strands are identified, the better of the quality +is identified and used. Both reads are required for this and hence reads are +cached until its mate is read. The maximum number of reads that can be cached +is controlled by bam_mplp_set_maxcnt. Reads covering a position are cached and +as soon as mate is found, quality is adjusted and is removed from cache. Reads +above the cache limit are discarded. + +Once done, the pileup iterator to be discarded by sam_mplp_destroy api. + + ... + if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) { + ... + //set constructor destructor callbacks + bam_mplp_constructor(mplpiter, plpconstructor); + bam_mplp_destructor(mplpiter, plpdestructor); + + while (bam_mplp64_auto(mplpiter, &tid, &refpos, depth, plp) > 0) { + printf("%d\t%"PRIhts_pos"\t", tid+1, refpos+1); + + for (input = 0; input < argc - 1; ++input) { + for (dpt = 0; dpt < depth[input]; ++dpt) { + if (plp[input][dpt].is_del || plp[input][dpt].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[input][dpt].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]) : (plp[input]->is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos)]))); + if (plp[input][dpt].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[input][dpt].indel); + for (k = 0; k < plp[input][dpt].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), + plp[input][dpt].qpos + k + 1)])); + } + } + else if (plp[input][dpt].indel < 0) { + printf("%d", plp[input][dpt].indel); + for (k = 0; k < -plp[input][dpt].indel; ++k) { + printf("?"); + ... + if (mplpiter) { + bam_mplp_destroy(mplpiter); + } + ... + if (plp) { + free(plp); + ... +Refer:mpileup.c + +This sample takes multiple sam files and shows the pileup of data side by side. + + ./mpileup /tmp/mp.bam /tmp/mp.sam + + +### Base modifications + +The alignment data may contain base modification information as well. This +gives the base, modifications found, orientation in which it was found and the +quality for the modification. The base modification can be identified using +hts_parse_basemod api. It stores the modification details on hts_base_mod_state +and this has to be initialized using hts_base_mod_state_alloc api. + +Once the modifications are identified, they can be accessed through different +ways. bam_mods_recorded api gives the modifications identified for an alignment. +Modifications can be queried for each base position iteratively using +bam_mods_at_next_pos api. Check the returned value with buffer size to see +whether the buffer is big enough to retrieve all modifications. +Instead of querying for each position, the next modified position can be +directly retrieved directly using bam_next_basemod api. An alignment can be +queried to have a specific modification using bam_mods_query_type api. At the +end of processing, the state need to be released using hts_base_mod_state_free +api. + + ... + if (!(ms = hts_base_mod_state_alloc())) { + ... + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + ... + if (bam_parse_basemod(bamdata, ms)) { + ... + bm = bam_mods_recorded(ms, &cnt); + for (k = 0; k < cnt; ++k) { + printf("%c", bm[k]); + } + printf("\n"); + hts_base_mod mod[5] = {0}; //for ATCGN + if (opt) { + //option 1 + for (; i < bamdata->core.l_qseq; ++i) { + if ((r = bam_mods_at_next_pos(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]))) <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + else if (r > (sizeof(mod) / sizeof(mod[0]))) { + printf("More modifications than this app can handle, update the app\n"); + goto end; + } + else if (!r) { + //no modification at this pos + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + ... + else { + //option 2 + while ((r = bam_next_basemod(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]), &pos)) >= 0) { + for (; i < bamdata->core.l_qseq && i < pos; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + ... + //check last alignment's base modification + int strand = 0, impl = 0; + char canonical = 0, modification[] = "mhfcgebaon"; //possible modifications + printf("\n\nLast alignment has \n"); + for (k = 0; k < sizeof(modification) - 1; ++k) { //avoiding NUL termination + if (bam_mods_query_type(ms, modification[k], &strand, &impl, &canonical)) { + printf ("No modification of %c type\n", modification[k]); + } + else { + printf("%s strand has %c modified with %c, can %sassume unlisted as unmodified\n", strand ? "-/bottom/reverse" : + "+/top/forward", canonical, modification[k], impl?"" : "not " ); + } + } + ... + if (ms) { + hts_base_mod_state_free(ms); + ... +Refer:modstate.c + +The modification can be accessed in pileup mode as well. bam_mods_at_qpos gives +the modification at given pileup position. Insertion and deletion to the given +position with possible modification can be retrieved using bam_plp_insertion_mod +api. + + ... + int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + //when using cd, initialize and use as it will be reused after destructor + cd->p = hts_base_mod_state_alloc(); + //parse the bam data and gather modification data from MM tags + return (-1 == bam_parse_basemod(b, (hts_base_mod_state*)cd->p)) ? 1 : 0; + } + + int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + if (cd->p) { + hts_base_mod_state_free((hts_base_mod_state *)cd->p); + cd->p = NULL; + } + return 0; + } + + int main(int argc, char *argv[]) + { + ... + if (!(plpiter = bam_plp_init(readdata, &conf))) { + ... + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &depth))) { + memset(&mods, 0, sizeof(mods)); + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < depth; ++j) { + dellen = 0; + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + /*invoke bam mods_mods_at_qpos before bam_plp_insertion_mod that the base modification + is retrieved before change in pileup pos thr' plp_insertion_mod call*/ + if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) { + ... + //use plp_insertion/_mod to get insertion and del at the same position + if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) { + ... + //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered + //base and modification + printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)])), + modlen > 0 ? mods[0].strand ? '-' : '+' : '\0', modlen > 0 ? mods[0].modified_base : '\0'); + //insertion and deletions + if (plp[j].indel > 0) { + //insertion + /*insertion data from plp_insertion_mod, note this shows the quality value as well + which is different from base and modification above;the lower case display is not attempted either*/ + printf("+%d%s", plp[j].indel, insdata.s); + //handle deletion if any + if (dellen) { + printf("-%d", dellen); + for (k = 0; k < dellen; ++k) { + printf("?"); + ... + else if (plp[j].indel < 0) { + //deletion + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + ... +Refer:pileup_mod.c + + +### Read selected fields + +At times the whole alignment data may not be of interest and it would be +better to read required fields alone from the alignment data. CRAM file format +supports such specific data read and HTSLib provides an option to use this. +This can improve the performance on read operation. + +The hts_set_opt method does the selection of specified fields. There are flags +indicating specific fields, like SAM_FLAG, SAM_SEQ, SAM_QNAME, in alignment +data and a combination of flags for the required fields can be passed with +CRAM_OPT_REQUIRED_FIELDS to this api. + + ... + //select required field alone, this is useful for CRAM alone + if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) { + ... + //read header + in_samhdr = sam_hdr_read(infile); + ... + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + ... +Refer: flags_htsopt_field.c + + +### Thread-pool to read / write + +The HTSLib api supports thread pooling for better performance. There are a few +ways in which this can be used. The pool can be made specific for a file or a +generic pool can be created and shared across multiple files. Another way to +use thread pool is to schedule tasks explicitly to queues which gets executed +using threads in pool. + +To have a thread pool specific for a file, hts_set_opt api can be used with the +file pointer, HTS_OPT_NTHREADS and the number of threads to use in the pool. +Closure of file releases the thread pool as well. To have a thread pool which +can be shared across different files, it needs to be initialized using +hts_tpool_init api, passing number of threads as argument. This thread pool can +be associated with a file using hts_set_opt api. The file pointer, +HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments +to api. The thread pool has to be released with hts_tpool_destroy. + +Below excerpt shows file specific thread pool, + ... + //create file specific threads + if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading + hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for sam write + hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) { //1 thread specific for bam write + printf("Failed to set thread options\n"); + goto end; + } +Refer: split_thread1.c + +Below excerpt shows thread pool shared across files, + ... + //create a pool of 4 threads + if (!(tpool.pool = hts_tpool_init(4))) { + ... + //share the pool with all the 3 files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) { + ... + if (tpool.pool) { + hts_tpool_destroy(tpool.pool); + } + ... +Refer: split_thread2.c + + +## More Information + +### CRAM reference files + +The cram reference data is required for the read of sequence data in CRAM +format. The sequence data file may have it as embedded or as a reference to +the actual file. When it is a reference, it is downloaded locally, in the +cache directory for later usage. It will be stored in a directory structure +based on the MD5 checksum in the cache directory. + +Each chromosome in a reference file gets saved as a separate file with md5sum +as its path and name. The initial 4 numerals make the directory name and rest +as the file name (/<1st 2 of md5sum>/<2nd 2 of md5sum>/). + +The download would be attempted from standard location, EBI ENA +(https://www.ebi.ac.uk/ena). + + +### Bam1_t + +This structure holds the sequence data in BAM format. There are fixed and +variable size fields, basic and extended information on sequence +data. Variable size data and extended information are kept together in a +buffer, named data in bam1_t. Fields in the member named core, bam1_core_t, +and a few macros together support the storage and handling of the whole +sequence data. + +- core has a link to reference as a 0 based index in field tid. The mate / + reverse strand's link to reference is given by mtid. + +- Field pos and mpos gives the position in reference to which the sequence and + its mate / reverse strand match. + +- Field flag gives the properties of the given alignment. It shows the + alignment's orientation, mate status, read order etc. + +- Field qual gives the quality of the alignment read. + +- l_qname gives the length of the name of the alignment / read, l_extranul gives + the extra space used internally in the data field. + +- l_qseq gives the length of the alignment / read in the data field. + +-- n_cigar gives the number of CIGAR operations for the given alignment. + +- isize gives the insert size of the read / alignment. + +The bases in sequence data are stored by compressing 2 bases together in a +byte. When the reverse flag is set, the base data is reversed and +complemented from the actual read (i.e. if the forward read is ACTG, the +reverse read to be CAGT; it will be stored in SAM format with reversed and +complemented format as ACTG with reverse flag set). + +Macros bam_get_qname, bam_get_seq, bam_get_qual, bam_get_aux, bam_get_l_aux, +bam_seqi etc access the data field and retrieve the required data. The aux +macros support the retrieval of auxiliary data from the data field. + + +### Sam_hdr_t + +This structure holds the header information. This holds the number of targets +/ SQ lines in the file, each one's length, name and reference count to this +structure. It also has this information in an internal data structure for +easier access of each field of this data. + +When this data is shared or assigned to another variable of a different scope +or purpose, the reference count needs to be incremented to ensure that it is +valid till the end of the variable's scope. sam_hdr_incr_ref and it needs to +be destroyed as many times with sam_hdr_destroy api. + + +### Index + +Indices need the data to be sorted by position. They can be of different +types with extension .bai, .csi or .tbi for compressed SAM/BAM files and .crai +for CRAM files. The index name can be passed along with the alignment file +itself by appending a specific character sequence. The apis can detect this +sequence and extract the index path. ##idx## is the sequence which separates +the file path and index path. + + +### Data files + +The data files can be a local file, a network file, a file accessible through +the web or in cloud storage like google and amazon. The data files can be +represented with URIs like file://, file://localhost/.., ,ftp://.., +gs+http[s].., s3+http[s]:// + diff --git a/samples/Makefile b/samples/Makefile new file mode 100644 index 000000000..40991d78f --- /dev/null +++ b/samples/Makefile @@ -0,0 +1,106 @@ +HTS_DIR = ../ +include $(HTS_DIR)/htslib_static.mk + +CC = gcc +CFLAGS = -Wall -g -O0 + +#to statically link to libhts +LDFLAGS = $(HTS_DIR)/libhts.a -L$(HTS_DIR) $(HTSLIB_static_LDFLAGS) $(HTSLIB_static_LIBS) + +#to dynamically link to libhts +#LDFLAGS = -L $(HTS_DIR) -lhts -Wl,-rpath, + +PRGS = flags split split2 cram read_fast read_header read_ref read_bam \ + read_aux dump_aux add_header rem_header update_header mod_bam mod_aux \ + mod_aux_ba write_fast idx_on_write read_reg read_multireg pileup \ + mpileup modstate pileup_mod flags_field split_t1 split_t2 + +all: $(PRGS) + +flags: + $(CC) $(CFLAGS) -I $(HTS_DIR) flags_demo.c -o $@ $(LDFLAGS) + +split: + $(CC) $(CFLAGS) -I $(HTS_DIR) split.c -o $@ $(LDFLAGS) + +split2: + $(CC) $(CFLAGS) -I $(HTS_DIR) split2.c -o $@ $(LDFLAGS) + +cram: + $(CC) $(CFLAGS) -I $(HTS_DIR) cram.c -o $@ $(LDFLAGS) + +read_fast: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_fast.c -o $@ $(LDFLAGS) + +read_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_header.c -o $@ $(LDFLAGS) + +read_ref: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_refname.c -o $@ $(LDFLAGS) + +read_bam: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_bam.c -o $@ $(LDFLAGS) + +read_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) read_aux.c -o $@ $(LDFLAGS) + +dump_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) dump_aux.c -o $@ $(LDFLAGS) + +add_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) add_header.c -o $@ $(LDFLAGS) + +rem_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) rem_header.c -o $@ $(LDFLAGS) + +update_header: + $(CC) $(CFLAGS) -I $(HTS_DIR) update_header.c -o $@ $(LDFLAGS) + +mod_bam: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_bam.c -o $@ $(LDFLAGS) + +mod_aux: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux.c -o $@ $(LDFLAGS) + +mod_aux_ba: + $(CC) $(CFLAGS) -I $(HTS_DIR) mod_aux_ba.c -o $@ $(LDFLAGS) + +write_fast: + $(CC) $(CFLAGS) -I $(HTS_DIR) write_fast.c -o $@ $(LDFLAGS) + +idx_on_write: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_write.c -o $@ $(LDFLAGS) + +read_reg: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_reg_read.c -o $@ $(LDFLAGS) + +read_multireg: + $(CC) $(CFLAGS) -I $(HTS_DIR) index_multireg_read.c -o $@ $(LDFLAGS) + +pileup: + $(CC) $(CFLAGS) -I $(HTS_DIR) pileup.c -o $@ $(LDFLAGS) + +mpileup: + $(CC) $(CFLAGS) -I $(HTS_DIR) mpileup.c -o $@ $(LDFLAGS) + +modstate: + $(CC) $(CFLAGS) -I $(HTS_DIR) modstate.c -o $@ $(LDFLAGS) + +pileup_mod: + $(CC) $(CFLAGS) -I $(HTS_DIR) pileup_mod.c -o $@ $(LDFLAGS) + +flags_field: + $(CC) $(CFLAGS) -I $(HTS_DIR) flags_htsopt_field.c -o $@ $(LDFLAGS) + +split_t1: + $(CC) $(CFLAGS) -I $(HTS_DIR) split_thread1.c -o $@ $(LDFLAGS) + +split_t2: + $(CC) $(CFLAGS) -I $(HTS_DIR) split_thread2.c -o $@ $(LDFLAGS) + +clean: + find . -name "*.o" | xargs rm -rf + find . -name "*.dSYM" | xargs rm -rf + rm $(PRGS) + + diff --git a/samples/README.md b/samples/README.md new file mode 100644 index 000000000..0438e1fc4 --- /dev/null +++ b/samples/README.md @@ -0,0 +1,229 @@ +[![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib/samples) + +HTSlib is an implementation of a unified C library for accessing common file +formats, such as [SAM, CRAM and VCF][1], used for high-throughput sequencing +data, and is the core library used by [samtools][2] and [bcftools][3]. + +A set of sample programs are available which showcases the usage of APIs in HTSlib. +They are based on version 1.17 of HTSLib and are mainly for demonstration of API usage. +Further optimization and error handling might be required for actual usage. + + +[1]: http://samtools.github.io/hts-specs/ +[2]: http://github.com/samtools/samtools +[3]: http://samtools.github.io/bcftools/ + +### Building and using sample programs + +GCC and compatible compilers can be used to build these samples. + +A makefile is available along with source files which links statically to +htslib. To use dynamic linking, update the makefile's 'LDFLAGS' and 'rpath' +path. The 'rpath' path to be set as the path to lib directory of htslib +installation. + +```sh + +# linking statically on a linux machine +gcc -g -o -I \ + /libhts.a -lcrypto -lm -lpthread -lcurl -llzma -lz -lbz2 + +# dynamically linking with libhts +gcc -g -o -I \ + -L -lhts -Wl,-rpath, + +``` + +In many cases, the alignment data are expected as sorted, compressed and +indexed. + +### The samples... + +[Flags][Flags] + + This application showcases the basic read of alignment files and flag + access. It reads and shows the count of read1 and read2 alignments. + +[Split][Split] + + This application showcases the basic read and write of alignment data. It + saves the read1 and read2 as separate files in given directory, one as sam + and other as bam. + +[Split2][Split2] + + This application showcases the output file format selection. It saves the + read1 and read2 as separate files in given directory, both as compressed + sam though the extensions are different. + +[Cram][Cram] + + This application showcases the different way in which cram reference data + is used for cram output creation. + +[Read_fast][Read_fast] + + This application showcases the fasta/fastq data read. + +[Read_header][Read_header] + + This application showcases the read and access of header data. It can show + all header line of given type, data of a given tag on a specific header + line or for all lines of given type. + +[Read_ref][Read_ref] + + This application showcases the read and access of header data. It shows + all reference names which has length equal or greather to given input. + +[Read_bam][Read_bam] + + This application showcases read of different alignment data fields. It + shows contents of each alignment. + +[Read_aux][Read_aux] + + This application showcases read of specific auxiliary tag data in + alignment. It shows the data retrieved using 2 APIs, one as a string with + tag data and other as raw data alternatively. + +[Dump_aux][Dump_aux] + + This application showcases read of all auxiliary tag data one by one in an + alignment. It shows the data retrieved. + +[Add_header][Add_header] + + This application showcases the write of header lines to a file. It adds + header line of types, SQ, RG, PG and CO and writes to standard output. + +[Remove_header][Remove_header] + + This application showcases removal of header line from a file. It removes + either all header lines of given type or one specific line of given type + with given unique identifier. Modified header is written on standard + output. + +[Update_header][Update_header] + + This application shows the update of header line fields, where update is + allowed. It takes the header line type, unique identifier for the line, + tag to be modified and the new value. Updated data is written on standard + output. + +[Mod_bam][Mod_bam] + + This application showcases the update of alignment data. It takes + alignment name, position of field to be modified and new value of + it. Modified data is written on standard output. + +[Mod_aux][Mod_aux] + + This application showcases the update of auxiliary data in alignment. It + takes alignment name, tag to be modified, its type and new value. Modified + data is written on standard output. + +[Mod_aux_ba][Mod_aux_ba] + + This application showcases the update of auxiliary array data in + alignment. It adds count of ATCGN base as an array in auxiliary data, + BA:I. Modified data is written on standard output. + +[Write_fast][Write_fast] + + This application showcases the fasta/fastq data write. It appends a dummy + data to given file. + +[Index_write][Index_write] + + This application showcases the creation of index along with output + creation. Based on file type and shift, it creates bai, csi or crai files. + +[Read_reg][Read_reg]: + + This application showcases the usage of region specification in alignment + read. + +[Read_multireg][Read_multireg]: + + This application showcases the usage of mulitple region specification in + alignment read. + +[Pileup][Pileup]: + + This application showcases the pileup api, where all alignments covering a + reference position are accessed together. It displays the bases covering + each position on standard output. + +[Mpileup][Mpileup]: + + This application showcases the mpileup api, which supports multiple input + files for pileup and gives a side by side view of them in pileup + format. It displays the bases covering each position on standard output. + +[Modstate][Modstate]: + + This application showcases the access of base modifications in + alignment. It shows the modifications present in an alignment and accesses + them using available APIs. There are 2 APIs and which one to be used can + be selected through input. + +[Pileup_mod][Pileup_mod]: + + This application showcases the base modification access in pileup mode. It + shows the pileup display with base modifications. + +[Flags_field][Flags_field] + + This application showcases the read of selected fields alone, reducing the + overhead / increasing the performance. It reads the flag field alone and + shows the count of read1 and read2. This has impact only on CRAM files. + +[Split_thread1][Split_thread1] + + This application showcases the use of threads in file handling. It saves + the read1 and read2 as separate files in given directory, one as sam and + other as bam. 2 threads are used for read and 1 each dedicated for each + output file. + +[Split_thread2][Split_thread2] + + This application showcases the use of thread pool in file handling. It + saves the read1 and read2 as separate files in given directory, one as sam + and other as bam. A pool of 4 threads is created and shared for both read + and write. + +### More Information + +More detailed documentation is available in the [DEMO.md][DEMO] with worked +examples per demonstration tool. + + +[Flags]: flags_demo.c +[Split]: split.c +[Split2]: split2.c +[Cram]: cram.c +[Read_fast]: read_fast.c +[Read_header]: read_header.c +[Read_ref]: read_refname.c +[Read_bam]: read_bam.c +[Read_aux]: read_aux.c +[Dump_aux]: dump_aux.c +[Add_header]: add_header.c +[Remove_header]: rem_header.c +[Update_header]: update_header.c +[Mod_bam]: mod_bam.c +[Mod_aux]: mod_aux.c +[Mod_aux_ba]: mod_aux_ba.c +[Write_fast]: write_fast.c +[Index_write]: index_write.c +[Read_reg]: index_reg_read.c +[Read_multireg]: index_multireg_read.c +[Pileup]: pileup.c +[Mpileup]: mpileup.c +[Modstate]: modstate.c +[Pileup_mod]: pileup_mod.c +[Flags_field]: flags_htsopt_field.c +[Split_thread1]: split_thread1.c +[Split_thread2]: split_thread2.c +[DEMO]: DEMO.md diff --git a/samples/add_header.c b/samples/add_header.c new file mode 100644 index 000000000..d1a2fc13c --- /dev/null +++ b/samples/add_header.c @@ -0,0 +1,128 @@ +/* add_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: add_header infile\n\ +Adds new header lines of SQ, RG, PG and CO typs\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, sq[] = "@SQ\tSN:TR1\tLN:100\n@SQ\tSN:TR2\tLN:50"; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + + //update_header infile header idval tag value + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + //dump command line arguments for PG line + for (c = 0; c < argc; ++c) { + kputs(argv[c], &data); + kputc(' ', &data); + } + + //add SQ line with SN as TR1 and TR2 + if (sam_hdr_add_lines(in_samhdr, &sq[0], 0)) { //length as 0 for NULL terminated data + printf("Failed to add SQ lines\n"); + goto end; + } + + //add RG line with ID as RG1 + if (sam_hdr_add_line(in_samhdr, "RG", "ID", "RG1", "LB", "Test", "SM", "S1", NULL)) { + printf("Failed to add RG line\n"); + goto end; + } + + //add pg line + if (sam_hdr_add_pg(in_samhdr, "add_header", "VN", "Test", "CL", data.s, NULL)) { //NULL is to indicate end of args + printf("Failed to add PG line\n"); + goto end; + } + + if (sam_hdr_add_line(in_samhdr, "CO", "Test data", NULL)) { //NULL is to indicate end of args + printf("Failed to add PG line\n"); + goto end; + } + + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + ks_free(&data); + return ret; +} diff --git a/samples/cram.c b/samples/cram.c new file mode 100644 index 000000000..5f55e65d2 --- /dev/null +++ b/samples/cram.c @@ -0,0 +1,168 @@ +/* cram.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: cram infile reffile outdir\n\ +Dumps the input file alignments in cram format in given directory\n\ +1.cram has external reference\n\ +2.cram has reference embedded\n\ +3.cram has autogenerated reference\n\ +4.cram has no reference data in it\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL, *reffile = NULL; + char *file1 = NULL, *file2 = NULL, *file3 = NULL, *file4 = NULL, *reffmt1 = NULL, *reffmt2 = NULL; + int c = 0, ret = EXIT_FAILURE, size1 = 0, size2 = 0, size3 = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL, *outfile3 = NULL, *outfile4 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + htsFormat fmt1 = {0}, fmt2 = {0}, fmt3 = {0}, fmt4 = {0}; + + //cram infile reffile outdir + if (argc != 4) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + reffile = argv[2]; + outdir = argv[3]; + + //allocate space for option string and output file names + size1 = sizeof(char) * (strlen(reffile) + sizeof("cram,reference=") + 1); + size2 = sizeof(char) * (strlen(reffile) + sizeof("cram,embed_ref=1,reference=") + 1); + size3 = sizeof(char) * (strlen(outdir) + sizeof("/1.cram") + 1); + + reffmt1 = malloc(size1); reffmt2 = malloc(size2); + file1 = malloc(size3); file2 = malloc(size3); + file3 = malloc(size3); file4 = malloc(size3); + + if (!file1 || !file2 || !file3 || !file4 || !reffmt1 || !reffmt2) { + printf("Failed to create buffers\n"); + goto end; + } + + snprintf(reffmt1, size1, "cram,reference=%s", reffile); + snprintf(reffmt2, size2, "cram,embed_ref=1,reference=%s", reffile); + snprintf(file1, size3, "%s/1.cram", outdir); snprintf(file2, size3, "%s/2.cram", outdir); + snprintf(file3, size3, "%s/3.cram", outdir); snprintf(file4, size3, "%s/4.cram", outdir); + + if (hts_parse_format(&fmt1, reffmt1) == -1 || //using external reference - uses the M5/UR tags to get reference data during read + hts_parse_format(&fmt2, reffmt2) == -1 || //embed the reference internally + hts_parse_format(&fmt3, "cram,embed_ref=2") == -1 || //embed autogenerated reference + hts_parse_format(&fmt4, "cram,no_ref=1") == -1) { //no reference data encoding at all + printf("Failed to set output option\n"); + goto end; + } + + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM, wc as CRAM (equivalent to fmt3) + outfile1 = sam_open_format(file1, "wc", &fmt1); outfile2 = sam_open_format(file2, "wc", &fmt2); + outfile3 = sam_open_format(file3, "wc", &fmt3); outfile4 = sam_open_format(file4, "wc", &fmt4); + if (!outfile1 || !outfile2 || !outfile3 || !outfile4) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1) || + (sam_hdr_write(outfile3, in_samhdr) == -1) || (sam_hdr_write(outfile4, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0 || + sam_write1(outfile2, in_samhdr, bamdata) < 0 || + sam_write1(outfile3, in_samhdr, bamdata) < 0 || + sam_write1(outfile4, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: +#define IF_OL(X,Y) if((X)) {(Y);} //if one liner + //cleanup + IF_OL(in_samhdr, sam_hdr_destroy(in_samhdr)); + IF_OL(infile, sam_close(infile)); + IF_OL(outfile1, sam_close(outfile1)); + IF_OL(outfile2, sam_close(outfile2)); + IF_OL(outfile3, sam_close(outfile3)); + IF_OL(outfile4, sam_close(outfile4)); + IF_OL(file1, free(file1)); + IF_OL(file2, free(file2)); + IF_OL(file3, free(file3)); + IF_OL(file4, free(file4)); + IF_OL(reffmt1, free(reffmt1)); + IF_OL(reffmt2, free(reffmt2)); + IF_OL(fmt1.specific, hts_opt_free(fmt1.specific)); + IF_OL(fmt2.specific, hts_opt_free(fmt2.specific)); + IF_OL(fmt3.specific, hts_opt_free(fmt3.specific)); + IF_OL(fmt4.specific, hts_opt_free(fmt4.specific)); + IF_OL(bamdata, bam_destroy1(bamdata)); + + return ret; +} diff --git a/samples/dump_aux.c b/samples/dump_aux.c new file mode 100644 index 000000000..49251fe04 --- /dev/null +++ b/samples/dump_aux.c @@ -0,0 +1,188 @@ +/* dump_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: dump_aux infile\n\ +Dump the aux tags from alignments\n"); +} + +/// printauxdata - prints aux data +/** @param fp - file to which it to be printed - stdout or null + * @param type - aux type + * @param idx - index in array, -1 when not an array type + * @param data - data + * recurses when the data is array type +returns 1 on failure 0 on success +*/ +int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data) +{ + uint32_t auxBcnt = 0; + int i = 0; + char auxBType = 'Z'; + + //the tag is already queried and ensured to exist and the type is retrieved from the tag data, also iterated within index for arrays, so no error is expected here. + //when these apis are used explicitly, these error conditions needs to be handled based on return value and errno + switch(type) { + case 'A': + fprintf(fp, "%c", bam_aux2A(data)); //byte data + break; + case 'c': + fprintf(fp, "%d", (int8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 1 byte data; bam_auxB2i - from array or bam_aux2i - non array data + break; + case 'C': + fprintf(fp, "%u", (uint8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 1 byte data + break; + case 's': + fprintf(fp, "%d", (int16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 2 byte data + break; + case 'S': + fprintf(fp, "%u", (uint16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 2 byte data + break; + case 'i': + fprintf(fp, "%d", (int32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 4 byte data + break; + case 'I': + fprintf(fp, "%u", (uint32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 4 byte data + break; + case 'f': + case 'd': + fprintf(fp, "%g", (float)(idx > -1 ? bam_auxB2f(data, idx) : bam_aux2f(data))); //floating point data, 4 bytes + break; + case 'H': + case 'Z': + fprintf(fp, "%s", bam_aux2Z(data)); //array of char or hex data + break; + case 'B': //array of char/int/float + auxBcnt = bam_auxB_len(data); //length of array + auxBType = bam_aux_type(data + 1); //type of element in array + fprintf(fp, "%c", auxBType); + for (i = 0; i < auxBcnt; ++i) { //iterate the array + fprintf(fp, ","); + //calling recurssively with index to reuse a few lines + if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) { + return EXIT_FAILURE; + } + } + break; + default: + printf("Invalid aux tag?\n"); + return EXIT_FAILURE; + break; + } + return EXIT_SUCCESS; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + int ret_r = 0; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + + //dump_aux infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; + data = NULL; + data = bam_aux_first(bamdata); //get the first aux data + while (data) { + printf("%.2s:%c:", bam_aux_tag(data), NULL != strchr("cCsSiI", bam_aux_type(data)) ? 'i' : bam_aux_type(data)); //macros gets the tag and type of aux data + //dump the data + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + printf("Failed to dump aux data\n"); + goto end; + } + else { + printf(" "); + } + data = bam_aux_next(bamdata, data); //get the next aux data + } + if (ENOENT != errno) { + printf("\nFailed to get aux data\n"); + goto end; + } + printf("\n"); + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/flags_demo.c b/samples/flags_demo.c new file mode 100644 index 000000000..e03fc6cd8 --- /dev/null +++ b/samples/flags_demo.c @@ -0,0 +1,110 @@ +/* flags_demo.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: flags \n\ +Shows the count of read1 and read2 alignments\n\ +This shows basic reading and alignment flag access\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + int64_t cntread1 = 0, cntread2 = 0; //count + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf( "Failed to read header from file\n"); + goto end; + } + + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + if (bamdata->core.flag & BAM_FREAD2) { + cntread2++; + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + printf("File %s has %"PRIhts_pos" read1 and %"PRIhts_pos" read2 alignments\n", inname, cntread1, cntread2); + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/flags_htsopt_field.c b/samples/flags_htsopt_field.c new file mode 100644 index 000000000..4b64445e3 --- /dev/null +++ b/samples/flags_htsopt_field.c @@ -0,0 +1,115 @@ +/* flags_htsopt_field.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: flags_field \n\ +Shows the count of read1 and read2 alignments\n\ +This shows reading selected fields from CRAM file\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + int64_t cntread1 = 0, cntread2 = 0; //count + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //select required field alone, this is useful for CRAM alone + if (hts_set_opt(infile, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG) < 0) { + printf("Failed to set htsoption\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file\n"); + goto end; + } + + //read data, check flags and update count + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + cntread1++; + } + if (bamdata->core.flag & BAM_FREAD2) { + cntread2++; + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + printf("File %s has %"PRIhts_pos" read1 and %"PRIhts_pos" read2 alignments\n", inname, cntread1, cntread2); + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/index_multireg_read.c b/samples/index_multireg_read.c new file mode 100644 index 000000000..dbe8f15f9 --- /dev/null +++ b/samples/index_multireg_read.c @@ -0,0 +1,150 @@ +/* index_multireg_read.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the print_usage +/** @param fp pointer to the file / terminal to which print_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_multireg infile count regspec_csv\n\ + Reads alignment of a target matching to given region specifications\n\ + read_multireg infile.sam 2 R1:10-100,R2:200"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + char *ptr = NULL; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + hts_idx_t *idx = NULL; + hts_itr_t *iter = NULL; + unsigned int regcnt = 0; + char **regions = NULL; + + //read_multireg infile count regspec_csv + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + regcnt = atoi(argv[2]); + regions = calloc(regcnt, sizeof(char*)); + //set each regspec as separate entry in region array + ptr = argv[3]; + for (c = 0; ptr && (c < regcnt); ++c) { + regions[c] = ptr; + ptr = strchr(ptr, ','); + if (ptr) { *ptr = '\0'; ++ptr; } + } + + if (regcnt == 0) { + printf("Region count can not be 0\n"); + goto end; + } + //initialize bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open files, use stdout as output SAM file for ease of display + infile = sam_open(inname, "r"); + outfile = sam_open("-", "w"); + if (!outfile || !infile) { + printf("Could not open in/out files\n"); + goto end; + } + //load index file, assume it to be present in same location + if (!(idx = sam_index_load(infile, inname))) { + printf("Failed to load the index\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //create iterator + if (!(iter = sam_itr_regarray(idx, in_samhdr, regions, regcnt))) { + printf("Failed to get iterator\n"); + goto end; + } + if (regions) { + //can be freed as it is no longer required + free(regions); + regions = NULL; + } + + //get required area + while ((c = sam_itr_multi_next(infile, iter, bamdata) >= 0)) { + //write to output + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (c != -1) { + printf("Error during read\n"); + goto end; + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (iter) { + sam_itr_destroy(iter); + } + if (idx) + hts_idx_destroy(idx); + return ret; +} diff --git a/samples/index_reg_read.c b/samples/index_reg_read.c new file mode 100644 index 000000000..346d5428f --- /dev/null +++ b/samples/index_reg_read.c @@ -0,0 +1,143 @@ +/* index_reg_read.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the print_usage +/** @param fp pointer to the file / terminal to which print_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: readreg infile idxfile region\n\ +Reads alignments matching to a specific region\n\ +\\. from start of file\n\ +\\* only unmapped reads\n\ +REFNAME all reads referring REFNAME\n\ +REFNAME:S all reads referring REFNAME and overlapping from S onwards\n\ +REFNAME:S-E all reads referring REFNAME overlapping from S to E\n\ +REFNAME:-E all reads referring REFNAME overlapping upto E\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *region = NULL; + char *idxfile = NULL; + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + hts_idx_t *idx = NULL; + hts_itr_t *iter = NULL; + + //readreg infile indexfile region + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + idxfile = argv[2]; + region = argv[3]; + + //initialize bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + + //open files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open input file\n"); + goto end; + } + //using stdout as output file for ease of dumping data + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open out file\n"); + goto end; + } + //load index file + if (!(idx = sam_index_load2(infile, inname, idxfile))) { + printf("Failed to load the index\n"); + goto end; + } + //can use sam_index_load if the index file is present in same location and follows standard naming conventions (i.e. .) + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //create iterator + if (!(iter = sam_itr_querys(idx, in_samhdr, region))) { + printf("Failed to get iterator\n"); + goto end; + } + //read using iterator + while ((c = sam_itr_next(infile, iter, bamdata)) >= 0) { + //write to output + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (c != -1) { + printf("Error during read\n"); + goto end; + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (iter) { + sam_itr_destroy(iter); + } + if (idx) { + hts_idx_destroy(idx); + } + return ret; +} diff --git a/samples/index_write.c b/samples/index_write.c new file mode 100644 index 000000000..8fd2bc968 --- /dev/null +++ b/samples/index_write.c @@ -0,0 +1,166 @@ +/* index_write.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: idx_on_write infile shiftsize outdir\n\ +Creates compressed sam file and index file for it in given directory\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *outdir = NULL; + char *inname = NULL, *fileidx = NULL, *outname = NULL, outmode[4] = "w"; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + //idx_on_write infile sizeshift outputdirectory + if (argc != 4) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + size = atoi(argv[2]); + outdir = argv[3]; + + //allocate space for output name - outdir/filename.ext.idxextNUL + c = strlen(basename(inname)) + strlen(outdir) + 10; + fileidx = malloc(sizeof(char) * c); + outname = malloc(sizeof(char) * c); + if (!fileidx || !outname) { + printf("Couldnt allocate memory\n"); + goto end; + } + //initialize bam storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + + //open files + if ((infile = sam_open(inname, "r"))) { + //get file type and create output names + if (infile->format.format == cram) { + //set as crai + snprintf(fileidx, c, "%s/%s.crai", outdir, basename(inname)); + snprintf(outname, c, "%s/%s", outdir, basename(inname)); + } + else { + //set as either bai or csi based on interval + if (infile->format.format == sam && infile->format.compression == no_compression) { + //create as gzip compressed + snprintf(outname, c, "%s/%s.gz", outdir, basename(inname)); + snprintf(fileidx, c, "%s/%s.gz.%s", outdir, basename(inname), !size ? "bai" : "csi"); + } + else { + //with same name as input + snprintf(outname, c, "%s/%s", outdir, basename(inname)); + snprintf(fileidx, c, "%s/%s.%s", outdir, basename(inname), !size ? "bai" : "csi"); + } + } + } + c = 0; + sam_open_mode(outmode + 1, outname, NULL); //set extra write options based on name + outfile = sam_open(outname, outmode); + if (!outfile || !infile) { + printf("Could not open files\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if (sam_hdr_write(outfile, in_samhdr)) { + printf("Failed to write header\n"); + goto end; + } + + // initialize indexing, before start of write + if (sam_idx_init(outfile, in_samhdr, size, fileidx)) { + printf("idx initialization failed\n"); + goto end; + } + //read and write alignments + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + goto end; + } + } + if (c != -1) { + printf("Error in reading data\n"); + goto end; + } + //else EOF, save index + if (sam_idx_save(outfile)) { + printf("Could not save index\n"); + goto end; + } + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (fileidx) { + free(fileidx); + } + if (outname) { + free(outname); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/mod_aux.c b/samples/mod_aux.c new file mode 100644 index 000000000..d5ed18cde --- /dev/null +++ b/samples/mod_aux.c @@ -0,0 +1,221 @@ +/* mod_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_aux infile QNAME tag type val\n\ +Add/update the given aux tag to all alignments\n\ +type A-char C-int F-float Z-string\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL, *qname = NULL, *val = NULL; + char type = '\0'; + int ret = EXIT_FAILURE, ret_r = 0, length = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + + //mod_aux infile QNAME tag type val + if (argc != 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + qname = argv[2]; + tag = argv[3]; + type = argv[4][0]; + val = argv[5]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output file + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open std output\n"); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (strcasecmp(bam_get_qname(bamdata), qname)) { + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + continue; //not matching + } + + errno = 0; + //matched to qname, update aux + if (!(data = bam_aux_get(bamdata, tag))) { + int i = 0; float f = 0; + //tag not present append + switch (type) { + case 'f': + case 'd': + length = sizeof(float); + f = atof(val); + val = (const char*) &f; + type = 'f'; + break; + case 'C': + case 'S': + case 'I': + length = sizeof(int); + i = atoi(val); + val = (const char*) &i; + break; + case 'Z': + length = strlen(val) + 1; //1 for NUL termination + break; + case 'A': + length = 1; + break; + default: + printf("Invalid type mentioned\n"); + goto end; + break; + } + if (bam_aux_append(bamdata, tag, type, length, (const uint8_t*)val)) { + printf("Failed to append aux data, errno: %d\n", errno); + goto end; + } + } + else { + char auxtype = bam_aux_type(data); + //update the tag with newer value + switch (type) { + case 'f': + case 'd': + if (auxtype != 'f' && auxtype != 'd') { + printf("Invalid aux type passed\n"); + goto end; + } + if (bam_aux_update_float(bamdata, tag, atof(val))) { + printf("Failed to update float data, errno: %d\n", errno); + goto end; + } + break; + case 'C': + case 'S': + case 'I': + if (auxtype != 'c' && auxtype != 'C' && auxtype != 's' && auxtype != 'S' && auxtype != 'i' && auxtype != 'I') { + printf("Invalid aux type passed\n"); + goto end; + } + if (bam_aux_update_int(bamdata, tag, atoll(val))) { + printf("Failed to update int data, errno: %d\n", errno); + goto end; + } + break; + case 'Z': + if (auxtype != 'Z') { + printf("Invalid aux type passed\n"); + goto end; + } + length = strlen(val) + 1; //1 for NUL termination + if (bam_aux_update_str(bamdata, tag, length, val)) { + //with length as -1, length will be detected based on null terminated val data + printf("Failed to update string data, errno: %d\n", errno); + goto end; + } + break; + case 'A': + if (auxtype != 'A') { + printf("Invalid aux type passed\n"); + goto end; + } + //update the char data directly on buffer + *(data+1) = val[0]; + break; + default: + printf("Invalid data type\n"); + goto end; + break; + } + } + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/mod_aux_ba.c b/samples/mod_aux_ba.c new file mode 100644 index 000000000..8ef90ee1e --- /dev/null +++ b/samples/mod_aux_ba.c @@ -0,0 +1,147 @@ +/* mod_aux_ba.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_aux_ba infile\n\ +Updates the count of bases as an aux array on all alignments\n\ +BA:B:I,count of ACTGN\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int i = 0, ret = EXIT_FAILURE, ret_r = 0; + uint32_t cnt[5] = {0}; //A C G T N + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + bam1_t *bamdata = NULL; + + //mod_aux infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //open output file + if (!(outfile = sam_open("-", "w"))) { + printf("Could not open std output\n"); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; + memset(cnt, 0, sizeof(cnt)); + for (i = 0; i < bamdata->core.l_qseq; ++i) { + switch (seq_nt16_str[bam_seqi(bam_get_seq(bamdata),i)]) { + case 'A': + ++cnt[0]; + break; + case 'C': + ++cnt[1]; + break; + case 'G': + ++cnt[2]; + break; + case 'T': + ++cnt[3]; + break; + default: //N + ++cnt[4]; + break; + } + } + + if (bam_aux_update_array(bamdata, "BA", 'I', sizeof(cnt)/sizeof(cnt[0]), cnt)) { + printf("Failed to update base array, errno %d", errno); + goto end; + } + + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write output\n"); + goto end; + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/mod_bam.c b/samples/mod_bam.c new file mode 100644 index 000000000..9f1eb324e --- /dev/null +++ b/samples/mod_bam.c @@ -0,0 +1,229 @@ +/* mod_bam.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mod_bam infile QNAME fieldpos newval\n\ +Modifies the alignment data field\n\ +fieldpos - 1 QNAME 2 FLAG 3 RNAME 4 POS 5 MAPQ 6 CIGAR 7 RNEXT 8 PNEXT 9 TLEN 10 SEQ 11 QUAL\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *qname = NULL; + char *val = NULL; + int c = 0, ret = EXIT_FAILURE, field = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL, *outfile = NULL; + int ret_r = 0, i = 0; + bam1_t *bamdata = NULL; + + //mod_bam infile QNAME fieldpos newval + if (argc != 5) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + qname = argv[2]; + //1 QNAME 2 FLAG 3 RNAME 4 POS 5 MAPQ 6 CIGAR 7 RNEXT 8 PNEXT 9 TLEN 10 SEQ 11 QUAL + field = atoi(argv[3]); + val = argv[4]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r")) || !(outfile = sam_open("-", "w"))) { + printf("Could not open input/output\n"); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (sam_hdr_write(outfile, in_samhdr) == -1) { + printf("Failed to write header\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… + ret = 0; + if (!strcasecmp(qname, bam_get_qname(bamdata))) { + //the required one + switch(field) { + case 1:// QNAME + ret = bam_set_qname(bamdata, val); + break; + case 2:// FLAG + bamdata->core.flag = atol(val) & 0xFFFF; + break; + case 3:// RNAME + case 7:// RNEXT + if ((ret = sam_hdr_name2tid(in_samhdr, val)) < 0) { + printf("Invalid reference name\n"); + ret = -1; + break; + } + if (field == 3) { + //reference + bamdata->core.tid = ret; + } + else { + //mate reference + bamdata->core.mtid = ret; + } + break; + case 4:// POS + bamdata->core.pos = atoll(val); + break; + case 5:// MAPQ + bamdata->core.qual = atoi(val) & 0x0FF; + break; + case 6:// CIGAR + { + uint32_t *cigar = NULL; + size_t size = 0; + ssize_t ncigar = 0; + bam1_t *newbam = bam_init1(); + if (!newbam) { + printf("Failed to create new bam data\n"); + ret = -1; + break; + } + //get cigar array and set all data in new bam record + if ((ncigar = sam_parse_cigar(val, NULL, &cigar, &size)) < 0) { + printf("Failed to parse cigar\n"); + ret = -1; + break; + } + if (bam_set1(newbam, bamdata->core.l_qname, bam_get_qname(bamdata), bamdata->core.flag, bamdata->core.tid, bamdata->core.pos, bamdata->core.qual, + ncigar, cigar, bamdata->core.mtid, bamdata->core.mpos, bamdata->core.isize, bamdata->core.l_qseq, (const char*)bam_get_seq(bamdata), (const char*)bam_get_qual(bamdata), bam_get_l_aux(bamdata)) < 0) { + printf("Failed to set bamdata\n"); + ret = -1; + break; + } + //correct sequence data as input is expected in ascii format and not as compressed inside bam! + memcpy(bam_get_seq(newbam), bam_get_seq(bamdata), (bamdata->core.l_qseq + 1) / 2); + //copy the aux data + memcpy(bam_get_aux(newbam), bam_get_aux(bamdata), bam_get_l_aux(bamdata)); + + bam_destroy1(bamdata); + bamdata = newbam; + } + break; + case 8:// PNEXT + bamdata->core.mpos = atoll(val); + break; + case 9:// TLEN + bamdata->core.isize = atoll(val); + break; + case 10:// SEQ + i = strlen(val); + if (bamdata->core.l_qseq != i) { + printf("SEQ length different\n"); + ret = -1; + //as it is different, have to update quality data and cigar data as well and more info is required for it, which is not handled in this sample + //accessing raw memory and moving is one option; creating and using new bam1_t object is another option. + break; + } + for( c = 0; c < i; ++c) { + bam_set_seqi(bam_get_seq(bamdata), c, seq_nt16_table[(unsigned char)val[c]]); + } + break; + case 11:// QUAL + i = strlen(val); + if (i != bamdata->core.l_qseq) { + printf("Qual length different than sequence\n"); + ret = -1; + break; + } + for (c = 0; c < i; ++c) { + val[c] -= 33; //phred score from ascii value + } + memcpy(bam_get_qual(bamdata), val, i); + break; + default: + printf("Invalid input\n"); + goto end; + break; + } + if (ret < 0) { + printf("Failed to set new data\n"); + ret = EXIT_FAILURE; + goto end; + } + } + if (sam_write1(outfile, in_samhdr, bamdata) < 0) { + printf("Failed to write bam data\n"); + ret = EXIT_FAILURE; + goto end; + } + } + + if (ret_r == -1 || ret != EXIT_FAILURE) { + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/modstate.c b/samples/modstate.c new file mode 100644 index 000000000..976391684 --- /dev/null +++ b/samples/modstate.c @@ -0,0 +1,190 @@ +/* modstate.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: modstate infile option\n\ +Shows the base modifications on the alignment\n\ +Option can be 1 or 2 to select the api to use\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + + int ret_r = 0, i = 0 , r = 0, j = 0, pos = 0, opt = 0, k = 0, cnt = 0, *bm = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + hts_base_mod_state *ms = NULL; + + + //modstate infile 1/2 + if (argc != 3) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + opt = atoi(argv[2]) - 1; //option 1 or 2? + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + if (!(ms = hts_base_mod_state_alloc())) { + printf("Failed to allocate state memory\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + i = 0; + data = bam_get_seq(bamdata); + if (bam_parse_basemod(bamdata, ms)) { + printf("Failed to parse the base mods\n"); + goto end; + } + //dump the modifications + printf("Modifications:"); + bm = bam_mods_recorded(ms, &cnt); + for (k = 0; k < cnt; ++k) { + printf("%c", bm[k]); + } + printf("\n"); + hts_base_mod mod[5] = {0}; //for ATCGN + if (opt) { + //option 1 + for (; i < bamdata->core.l_qseq; ++i) { + if ((r = bam_mods_at_next_pos(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]))) <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + else if (r > (sizeof(mod) / sizeof(mod[0]))) { + printf("More modifications than this app can handle, update the app\n"); + goto end; + } + else if (!r) { + //no modification at this pos + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + } + } + else { + //option 2 + while ((r = bam_next_basemod(bamdata, ms, mod, sizeof(mod)/sizeof(mod[0]), &pos)) >= 0) { + for (; i < bamdata->core.l_qseq && i < pos; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + //modifications + for (j = 0; j < r; ++j) { + printf("%c%c%c", mod[j].canonical_base, mod[j].strand ? '-' : '+', mod[j].modified_base); + } + if (i == pos) + i++; //skip the modification already displayed + if (!r) { + for (; i < bamdata->core.l_qseq; ++i) { + printf("%c", seq_nt16_str[bam_seqi(data, i)]); + } + break; + } + } + if (r <= -1) { + printf("Failed to get modifications\n"); + goto end; + } + } + printf("\n"); + } + + if (ret_r == -1) { + //check last alignment's base modification + int strand = 0, impl = 0; + char canonical = 0, modification[] = "mhfcgebaon"; //possible modifications + printf("\n\nLast alignment has \n"); + for (k = 0; k < sizeof(modification) - 1; ++k) { //avoiding NUL termination + if (bam_mods_query_type(ms, modification[k], &strand, &impl, &canonical)) { + printf ("No modification of %c type\n", modification[k]); + } + else { + printf("%s strand has %c modified with %c, can %sassume unlisted as unmodified\n", strand?"-/bottom/reverse":"+/top/forward", canonical, modification[k], impl?"" : "not " ); + } + } + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + + if (ms) { + hts_base_mod_state_free(ms); + } + return ret; +} diff --git a/samples/mpileup.c b/samples/mpileup.c new file mode 100644 index 000000000..fe933748e --- /dev/null +++ b/samples/mpileup.c @@ -0,0 +1,204 @@ +/* mpileup.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: mpileup infile ...\n\ +Shows the mpileup api usage.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + return 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf** conf = NULL; + bam_mplp_t mplpiter = NULL; + int tid = -1, input = 0, k = 0, dpt = 0, *depth = NULL; + hts_pos_t refpos = -1; + const bam_pileup1_t **plp = NULL; + + //infile ... + if (argc < 2) { + print_usage(stderr); + goto end; + } + if ((conf = calloc(argc - 1, sizeof(plpconf*)))) { + for (input = 0; input < argc - 1; ++input) { + conf[input] = calloc(1, sizeof(plpconf)); + } + } + depth = calloc(argc - 1, sizeof(int)); + plp = calloc(argc - 1, sizeof(bam_pileup1_t*)); + if (!conf || !depth || !plp) { + printf("Failed to allocate memory\n"); + goto end; + } + for (input = 0; input < argc - 1; ++input) { + conf[input]->inname = argv[input+1]; + } + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + for(input = 0; input < argc - 1; ++input) { + if (!(conf[input]->infile = sam_open(conf[input]->inname, "r"))) { + printf("Could not open %s\n", conf[input]->inname); + goto end; + } + //read header + if (!(conf[input]->in_samhdr = sam_hdr_read(conf[input]->infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + } + + if (!(mplpiter = bam_mplp_init(argc - 1, readdata, (void**) conf))) { + printf("Failed to initialize mpileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_mplp_constructor(mplpiter, plpconstructor); + bam_mplp_destructor(mplpiter, plpdestructor); + + while (bam_mplp64_auto(mplpiter, &tid, &refpos, depth, plp) > 0) { + printf("%d\t%"PRIhts_pos"\t", tid+1, refpos+1); + + for (input = 0; input < argc - 1; ++input) { + for (dpt = 0; dpt < depth[input]; ++dpt) { + if (plp[input][dpt].is_del || plp[input][dpt].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[input][dpt].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]) : + (plp[input]->is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos)]))); + if (plp[input][dpt].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[input][dpt].indel); + for (k = 0; k < plp[input][dpt].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[input][dpt].b), plp[input][dpt].qpos + k + 1)])); + } + } + else if (plp[input][dpt].indel < 0) { + printf("%d", plp[input][dpt].indel); + for (k = 0; k < -plp[input][dpt].indel; ++k) { + printf("?"); + } + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf) { + for (input = 0; input < argc - 1; ++input) { + if (conf[input] && conf[input]->in_samhdr) { + sam_hdr_destroy(conf[input]->in_samhdr); + } + if (conf[input] && conf[input]->infile) { + sam_close(conf[input]->infile); + } + if (conf[input]) { + free(conf[input]); + } + } + free(conf); + } + + if (bamdata) { + bam_destroy1(bamdata); + } + if (mplpiter) { + bam_mplp_destroy(mplpiter); + } + if (depth) { + free(depth); + } + if (plp) { + free(plp); + } + return ret; +} diff --git a/samples/pileup.c b/samples/pileup.c new file mode 100644 index 000000000..11e2fb02f --- /dev/null +++ b/samples/pileup.c @@ -0,0 +1,183 @@ +/* pileup.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: pileup infile\n\ +Shows the pileup api usage.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + /*plpconf *conf= (plpconf*)data; + can access the data passed to pileup init from data + can do any alignment specific allocation / data storage here in param cd + it can hold either a float, 64 bit int or a pointer + when using cd, initialize and use as it will be reused after destructor*/ + return 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + /*plpconf *conf= (plpconf*)data; + can access the data passed to pileup init from data + deallocate any alignment specific allocation made in constructor and stored in param cd*/ + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf conf = {0}; + bam_plp_t plpiter = NULL; + int tid = -1, n = -1, j = 0, k = 0; + int refpos = -1; + const bam_pileup1_t *plp = NULL; + + //infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + conf.inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + if (!(conf.infile = sam_open(conf.inname, "r"))) { + printf("Could not open %s\n", conf.inname); + goto end; + } + //read header + if (!(conf.in_samhdr = sam_hdr_read(conf.infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (!(plpiter = bam_plp_init(readdata, &conf))) { + printf("Failed to initialize pileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &n))) { + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < n; ++j) { + //doesnt detect succeeding insertion and deletion together here, only insertion is identified + //deletion is detected in plp->is_del as and when pos reaches the position + //if detection ahead is required, use bam_plp_insertion here which gives deletion length along with insertion + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + //start and end are displayed in UPPER and rest on LOWER + printf("%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]))); + if (plp[j].indel > 0) { + //insertions, anyway not start or end + printf("+%d", plp[j].indel); + for (k = 0; k < plp[j].indel; ++k) { + printf("%c", tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos + k + 1)])); + } + } + else if (plp[j].indel < 0) { + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf.in_samhdr) { + sam_hdr_destroy(conf.in_samhdr); + } + if (conf.infile) { + sam_close(conf.infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (plpiter) { + bam_plp_destroy(plpiter); + } + return ret; +} diff --git a/samples/pileup_mod.c b/samples/pileup_mod.c new file mode 100644 index 000000000..24d6cf539 --- /dev/null +++ b/samples/pileup_mod.c @@ -0,0 +1,218 @@ +/* pileup_mod.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: pileup_mod infile\n\ +Shows the pileup api usage with base modification.\n"); + return; +} + +typedef struct plpconf { + char *inname; + samFile *infile; + sam_hdr_t *in_samhdr; +} plpconf; + +/// @brief plpconstructor +/// @param data client data? +/// @param b bam being loaded +/// @param cd client data +/// @return +int plpconstructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + //plpconf *conf= (plpconf*)data; can use this to access anything required from the data in pileup init + + //when using cd, initialize and use as it will be reused after destructor + cd->p = hts_base_mod_state_alloc(); + if (!cd->p) { + printf("Failed to allocate base modification state\n"); + return 1; + } + + //parse the bam data and gather modification data from MM tags + return (-1 == bam_parse_basemod(b, (hts_base_mod_state*)cd->p)) ? 1 : 0; +} + +int plpdestructor(void *data, const bam1_t *b, bam_pileup_cd *cd) { + if (cd->p) { + hts_base_mod_state_free((hts_base_mod_state *)cd->p); + cd->p = NULL; + } + return 0; +} + +/// @brief bam_plp_auto_f reads alignment data for pileup operation +/// @param data client callback data holding alignment file handle +/// @param b bamdata read +/// @return same as sam_read1 +int readdata(void *data, bam1_t *b) +{ + plpconf *conf = (plpconf*)data; + if (!conf || !conf->infile) { + return -2; //cant read data + } + + //read alignment and send + return sam_read1(conf->infile, conf->infile->bam_header, b); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + int ret = EXIT_FAILURE; + bam1_t *bamdata = NULL; + plpconf conf = {0}; + bam_plp_t plpiter = NULL; + int tid = -1, depth = -1, j = 0, k = 0, inslen = 0, dellen = 0, modlen = 0; + #define NMODS 5 + hts_base_mod mods[NMODS] = {0}; //ACGT N + int refpos = -1; + const bam_pileup1_t *plp = NULL; + kstring_t insdata = KS_INITIALIZE; + + //infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + conf.inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files + if (!(conf.infile = sam_open(conf.inname, "r"))) { + printf("Could not open %s\n", conf.inname); + goto end; + } + //read header + if (!(conf.in_samhdr = sam_hdr_read(conf.infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (!(plpiter = bam_plp_init(readdata, &conf))) { + printf("Failed to initialize pileup data\n"); + goto end; + } + + //set constructor destructor callbacks + bam_plp_constructor(plpiter, plpconstructor); + bam_plp_destructor(plpiter, plpdestructor); + + while ((plp = bam_plp_auto(plpiter, &tid, &refpos, &depth))) { + memset(&mods, 0, sizeof(mods)); + printf("%d\t%d\t", tid+1, refpos+1); + + for (j = 0; j < depth; ++j) { + dellen = 0; + + if (plp[j].is_del || plp[j].is_refskip) { + printf("*"); + continue; + } + /*invoke bam_mods_at_qpos before bam_plp_insertion_mod that the base modification + is retrieved before change in pileup pos thr' plp_insertion_mod call*/ + if ((modlen = bam_mods_at_qpos(plp[j].b, plp[j].qpos, plp[j].cd.p, mods, NMODS)) == -1) { + printf("Failed to get modifications\n"); + goto end; + } + + //use plp_insertion/_mod to get insertion and del at the same position + if ((inslen = bam_plp_insertion_mod(&plp[j], (hts_base_mod_state*)plp[j].cd.p, &insdata, &dellen)) == -1) { + printf("Failed to get insertion status\n"); + goto end; + } + + //start and end are displayed in UPPER and rest on LOWER, only 1st modification considered + //base and modification + printf("%c%c%c", plp[j].is_head ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + (plp[j].is_tail ? toupper(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)]) : + tolower(seq_nt16_str[bam_seqi(bam_get_seq(plp[j].b), plp[j].qpos)])), + modlen > 0 ? mods[0].strand ? '-' : '+' : '\0', + modlen > 0 ? mods[0].modified_base : '\0'); + //insertion and deletions + if (plp[j].indel > 0) { + //insertion + /*insertion data from plp_insertion_mod, note this shows the quality value as well + which is different from base and modification above;the lower case display is not attempted either*/ + printf("+%d%s", plp[j].indel, insdata.s); + //handle deletion if any + if (dellen) { + printf("-%d", dellen); + for (k = 0; k < dellen; ++k) { + printf("?"); + } + } + } + else if (plp[j].indel < 0) { + //deletion + printf("%d", plp[j].indel); + for (k = 0; k < -plp[j].indel; ++k) { + printf("?"); + } + } + printf(" "); + } + printf("\n"); + fflush(stdout); + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (conf.in_samhdr) { + sam_hdr_destroy(conf.in_samhdr); + } + if (conf.infile) { + sam_close(conf.infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (plpiter) { + bam_plp_destroy(plpiter); + } + ks_free(&insdata); + return ret; +} diff --git a/samples/read_aux.c b/samples/read_aux.c new file mode 100644 index 000000000..cbf972b98 --- /dev/null +++ b/samples/read_aux.c @@ -0,0 +1,207 @@ +/* read_aux.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_aux infile tag\n\ +Read the given aux tag from alignments either as SAM string or as raw data\n"); +} + +/// printauxdata - prints aux data +/** @param fp - file to which it to be printed - stdout or null + * @param type - aux type + * @param idx - index in array, -1 when not an array type + * @param data - data + * recurses when the data is array type +returns 1 on failure 0 on success +*/ +int printauxdata(FILE *fp, char type, int32_t idx, const uint8_t *data) +{ + uint32_t auxBcnt = 0; + int i = 0; + char auxBType = 'Z'; + + //the tag is already queried and ensured to exist and the type is retrieved from the tag data, also iterated within index for arrays, so no error is expected here. + //when these apis are used explicitly, these error conditions needs to be handled based on return value and errno + switch(type) { + case 'A': + fprintf(fp, "%c", bam_aux2A(data)); //byte data + break; + case 'c': + fprintf(fp, "%d", (int8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 1 byte data; bam_auxB2i - from array or bam_aux2i - non array data + break; + case 'C': + fprintf(fp, "%u", (uint8_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 1 byte data + break; + case 's': + fprintf(fp, "%d", (int16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 2 byte data + break; + case 'S': + fprintf(fp, "%u", (uint16_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 2 byte data + break; + case 'i': + fprintf(fp, "%d", (int32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //signed 4 byte data + break; + case 'I': + fprintf(fp, "%u", (uint32_t)(idx > -1 ? bam_auxB2i(data, idx) : bam_aux2i(data))); //unsigned 4 byte data + break; + case 'f': + case 'd': + fprintf(fp, "%g", (float)(idx > -1 ? bam_auxB2f(data, idx) : bam_aux2f(data))); //floating point data, 4 bytes + break; + case 'H': + case 'Z': + fprintf(fp, "%s", bam_aux2Z(data)); //array of char or hex data + break; + case 'B': //array of char/int/float + auxBcnt = bam_auxB_len(data); //length of array + auxBType = bam_aux_type(data + 1); //type of element in array + fprintf(fp, "%c", auxBType); + for (i = 0; i < auxBcnt; ++i) { //iterate the array + fprintf(fp, ","); + //calling recurssively with index to reuse a few lines + if (printauxdata(fp, auxBType, i, data) == EXIT_FAILURE) { + return EXIT_FAILURE; + } + } + break; + default: + printf("Invalid aux tag?\n"); + return EXIT_FAILURE; + break; + } + return EXIT_SUCCESS; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL; + int c = 0, ret = EXIT_FAILURE, ret_r = 0, i = 0; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + kstring_t sdata = KS_INITIALIZE; + + //read_aux infile tag + if (argc != 3) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + tag = argv[2]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + errno = 0; i++; + ks_clear(&sdata); + if (i % 2) { //use options alternatively to demonstrate both + //option 1 - get data as string with tag and type + if ((c = bam_aux_get_str(bamdata, tag, &sdata)) == 1) { + printf("%s\n",sdata.s); + } + else if (c == 0 && errno == ENOENT) { + //tag not present + printf("Tag not present\n"); + } + else { + //error + printf("Failed to get tag\n"); + goto end; + } + } + else { + //option 2 - get raw data + if (!(data = bam_aux_get(bamdata, tag))) { + //tag data not returned, errono gives the reason + if (errno == ENOENT) { + printf("Tag not present\n"); + } + else { + printf("Invalid aux data\n"); + } + } + else { + //got the tag, read and print + if (printauxdata(stdout, bam_aux_type(data), -1, data) == EXIT_FAILURE) { + printf("Failed to read aux data\n"); + goto end; + } + printf("\n"); + } + } + } + if (ret_r < -1) { + //read error + printf("Failed to read data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + ks_free(&sdata); + return ret; +} diff --git a/samples/read_bam.c b/samples/read_bam.c new file mode 100644 index 000000000..7fca8c55d --- /dev/null +++ b/samples/read_bam.c @@ -0,0 +1,139 @@ +/* read_bam.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_bam infile\n\ +Shows the alignment data from file\n"); +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tidname = NULL, *flags = NULL; + int ret = EXIT_FAILURE; + sam_hdr_t *in_samhdr = NULL; + samFile *infile = NULL; + + int ret_r = 0, i = 0; + bam1_t *bamdata = NULL; + uint8_t *data = NULL; + uint32_t *cigar = NULL; + + + //read_bam infile + if (argc != 2) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + + if (!(bamdata = bam_init1())) { + printf("Failed to allocate data memory!\n"); + goto end; + } + + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + while ((ret_r = sam_read1(infile, in_samhdr, bamdata)) >= 0) + { + //QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… + printf("NAME: %s\n", bam_get_qname(bamdata)); //get the query name using the macro + flags = bam_flag2str(bamdata->core.flag); //flags as string + printf("FLG: %d - %s\n", bamdata->core.flag, flags); //flag is available in core structure + free((void*)flags); + tidname = sam_hdr_tid2name(in_samhdr, bamdata->core.tid); + printf("RNAME/TID: %d - %s\n", bamdata->core.tid, tidname? tidname: "" ); //retrieves the target name using the value in bam and by referring the header + printf("POS: %"PRIhts_pos"\n", bamdata->core.pos + 1); //internally position is 0 based and on text output / SAM it is 1 based + printf("MQUAL: %d\n", bamdata->core.qual); //map quality value + + cigar = bam_get_cigar(bamdata); //retrieves the cigar data + printf("CGR: "); + for (i = 0; i < bamdata->core.n_cigar; ++i) { //no. of cigar data entries + printf("%d%c", bam_cigar_oplen(cigar[i]), bam_cigar_opchr(cigar[i])); //the macros gives the count of operation and the symbol of operation for given cigar entry + } + printf("\nTLEN/ISIZE: %"PRIhts_pos"\n", bamdata->core.isize); + + data = bam_get_seq(bamdata); //get the sequence data + if (bamdata->core.l_qseq != bam_cigar2qlen(bamdata->core.n_cigar, cigar)) { //checks the length with CIGAR and query + printf("\nLength doesnt matches to cigar data\n"); + goto end; + } + + printf("SEQ: "); + for (i = 0; i < bamdata->core.l_qseq ; ++i) { //sequence length + printf("%c", seq_nt16_str[bam_seqi(data, i)]); //retrieves the base from (internal compressed) sequence data + } + printf("\nQUAL: "); + for (int i = 0; i < bamdata->core.l_qseq ; ++i) { + printf("%c", bam_get_qual(bamdata)[i]+33); //retrives the quality value + } + printf("\n\n"); + } + + if (ret_r == -1) { + // no error! + ret = EXIT_SUCCESS; + } + else { + printf("Failed to read data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/read_fast.c b/samples/read_fast.c new file mode 100644 index 000000000..f74b25515 --- /dev/null +++ b/samples/read_fast.c @@ -0,0 +1,116 @@ +/* read_fast.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_fast \n\ +Reads the fasta/fastq file and shows the content.\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL; //input file name + int c = 0, ret = EXIT_FAILURE; + samFile *infile = NULL; //sam file + sam_hdr_t *in_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + + if (argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input files - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (infile->format.format != fasta_format && infile->format.format != fastq_format) { + printf("Invalid file specified\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf( "Failed to read header from file\n"); + goto end; + } + + //read data + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + printf("\nsequence: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", seq_nt16_str[bam_seqi(bam_get_seq(bamdata), c)]); + } + if (infile->format.format == fastq_format) { + printf("\nquality: "); + for (c = 0; c < bamdata->core.l_qseq; ++c) { + printf("%c", bam_get_qual(bamdata)[c]); + } + } + } + if (c != -1) { + //error + printf("Failed to get data\n"); + goto end; + } + //else -1 / EOF + ret = EXIT_SUCCESS; +end: + //clean up + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} diff --git a/samples/read_header.c b/samples/read_header.c new file mode 100644 index 000000000..eb14daea5 --- /dev/null +++ b/samples/read_header.c @@ -0,0 +1,173 @@ +/* read_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_header infile header [id val] [tag]\n\ +This shows given tag from given header or the whole line\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *header = NULL, *tag = NULL, *idval = NULL; + char *id = NULL; + int c = 0, ret = EXIT_FAILURE, linecnt = 0; + samFile *infile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + + //read_header infile header tag + if (argc < 3 || argc > 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + if (argc == 4) { //header and tag + tag = argv[3]; + //find unique identifier field name for requested header type + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + id = ""; + } + else { + printf("Invalid header type\n"); + goto end; + } + } + else if (argc == 5) { //header id val + id = argv[3]; + idval = argv[4]; + } + else if (argc == 6) { //header id val tag + id = argv[3]; + idval = argv[4]; + tag = argv[5]; + } + + //open input files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + if (id && idval) { + if (tag) { + ret = sam_hdr_find_tag_id(in_samhdr, header, id, idval, tag, &data); + } + else { + ret = sam_hdr_find_line_id(in_samhdr, header, id, idval, &data); + } + + if (ret == 0) { + printf("%s\n", data.s); + } + else if (ret == -1) { + printf("No matching tag found\n"); + goto end; + } + else { + printf("Failed to find header line\n"); + goto end; + } + } + else { + //get count of given header type + linecnt = sam_hdr_count_lines(in_samhdr, header); + if (linecnt == 0) { + printf("No matching line found\n"); + goto end; + } + for (c = 0; c < linecnt; ++c ) { + if (tag) { + //non CO, get the tag requested + ret = sam_hdr_find_tag_pos(in_samhdr, header, c, tag, &data); + } + else { + //CO header, there are no tags but the whole line + ret = sam_hdr_find_line_pos(in_samhdr, header, c, &data); + } + + if (ret == 0) { + printf("%s\n", data.s); + continue; + } + else if (ret == -1) { + printf("Tag not present\n"); + continue; + } + else { + printf("Failed to get tag\n"); + goto end; + } + } + } + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + ks_free(&data); + return ret; +} diff --git a/samples/read_refname.c b/samples/read_refname.c new file mode 100644 index 000000000..adbc71183 --- /dev/null +++ b/samples/read_refname.c @@ -0,0 +1,125 @@ +/* read_refname.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: read_refname infile minsize\n\ +This shows name of references which has length above the given size\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *id = NULL; + int c = 0, ret = EXIT_FAILURE, linecnt = 0, pos = 0; + samFile *infile = NULL; + sam_hdr_t *in_samhdr = NULL; + kstring_t data = KS_INITIALIZE; + int64_t minsize = 0, size = 0; + + if (argc != 3 && argc != 2) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + if (argc == 3) { + minsize = atoll(argv[2]); + } + + //open input files + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + linecnt = sam_hdr_count_lines(in_samhdr, "SQ"); //get reference count + if (linecnt <= 0) { + if (!linecnt) { + printf("No reference line present\n"); + } + else { + printf("Failed to get reference line count\n"); + } + goto end; + } + //iterate and check each reference's length + for (pos = 1, c = 0; c < linecnt; ++c) { + if ((ret = sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "LN", &data) == -2)) { + printf("Failed to get length\n"); + goto end; + } + else if (ret == -1) { + //length not present, ignore + continue; + } + //else have length + size = atoll(data.s); + if (size < minsize) { + //not required + continue; + } + if (!(id = sam_hdr_line_name(in_samhdr, "SQ", c))) { //sam_hdr_find_tag_pos(in_samhdr, "SQ", c, "SN", &data) can also do the same! + printf("Failed to get id for reference data\n"); + goto end; + } + printf("%d,%s,%s\n", pos, id, data.s); + pos++; + } + + ret = EXIT_SUCCESS; + +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + ks_free(&data); + return ret; +} diff --git a/samples/rem_header.c b/samples/rem_header.c new file mode 100644 index 000000000..a0b6510fb --- /dev/null +++ b/samples/rem_header.c @@ -0,0 +1,138 @@ +/* rem_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: rem_header infile header [id]\n\ +Removes header line of given type and id\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *header = NULL, *idval = NULL; + char *id = NULL; + int ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + + //update_header infile header idval tag value + if (argc <3 || argc > 4) { + //3 & 4 are ok, 3-> all of given header type, 4->given id of given header type to be removed + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + if (argc == 4) { + idval = argv[3]; + } + + //unique identifier for each of the header types + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + //CO field can be removed using the position of it using sam_hdr_remove_line_pos + id = ""; + } + else { + printf("Invalid header type\n"); + goto end; + } + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + if (idval) { + //remove specific line + if (sam_hdr_remove_line_id(in_samhdr, header, id, idval)) { + printf("Failed to remove header line\n"); + goto end; + } + } + else { + //remove multiple lines of a header type + if (sam_hdr_remove_lines(in_samhdr, header, id, NULL)) { + printf("Failed to remove header line\n"); + goto end; + } + } + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanupq + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/sample.ref.fa b/samples/sample.ref.fa new file mode 100644 index 000000000..5789e8c42 --- /dev/null +++ b/samples/sample.ref.fa @@ -0,0 +1,4 @@ +>T1 T1:1-40 +AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT +>T2 T2:1:40 +TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT diff --git a/samples/sample.sam b/samples/sample.sam new file mode 100644 index 000000000..e56efd69f --- /dev/null +++ b/samples/sample.sam @@ -0,0 +1,29 @@ +@HD VN:1.17 SO:unknown +@SQ SN:T1 LN:40 +@SQ SN:T2 LN:40 +@CO @SQ SN* LN* AH AN AS DS M5 SP TP UR +@CO @RG ID* BC CN DS DT FO KS LB PG PI PL PM PU SM +@CO @PG ID* PN CL PP DS VN +@CO this is a dummy alignment file to demonstrate different abilities of hts apis +@CO QNAME FLAG RNAME POS MAPQ CIGAR RNEXT PNEXT TLEN SEQ QUAL [TAG:TYPE:VALUE]… +@CO 1234567890123456789012345678901234567890 +@CO AAAAACTGAAAACCCCTTTTGGGGACTGTTAACAGTTTTT T1 +@CO TTTTCCCCACTGAAAACCCCTTTTGGGGACTGTTAACAGT T2 +@CO ITR1-ITR2M, ITR2-ITR2M are proper pairs in T1 and T2, UNMP1 is partly mapped and pair is unmapped, UNMP2 & 3 are unmappped +@CO A1-A2, A4-A3 are proper pairs with A4-A3 in different read order. A5 is secondary alignment +ITR1 99 T1 5 40 4M = 33 10 ACTG ()() +ITR2 147 T2 23 49 2M = 35 -10 TT ** +ITR2M 99 T2 35 51 2M = 23 10 AA && +ITR1M 147 T1 33 37 4M = 5 -10 ACTG $$$$ +UNMP1 73 T1 21 40 3M * 0 5 GGG &&1 +UNMP2 141 * 0 0 * * 0 7 AA && +UNMP3 77 * 0 0 * * 0 5 GGG &&2 +A1 99 T1 25 35 6M = 31 8 ACTGTT ****** +A2 147 T1 31 33 6M = 25 -8 ACTGTT ()()() +A3 147 T2 23 47 2M1X = 12 -5 TTG ((( +A4 99 T2 12 50 3M = 23 5 GAA ()( +A5 355 T1 25 35 4M = 33 5 ACTG PPPP +B1 99 T1 25 35 6M = 31 8 GCTATT ****** +B3 147 T2 23 47 2M1X = 12 -5 TAG ((( +B4 99 T2 12 50 3M = 23 5 GAT ()( +B5 355 T1 25 35 4M = 33 5 AGTG PPPP diff --git a/samples/split.c b/samples/split.c new file mode 100644 index 000000000..2eb9e6b79 --- /dev/null +++ b/samples/split.c @@ -0,0 +1,153 @@ +/* split.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the basic writing of output\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split2.c b/samples/split2.c new file mode 100644 index 000000000..2354abfe3 --- /dev/null +++ b/samples/split2.c @@ -0,0 +1,158 @@ +/* split2.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows file type selection through name and format api\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL, mode1[5] = "w", mode2[5] = "w"; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam.gz") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam.gz", outdir); //name of Read1 file + snprintf(file2, size, "%s/2.sam", outdir); //name of Read2 file + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //set file open mode based on file name for 1st and as explicit for 2nd + if ((sam_open_mode(mode1+1, file1, NULL) == -1) || (sam_open_mode(mode2+1, file2, "sam.gz") == -1)) { + printf("Failed to set open mode\n"); + goto end; + } + //open input file + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files + outfile1 = sam_open(file1, mode1); //as compressed SAM through sam_open + outfile2 = sam_open_format(file2, mode2, NULL); //as compressed SAM through sam_open_format + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split_thread1.c b/samples/split_thread1.c new file mode 100644 index 000000000..40d2dfdc2 --- /dev/null +++ b/samples/split_thread1.c @@ -0,0 +1,161 @@ +/* split_thread1.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split_t1 infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the usage of basic thread in htslib\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //create file specific threads + if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading + hts_set_opt(outfile1, HTS_OPT_NTHREADS, 1) < 0 || //1 thread specific for sam write + hts_set_opt(outfile2, HTS_OPT_NTHREADS, 1) < 0) { //1 thread specific for bam write + printf("Failed to set thread options\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + return ret; +} diff --git a/samples/split_thread2.c b/samples/split_thread2.c new file mode 100644 index 000000000..dab897b5f --- /dev/null +++ b/samples/split_thread2.c @@ -0,0 +1,171 @@ +/* split_thread2.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: split_t2 infile outdir\n\ +Splits the input file alignments to read1 and read2 and saves as 1.sam and 2.bam in given directory\n\ +Shows the usage of thread pool\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *outdir = NULL; + char *file1 = NULL, *file2 = NULL; + int c = 0, ret = EXIT_FAILURE, size = 0; + samFile *infile = NULL, *outfile1 = NULL, *outfile2 = NULL; + sam_hdr_t *in_samhdr = NULL; + bam1_t *bamdata = NULL; + htsThreadPool tpool = {NULL, 0}; + + if (argc != 3) { + print_usage(stdout); + goto end; + } + inname = argv[1]; + outdir = argv[2]; + + //allocate space for output + size = sizeof(char) * (strlen(outdir) + sizeof("/1.sam") + 1); //space for output file name and null termination + file1 = malloc(size); + file2 = malloc(size); + if (!file1 || !file2) { + printf("Failed to set output path\n"); + goto end; + } + + //output file names + snprintf(file1, size, "%s/1.sam", outdir); //for SAM output + snprintf(file2, size, "%s/2.bam", outdir); //for BAM output + //bam data storage + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + //open input file - r reading + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + //open output files - w write as SAM, wb write as BAM + outfile1 = sam_open(file1, "w"); //as SAM + outfile2 = sam_open(file2, "wb"); //as BAM + if (!outfile1 || !outfile2) { + printf("Could not open output file\n"); + goto end; + } + + //create a pool of 4 threads + if (!(tpool.pool = hts_tpool_init(4))) { + printf("Failed to initialize the thread pool\n"); + goto end; + } + //share the pool with all the 3 files + if (hts_set_opt(infile, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile1, HTS_OPT_THREAD_POOL, &tpool) < 0 || + hts_set_opt(outfile2, HTS_OPT_THREAD_POOL, &tpool) < 0) { + printf("Failed to set thread options\n"); + goto end; + } + + //read header, required to resolve the target names to proper ids + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + //write header + if ((sam_hdr_write(outfile1, in_samhdr) == -1) || (sam_hdr_write(outfile2, in_samhdr) == -1)) { + printf("Failed to write header\n"); + goto end; + } + + //check flags and write + while ((c = sam_read1(infile, in_samhdr, bamdata)) >= 0) { + if (bamdata->core.flag & BAM_FREAD1) { + if (sam_write1(outfile1, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + else if (bamdata->core.flag & BAM_FREAD2) { + if (sam_write1(outfile2, in_samhdr, bamdata) < 0) { + printf("Failed to write output data\n"); + goto end; + } + } + } + if (-1 == c) { + //EOF + ret = EXIT_SUCCESS; + } + else { + printf("Error in reading data\n"); + } +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + if (file1) { + free(file1); + } + if (file2) { + free(file2); + } + if (outfile1) { + sam_close(outfile1); + } + if (outfile2) { + sam_close(outfile2); + } + if (tpool.pool) { + hts_tpool_destroy(tpool.pool); + } + return ret; +} diff --git a/samples/update_header.c b/samples/update_header.c new file mode 100644 index 000000000..f6b1680cd --- /dev/null +++ b/samples/update_header.c @@ -0,0 +1,131 @@ +/* update_header.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - print the demo_usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: update_header infile header idval tag value\n\ +Updates the tag's value on line given in id on header of given type\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *inname = NULL, *tag = NULL, *idval = NULL, *val = NULL, *header = NULL; + char *id = NULL; + int ret = EXIT_FAILURE; + samFile *infile = NULL, *outfile = NULL; + sam_hdr_t *in_samhdr = NULL; + + //update_header infile header idval tag value + if (argc != 6) { + print_usage(stderr); + goto end; + } + inname = argv[1]; + header = argv[2]; + idval = argv[3]; + tag = argv[4]; + val = argv[5]; + + //unique identifier for each of the header types + if (header[0] == 'H' && header[1] == 'D') { + id = NULL; + printf("This sample doesnt not support modifying HD fields\n"); + } + else if (header[0] == 'S' && header[1] == 'Q') { + id = "SN"; + } + else if (header[0] == 'R' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'P' && header[1] == 'G') { + id = "ID"; + } + else if (header[0] == 'C' && header[1] == 'O') { + tag = NULL; + id = ""; + printf("This sample doesnt not support modifying CO fields\n"); + } + else { + printf("Invalid header type\n"); + goto end; + } + + if (!(infile = sam_open(inname, "r"))) { + printf("Could not open %s\n", inname); + goto end; + } + if (!(outfile = sam_open("-", "w"))) { //use stdout as the output file for ease of display of update + printf("Could not open stdout\n"); + goto end; + } + + //read header + if (!(in_samhdr = sam_hdr_read(infile))) { + printf("Failed to read header from file!\n"); + goto end; + } + + //update with new data + if (sam_hdr_update_line(in_samhdr, header, id, idval, tag, val, NULL) < 0) { + printf("Failed to update data\n"); + goto end; + } + //write output + if (sam_hdr_write(outfile, in_samhdr) < 0) { + printf("Failed to write output\n"); + goto end; + } + ret = EXIT_SUCCESS; + //bam data write to follow.... +end: + //cleanup + if (in_samhdr) { + sam_hdr_destroy(in_samhdr); + } + if (infile) { + sam_close(infile); + } + if (outfile) { + sam_close(outfile); + } + return ret; +} diff --git a/samples/write_fast.c b/samples/write_fast.c new file mode 100644 index 000000000..ef7817683 --- /dev/null +++ b/samples/write_fast.c @@ -0,0 +1,101 @@ +/* write_fast.c -- showcases the htslib api usage + + Copyright (C) 2023 Genome Research Ltd. + + Author: Vasudeva Sarma + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE + +*/ + +/* The pupose of this code is to demonstrate the library apis and need proper error handling and optimization */ + +#include +#include +#include + +/// print_usage - show flags_demo usage +/** @param fp pointer to the file / terminal to which demo_usage to be dumped +returns nothing +*/ +static void print_usage(FILE *fp) +{ + fprintf(fp, "Usage: write_fast \n\ +Appends a fasta/fastq file.\n"); + return; +} + +/// main_demo - start of the demo +/** @param argc - count of arguments + * @param argv - pointer to array of arguments +returns 1 on failure 0 on success +*/ +int main(int argc, char *argv[]) +{ + const char *outname = NULL; //output file name + int ret = EXIT_FAILURE; + samFile *outfile = NULL; //sam file + sam_hdr_t *out_samhdr = NULL; //header of file + bam1_t *bamdata = NULL; //to hold the read data + char mode[4] = "a"; + + if (argc != 2) { + print_usage(stdout); + goto end; + } + outname = argv[1]; + + //initialize + if (!(bamdata = bam_init1())) { + printf("Failed to initialize bamdata\n"); + goto end; + } + if (sam_open_mode(mode + 1, outname, NULL) < 0) { + printf("Invalid file name\n"); + goto end; + } + //open output file + if (!(outfile = sam_open(outname, mode))) { + printf("Could not open %s\n", outname); + goto end; + } + //dummy data + if (bam_set1(bamdata, sizeof("test"), "test", BAM_FUNMAP, -1, -1, 0, 0, NULL, -1, -1, 0, 10, "AACTGACTGA", "1234567890", 0) < 0) { + printf("Failed to set data\n"); + goto end; + } + if (sam_write1(outfile, out_samhdr, bamdata) < 0) { + printf("Failed to write data\n"); + goto end; + } + + ret = EXIT_SUCCESS; +end: + //clean up + if (out_samhdr) { + sam_hdr_destroy(out_samhdr); + } + if (outfile) { + sam_close(outfile); + } + if (bamdata) { + bam_destroy1(bamdata); + } + return ret; +} From 62909e2a9f1b9e3ad9a7ba56b073242ef8f8ab8e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 3 Jul 2023 15:34:48 +0100 Subject: [PATCH 461/488] NEWS updates for pending release Co-authored-by: Rob Davies --- NEWS | 164 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 164 insertions(+) diff --git a/NEWS b/NEWS index d2c168ee7..10fdc1b5a 100644 --- a/NEWS +++ b/NEWS @@ -1,6 +1,170 @@ Noteworthy changes in release a.b ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +Updates +------- + +* Using CRAM 3.1 no longer gives a warning about the specification + being draft. Note CRAM 3.0 is still the default output format. + (PR#1583) + +* Replaced use of sprintf with snprintf, to silence potential warnings + from Apple's compilers and those who implement similar checks. + (PR#1594, fixes #1586. Reported by Oleksii Nikolaienko) + +* Fastq output will now generate empty records for reads with no + sequence data (i.e. sequence is "*" in SAM format). (PR#1576, + fixes samtools/samtools#1576. Reported by Nils Homer) + +* CRAM decoding speed-ups. (PR#1580) + +* A new MN aux tag can now be used to verify that MM/ML base modification + data has not been broken by hard clipping. (PR#1590, PR#1612. See also + PR samtools/hts-specs#714 and issue samtools/hts-specs#646. + Reported by Jared Simpson) + +* The base modification API has been improved to make it easier for callers + to tell unchecked bases from unmodified ones. (PR#1636, fixes #1550. + Requested by Chris Wright) + +* A new bam_mods_queryi() API has been added to return additional + data about the i-th base modification returned by bam_mods_recorded(). + (PR#1636, fixes #1550 and #1635. Requested by Jared Simpson) + +* Speed up index look-ups for whole-chromosome queries. (PR#1596) + +* Mpileup now merges adjacent (mis)match CIGAR operations, so CIGARs + using the X/= operators give the same results as if the M operator + was used. (PR#1607, fixes #1597. Reported by Marcel Martin) + +* It's now possible to call bcf_sr_set_regions() after adding readers + using bcf_sr_add_reader() (previously this returned an error). Doing so + will discard any unread data, and reset the readers so they iterate over + the new regions. (PR#1624, fixes samtools/bcftools#1918. Reported by + Gregg Thomas) + +* The synced BCF reader can now accept regions with reference names including + colons and hyphens, by enclosing them in curly braces. For example, + {chr_part:1-1001}:10-20 will return bases 10 to 20 from reference + "chr_part:1-1001". (PR#1630, fixes #1620. Reported by Bren) + +* Add a "samples" directory with code demonstrating usage of HTSlib plus + a tutorial document. (PR#1589) + +Build changes +------------- + +* Htscodecs has been updated to 1.5.1 (PR#1654) + +* Htscodecs SIMD code now works with Apple multiarch binaries. + (PR#1587, HTSlib fix for samtools/htscodecs#76. Reported by John Marshall) + +* Improve portability of "expr" usage in version.sh. + (PR#1593, fixes #1592. Reported by John Marshall) + +* Improve portability to *BSD targets by ensuring _XOPEN_SOURCE is defined + correctly and that source files properly include "config.h". Perl + scripts also now all use #!/usr/bin/env instead of assuming that + it's in /usr/bin/perl. (PR#1628, fixes #1606. + Reported by Robert Clausecker) + +* Fixed NAME entry in htslib-s3-plugin man page so the whatis and apropos + commands find it. (PR#1634, thanks to Étienne Mollier) + +* Assorted dependency tracking fixes. (PR#1653, thanks to John Marshall) + +Documentation updates +--------------------- + +* Changed Alpine build instructions as they've switched back to using openssl. + (PR#1609) + +* Recommend using -rdynamic when statically linking a libhts.a with + plugins enabled. (PR#1611, thanks to John Marshall. Fixes #1600, + reported by Jack Wimberley) + +* Fixed example in docs for sam_hdr_add_line(). (PR#1618, thanks to kojix2) + +* Improved test harness for base modifications API. (PR#1648) + +Bug fixes +--------- + +* Fix a major bug when searching against a CRAM index where one container + has start and end coordinates entirely contained within the previous + container. This would occasionally miss data, and sometimes return much + more than required. The bug affected versions 1.11 to 1.17, although the + change in 1.11 was bug-fixing multi-threaded index queries. This bug did + not affect index building. There is no need to reindex your CRAM files. + (PR#1574, PR#1640. Fixes #1569, #1639, samtools/samtools#1808, + samtools/samtools#1819. Reported by xuxif, Jens Reeder and Jared Simpson) + +* Prevent CRAM blocks from becoming too big in files with short + sequences but very long aux tags. (PR #1613) + +* Fix bug where the CRAM decoder for CONST_INT and CONST_BYTE + codecs may incorrectly look for extra data in the CORE block. + Note that this bug only affected the experimental CRAM v4.0 decoder. + (PR#1614) + +* Fix crypt4gh redirection so it works in conjunction with non-file + IO, such as using htsget. (PR#1577) + +* Improve error checking for the VCF POS column, when facing invalid + data. (PR#1575, replaces #1570 originally reported and fixed + by Colin Nolan.) + +* Improved error checking on VCF indexing to validate the data is BGZF + compressed. (PR#1581) + +* Fix bug where bin number calculation could overflow when making iterators + over regions that go to the end of a chromosome. (PR#1595) + +* Backport attractivechaos/klib#78 (by Pall Melsted) to HTSlib. + Prevents infinite loops in kseq_read() when reading broken gzip files. + (PR#1582, fixes #1579. Reported by Goran Vinterhalter) + +* Backport attractivechaos/klib@384277a (by innoink) to HTSlib. + Fixes the kh_int_hash_func2() macro definition. + (PR#1599, fixes #1598. Reported by fanxinping) + +* Remove a compilation warning on systems with newer libcurl releases. + (PR#1572) + +* Windows: Fixed BGZF EOF check for recent MinGW releases. (PR#1601, + fixes samtools/bcftools#1901) + +* Fixed bug where tabix would not return the correct regions for files + where the column ordering is end, ..., begin instead of begin, ..., end. + (PR#1626, fixes #1622. Reported by Hiruna Samarakoon) + +* sam_format_aux1() now always NUL-terminates Z/H tags. (PR#1631) + +* Ensure base modification iterator is reset when no MM tag is present. + (PR#1631, PR#1647) + +* Fix segfault when attempting to write an uncompressed BAM file opened using + hts_open(name, "wbu"). This was attempting to write BAM data without + wrapping it in BGZF blocks, which is invalid according to the BAM + specification. "wbu" is now internally converted to "wb0" to output + uncompressed data wrapped in BGZF blocks. (PR#1632, fixes #1617. + Reported by Joyjit Daw) + +* Fixed over-strict bounds check in probaln_glocal() which caused it to make + sub-optimal alignments when the requested band width was greater than the + query length. (PR#1616, fixes #1605. Reported by Jared Simpson) + +* Fixed possible double frees when handling errors in bcf_hdr_add_hrec(), + if particular memory allocations fail. (PR#1637) + +* Ensure that bcf_hdr_remove() clears up all pointers to the items removed + from dictionaries. Failing to do this could have resulted in a call + requesting a deleted item via bcf_hdr_get_hrec() returning a stale pointer. + (PR#1637) + +* Stop the gzip decompresser from finishing prematurely when an empty + gzip block is followed by more data. (PR#1643, PR#1646) + Noteworthy changes in release 1.17 (21st February 2023) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ From 2425ce962eed21f216d87470cef26c59805fc5c4 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Fri, 3 Mar 2023 11:55:32 +0000 Subject: [PATCH 462/488] formatting update --- samples/DEMO.md | 5 +++++ samples/README.md | 2 -- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/samples/DEMO.md b/samples/DEMO.md index 1f14b7485..911792899 100644 --- a/samples/DEMO.md +++ b/samples/DEMO.md @@ -311,6 +311,7 @@ reference in it or with no reference data at all. It can also be created using an autogenerated reference, based on consensus with-in the alignment data. The reference detail can be set to an htsFormat structure using hts_parse_format api and used with sam_open_format api to create appropriate CRAM file. + ... snprintf(reffmt1, size1, "cram,reference=%s", reffile); snprintf(reffmt2, size2, "cram,embed_ref=1,reference=%s", reffile); @@ -410,6 +411,7 @@ Shows the 2nd SQ line's LN field value. ./read_header /tmp/sample.sam.gz SQ SN T2 LN Below code excerpt shows the reference names which has length above given value. + ... linecnt = sam_hdr_count_lines(in_samhdr, "SQ"); //get reference count ... @@ -784,6 +786,7 @@ At the end of write, sam_idx_save api need to be invoked to save the index. if (sam_write1(outfile, in_samhdr, bamdata) < 0) { ... if (sam_idx_save(outfile)) { + ... Refer:index_write.c Creates mpileup.1.bam and mpileup.1.bam.bai in /tmp/. @@ -1314,6 +1317,7 @@ HTS_OPT_THREAD_POOL and the thread pool address are to be passed as arguments to api. The thread pool has to be released with hts_tpool_destroy. Below excerpt shows file specific thread pool, + ... //create file specific threads if (hts_set_opt(infile, HTS_OPT_NTHREADS, 2) < 0 || //2 thread specific for reading @@ -1325,6 +1329,7 @@ Below excerpt shows file specific thread pool, Refer: split_thread1.c Below excerpt shows thread pool shared across files, + ... //create a pool of 4 threads if (!(tpool.pool = hts_tpool_init(4))) { diff --git a/samples/README.md b/samples/README.md index 0438e1fc4..ab5481dea 100644 --- a/samples/README.md +++ b/samples/README.md @@ -1,5 +1,3 @@ -[![Github All Releases](https://img.shields.io/github/downloads/samtools/htslib/total.svg)](https://github.com/samtools/htslib/samples) - HTSlib is an implementation of a unified C library for accessing common file formats, such as [SAM, CRAM and VCF][1], used for high-throughput sequencing data, and is the core library used by [samtools][2] and [bcftools][3]. From aa570fbffd267c3795492be67348975affbc780e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 17 Jul 2023 16:41:36 +0100 Subject: [PATCH 463/488] Makes bam_parse_cigar able to modify existing BAM records rather than partially parsed ones. It makes little sense for this to exist as a public API when it's only capable of handling the internal during-SAM-parse situation, and the changes are relatively minor. Also fixes an undocumented assumption that end == &in. Fixes #1650 --- htslib/sam.h | 6 ++++++ sam.c | 33 ++++++++++++++++++++++++--------- 2 files changed, 30 insertions(+), 9 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index cffa04701..c95c749df 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1133,6 +1133,12 @@ ssize_t sam_parse_cigar(const char *in, char **end, uint32_t **a_cigar, size_t * can be NULL @param b [in/out] address of the destination bam1_t struct @return number of processed CIGAR operators; -1 on error + + @discussion The BAM record may be partial and empty of existing cigar, seq + and quality, as is the case during SAM parsing, or it may be an existing + BAM record in which case this function replaces the existing CIGAR field + and shuffles data accordingly. A CIGAR of "*" will remove the CIGAR, + returning zero. */ HTSLIB_EXPORT ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b); diff --git a/sam.c b/sam.c index fc4e677df..0e466df5e 100644 --- a/sam.c +++ b/sam.c @@ -2750,7 +2750,6 @@ int sam_parse1(kstring_t *s, sam_hdr_t *h, bam1_t *b) int n_cigar = bam_parse_cigar(p, &p, b); if (n_cigar < 1 || *p++ != '\t') goto err_ret; cigar = (uint32_t *)(b->data + old_l_data); - c->n_cigar = n_cigar; // can't use bam_endpos() directly as some fields not yet set up cigreflen = (!(c->flag&BAM_FUNMAP))? bam_cigar2rlen(c->n_cigar, cigar) : 1; @@ -2926,20 +2925,36 @@ ssize_t bam_parse_cigar(const char *in, char **end, bam1_t *b) { } if (end) *end = (char *)in; - if (*in == '*') { - if (end) (*end)++; + n_cigar = (*in == '*') ? 0 : read_ncigar(in); + if (!n_cigar && b->core.n_cigar == 0) { + if (end) *end = (char *)in+1; return 0; } - n_cigar = read_ncigar(in); - if (!n_cigar) return 0; - if (possibly_expand_bam_data(b, n_cigar * sizeof(uint32_t)) < 0) { + + ssize_t cig_diff = n_cigar - b->core.n_cigar; + if (cig_diff > 0 && + possibly_expand_bam_data(b, cig_diff * sizeof(uint32_t)) < 0) { hts_log_error("Memory allocation error"); return -1; } - if (!(diff = parse_cigar(in, (uint32_t *)(b->data + b->l_data), n_cigar))) return -1; - b->l_data += (n_cigar * sizeof(uint32_t)); - if (end) *end = (char *)in+diff; + uint32_t *cig = bam_get_cigar(b); + if ((uint8_t *)cig != b->data + b->l_data) { + // Modifying an BAM existing BAM record + uint8_t *seq = bam_get_seq(b); + memmove(cig + n_cigar, seq, (b->data + b->l_data) - seq); + } + + if (n_cigar) { + if (!(diff = parse_cigar(in, cig, n_cigar))) + return -1; + } else { + diff = 1; // handle "*" + } + + b->l_data += cig_diff * sizeof(uint32_t); + b->core.n_cigar = n_cigar; + if (end) *end = (char *)in + diff; return n_cigar; } From f1b89dffb48a242dbac4787a500ed380daab33df Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 17 Jul 2023 16:44:33 +0100 Subject: [PATCH 464/488] Fix cut and paste errors in bam_aux2f documentation --- htslib/sam.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/htslib/sam.h b/htslib/sam.h index c95c749df..512db5420 100644 --- a/htslib/sam.h +++ b/htslib/sam.h @@ -1713,11 +1713,11 @@ static inline int bam_aux_get_str(const bam1_t *b, HTSLIB_EXPORT int64_t bam_aux2i(const uint8_t *s); -/// Get an integer aux value +/// Get a float aux value /** @param s Pointer to the tag data, as returned by bam_aux_get() - @return The value, or 0 if the tag was not an integer type + @return The value, or 0 if the tag was not a float type If the tag is not an numeric type, errno is set to EINVAL. The value of - integer flags will be returned cast to a double. + the float will be returned cast to a double. */ HTSLIB_EXPORT double bam_aux2f(const uint8_t *s); From 5098983e426ac74bf0d1edaf5e8902bab72e5947 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 21 Jul 2023 20:06:02 +1200 Subject: [PATCH 465/488] Add "uncompressed" in hts_format_description() where appropriate For formats like BAM and BCF that are normally compressed, report it explicitly when encountering a raw uncompressed such file. See samtools/samtools#1884 for motivation. --- hts.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/hts.c b/hts.c index b7b528a61..d90fea61f 100644 --- a/hts.c +++ b/hts.c @@ -794,6 +794,7 @@ char *hts_format_description(const htsFormat *format) case zstd_compression: kputs(" Zstandard-compressed", &str); break; case custom: kputs(" compressed", &str); break; case gzip: kputs(" gzip-compressed", &str); break; + case bgzf: switch (format->format) { case bam: @@ -808,6 +809,22 @@ char *hts_format_description(const htsFormat *format) break; } break; + + case no_compression: + switch (format->format) { + case bam: + case bcf: + case cram: + case csi: + case tbi: + // These are normally compressed, so emphasise that this one isn't + kputs(" uncompressed", &str); + break; + default: + break; + } + break; + default: break; } From 7c1d3ccd1ae10dd81cbcfc84d1dd984534cf2caa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 4 Jul 2023 15:49:44 +0100 Subject: [PATCH 466/488] The first stage of vcf_parse_format speed improvements. This simply turns the monolithic vcf_parse_format functions into a series of numbers sub-functions whose primary is to localise the variables to that code block and to make it easier to see the structure of the tasks being performed. There is no code optimisation here and the main algorithm is unchanged, so this is just moving of code from 1 function to multiple functions. However it makes the next commit easier to understand as we're not trying to see a delta mixed in with restructuring. An unexpected consequence however of making the variables local to their blocks is that it also speeds up the code. The first step in separating this code into functions was simply adding curly braces around each code segment and moving the function-global variables into their respective blocks. The before/after benchmarkjs on 100,000 lines of a multi-sample G1K VCF are ("perf stat" cycle counts): ORIG LOCALISED gcc7 29335942870 27353443704 gcc13 31974757094 31452452908 clang13 31290989382 29508665020 Benchmarked again after moving to actual functions, but the difference was tiny in comparison.) --- vcf.c | 146 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 114 insertions(+), 32 deletions(-) diff --git a/vcf.c b/vcf.c index 9e589f993..4dbac2580 100644 --- a/vcf.c +++ b/vcf.c @@ -2633,22 +2633,11 @@ static inline int align_mem(kstring_t *s) return e == 0 ? 0 : -1; } -// p,q is the start and the end of the FORMAT field #define MAX_N_FMT 255 /* Limited by size of bcf1_t n_fmt field */ -static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) -{ - if ( !bcf_hdr_nsamples(h) ) return 0; - - static int extreme_val_warned = 0; - char *r, *t; - int j, l, m, g, overflow = 0; - khint_t k; - ks_tokaux_t aux1; - vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; - kstring_t *mem = (kstring_t*)&h->mem; - fmt_aux_t fmt[MAX_N_FMT]; - mem->l = 0; +// detect FORMAT "." +static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q) { char *end = s->s + s->l; if ( q>=end ) { @@ -2661,10 +2650,19 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if ( p[0]=='.' && p[1]==0 ) // FORMAT field is empty "." { v->n_sample = bcf_hdr_nsamples(h); - return 0; + return 1; } - // get format information from the dictionary + return 0; +} + +// get format information from the dictionary +static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { + const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; + char *t; + int j; + ks_tokaux_t aux1; for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; @@ -2674,7 +2672,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } *(char*)aux1.p = 0; - k = kh_get(vdict, d, t); + khint_t k = kh_get(vdict, d, t); if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_FMT] == 15) { if ( t[0]=='.' && t[1]==0 ) { @@ -2706,10 +2704,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; v->n_fmt++; } - // compute max + return 0; +} + +// compute max +static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { int n_sample_ori = -1; - r = q + 1; // r: position in the format string - l = 0, m = g = 1, v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles + char *r = q + 1; // r: position in the format string + int l = 0, m = 1, g = 1, j; + v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles + char *end = s->s + s->l; while ( rmem; + + int j; for (j = 0; j < v->n_fmt; ++j) { fmt_aux_t *f = &fmt[j]; if ( !f->max_m ) f->max_m = 1; // omitted trailing format field @@ -2804,11 +2817,25 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } mem->l += v->n_sample * f->size; } - for (j = 0; j < v->n_fmt; ++j) - fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; - // fill the sample fields; at beginning of the loop, t points to the first char of a format - n_sample_ori = -1; - t = q + 1; m = 0; // m: sample id + + { + int j; + for (j = 0; j < v->n_fmt; ++j) + fmt[j].buf = (uint8_t*)mem->s + fmt[j].offset; + } + + return 0; +} + +// fill the sample fields; at beginning of the loop +static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { + static int extreme_val_warned = 0; + int n_sample_ori = -1; + // t points to the first char of a format + char *t = q + 1; + int m = 0; // m: sample id + char *end = s->s + s->l; while ( ty>>4&0xf) == BCF_HT_STR) { + int l; if (z->is_gt) { // genotypes int32_t is_phased = 0; uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); uint32_t unreadable = 0; uint32_t max = 0; - overflow = 0; + int overflow = 0; for (l = 0;; ++t) { if (*t == '.') { ++t, x[l++] = is_phased; @@ -2867,16 +2895,17 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; } else { char *x = (char*)z->buf + z->size * (size_t)m; - for (r = t, l = 0; *t != ':' && *t; ++t) x[l++] = *t; + for (l = 0; *t != ':' && *t; ++t) x[l++] = *t; for (; l < z->size; ++l) x[l] = 0; } } else if ((z->y>>4&0xf) == BCF_HT_INT) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); + int l; for (l = 0;; ++t) { if (*t == '.') { x[l++] = bcf_int32_missing, ++t; // ++t to skip "." } else { - overflow = 0; + int overflow = 0; char *te; long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); if ( te==t || overflow || tmp_valBCF_MAX_BT_INT32 ) @@ -2897,11 +2926,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; } else if ((z->y>>4&0xf) == BCF_HT_REAL) { float *x = (float*)(z->buf + z->size * (size_t)m); + int l; for (l = 0;; ++t) { if (*t == '.' && !isdigit_c(t[1])) { bcf_float_set_missing(x[l++]), ++t; // ++t to skip "." } else { - overflow = 0; + int overflow = 0; char *te; float tmp_val = hts_str2dbl(t, &te, &overflow); if ( (te==t || overflow) && !extreme_val_warned ) @@ -2940,6 +2970,7 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p for (; j < v->n_fmt; ++j) { // fill end-of-vector values fmt_aux_t *z = &fmt[j]; + int l; if ((z->y>>4&0xf) == BCF_HT_STR) { if (z->is_gt) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); @@ -2964,7 +2995,12 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p m++; t++; } - // write individual genotype information + return 0; +} + +// write individual genotype information +static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q, fmt_aux_t *fmt) { kstring_t *str = &v->indiv; int i; if (v->n_sample > 0) { @@ -2988,6 +3024,11 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p } } + return 0; +} + +// validity checking +static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) { if ( v->n_sample!=bcf_hdr_nsamples(h) ) { hts_log_error("Number of columns at %s:%"PRIhts_pos" does not match the number of samples (%d vs %d)", @@ -3008,6 +3049,47 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p return 0; } +// p,q is the start and the end of the FORMAT field +static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) +{ + if ( !bcf_hdr_nsamples(h) ) return 0; + kstring_t *mem = (kstring_t*)&h->mem; + mem->l = 0; + + fmt_aux_t fmt[MAX_N_FMT]; + + // detect FORMAT "." + int ret; // +ve = ok, -ve = err + if ((ret = vcf_parse_format_empty1(s, h, v, p, q))) + return ret ? 0 : -1; + + // get format information from the dictionary + if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0) + return -1; + + // compute max + if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0) + return -1; + + // allocate memory for arrays + if (vcf_parse_format_alloc4(s, h, v, p, q, fmt) < 0) + return -1; + + // fill the sample fields; at beginning of the loop + if (vcf_parse_format_fill5(s, h, v, p, q, fmt) < 0) + return -1; + + // write individual genotype information + if (vcf_parse_format_gt6(s, h, v, p, q, fmt) < 0) + return -1; + + // validity checking + if (vcf_parse_format_check7(h, v) < 0) + return -1; + + return 0; +} + static khint_t fix_chromosome(const bcf_hdr_t *h, vdict_t *d, const char *p) { // Simple error recovery for chromosomes not defined in the header. It will not help when VCF header has // been already printed, but will enable tools like vcfcheck to proceed. From 231897568c45c5821c6515ca88c98d1885060f97 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 4 Jul 2023 17:06:42 +0100 Subject: [PATCH 467/488] Further VCF reading speeds optimisations. This is the main meat of the VCF read speedup, following on from the previous code refactoring. Combined timings on testing GNOMAD very INFO heavy single-sample file, a many-sample (approx 4000) FORMAT rich file for different compilers, and the GIAB HG002 VCF truth set are: INFO heavy (15-29% speedup) (34-39% speedup) dev(s) PR(s) dev(s) PR(s) clang13 6.29 5.34 2.84 1.85 gcc13 6.74 5.22 2.93 1.93 gcc7 7.96 5.65 3.25 1.98 FORMAT heavy (6-19% speedup) (18-22% speedup) dev PR dev PR clang13 9.17 8.58 5.45 4.48 gcc13 9.88 8.04 5.08 3.95 gcc7 9.12 8.33 4.87 3.98 GIAB HG002 (28-29% speedup) (33-37% speedup) dev PR dev PR clang13 12.88 9.30 5.12 3.29 gcc13 12.04 8.60 4.74 3.19 gcc7 12.87 9.37 5.32 3.34 (Tested on Intel Xeon) Gold 6142 and an AMD Zen4 respectively) Bigger speedups (see first message in PR) were seen on some older hardware. Specific optimisations along with estimates of their benefit include, in approximate order of writing / testing: - Adding consts and caching of bcf_hdr_nsamples(h). No difference on system gcc (gcc7) and clang13, but a couple percent gain on gcc13. - Remove the need for most calls to hts_str2uint by recognising that most GT numbers are single digits. This was 4-5% saving for gcc and 9-10% on clang. - Remove kputc calls in bcf_enc_vint / bcf_enc_size, avoiding repeated ks_resize checking. This is a further ~10% speedup. - Unrolling in bcf_enc_vint to encourage SIMD. - Improve speed of bgzf_getline and kstrrok via memchr/strchr. In tabix timings indexing VCF, bgzf_getline change is 9-22% quicker with clang 13 and 19-25% quicker with gcc 7. I did investigate a manually unrolled 64-bit search, before I remembered the existance of memchr (doh!). This is often faster on clang (17-19%), but marginally slower on gcc. The actual speed up to this function however is considerably more (3-4x quicker). For interest, I include the equivalent code here, as it may be useful in other contexts: #if HTS_ALLOW_UNALIGNED != 0 && ULONG_MAX == 0xffffffffffffffff // 64-bit unrolled delim detection #define haszero(x) (((x)-0x0101010101010101UL)&~(x)&0x8080808080808080UL) // Quicker broadcast on clang than bit shuffling in delim union { uint64_t d8; uint8_t d[8]; } u; memset(u.d, delim, 8); const uint64_t d8 = u.d8; uint64_t *b8 = (uint64_t *)(&buf[fp->block_offset]); const int l8 = (fp->block_length-fp->block_offset)/8; for (l = 0; l < (l8 & ~3); l+=4) { if (haszero(b8[l+0] ^ d8)) break; if (haszero(b8[l+1] ^ d8)) { l++; break; } if (haszero(b8[l+2] ^ d8)) { l+=2; break; } if (haszero(b8[l+3] ^ d8)) { l+=3; break; } } l *= 8; for (l += fp->block_offset; l < fp->block_length && buf[l] != delim; l++); The analogous kstrtok change is using strchr+strlen instead of memchr as we don't know the string end. This makes kstrtok around 150% quicker when parsing a single sample VCF. When not finding aux->sep in the string, strchr returns NULL rather than end of string, so we need an additional strlen to set aux->p. However there is also glibc's strchrnul which solves this in a single call. This makes kstrtok another 40% quicker on this test, but overall it's not a big bottleneck any more. - Use strchr in vcf_parse_info. This is a major speed increase over manual searching on Linux. TODO: is this just glibc? Eg libmusl speeds, etc? Other OSes? It saves about 33% of time in vcf_parse (vcf_parse_info inlined to it) with gcc. Even more with clang. The total speed gain on a single sample VCF view (GIAB truth set) is 12-19% fewer cycles: - Minor "GT" check improvement. This has no real affect on gcc13 and clang13, but the system gcc (gcc7) speeds up single sample VCF decoding by 7% - Speed up the unknown value check (strcmp(p, "."). Helps gcc7 the most (9%), with gcc13/clang13 in the 3-4% gains. - Speed up vcf_parse_format_max3. This is the first parse through the FORMAT fields. Ideally we'd merge this and fill5 (the other parse through), but that is harder due to the data pivot / rotate. For now we just optimise the existing code path. Instead of a laborious switch character by character, we have an initial tight loop to find the first meta-character and then a switch to do char dependant code. This is 5% to 13% speed up depending on data set. - Remove kputc and minimise resize for bcf_enc_int1. 3-8% speedup depending on data / compiler. - Use memcmp instead of strcmp for "END" and ensure we have room. Also memset over explicit nulling of arrays. - Force BCF header dicts to be larger than needed. This is a tactic to reduce hash collisions due to the use of overly simple hash functions. It seems to typically be around 3-8% speed gain. - Restructure of main vcf_parse function. This can speed things up by 6-7% on basic single-sample files. The previous loop caused lots of branch prediction misses due to the counter 'i' being used to do 8 different parts of code depending on token number. Additionally it's got better error checking now as previously running out of tokens early just did a return 0 rather than complaining about missing columns. --- bgzf.c | 8 +- htslib/vcf.h | 85 +++++--- kstring.c | 13 +- vcf.c | 559 +++++++++++++++++++++++++++++++++++++-------------- 4 files changed, 476 insertions(+), 189 deletions(-) diff --git a/bgzf.c b/bgzf.c index 45f2b1150..5ef433c20 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2280,7 +2280,13 @@ int bgzf_getline(BGZF *fp, int delim, kstring_t *str) if (fp->block_length == 0) { state = -1; break; } } unsigned char *buf = fp->uncompressed_block; - for (l = fp->block_offset; l < fp->block_length && buf[l] != delim; ++l); + + // Equivalent to a naive byte by byte search from + // buf + block_offset to buf + block_length. + void *e = memchr(&buf[fp->block_offset], delim, + fp->block_length - fp->block_offset); + l = e ? (unsigned char *)e - buf : fp->block_length; + if (l < fp->block_length) state = 1; l -= fp->block_offset; if (ks_expand(str, l + 2) < 0) { state = -3; break; } diff --git a/htslib/vcf.h b/htslib/vcf.h index 83659ae12..70cf95372 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1522,26 +1522,37 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) static inline int bcf_enc_size(kstring_t *s, int size, int type) { - uint32_t e = 0; - uint8_t x[4]; - if (size >= 15) { - e |= kputc(15<<4|type, s) < 0; - if (size >= 128) { - if (size >= 32768) { - i32_to_le(size, x); - e |= kputc(1<<4|BCF_BT_INT32, s) < 0; - e |= kputsn((char*)&x, 4, s) < 0; - } else { - i16_to_le(size, x); - e |= kputc(1<<4|BCF_BT_INT16, s) < 0; - e |= kputsn((char*)&x, 2, s) < 0; - } + // Most common case is first + if (size < 15) { + if (ks_resize(s, s->l + 1) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + *p++ = (size<<4) | type; + s->l++; + return 0; + } + + if (ks_resize(s, s->l + 6) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + *p++ = 15<<4|type; + + if (size < 128) { + *p++ = 1<<4|BCF_BT_INT8; + *p++ = size; + s->l += 3; + } else { + if (size < 32768) { + *p++ = 1<<4|BCF_BT_INT16; + i16_to_le(size, p); + s->l += 4; } else { - e |= kputc(1<<4|BCF_BT_INT8, s) < 0; - e |= kputc(size, s) < 0; + *p++ = 1<<4|BCF_BT_INT32; + i32_to_le(size, p); + s->l += 6; } - } else e |= kputc(size<<4|type, s) < 0; - return e == 0 ? 0 : -1; + } + return 0; } static inline int bcf_enc_inttype(long x) @@ -1553,27 +1564,35 @@ static inline int bcf_enc_inttype(long x) static inline int bcf_enc_int1(kstring_t *s, int32_t x) { - uint32_t e = 0; - uint8_t z[4]; + if (ks_resize(s, s->l + 5) < 0) + return -1; + uint8_t *p = (uint8_t *)s->s + s->l; + if (x == bcf_int32_vector_end) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(bcf_int8_vector_end, s) < 0; + // An inline implementation of bcf_enc_size with size==1 and + // memory allocation already accounted for. + *p = (1<<4) | BCF_BT_INT8; + p[1] = bcf_int8_vector_end; + s->l+=2; } else if (x == bcf_int32_missing) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(bcf_int8_missing, s) < 0; + *p = (1<<4) | BCF_BT_INT8; + p[1] = bcf_int8_missing; + s->l+=2; } else if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) { - e |= bcf_enc_size(s, 1, BCF_BT_INT8); - e |= kputc(x, s) < 0; + *p = (1<<4) | BCF_BT_INT8; + p[1] = x; + s->l+=2; } else if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) { - i16_to_le(x, z); - e |= bcf_enc_size(s, 1, BCF_BT_INT16); - e |= kputsn((char*)&z, 2, s) < 0; + *p = (1<<4) | BCF_BT_INT16; + i16_to_le(x, p+1); + s->l+=3; } else { - i32_to_le(x, z); - e |= bcf_enc_size(s, 1, BCF_BT_INT32); - e |= kputsn((char*)&z, 4, s) < 0; + *p = (1<<4) | BCF_BT_INT32; + i32_to_le(x, p+1); + s->l+=5; } - return e == 0 ? 0 : -1; + + return 0; } /// Return the value of a single typed integer. diff --git a/kstring.c b/kstring.c index 71facf975..f8e0f9f3d 100644 --- a/kstring.c +++ b/kstring.c @@ -204,8 +204,17 @@ char *kstrtok(const char *str, const char *sep_in, ks_tokaux_t *aux) for (p = start; *p; ++p) if (aux->tab[*p>>6]>>(*p&0x3f)&1) break; } else { - for (p = start; *p; ++p) - if (*p == aux->sep) break; + // Using strchr is fast for next token, but slower for + // last token due to extra pass from strlen. Overall + // on a VCF parse this func was 146% faster with // strchr. + // Equiv to: + // for (p = start; *p; ++p) if (*p == aux->sep) break; + + // NB: We could use strchrnul() here from glibc if detected, + // which is ~40% faster again, but it's not so portable. + // i.e. p = (uint8_t *)strchrnul((char *)start, aux->sep); + uint8_t *p2 = (uint8_t *)strchr((char *)start, aux->sep); + p = p2 ? p2 : start + strlen((char *)start); } aux->p = (const char *) p; // end of token if (*p == 0) aux->finished = 1; // no more tokens diff --git a/vcf.c b/vcf.c index 4dbac2580..ce54dc227 100644 --- a/vcf.c +++ b/vcf.c @@ -46,9 +46,29 @@ DEALINGS IN THE SOFTWARE. */ #include "htslib/khash_str2int.h" #include "htslib/kstring.h" #include "htslib/sam.h" - #include "htslib/khash.h" + +#if 0 +// This helps on Intel a bit, often 6-7% faster VCF parsing. +// Conversely sometimes harms AMD Zen4 as ~9% slower. +// Possibly related to IPC differences. However for now it's just a +// curiousity we ignore and stick with the simpler code. +// +// Left here as a hint for future explorers. +static inline int xstreq(const char *a, const char *b) { + while (*a && *a == *b) + a++, b++; + return *a == *b; +} + +#define KHASH_MAP_INIT_XSTR(name, khval_t) \ + KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, xstreq) + +KHASH_MAP_INIT_XSTR(vdict, bcf_idinfo_t) +#else KHASH_MAP_INIT_STR(vdict, bcf_idinfo_t) +#endif + typedef khash_t(vdict) vdict_t; KHASH_MAP_INIT_STR(hdict, bcf_hrec_t*) @@ -1370,8 +1390,12 @@ bcf_hdr_t *bcf_hdr_init(const char *mode) bcf_hdr_t *h; h = (bcf_hdr_t*)calloc(1, sizeof(bcf_hdr_t)); if (!h) return NULL; - for (i = 0; i < 3; ++i) + for (i = 0; i < 3; ++i) { if ((h->dict[i] = kh_init(vdict)) == NULL) goto fail; + // Supersize the hash to make collisions very unlikely + static int dsize[3] = {16384,16384,2048}; // info, contig, format + if (kh_resize(vdict, h->dict[i], dsize[i]) < 0) goto fail; + } bcf_hdr_aux_t *aux = (bcf_hdr_aux_t*)calloc(1,sizeof(bcf_hdr_aux_t)); if ( !aux ) goto fail; @@ -2463,25 +2487,64 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) { int32_t max = INT32_MIN, min = INT32_MAX; int i; - if (n <= 0) bcf_enc_size(s, 0, BCF_BT_NULL); - else if (n == 1) bcf_enc_int1(s, a[0]); - else { + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_int1(s, a[0]); + } else { if (wsize <= 0) wsize = n; - for (i = 0; i < n; ++i) { - if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) continue; + + // Equivalent to: + // for (i = 0; i < n; ++i) { + // if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) + // continue; + // if (max < a[i]) max = a[i]; + // if (min > a[i]) min = a[i]; + // } + int max4[4] = {INT32_MIN, INT32_MIN, INT32_MIN, INT32_MIN}; + int min4[4] = {INT32_MAX, INT32_MAX, INT32_MAX, INT32_MAX}; + for (i = 0; i < (n&~3); i+=4) { + // bcf_int32_missing == INT32_MIN and + // bcf_int32_vector_end == INT32_MIN+1. + // We skip these, but can mostly avoid explicit checking + if (max4[0] < a[i+0]) max4[0] = a[i+0]; + if (max4[1] < a[i+1]) max4[1] = a[i+1]; + if (max4[2] < a[i+2]) max4[2] = a[i+2]; + if (max4[3] < a[i+3]) max4[3] = a[i+3]; + if (min4[0] > a[i+0] && a[i+0] > INT32_MIN+1) min4[0] = a[i+0]; + if (min4[1] > a[i+1] && a[i+1] > INT32_MIN+1) min4[1] = a[i+1]; + if (min4[2] > a[i+2] && a[i+2] > INT32_MIN+1) min4[2] = a[i+2]; + if (min4[3] > a[i+3] && a[i+3] > INT32_MIN+1) min4[3] = a[i+3]; + } + min = min4[0]; + if (min > min4[1]) min = min4[1]; + if (min > min4[2]) min = min4[2]; + if (min > min4[3]) min = min4[3]; + max = max4[0]; + if (max < max4[1]) max = max4[1]; + if (max < max4[2]) max = max4[2]; + if (max < max4[3]) max = max4[3]; + for (; i < n; ++i) { if (max < a[i]) max = a[i]; - if (min > a[i]) min = a[i]; + if (min > a[i] && a[i] > INT32_MIN+1) min = a[i]; } + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { - bcf_enc_size(s, wsize, BCF_BT_INT8); - for (i = 0; i < n; ++i) - if ( a[i]==bcf_int32_vector_end ) kputc(bcf_int8_vector_end, s); - else if ( a[i]==bcf_int32_missing ) kputc(bcf_int8_missing, s); - else kputc(a[i], s); + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int32_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int32_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + s->l += n; } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT16); - ks_resize(s, s->l + n * sizeof(int16_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { @@ -2495,8 +2558,9 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) s->l += n * sizeof(int16_t); } else { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT32); - ks_resize(s, s->l + n * sizeof(int32_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { i32_to_le(a[i], p); @@ -2506,7 +2570,7 @@ int bcf_enc_vint(kstring_t *s, int n, int32_t *a, int wsize) } } - return 0; // FIXME: check for errs in this function + return 0; } #ifdef VCF_ALLOW_INT64 @@ -2616,13 +2680,36 @@ uint8_t *bcf_fmt_sized_array(kstring_t *s, uint8_t *ptr) ********************/ typedef struct { - int key, max_m, size, offset; - uint32_t is_gt:1, max_g:31; - uint32_t max_l; - uint32_t y; - uint8_t *buf; + int key; // Key for h->id[BCF_DT_ID][key] vdict + int max_m; // number of elements in field array (ie commas) + int size; // field size (max_l or max_g*4 if is_gt) + int offset; // offset of buf into h->mem + uint32_t is_gt:1, // is genotype + max_g:31; // maximum number of genotypes + uint32_t max_l; // length of field + uint32_t y; // h->id[0][fmt[j].key].val->info[BCF_HL_FMT] + uint8_t *buf; // Pointer into h->mem } fmt_aux_t; +// fmt_aux_t field notes: +// max_* are biggest sizes of the various FORMAT fields across all samples. +// We use these after pivoting the data to ensure easy random access +// of a specific sample. +// +// max_m is only used for type BCF_HT_REAL or BCF_HT_INT +// max_g is only used for is_gt == 1 (will be BCF_HT_STR) +// max_l is only used for is_gt == 0 (will be BCF_HT_STR) +// +// These are computed in vcf_parse_format_max3 and used in +// vcf_parse_format_alloc4 to get the size. +// +// size is computed from max_g, max_l, max_m and is_gt. Once computed +// the max values are never accessed again. +// +// In theory all 4 vars could be coalesced into a single variable, but this +// significantly harms speed (even if done via a union). It's about 25-30% +// slower. + static inline int align_mem(kstring_t *s) { int e = 0; @@ -2637,8 +2724,8 @@ static inline int align_mem(kstring_t *s) // detect FORMAT "." static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q) { - char *end = s->s + s->l; + const char *p, const char *q) { + const char *end = s->s + s->l; if ( q>=end ) { hts_log_error("FORMAT column with no sample columns starting at %s:%"PRIhts_pos"", bcf_seqname_safe(h,v), v->pos+1); @@ -2658,11 +2745,12 @@ static int vcf_parse_format_empty1(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, // get format information from the dictionary static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { + const char *p, const char *q, fmt_aux_t *fmt) { const vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; char *t; int j; ks_tokaux_t aux1; + for (j = 0, t = kstrtok(p, ":", &aux1); t; t = kstrtok(0, 0, &aux1), ++j) { if (j >= MAX_N_FMT) { v->errcode |= BCF_ERR_LIMITS; @@ -2700,7 +2788,7 @@ static int vcf_parse_format_dict2(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } fmt[j].max_l = fmt[j].max_m = fmt[j].max_g = 0; fmt[j].key = kh_val(d, k).id; - fmt[j].is_gt = !strcmp(t, "GT"); + fmt[j].is_gt = (t[0] == 'G' && t[1] == 'T' && !t[2]); fmt[j].y = h->id[0][fmt[j].key].val->info[BCF_HL_FMT]; v->n_fmt++; } @@ -2714,7 +2802,8 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *r = q + 1; // r: position in the format string int l = 0, m = 1, g = 1, j; v->n_sample = 0; // m: max vector size, l: max field len, g: max number of alleles - char *end = s->s + s->l; + const char *end = s->s + s->l; + while ( rmax_m < m) f->max_m = m; if (f->max_l < l) f->max_l = l; if (f->is_gt && f->max_g < g) f->max_g = g; @@ -2764,7 +2871,7 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, break; } if ( r>=end ) break; - r++; l++; + r++; } end_for: v->n_sample++; @@ -2777,23 +2884,25 @@ static int vcf_parse_format_max3(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, // allocate memory for arrays static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { + const char *p, const char *q, + fmt_aux_t *fmt) { kstring_t *mem = (kstring_t*)&h->mem; int j; for (j = 0; j < v->n_fmt; ++j) { fmt_aux_t *f = &fmt[j]; if ( !f->max_m ) f->max_m = 1; // omitted trailing format field + if ((f->y>>4&0xf) == BCF_HT_STR) { f->size = f->is_gt? f->max_g << 2 : f->max_l; } else if ((f->y>>4&0xf) == BCF_HT_REAL || (f->y>>4&0xf) == BCF_HT_INT) { f->size = f->max_m << 2; - } else - { + } else { hts_log_error("The format type %d at %s:%"PRIhts_pos" is currently not supported", f->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; return -1; } + if (align_mem(mem) < 0) { hts_log_error("Memory allocation failure at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; @@ -2827,15 +2936,17 @@ static int vcf_parse_format_alloc4(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, return 0; } -// fill the sample fields; at beginning of the loop +// Fill the sample fields static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { - static int extreme_val_warned = 0; + const char *p, const char *q, fmt_aux_t *fmt) { + static int extreme_val_warned = 0; int n_sample_ori = -1; - // t points to the first char of a format - char *t = q + 1; + // At beginning of the loop t points to the first char of a format + const char *t = q + 1; int m = 0; // m: sample id - char *end = s->s + s->l; + const int nsamples = bcf_hdr_nsamples(h); + + const char *end = s->s + s->l; while ( ty>>4&0xf; if (!z->buf) { hts_log_error("Memory allocation failure for FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; return -1; } - if ((z->y>>4&0xf) == BCF_HT_STR) { + + if (htype == BCF_HT_STR) { int l; - if (z->is_gt) { // genotypes + if (z->is_gt) { + // Genotypes. + // ([|/])+... where is [0-9]+ or ".". int32_t is_phased = 0; uint32_t *x = (uint32_t*)(z->buf + z->size * (size_t)m); uint32_t unreadable = 0; @@ -2873,9 +2988,19 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, if (*t == '.') { ++t, x[l++] = is_phased; } else { - char *tt = t; - uint32_t val = hts_str2uint(t, &t, sizeof(val) * CHAR_MAX - 2, &overflow); - unreadable |= tt == t; + const char *tt = t; + uint32_t val; + // Or "v->n_allele < 10", but it doesn't + // seem to be any faster and this feels safer. + if (*t >= '0' && *t <= '9' && + !(t[1] >= '0' && t[1] <= '9')) { + val = *t++ - '0'; + } else { + val = hts_str2uint(t, (char **)&t, + sizeof(val) * CHAR_MAX - 2, + &overflow); + unreadable |= tt == t; + } if (max < val) max = val; x[l++] = (val + 1) << 1 | is_phased; } @@ -2892,13 +3017,20 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, return -1; } if ( !l ) x[l++] = 0; // An empty field, insert missing value - for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; + for (; l < z->size>>2; ++l) + x[l] = bcf_int32_vector_end; + } else { + // Otherwise arbitrary strings char *x = (char*)z->buf + z->size * (size_t)m; - for (l = 0; *t != ':' && *t; ++t) x[l++] = *t; - for (; l < z->size; ++l) x[l] = 0; + for (l = 0; *t != ':' && *t; ++t) + x[l++] = *t; + if (z->size > l) + memset(&x[l], 0, (z->size-l) * sizeof(*x)); } - } else if ((z->y>>4&0xf) == BCF_HT_INT) { + + } else if (htype == BCF_HT_INT) { + // One or more integers in an array int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); int l; for (l = 0;; ++t) { @@ -2912,7 +3044,8 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, { if ( !extreme_val_warned ) { - hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1); + hts_log_warning("Extreme FORMAT/%s value encountered and set to missing at %s:%"PRIhts_pos, + h->id[BCF_DT_ID][fmt[j-1].key].key, bcf_seqname_safe(h,v), v->pos+1); extreme_val_warned = 1; } tmp_val = bcf_int32_missing; @@ -2922,9 +3055,13 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } if (*t != ',') break; } - if ( !l ) x[l++] = bcf_int32_missing; - for (; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; - } else if ((z->y>>4&0xf) == BCF_HT_REAL) { + if ( !l ) + x[l++] = bcf_int32_missing; + for (; l < z->size>>2; ++l) + x[l] = bcf_int32_vector_end; + + } else if (htype == BCF_HT_REAL) { + // One of more floating point values in an array float *x = (float*)(z->buf + z->size * (size_t)m); int l; for (l = 0;; ++t) { @@ -2944,10 +3081,13 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } if (*t != ',') break; } - if ( !l ) bcf_float_set_missing(x[l++]); // An empty field, insert missing value - for (; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); + if ( !l ) + // An empty field, insert missing value + bcf_float_set_missing(x[l++]); + for (; l < z->size>>2; ++l) + bcf_float_set_vector_end(x[l]); } else { - hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, z->y>>4&0xf, bcf_seqname_safe(h,v), v->pos+1); + hts_log_error("Unknown FORMAT field type %d at %s:%"PRIhts_pos, htype, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_TAG_INVALID; return -1; } @@ -2968,24 +3108,28 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, } } - for (; j < v->n_fmt; ++j) { // fill end-of-vector values + // fill end-of-vector values + for (; j < v->n_fmt; ++j) { fmt_aux_t *z = &fmt[j]; + const int htype = z->y>>4&0xf; int l; - if ((z->y>>4&0xf) == BCF_HT_STR) { + if (htype == BCF_HT_STR) { if (z->is_gt) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); if (z->size) x[0] = bcf_int32_missing; for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; } else { char *x = (char*)z->buf + z->size * (size_t)m; - if ( z->size ) x[0] = '.'; - for (l = 1; l < z->size; ++l) x[l] = 0; + if ( z->size ) { + x[0] = '.'; + memset(&x[1], 0, (z->size-1) * sizeof(*x)); + } } - } else if ((z->y>>4&0xf) == BCF_HT_INT) { + } else if (htype == BCF_HT_INT) { int32_t *x = (int32_t*)(z->buf + z->size * (size_t)m); x[0] = bcf_int32_missing; for (l = 1; l < z->size>>2; ++l) x[l] = bcf_int32_vector_end; - } else if ((z->y>>4&0xf) == BCF_HT_REAL) { + } else if (htype == BCF_HT_REAL) { float *x = (float*)(z->buf + z->size * (size_t)m); bcf_float_set_missing(x[0]); for (l = 1; l < z->size>>2; ++l) bcf_float_set_vector_end(x[l]); @@ -3000,7 +3144,7 @@ static int vcf_parse_format_fill5(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, // write individual genotype information static int vcf_parse_format_gt6(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, - char *p, char *q, fmt_aux_t *fmt) { + const char *p, const char *q, fmt_aux_t *fmt) { kstring_t *str = &v->indiv; int i; if (v->n_sample > 0) { @@ -3050,7 +3194,8 @@ static int vcf_parse_format_check7(const bcf_hdr_t *h, bcf1_t *v) { } // p,q is the start and the end of the FORMAT field -static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p, char *q) +static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, + char *p, char *q) { if ( !bcf_hdr_nsamples(h) ) return 0; kstring_t *mem = (kstring_t*)&h->mem; @@ -3067,6 +3212,23 @@ static int vcf_parse_format(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v, char *p if (vcf_parse_format_dict2(s, h, v, p, q, fmt) < 0) return -1; + // FORMAT data is per-sample A:B:C A:B:C A:B:C ... but in memory it is + // stored as per-type arrays AAA... BBB... CCC... This is basically + // a data rotation or pivot. + + // The size of elements in the array grow to their maximum needed, + // permitting fast random access. This means however we have to first + // scan the whole FORMAT line to find the maximum of each type, and + // then scan it again to find the store the data. + // We break this down into compute-max, allocate, fill-out-buffers + + // TODO: ? + // The alternative would be to pivot on the first pass, with fixed + // size entries for numerics and concatenated strings otherwise, also + // tracking maximum sizes. Then on a second pass we reallocate and + // copy the data again to a uniformly sized array. Two passes through + // memory, but without doubling string parsing. + // compute max if (vcf_parse_format_max3(s, h, v, p, q, fmt) < 0) return -1; @@ -3174,25 +3336,50 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->n_info = 0; if (*(q-1) == ';') *(q-1) = 0; - for (r = key = p;; ++r) { - int c; - char *val, *end; - if (*r != ';' && *r != '=' && *r != 0) continue; - if (v->n_info == UINT16_MAX) { - hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, - bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_LIMITS; - goto fail; + + // Parsing consists of processing key=value; or key; so we only need + // to track the next = and ; symbols, plus end of string. + // We could process 1 char at a time, but this is inefficient compared + // to strchr which can process word by word. Hence even doing two strchrs + // is quicker than byte by byte processing. + char *next_equals = strchr(p, '='); + char *next_semicolon = strchr(p, ';'); + r = p; + while (*r) { + // Look for key=value or just key. + char *val, *end, *from; + key = r; + if (next_equals && (!next_semicolon || next_equals < next_semicolon)) { + // key=value; + *next_equals = 0; + from = val = next_equals+1; + // Prefetching d->keys[hash] helps here provided we avoid + // computing hash twice (needs API change), but not universally. + // It may be significant for other applications though so it's + // something to consider for the future. + } else { + // key; + val = NULL; + from = key; + } + + // Update location of next ; and if used = + if (next_semicolon) { + end = next_semicolon; + r = end+1; + next_semicolon = strchr(end+1, ';'); + if (val) + next_equals = strchr(end, '='); + } else { + // find nul location, starting from key or val. + r = end = from + strlen(from); } - val = end = 0; - c = *r; *r = 0; - if (c == '=') { - val = r + 1; - for (end = val; *end != ';' && *end != 0; ++end); - c = *end; *end = 0; - } else end = r; - if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO + + *end = 0; + + // We've now got key and val (maybe NULL), so process it k = kh_get(vdict, d, key); + // 15 is default (unknown) type. See bcf_idinfo_def at top if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15) { hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key); @@ -3294,7 +3481,8 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p } else { bcf_enc_vint(str, n_val, a_val, -1); } - if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && strcmp(key, "END") == 0) + if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && + memcmp(key, "END", 4) == 0) { if ( val1 <= v->pos ) { @@ -3320,9 +3508,6 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p bcf_enc_vfloat(str, n_val, val_f); } } - if (c == 0) break; - r = end; - key = r + 1; } free(a_val); @@ -3335,94 +3520,162 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) { - int i = 0, ret = -2, overflow = 0; + int ret = -2, overflow = 0; char *p, *q, *r, *t; kstring_t *str; khint_t k; ks_tokaux_t aux; +//#define NOT_DOT(p) strcmp((p), ".") +//#define NOT_DOT(p) (!(*p == '.' && !p[1])) +//#define NOT_DOT(p) ((*p) != '.' || (p)[1]) +//#define NOT_DOT(p) (q-p != 1 || memcmp(p, ".\0", 2)) +#define NOT_DOT(p) (memcmp(p, ".\0", 2)) + if (!s || !h || !v || !(s->s)) return ret; // Assumed in lots of places, but we may as well spot this early assert(sizeof(float) == sizeof(int32_t)); + // Ensure string we parse has space to permit some over-flow when during + // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over + // the more straight forward looking strcmp, giving a speed advantage. + if (ks_resize(s, s->l+4) < 0) + return -1; + + // Force our memory to be initialised so we avoid the technicality of + // undefined behaviour in using a 4-byte memcmp. (The reality is this + // almost certainly is never detected by the compiler so has no impact, + // but equally so this code has minimal (often beneficial) impact on + // performance too.) + s->s[s->l+0] = 0; + s->s[s->l+1] = 0; + s->s[s->l+2] = 0; + s->s[s->l+3] = 0; + bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t)); - for (p = kstrtok(s->s, "\t", &aux), i = 0; p; p = kstrtok(0, 0, &aux), ++i) { - q = (char*)aux.p; - *q = 0; - if (i == 0) { // CHROM - vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; - k = kh_get(vdict, d, p); - if (k == kh_end(d)) - { - hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); - v->errcode = BCF_ERR_CTG_UNDEF; - if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { - hts_log_error("Could not add dummy header for contig '%s'", p); - v->errcode |= BCF_ERR_CTG_INVALID; + + // CHROM + if (!(p = kstrtok(s->s, "\t", &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + vdict_t *d = (vdict_t*)h->dict[BCF_DT_CTG]; + k = kh_get(vdict, d, p); + if (k == kh_end(d)) { + hts_log_warning("Contig '%s' is not defined in the header. (Quick workaround: index the file with tabix.)", p); + v->errcode = BCF_ERR_CTG_UNDEF; + if ((k = fix_chromosome(h, d, p)) == kh_end(d)) { + hts_log_error("Could not add dummy header for contig '%s'", p); + v->errcode |= BCF_ERR_CTG_INVALID; + goto err; + } + } + v->rid = kh_val(d, k).id; + + // POS + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + overflow = 0; + char *tmp = p; + v->pos = hts_str2uint(p, &p, 63, &overflow); + if (overflow) { + hts_log_error("Position value '%s' is too large", tmp); + goto err; + } else if ( *p ) { + hts_log_error("Could not parse the position '%s'", tmp); + goto err; + } else { + v->pos -= 1; + } + if (v->pos >= INT32_MAX) + v->unpacked |= BCF_IS_64BIT; + + // ID + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) bcf_enc_vchar(str, q - p, p); + else bcf_enc_size(str, 0, BCF_BT_CHAR); + + // REF + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + bcf_enc_vchar(str, q - p, p); + v->n_allele = 1, v->rlen = q - p; + + // ALT + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + for (r = t = p;; ++r) { + if (*r == ',' || *r == 0) { + if (v->n_allele == UINT16_MAX) { + hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, + bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_LIMITS; goto err; } + bcf_enc_vchar(str, r - t, t); + t = r + 1; + ++v->n_allele; } - v->rid = kh_val(d, k).id; - } else if (i == 1) { // POS - overflow = 0; - char *tmp = p; - v->pos = hts_str2uint(p, &p, 63, &overflow); - if (overflow) { - hts_log_error("Position value '%s' is too large", tmp); - goto err; - } else if ( *p ) { - hts_log_error("Could not parse the position '%s'", tmp); - goto err; - } else { - v->pos -= 1; - } - if (v->pos >= INT32_MAX) - v->unpacked |= BCF_IS_64BIT; - } else if (i == 2) { // ID - if (strcmp(p, ".")) bcf_enc_vchar(str, q - p, p); - else bcf_enc_size(str, 0, BCF_BT_CHAR); - } else if (i == 3) { // REF - bcf_enc_vchar(str, q - p, p); - v->n_allele = 1, v->rlen = q - p; - } else if (i == 4) { // ALT - if (strcmp(p, ".")) { - for (r = t = p;; ++r) { - if (*r == ',' || *r == 0) { - if (v->n_allele == UINT16_MAX) { - hts_log_error("Too many ALT alleles at %s:%"PRIhts_pos, - bcf_seqname_safe(h,v), v->pos+1); - v->errcode |= BCF_ERR_LIMITS; - goto err; - } - bcf_enc_vchar(str, r - t, t); - t = r + 1; - ++v->n_allele; - } - if (r == q) break; - } - } - } else if (i == 5) { // QUAL - if (strcmp(p, ".")) v->qual = atof(p); - else bcf_float_set_missing(v->qual); - if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR - } else if (i == 6) { // FILTER - if (strcmp(p, ".")) { - if (vcf_parse_filter(str, h, v, p, q)) goto err; - } else bcf_enc_vint(str, 0, 0, -1); - if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT - } else if (i == 7) { // INFO - if (strcmp(p, ".")) { - if (vcf_parse_info(str, h, v, p, q)) goto err; - } - if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; - } else if (i == 8) {// FORMAT - return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; + if (r == q) break; + } + } + + // QUAL + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) v->qual = atof(p); + else bcf_float_set_missing(v->qual); + if ( v->max_unpack && !(v->max_unpack>>1) ) goto end; // BCF_UN_STR + + // FILTER + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + if (vcf_parse_filter(str, h, v, p, q)) { + goto err; + } + } else bcf_enc_vint(str, 0, 0, -1); + if ( v->max_unpack && !(v->max_unpack>>2) ) goto end; // BCF_UN_FLT + + // INFO + if (!(p = kstrtok(0, 0, &aux))) + goto err; + *(q = (char*)aux.p) = 0; + + if (NOT_DOT(p)) { + if (vcf_parse_info(str, h, v, p, q)) { + goto err; } } + if ( v->max_unpack && !(v->max_unpack>>3) ) goto end; + + // FORMAT; optional + p = kstrtok(0, 0, &aux); + if (p) { + *(q = (char*)aux.p) = 0; + + return vcf_parse_format(s, h, v, p, q) == 0 ? 0 : -2; + } else { + return 0; + } end: ret = 0; From ac70212aefe0d27662dc48bcce4302057dd5f507 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 9 Aug 2023 15:19:59 +0100 Subject: [PATCH 468/488] Revert most of the vcf_parse_info improvements. The two we keep are the internal while loop to find the next ; or = instead of iterating back in the outer for loop, and memcmp instead of strcmp for "END". The strchr changes do help glibc on excessively long INFO tokens, seen in the GIAB truth set and GNOMAD files, but they have no impact on most mainstream VCF outputs. Furthermore, other C libraries, such as MUSL, are considerably slowed down by the use of strchr. Hence this isn't a particularly robust or warranted change. --- vcf.c | 65 ++++++++++++++++++++--------------------------------------- 1 file changed, 22 insertions(+), 43 deletions(-) diff --git a/vcf.c b/vcf.c index ce54dc227..afd60fcaa 100644 --- a/vcf.c +++ b/vcf.c @@ -3336,50 +3336,26 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->n_info = 0; if (*(q-1) == ';') *(q-1) = 0; - - // Parsing consists of processing key=value; or key; so we only need - // to track the next = and ; symbols, plus end of string. - // We could process 1 char at a time, but this is inefficient compared - // to strchr which can process word by word. Hence even doing two strchrs - // is quicker than byte by byte processing. - char *next_equals = strchr(p, '='); - char *next_semicolon = strchr(p, ';'); - r = p; - while (*r) { - // Look for key=value or just key. - char *val, *end, *from; - key = r; - if (next_equals && (!next_semicolon || next_equals < next_semicolon)) { - // key=value; - *next_equals = 0; - from = val = next_equals+1; - // Prefetching d->keys[hash] helps here provided we avoid - // computing hash twice (needs API change), but not universally. - // It may be significant for other applications though so it's - // something to consider for the future. - } else { - // key; - val = NULL; - from = key; - } - - // Update location of next ; and if used = - if (next_semicolon) { - end = next_semicolon; - r = end+1; - next_semicolon = strchr(end+1, ';'); - if (val) - next_equals = strchr(end, '='); - } else { - // find nul location, starting from key or val. - r = end = from + strlen(from); + for (r = key = p;; ++r) { + int c; + char *val, *end; + while (*r > '=' || (*r != ';' && *r != '=' && *r != 0)) r++; + if (v->n_info == UINT16_MAX) { + hts_log_error("Too many INFO entries at %s:%"PRIhts_pos, + bcf_seqname_safe(h,v), v->pos+1); + v->errcode |= BCF_ERR_LIMITS; + goto fail; } + val = end = NULL; + c = *r; *r = 0; + if (c == '=') { + val = r + 1; - *end = 0; - - // We've now got key and val (maybe NULL), so process it + for (end = val; *end != ';' && *end != 0; ++end); + c = *end; *end = 0; + } else end = r; + if ( !*key ) { if (c==0) break; r = end; key = r + 1; continue; } // faulty VCF, ";;" in the INFO k = kh_get(vdict, d, key); - // 15 is default (unknown) type. See bcf_idinfo_def at top if (k == kh_end(d) || kh_val(d, k).info[BCF_HL_INFO] == 15) { hts_log_warning("INFO '%s' is not defined in the header, assuming Type=String", key); @@ -3481,8 +3457,8 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p } else { bcf_enc_vint(str, n_val, a_val, -1); } - if (n_val==1 && (val1!=bcf_int32_missing || is_int64) && - memcmp(key, "END", 4) == 0) + if (n_val==1 && (val1!=bcf_int32_missing || is_int64) + && memcmp(key, "END", 4) == 0) { if ( val1 <= v->pos ) { @@ -3508,6 +3484,9 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p bcf_enc_vfloat(str, n_val, val_f); } } + if (c == 0) break; + r = end; + key = r + 1; } free(a_val); From 6806b2d7f6bc892a66f26c2ace83ad3f7b1f50d9 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 9 Aug 2023 12:01:59 +0100 Subject: [PATCH 469/488] Add an hclen SAM filter function. See samtools/samtools#813 (Will close issue with man page update which needs applying there) --- sam.c | 20 ++++++++++++++++++++ test/sam_filter/filter.tst | 1 + test/sam_filter/func8.out | 1 + 3 files changed, 22 insertions(+) create mode 100644 test/sam_filter/func8.out diff --git a/sam.c b/sam.c index 0e466df5e..b4545ff47 100644 --- a/sam.c +++ b/sam.c @@ -1255,6 +1255,26 @@ static int bam_sym_lookup(void *data, char *str, char **end, } break; + case 'h': + if (memcmp(str, "hclen", 5) == 0) { + int hclen = 0; + uint32_t *cigar = bam_get_cigar(b); + uint32_t ncigar = b->core.n_cigar; + + // left + if (ncigar > 0 && bam_cigar_op(cigar[0]) == BAM_CHARD_CLIP) + hclen = bam_cigar_oplen(cigar[0]); + + // right + if (ncigar > 1 && bam_cigar_op(cigar[ncigar-1]) == BAM_CHARD_CLIP) + hclen += bam_cigar_oplen(cigar[ncigar-1]); + + *end = str+5; + res->d = hclen; + return 0; + } + break; + case 'l': if (memcmp(str, "library", 7) == 0) { *end = str+7; diff --git a/test/sam_filter/filter.tst b/test/sam_filter/filter.tst index 13d2c340e..e74c307e6 100644 --- a/test/sam_filter/filter.tst +++ b/test/sam_filter/filter.tst @@ -56,3 +56,4 @@ P func4.out $tv -i 'filter=avg(qual) >= 20 && avg(qual) <= 30' ../ce#1000.sam P func5.out $tv -i 'filter=sclen>=20' ../realn02.sam | grep -E -v '^@' P func6.out $tv -i 'filter=rlen<50' ../realn02.sam | grep -E -v '^@' P func7.out $tv -i 'filter=qlen>100' ../realn02.sam | grep -E -v '^@' +P func8.out $tv -i 'filter=hclen>=4' ../c1#clip.sam | grep -E -v '^@' diff --git a/test/sam_filter/func8.out b/test/sam_filter/func8.out new file mode 100644 index 000000000..8553866ed --- /dev/null +++ b/test/sam_filter/func8.out @@ -0,0 +1 @@ +s0C 0 c1 3 0 2H6M2H * 0 0 CCGCGG ****** From d89f1fe0e1dfd23187aeccd518cb988e7083ec7d Mon Sep 17 00:00:00 2001 From: Andrew Whitwham Date: Tue, 30 May 2023 15:34:08 +0100 Subject: [PATCH 470/488] Speed up removal of lines in large headers. sam_hdr_remove_lines and sam_hdr_remove_except are very slow when removing large numbers of header lines due to the effort of deleting hash entries one by one. This commit instead rebuilds the hashes at the end of the deletion process. --- header.c | 69 ++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 60 insertions(+), 9 deletions(-) diff --git a/header.c b/header.c index 1d2fee491..43fd52c57 100644 --- a/header.c +++ b/header.c @@ -1,5 +1,5 @@ /* -Copyright (c) 2018-2020 Genome Research Ltd. +Copyright (c) 2018-2020, 2023 Genome Research Ltd. Authors: James Bonfield , Valeriu Ohan Redistribution and use in source and binary forms, with or without @@ -687,7 +687,7 @@ static void sam_hrecs_free_tags(sam_hrecs_t *hrecs, sam_hrec_tag_t *tag) { pool_free(hrecs->tag_pool, tag); } -static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found) { +static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_hrec_type_t *type_found, int remove_hash) { if (!hrecs || !type_name || !type_found) return -1; @@ -715,7 +715,7 @@ static int sam_hrecs_remove_line(sam_hrecs_t *hrecs, const char *type_name, sam_ } } - if (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2)) + if (remove_hash && (!strncmp(type_name, "SQ", 2) || !strncmp(type_name, "RG", 2))) sam_hrecs_remove_hash_entry(hrecs, itype, type_found); sam_hrecs_free_tags(hrecs, type_found->tag); @@ -1429,7 +1429,7 @@ int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, if (!type_found) return 0; - int ret = sam_hrecs_remove_line(hrecs, type, type_found); + int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1); if (ret == 0) { if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) return -1; @@ -1469,7 +1469,7 @@ int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) { if (!type_found) return -1; - int ret = sam_hrecs_remove_line(hrecs, type, type_found); + int ret = sam_hrecs_remove_line(hrecs, type, type_found, 1); if (ret == 0) { if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) return -1; @@ -1609,6 +1609,37 @@ int sam_hdr_update_line(sam_hdr_t *bh, const char *type, return ret; } +static int rebuild_hash(sam_hrecs_t *hrecs, const char *type) { + sam_hrec_type_t *head, *step; + khiter_t k; + + if (strncmp(type, "SQ", 2) == 0) { + hrecs->nref = 0; + kh_clear(m_s2i, hrecs->ref_hash); + } else if (strncmp(type, "RG", 2) == 0) { + hrecs->nrg = 0; + kh_clear(m_s2i, hrecs->rg_hash); + } + + k = kh_get(sam_hrecs_t, hrecs->h, TYPEKEY(type)); + + if (k != kh_end(hrecs->h)) { // something to rebuild + head = kh_val(hrecs->h, k); + step = head; + + do { + if (sam_hrecs_update_hashes(hrecs, TYPEKEY(type), step) == -1) { + hts_log_error("Unable to rebuild hashes"); + return -1; + } + + step = step->next; + } while (step != head); + } + + return 0; +} + int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, const char *ID_value) { sam_hrecs_t *hrecs; if (!bh || !type) @@ -1643,11 +1674,21 @@ int sam_hdr_remove_except(sam_hdr_t *bh, const char *type, const char *ID_key, c while (step != type_found) { sam_hrec_type_t *to_remove = step; step = step->next; - ret &= sam_hrecs_remove_line(hrecs, type, to_remove); + ret &= sam_hrecs_remove_line(hrecs, type, to_remove, 0); } if (remove_all) - ret &= sam_hrecs_remove_line(hrecs, type, type_found); + ret &= sam_hrecs_remove_line(hrecs, type, type_found, 0); + + /* if RG or SQ, delete then rebuild the hashes (as it is faster + to rebuild than delete one by one). + */ + + if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) { + if (rebuild_hash(hrecs, type)) { + return -1; + } + } if (!ret && hrecs->dirty) redact_header_text(bh); @@ -1691,7 +1732,7 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void * if (k == kh_end(rh)) { // value is not in the hash table, so remove sam_hrec_type_t *to_remove = step; step = step->next; - ret |= sam_hrecs_remove_line(hrecs, type, to_remove); + ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0); } else { step = step->next; } @@ -1707,10 +1748,20 @@ int sam_hdr_remove_lines(sam_hdr_t *bh, const char *type, const char *id, void * if (k == kh_end(rh)) { // value is not in the hash table, so remove sam_hrec_type_t *to_remove = head; head = head->next; - ret |= sam_hrecs_remove_line(hrecs, type, to_remove); + ret |= sam_hrecs_remove_line(hrecs, type, to_remove, 0); } } + /* if RG or SQ, delete then rebuild the hashes (as it is faster + to rebuild than delete one by one). + */ + + if ((strncmp(type, "SQ", 2) == 0) || (strncmp(type, "RG", 2) == 0)) { + if (rebuild_hash(hrecs, type)) { + return -1; + } + } + if (!ret && hrecs->dirty) redact_header_text(bh); From 9c7e845e25cdb899cd833e96a46ef7d6dd8fb6be Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Fri, 11 Aug 2023 13:47:04 +0100 Subject: [PATCH 471/488] Skip CRC checks when fuzzing As it's difficult for fuzzers to get CRC checksums right, prevent CRC checks from failing in fuzzing build mode. This should expand the number of code paths that the fuzzer can explore. --- bgzf.c | 8 ++++++++ cram/cram_io.c | 10 ++++++++++ 2 files changed, 18 insertions(+) diff --git a/bgzf.c b/bgzf.c index 5ef433c20..a775b1b83 100644 --- a/bgzf.c +++ b/bgzf.c @@ -733,6 +733,10 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen, } uint32_t crc = libdeflate_crc32(0, (unsigned char *)dst, *dlen); +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Pretend the CRC was OK so the fuzzer doesn't have to get it right + crc = expected_crc; +#endif if (crc != expected_crc) { hts_log_error("CRC32 checksum mismatch"); return -2; @@ -775,6 +779,10 @@ static int bgzf_uncompress(uint8_t *dst, size_t *dlen, *dlen = *dlen - zs.avail_out; uint32_t crc = crc32(crc32(0L, NULL, 0L), (unsigned char *)dst, *dlen); +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Pretend the CRC was OK so the fuzzer doesn't have to get it right + crc = expected_crc; +#endif if (crc != expected_crc) { hts_log_error("CRC32 checksum mismatch"); return -2; diff --git a/cram/cram_io.c b/cram/cram_io.c index ca226e29c..1cce86d4b 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1568,6 +1568,11 @@ int cram_uncompress_block(cram_block *b) { char *uncomp; size_t uncomp_size = 0; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Pretend the CRC was OK so the fuzzer doesn't have to get it right + b->crc32_checked = 1; +#endif + if (b->crc32_checked == 0) { uint32_t crc = crc32(b->crc_part, b->data ? b->data : (uc *)"", b->alloc); b->crc32_checked = 1; @@ -3875,6 +3880,11 @@ cram_container *cram_read_container(cram_fd *fd) { rd+=4; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // Pretend the CRC was OK so the fuzzer doesn't have to get it right + crc = c->crc32; +#endif + if (crc != c->crc32) { hts_log_error("Container header CRC32 failure"); cram_free_container(c); From a121c8450cebc1cffae34b787138618a7fa8bdde Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Mon, 7 Aug 2023 09:39:46 +0100 Subject: [PATCH 472/488] Prevent out-of-memory reports when fuzzing * Limit max. IDX numbers in VCF to prevent large allocations * Limit max. sum of shared_len + indiv_len in bcf_read1_core() * Limit max. header size in bcf_hdr_read() * Limit max. header size in cram_read_SAM_hdr() * Limit max. header size in bam_hdr_read() * Limit max. n_targets in bam_hdr_read() * Limit max. number of landmarks in cram_read_container() * Limit max. number of huffman codes in cram_huffman_decode_init() * Limit max. record size in sam_realloc_bam_data() Adds a header where the memory limit for fuzzing can be set. This involves a bit more work, but there is benefit to having this in one clearly defined place. --- Makefile | 9 +++++---- cram/cram_codecs.c | 11 ++++++++++- cram/cram_io.c | 17 ++++++++++++++++- fuzz_settings.h | 35 +++++++++++++++++++++++++++++++++++ sam.c | 17 +++++++++++++++++ vcf.c | 18 +++++++++++++++++- 6 files changed, 100 insertions(+), 7 deletions(-) create mode 100644 fuzz_settings.h diff --git a/Makefile b/Makefile index eec8164d5..5597d67d0 100644 --- a/Makefile +++ b/Makefile @@ -252,6 +252,7 @@ cram_samtools_h = cram/cram_samtools.h $(htslib_sam_h) cram_structs_h = cram/cram_structs.h $(htslib_thread_pool_h) $(htslib_cram_h) cram/string_alloc.h cram/mFILE.h $(htslib_khash_h) cram_open_trace_file_h = cram/open_trace_file.h cram/mFILE.h bcf_sr_sort_h = bcf_sr_sort.h $(htslib_synced_bcf_reader_h) $(htslib_kbitset_h) +fuzz_settings_h = fuzz_settings.h header_h = header.h cram/string_alloc.h cram/pooled_alloc.h $(htslib_khash_h) $(htslib_kstring_h) $(htslib_sam_h) hfile_internal_h = hfile_internal.h $(htslib_hts_defs_h) $(htslib_hfile_h) $(textutils_internal_h) hts_internal_h = hts_internal.h $(htslib_hts_h) $(textutils_internal_h) @@ -456,8 +457,8 @@ hfile_s3.o hfile_s3.pico: hfile_s3.c config.h $(hfile_internal_h) $(htslib_hts_h hts.o hts.pico: hts.c config.h os/lzma_stub.h $(htslib_hts_h) $(htslib_bgzf_h) $(cram_h) $(htslib_hfile_h) $(htslib_hts_endian_h) version.h config_vars.h $(hts_internal_h) $(hfile_internal_h) $(sam_internal_h) $(htslib_hts_expr_h) $(htslib_hts_os_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_ksort_h) $(htslib_tbx_h) $(htscodecs_htscodecs_h) hts_expr.o hts_expr.pico: hts_expr.c config.h $(htslib_hts_expr_h) $(htslib_hts_log_h) $(textutils_internal_h) hts_os.o hts_os.pico: hts_os.c config.h $(htslib_hts_defs_h) os/rand.c -vcf.o vcf.pico: vcf.c config.h $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) -sam.o sam.pico: sam.c config.h $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) +vcf.o vcf.pico: vcf.c config.h $(fuzz_settings_h) $(htslib_vcf_h) $(htslib_bgzf_h) $(htslib_tbx_h) $(htslib_hfile_h) $(hts_internal_h) $(htslib_khash_str2int_h) $(htslib_kstring_h) $(htslib_sam_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_hts_endian_h) +sam.o sam.pico: sam.c config.h $(fuzz_settings_h) $(htslib_hts_defs_h) $(htslib_sam_h) $(htslib_bgzf_h) $(cram_h) $(hts_internal_h) $(sam_internal_h) $(htslib_hfile_h) $(htslib_hts_endian_h) $(htslib_hts_expr_h) $(header_h) $(htslib_khash_h) $(htslib_kseq_h) $(htslib_kstring_h) sam_mods.o sam_mods.pico: sam_mods.c config.h $(htslib_sam_h) $(textutils_internal_h) tbx.o tbx.pico: tbx.c config.h $(htslib_tbx_h) $(htslib_bgzf_h) $(htslib_hts_endian_h) $(hts_internal_h) $(htslib_khash_h) faidx.o faidx.pico: faidx.c config.h $(htslib_bgzf_h) $(htslib_faidx_h) $(htslib_hfile_h) $(htslib_khash_h) $(htslib_kstring_h) $(hts_internal_h) @@ -475,12 +476,12 @@ probaln.o probaln.pico: probaln.c config.h $(htslib_hts_h) realn.o realn.pico: realn.c config.h $(htslib_hts_h) $(htslib_sam_h) textutils.o textutils.pico: textutils.c config.h $(htslib_hfile_h) $(htslib_kstring_h) $(htslib_sam_h) $(hts_internal_h) -cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) +cram/cram_codecs.o cram/cram_codecs.pico: cram/cram_codecs.c config.h $(fuzz_settings_h) $(htslib_hts_endian_h) $(htscodecs_varint_h) $(htscodecs_pack_h) $(htscodecs_rle_h) $(cram_h) cram/cram_decode.o cram/cram_decode.pico: cram/cram_decode.c config.h $(cram_h) $(cram_os_h) $(htslib_hts_h) cram/cram_encode.o cram/cram_encode.pico: cram/cram_encode.c config.h $(cram_h) $(cram_os_h) $(sam_internal_h) $(htslib_hts_h) $(htslib_hts_endian_h) $(textutils_internal_h) cram/cram_external.o cram/cram_external.pico: cram/cram_external.c config.h $(htscodecs_rANS_static4x16_h) $(htslib_hfile_h) $(cram_h) cram/cram_index.o cram/cram_index.pico: cram/cram_index.c config.h $(htslib_bgzf_h) $(htslib_hfile_h) $(hts_internal_h) $(cram_h) $(cram_os_h) -cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) +cram/cram_io.o cram/cram_io.pico: cram/cram_io.c config.h os/lzma_stub.h $(fuzz_settings_h) $(cram_h) $(cram_os_h) $(htslib_hts_h) $(cram_open_trace_file_h) $(htscodecs_rANS_static_h) $(htscodecs_rANS_static4x16_h) $(htscodecs_arith_dynamic_h) $(htscodecs_tokenise_name3_h) $(htscodecs_fqzcomp_qual_h) $(htscodecs_varint_h) $(htslib_hfile_h) $(htslib_bgzf_h) $(htslib_faidx_h) $(hts_internal_h) cram/cram_stats.o cram/cram_stats.pico: cram/cram_stats.c config.h $(cram_h) $(cram_os_h) cram/mFILE.o cram/mFILE.pico: cram/mFILE.c config.h $(htslib_hts_log_h) $(cram_os_h) cram/mFILE.h cram/open_trace_file.o cram/open_trace_file.pico: cram/open_trace_file.c config.h $(cram_os_h) $(cram_open_trace_file_h) $(cram_misc_h) $(htslib_hfile_h) $(htslib_hts_log_h) $(htslib_hts_h) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index cc5e52b2c..683badf93 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -44,6 +44,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#include "../fuzz_settings.h" +#endif + #include "../htslib/hts_endian.h" #if defined(HAVE_EXTERNAL_LIBHTSCODECS) @@ -2795,7 +2799,12 @@ cram_codec *cram_huffman_decode_init(cram_block_compression_hdr *hdr, errno = ENOMEM; return NULL; } - +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (ncodes > FUZZ_ALLOC_LIMIT / sizeof(*codes)) { + errno = ENOMEM; + return NULL; + } +#endif h = calloc(1, sizeof(*h)); if (!h) return NULL; diff --git a/cram/cram_io.c b/cram/cram_io.c index 1cce86d4b..92a31583f 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -69,6 +69,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define crc32(a,b,c) libdeflate_crc32((a),(b),(c)) #endif +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#include "../fuzz_settings.h" +#endif + #include "cram.h" #include "os.h" #include "../htslib/hts.h" @@ -3857,7 +3861,13 @@ cram_container *cram_read_container(cram_fd *fd) { return NULL; *c = c2; - +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (c->num_landmarks > FUZZ_ALLOC_LIMIT/sizeof(int32_t)) { + fd->err = errno = ENOMEM; + cram_free_container(c); + return NULL; + } +#endif if (c->num_landmarks && !(c->landmark = malloc(c->num_landmarks * sizeof(int32_t)))) { fd->err = errno; cram_free_container(c); @@ -4689,6 +4699,11 @@ sam_hdr_t *cram_read_SAM_hdr(cram_fd *fd) { if (-1 == int32_decode(fd, &header_len)) return NULL; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (header_len > FUZZ_ALLOC_LIMIT) + return NULL; +#endif + /* Alloc and read */ if (header_len < 0 || NULL == (header = malloc((size_t) header_len+1))) return NULL; diff --git a/fuzz_settings.h b/fuzz_settings.h new file mode 100644 index 000000000..821581927 --- /dev/null +++ b/fuzz_settings.h @@ -0,0 +1,35 @@ +/* fuzz_settings.h -- fuzz-tester specific definitions + + Copyright (C) 2023 Genome Research Ltd. + + Author: Rob Davies + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#ifndef HTSLIB_FUZZ_SETTINGS_H +#define HTSLIB_FUZZ_SETTINGS_H +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + +#ifndef FUZZ_ALLOC_LIMIT +// By default libfuzzer reports out-of-memory on allocations > 2 Gbytes +#define FUZZ_ALLOC_LIMIT 2000000000ULL +#endif + +#endif +#endif diff --git a/sam.c b/sam.c index b4545ff47..de603cf16 100644 --- a/sam.c +++ b/sam.c @@ -37,6 +37,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#include "fuzz_settings.h" +#endif + // Suppress deprecation message for cigar_tab, which we initialise #include "htslib/hts_defs.h" #undef HTS_DEPRECATED @@ -251,6 +255,9 @@ sam_hdr_t *bam_hdr_read(BGZF *fp) bufsize = h->l_text + 1; if (bufsize < h->l_text) goto nomem; // so large that adding 1 overflowed +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (bufsize > FUZZ_ALLOC_LIMIT) goto nomem; +#endif h->text = (char*)malloc(bufsize); if (!h->text) goto nomem; h->text[h->l_text] = 0; // make sure it is NULL terminated @@ -264,6 +271,10 @@ sam_hdr_t *bam_hdr_read(BGZF *fp) if (h->n_targets < 0) goto invalid; // read reference sequence names and lengths +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (h->n_targets > (FUZZ_ALLOC_LIMIT - bufsize)/(sizeof(char*)+sizeof(uint32_t))) + goto nomem; +#endif if (h->n_targets > 0) { h->target_name = (char**)calloc(h->n_targets, sizeof(char*)); if (!h->target_name) goto nomem; @@ -425,6 +436,12 @@ int sam_realloc_bam_data(bam1_t *b, size_t desired) errno = ENOMEM; // Not strictly true but we can't store the size return -1; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (new_m_data > FUZZ_ALLOC_LIMIT) { + errno = ENOMEM; + return -1; + } +#endif if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { new_data = realloc(b->data, new_m_data); } else { diff --git a/vcf.c b/vcf.c index afd60fcaa..c126f7354 100644 --- a/vcf.c +++ b/vcf.c @@ -37,6 +37,10 @@ DEALINGS IN THE SOFTWARE. */ #include #include +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +#include "fuzz_settings.h" +#endif + #include "htslib/vcf.h" #include "htslib/bgzf.h" #include "htslib/tbx.h" @@ -703,6 +707,11 @@ static int bcf_hdr_set_idx(bcf_hdr_t *hdr, int dict_type, const char *tag, bcf_i } new_n = idinfo->id >= hdr->n[dict_type] ? idinfo->id+1 : hdr->n[dict_type]; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // hts_resize() can attempt to allocate up to 2 * requested items + if (new_n > FUZZ_ALLOC_LIMIT/(2 * sizeof(bcf_idpair_t))) + return -1; +#endif if (hts_resize(bcf_idpair_t, new_n, &hdr->m[dict_type], &hdr->id[dict_type], HTS_RESIZE_CLEAR)) { return -1; @@ -1489,6 +1498,9 @@ bcf_hdr_t *bcf_hdr_read(htsFile *hfp) if (bgzf_read(fp, buf, 4) != 4) goto fail; hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (hlen > FUZZ_ALLOC_LIMIT) { errno = ENOMEM; goto fail; } +#endif htxt = (char*)malloc(hlen + 1); if (!htxt) goto fail; if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; @@ -1615,8 +1627,12 @@ static inline int bcf_read1_core(BGZF *fp, bcf1_t *v) shared_len = le_to_u32(x); if (shared_len < 24) return -2; shared_len -= 24; // to exclude six 32-bit integers - if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2; indiv_len = le_to_u32(x + 4); +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + // ks_resize() normally allocates 1.5 * requested size to allow for growth + if ((uint64_t) shared_len + indiv_len > FUZZ_ALLOC_LIMIT / 3 * 2) return -2; +#endif + if (ks_resize(&v->shared, shared_len ? shared_len : 1) != 0) return -2; if (ks_resize(&v->indiv, indiv_len ? indiv_len : 1) != 0) return -2; v->rid = le_to_i32(x + 8); v->pos = le_to_u32(x + 12); From 1abb46f872a75576c441149ed24c7ca940e12295 Mon Sep 17 00:00:00 2001 From: Rob Davies Date: Thu, 17 Aug 2023 12:10:34 +0100 Subject: [PATCH 473/488] Add missing dependency on libhts.a for hts_open_fuzzer --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5597d67d0..88af18adc 100644 --- a/Makefile +++ b/Makefile @@ -620,7 +620,7 @@ check test: all $(HTSCODECS_TEST_TARGETS) test/hts_endian: test/hts_endian.o $(CC) $(LDFLAGS) -o $@ test/hts_endian.o $(LIBS) -test/fuzz/hts_open_fuzzer: test/fuzz/hts_open_fuzzer.o +test/fuzz/hts_open_fuzzer: test/fuzz/hts_open_fuzzer.o libhts.a $(CC) $(LDFLAGS) -o $@ test/fuzz/hts_open_fuzzer.o libhts.a $(LIBS) -lpthread test/fieldarith: test/fieldarith.o libhts.a From 7e20d76f9c735be9bd209b743d3c41d35dee9590 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Thu, 24 Aug 2023 16:40:15 -0700 Subject: [PATCH 474/488] Apply the packed attribute to uint*_u types for Clang ... so that the following code ``` static inline void u32_to_le(uint32_t val, uint8_t *buf) { *((uint32_u *) buf) = val; ... ``` will not cause -fsanitize=alignment failures when building with Clang. --- htslib/hts_endian.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/htslib/hts_endian.h b/htslib/hts_endian.h index 30ad8055d..12effab7b 100644 --- a/htslib/hts_endian.h +++ b/htslib/hts_endian.h @@ -100,7 +100,7 @@ DEALINGS IN THE SOFTWARE. */ #endif #if HTS_ALLOW_UNALIGNED != 0 -# if defined (__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) +# if defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) || defined(__clang__) // This prevents problems with gcc's vectoriser generating the wrong // instructions for unaligned data. typedef uint16_t uint16_u __attribute__ ((__aligned__ (1))); From 6ba09a2453e4a70a32ae826c4486cb89c6e1eaae Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Mon, 14 Aug 2023 14:55:34 -0700 Subject: [PATCH 475/488] initial commit, htslib genomicsdb pull --- Makefile | 24 +- faidx.c | 52 +++++ hfile.c | 2 +- htscodecs | 2 +- htslib/faidx.h | 4 + htslib/synced_bcf_reader.h | 1 + htslib/vcf.h | 68 +++++- synced_bcf_reader.c | 35 ++- vcf.c | 463 ++++++++++++++++++++++++++++++++++++- 9 files changed, 627 insertions(+), 24 deletions(-) diff --git a/Makefile b/Makefile index 88af18adc..ef5d61147 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ CC = gcc AR = ar RANLIB = ranlib - +SOURCE_DIR = . # Default libraries to link if configure is not used htslib_default_libs = -lz -lm -lbz2 -llzma -lcurl @@ -35,10 +35,20 @@ CPPFLAGS = # TODO: probably update cram code to make it compile cleanly with -Wc++-compat # For testing strict C99 support add -std=c99 -D_XOPEN_SOURCE=600 #CFLAGS = -g -Wall -O2 -pedantic -std=c99 -D_XOPEN_SOURCE=600 -CFLAGS = -g -Wall -O2 -fvisibility=hidden +ifdef DEBUG + CFLAGS = -DDEBUG -g3 -gdwarf-3 + LDFLAGS = -g3 -gdwarf-3 +else + CFLAGS = -O3 + LDFLAGS = +endif +ifdef PROFILE + CFLAGS += -pg +endif +CFLAGS += -Wall -fPIC EXTRA_CFLAGS_PIC = -fpic TARGET_CFLAGS = -LDFLAGS = -fvisibility=hidden +LDFLAGS = VERSION_SCRIPT_LDFLAGS = -Wl,-version-script,$(srcprefix)htslib.map LIBS = $(htslib_default_libs) @@ -123,7 +133,7 @@ htscodecs.mk: $(srcdir)/hts_probe_cc.sh '$(CC)' '$(CFLAGS) $(CPPFLAGS)' '$(LDFLAGS)' >> $@ srcdir = . -srcprefix = +srcprefix = $(SOURCE_DIR)/ HTSPREFIX = # Flags for SIMD code @@ -138,7 +148,7 @@ HTS_BUILD_SSSE3 = HTS_BUILD_POPCNT = HTS_BUILD_SSE4_1 = -include htslib_vars.mk +include $(SOURCE_DIR)/htslib_vars.mk include htscodecs.mk # If not using GNU make, you need to copy the version number from version.sh @@ -189,10 +199,10 @@ config_vars.h: .SUFFIXES: .bundle .c .cygdll .dll .o .pico .so .c.o: - $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< + $(CC) $(CFLAGS) -I$(SOURCE_DIR) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) -c -o $@ $< .c.pico: - $(CC) $(CFLAGS) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< + $(CC) $(CFLAGS) -I$(SOURCE_DIR) $(TARGET_CFLAGS) $(ALL_CPPFLAGS) $(EXTRA_CFLAGS_PIC) -c -o $@ $< LIBHTS_OBJS = \ diff --git a/faidx.c b/faidx.c index 5dd4bf1c0..8ac149485 100644 --- a/faidx.c +++ b/faidx.c @@ -914,6 +914,58 @@ int fai_adjust_region(const faidx_t *fai, int tid, return ((orig_beg != *beg ? 1 : 0) | (orig_end != *end && orig_end < HTS_POS_MAX ? 2 : 0)); } +static void fai_retrieve_into_buffer(const faidx_t *fai, const faidx1_t *val, + const uint64_t offset, const hts_pos_t beg, const hts_pos_t end, + char* s, hts_pos_t *len) { + size_t l; + int c = 0; + int ret; + + if ((uint64_t) end - (uint64_t) beg >= SIZE_MAX - 2) { + hts_log_error("Range %"PRId64"..%"PRId64" too big", beg, end); + *len = -1; + return; + } + + ret = bgzf_useek(fai->bgzf, + offset + + beg / val->line_blen * val->line_len + + beg % val->line_blen, SEEK_SET); + + if (ret < 0) { + *len = -1; + hts_log_error("Failed to retrieve block. (Seeking in a compressed, .gzi unindexed, file?)"); + return; + } + + l = 0; + + while ( l < end - beg && (c=bgzf_getc(fai->bgzf))>=0 ) + if (isgraph(c)) s[l++] = c; + if (c < 0) { + hts_log_error("Failed to retrieve block: %s", + c == -1 ? "unexpected end of file" : "error reading file"); + *len = -1; + return; + } + + s[l] = '\0'; + *len = l < INT_MAX ? l : INT_MAX; +} + +void faidx_fetch_seq_into_buffer(const faidx_t *fai, + const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, char* s, hts_pos_t *len) +{ + faidx1_t val; + + // Adjust position + if (faidx_adjust_position(fai, 1,&val, c_name, &p_beg_i, &p_end_i, len)) { + *len = 0; + return; + } + + fai_retrieve_into_buffer(fai, &val, val.seq_offset, p_beg_i, p_end_i + 1, s, len); +} char *faidx_fetch_seq64(const faidx_t *fai, const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, hts_pos_t *len) { diff --git a/hfile.c b/hfile.c index f8d42e49a..b8143928b 100644 --- a/hfile.c +++ b/hfile.c @@ -1123,7 +1123,7 @@ static hFILE *hopen_unknown_scheme(const char *fname, const char *mode) } /* Returns the appropriate handler, or NULL if the string isn't an URL. */ -static const struct hFILE_scheme_handler *find_scheme_handler(const char *s) +const struct hFILE_scheme_handler *find_scheme_handler(const char *s) { static const struct hFILE_scheme_handler unknown_scheme = { hopen_unknown_scheme, hfile_always_local, "built-in", 0 }; diff --git a/htscodecs b/htscodecs index 11b5007ff..dcb331678 160000 --- a/htscodecs +++ b/htscodecs @@ -1 +1 @@ -Subproject commit 11b5007ffb68bea9f6c777874a215e4187ce659a +Subproject commit dcb33167839622903897fc985a8cccf89b3358e2 diff --git a/htslib/faidx.h b/htslib/faidx.h index 4351b3fbe..12f3f4b40 100644 --- a/htslib/faidx.h +++ b/htslib/faidx.h @@ -237,6 +237,10 @@ by end users by calling `free()` on it. HTSLIB_EXPORT char *faidx_fetch_seq(const faidx_t *fai, const char *c_name, int p_beg_i, int p_end_i, int *len); +void faidx_fetch_seq_into_buffer(const faidx_t *fai, + const char *c_name, hts_pos_t p_beg_i, hts_pos_t p_end_i, + char* s, hts_pos_t *len); + /// Fetch the sequence in a region /** @param fai Pointer to the faidx_t struct @param c_name Region name diff --git a/htslib/synced_bcf_reader.h b/htslib/synced_bcf_reader.h index 9a6b48438..58d3d9389 100644 --- a/htslib/synced_bcf_reader.h +++ b/htslib/synced_bcf_reader.h @@ -141,6 +141,7 @@ typedef struct bcf_sr_t { htsFile *file; tbx_t *tbx_idx; + unsigned char read_one_record_only; hts_idx_t *bcf_idx; bcf_hdr_t *header; hts_itr_t *itr; diff --git a/htslib/vcf.h b/htslib/vcf.h index 70cf95372..d2cb6e6a8 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -61,16 +61,24 @@ extern "C" { #define BCF_HT_FLAG 0 // header type #define BCF_HT_INT 1 -#define BCF_HT_REAL 2 -#define BCF_HT_STR 3 -#define BCF_HT_LONG (BCF_HT_INT | 0x100) // BCF_HT_INT, but for int64_t values; VCF only! +#define BCF_HT_REAL 7 +#define BCF_HT_STR 8 +#define BCF_HT_CHAR 9 +#define BCF_HT_INT64 10 +#define BCF_HT_LONG BCF_HT_INT64 // BCF_HT_INT, but for int64_t values; VCF only! +#define BCF_HT_VOID 12 +#define BCF_NUM_HT_TYPES 14 +#define BCF_HT_UINT 2 +#define BCF_HT_UINT64 11 +#define BCF_HT_DOUBLE 13 #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 #define BCF_VL_A 2 #define BCF_VL_G 3 #define BCF_VL_R 4 - +#define BCF_VL_P 5 //ploidy +#define BCF_VL_Phased_Ploidy 6 //ploidy with phase /* === Dictionary === The header keeps three dictionaries. The first keeps IDs in the @@ -87,6 +95,10 @@ extern "C" { #define BCF_DT_CTG 1 #define BCF_DT_SAMPLE 2 +#define BCF_V_2_1_HEADER_MAGIC_STRING "BCF\2\1" +#define BCF_V_2_2_HEADER_MAGIC_STRING "BCF\2\2" +#define BCF_HEADER_MAGIC_STRING_LENGTH 5 + // Complete textual representation of a header line typedef struct bcf_hrec_t { int type; // One of the BCF_HL_* type @@ -142,12 +154,14 @@ extern uint8_t bcf_type_shift[]; #define VCF_SNP (1<<0) #define VCF_MNP (1<<1) #define VCF_INDEL (1<<2) -#define VCF_OTHER (1<<3) -#define VCF_BND (1<<4) // breakend -#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_OTHER 32 +#define VCF_BND 64 // breakend +#define VCF_OVERLAP 16 // overlapping deletion, ALT=* #define VCF_INS (1<<6) // implies VCF_INDEL #define VCF_DEL (1<<7) // implies VCF_INDEL #define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) +#define VCF_NON_REF 8 +#define VCF_SPANNING_DELETION 16 typedef struct bcf_variant_t { int type, n; // variant type and the number of bases affected, negative for deletions @@ -237,6 +251,7 @@ typedef struct bcf1_t { hts_pos_t rlen; // length of REF int32_t rid; // CHROM float qual; // QUAL + hts_pos_t m_end_point; //END - must be after QUAL due to a memcpy() in vcf.c uint32_t n_info:16, n_allele:16; uint32_t n_fmt:8, n_sample:24; kstring_t shared, indiv; @@ -338,6 +353,7 @@ typedef struct bcf1_t { */ HTSLIB_EXPORT bcf_hdr_t *bcf_hdr_read(htsFile *fp) HTS_RESULT_USED; + bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_sample_line_required); /** * bcf_hdr_set_samples() - for more efficient VCF parsing when only one/few samples are needed @@ -375,6 +391,16 @@ typedef struct bcf1_t { HTSLIB_EXPORT int bcf_hdr_write(htsFile *fp, bcf_hdr_t *h) HTS_RESULT_USED; + + /* + * Serialize BCF header into buffer + * + * Returns new offset value in buffer if the new data fits within the buffer capacity, + * else returns the same offset value without modifying the buffer + */ + size_t bcf_hdr_serialize(bcf_hdr_t* h, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const uint8_t keep_idx_fields); + + size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf); /** * Parse VCF line contained in kstring and populate the bcf1_t struct * The line must not end with \n or \r characters. @@ -396,6 +422,27 @@ typedef struct bcf1_t { HTSLIB_EXPORT int vcf_format(const bcf_hdr_t *h, const bcf1_t *v, kstring_t *s); + /* + * Same as vcf_format, but for bcfs + * + * Returns new offset value in buffer if the new data fits within the buffer capacity, + * else returns the same offset value without modifying the buffer + * + * If vcf, then the hdr and tmp pointers must be valid. For bcfs, they might be null + */ + size_t bcf_serialize(bcf1_t* v, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr, kstring_t* tmp); + /* + * Same as vcf_parse, but for bcfs + * + * Returns new offset value in buffer if a full vcf record is read, + * else returns the same offset value + * + * If vcf, then the hdr and tmp pointers must be valid. For bcfs, they might be null + * + * Note that vcf parsing modifies the buffer (tokenize function) + */ + size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr); + /// Read next VCF or BCF record /** @param fp The file to read the record from @param h The header for the vcf/bcf file @@ -468,7 +515,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). */ HTSLIB_EXPORT bcf_hdr_t *vcf_hdr_read(htsFile *fp) HTS_RESULT_USED; - + bcf_hdr_t *vcf_hdr_read_required_sample_line(htsFile *fp, const uint8_t is_sample_line_required); /// Write a VCF format header /** @param fp Output file @param h The header to write @@ -651,6 +698,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). /** The following functions are for internal use and should rarely be called directly */ HTSLIB_EXPORT int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt); + int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_length, + const uint8_t is_sample_line_required); /// Synchronize internal header structures /** @param h Header @@ -988,6 +1037,8 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). return bcf_update_info(hdr, line, key, values, n, BCF_HT_LONG); } + void bcf_set_end_point_from_info(const bcf_hdr_t* hdr, bcf1_t* line); + /* * bcf_update_format_*() - functions for updating FORMAT fields * @values: pointer to the array of values, the same number of elements @@ -1248,6 +1299,7 @@ set to one of BCF_ERR* codes and must be checked before calling bcf_write(). #define bcf_hdr_id2coltype(hdr,type,int_id) (uint32_t)((hdr)->id[BCF_DT_ID][int_id].val->info[type] & 0xf) #define bcf_hdr_idinfo_exists(hdr,type,int_id) ((int_id)>=0 && (int_id)<(hdr)->n[BCF_DT_ID] && (hdr)->id[BCF_DT_ID][int_id].val && bcf_hdr_id2coltype((hdr),(type),(int_id))!=0xf) #define bcf_hdr_id2hrec(hdr,dict_type,col_type,int_id) ((hdr)->id[(dict_type)==BCF_DT_CTG?BCF_DT_CTG:BCF_DT_ID][int_id].val->hrec[(dict_type)==BCF_DT_CTG?0:(col_type)]) + uint64_t bcf_hdr_id2contig_length(const bcf_hdr_t* hdr, const int id); /// Convert BCF FORMAT data to string form /** * @param s kstring to write into diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index a43ab15ae..a86aebc18 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -420,8 +420,12 @@ static void bcf_sr_destroy1(bcf_sr_t *reader) free(reader->fname); if ( reader->tbx_idx ) tbx_destroy(reader->tbx_idx); if ( reader->bcf_idx ) hts_idx_destroy(reader->bcf_idx); - bcf_hdr_destroy(reader->header); - hts_close(reader->file); + if (reader->header) { + bcf_hdr_destroy(reader->header); + } + if (reader->file) { + hts_close(reader->file); + } if ( reader->itr ) tbx_itr_destroy(reader->itr); int j; for (j=0; jmbuffer; j++) @@ -693,7 +697,7 @@ static int _reader_fill_buffer(bcf_srs_t *files, bcf_sr_t *reader) reader->nbuffer++; if ( reader->buffer[reader->nbuffer]->rid != reader->buffer[1]->rid ) break; - if ( reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full + if ( reader->read_one_record_only || reader->buffer[reader->nbuffer]->pos != reader->buffer[1]->pos ) break; // the buffer is full } if ( ret<0 ) { @@ -1044,6 +1048,7 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; hts_pos_t from, to; + unsigned char inside_quotes = 0; while ( 1 ) { tmp.l = 0; @@ -1060,8 +1065,28 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } else { - while ( *ep && *ep!=',' && *ep!=':' ) ep++; - kputsn(sp,ep-sp,&tmp); + //A quote is seen, flip flag inside_quotes + if(*ep == '"') + { + inside_quotes = 1 ^ inside_quotes; + sp = ++ep; + } + while ( *ep && ((inside_quotes && *ep!='"') || (!inside_quotes && *ep!=',' && *ep!=':')) ) ep++; + tmp.l = 0; + kputsn(sp,ep-sp,&tmp); + if(inside_quotes) + { + if(*ep == '"') + { + inside_quotes = 0; + ++ep; + } + else + { + fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s - terminating \" missing\n", __FILE__,__LINE__,__FUNCTION__,str); + free(reg); free(tmp.s); return NULL; + } + } } if ( *ep==':' ) { diff --git a/vcf.c b/vcf.c index c126f7354..4f87db5ff 100644 --- a/vcf.c +++ b/vcf.c @@ -1175,6 +1175,7 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) { int len, done = 0; char *p = htxt; + int return_val = 0; // Check sanity: "fileformat" string must come as first bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); @@ -1195,6 +1196,12 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // Parse the whole header do { while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { + if(len < 0) + { + return_val = -1; + done = -1; + break; + } if (bcf_hdr_add_hrec(hdr, hrec) < 0) { bcf_hrec_destroy(hrec); return -1; @@ -1212,7 +1219,11 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) // of the next one). p += len; continue; - } + } + if(done < 0) + break; + + // Next should be the sample line. If not, it was a malformed // header, in which case print a warning and skip (many VCF @@ -2140,6 +2151,7 @@ static int bcf1_sync(bcf1_t *line) return 0; } + bcf1_t *bcf_copy(bcf1_t *dst, bcf1_t *src) { bcf1_sync(src); @@ -4353,6 +4365,432 @@ bcf_hdr_t *bcf_hdr_merge(bcf_hdr_t *dst, const bcf_hdr_t *src) } return dst; } +typedef union { + uint32_t i; + float f; +} if_pair; + +bcf_hdr_t *vcf_hdr_read_required_sample_line(htsFile *fp, const uint8_t is_sample_line_required) +{ + kstring_t txt, *s = &fp->line; + int ret; + bcf_hdr_t *h; + tbx_t *idx = NULL; + const char **names = NULL; + h = bcf_hdr_init("r"); + if (!h) { + hts_log_error("Failed to allocate bcf header"); + return NULL; + } + txt.l = txt.m = 0; txt.s = 0; + while ((ret = hts_getline(fp, KS_SEP_LINE, s)) >= 0) { + int e = 0; + if (s->l == 0) continue; + if (s->s[0] != '#') { + hts_log_error("No sample line"); + goto error; + } + if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here + kstring_t tmp = { 0, 0, NULL }; + hFILE *f = hopen(fp->fn_aux, "r"); + if (f == NULL) { + hts_log_error("Couldn't open \"%s\"", fp->fn_aux); + goto error; + } + while (tmp.l = 0, kgetline(&tmp, (kgets_func *) hgets, f) >= 0) { + char *tab = strchr(tmp.s, '\t'); + if (tab == NULL) continue; + e |= (kputs("##contig=\n", 2, &txt) < 0); + } + free(tmp.s); + if (hclose(f) != 0) { + hts_log_error("Error on closing %s", fp->fn_aux); + goto error; + } + if (e) goto error; + } + if (kputsn(s->s, s->l, &txt) < 0) goto error; + if (kputc('\n', &txt) < 0) goto error; + if (s->s[1] != '#') break; + } + if ( ret < -1 ) goto error; + if ( !txt.s ) + { + hts_log_error("Could not read the header"); + goto error; + } + size_t hdr_length = 0ull; + if ( bcf_hdr_parse_required_sample_line(h, txt.s, &hdr_length, is_sample_line_required) < 0 ) goto error; + + // check tabix index, are all contigs listed in the header? add the missing ones + idx = tbx_index_load3(fp->fn, NULL, HTS_IDX_SAVE_REMOTE|HTS_IDX_SILENT_FAIL); + if ( idx ) + { + int i, n, need_sync = 0; + names = tbx_seqnames(idx, &n); + if (!names) goto error; + for (i=0; ierrcode ) + { + // vcf_parse1() encountered a new contig or tag, undeclared in the + // header. At this point, the header must have been printed, + // proceeding would lead to a broken BCF file. Errors must be checked + // and cleared by the caller before we can proceed. + hts_log_error("Unchecked error (%d)", v->errcode); + return -1; + } + bcf1_sync(v); // check if the BCF record was modified + if(is_bcf) + { + if((offset+8*sizeof(int)+v->shared.l+v->indiv.l) <= capacity) + { + //First 8 integers represent various lengths + if_pair* x = (if_pair*)(buffer+offset); + x[0].i = v->shared.l + 24; // to include six 32-bit integers + x[1].i = v->indiv.l; + x[2].i = v->rid; + x[3].i = v->pos; + x[4].i = v->rlen; + x[5].f = v->qual; + x[6].i = (uint32_t)v->n_allele<<16 | v->n_info; + x[7].i = (uint32_t)v->n_fmt<<24 | v->n_sample; + offset += 8*sizeof(int); + memcpy(buffer+offset, v->shared.s, v->shared.l); + offset += v->shared.l; + memcpy(buffer+offset, v->indiv.s, v->indiv.l); + offset += v->indiv.l; + } + } + else + { + tmp->l = 0; + int status = vcf_format(hdr, v, tmp); + assert(status == 0); + if((offset+tmp->l) <= capacity) + { + memcpy(buffer+offset, tmp->s, tmp->l); + offset += tmp->l; + } + } + return offset; +} + +bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_sample_line_required) +{ + if (hfp->format.format == vcf) + return vcf_hdr_read_required_sample_line(hfp, is_sample_line_required); + if (hfp->format.format != bcf) { + hts_log_error("Input is not detected as bcf or vcf format"); + return NULL; + } + + assert(hfp->is_bgzf); + + BGZF *fp = hfp->fp.bgzf; + uint8_t magic[5]; + bcf_hdr_t *h; + h = bcf_hdr_init("r"); + if (!h) { + hts_log_error("Failed to allocate bcf header"); + return NULL; + } + if (bgzf_read(fp, magic, 5) != 5) + { + hts_log_error("Failed to read the header (reading BCF in text mode?)"); + bcf_hdr_destroy(h); + return NULL; + } + if (strncmp((char*)magic, "BCF\2\2", 5) != 0) + { + if (!strncmp((char*)magic, "BCF", 3)) + hts_log_error("Invalid BCF2 magic string: only BCFv2.2 is supported"); + else + hts_log_error("Invalid BCF2 magic string"); + bcf_hdr_destroy(h); + return NULL; + } + uint8_t buf[4]; + size_t hlen; + char *htxt = NULL; + if (bgzf_read(fp, buf, 4) != 4) goto fail; + hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); + if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } + htxt = (char*)malloc(hlen + 1); + if (!htxt) goto fail; + if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; + htxt[hlen] = '\0'; // Ensure htxt is terminated + size_t hdr_length = 0ull; + bcf_hdr_parse_required_sample_line(h, htxt, &hdr_length, is_sample_line_required); // FIXME: Does this return anything meaningful? + free(htxt); + return h; + fail: + hts_log_error("Failed to read BCF header"); + free(htxt); + bcf_hdr_destroy(h); + return NULL; +} + + +int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_length, + const uint8_t is_sample_line_required) +{ + int len, done = 0; + char *p = htxt; + int return_val = 0; + + // Check sanity: "fileformat" string must come as first + bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); + if ( !hrec || !hrec->key || strcasecmp(hrec->key,"fileformat") ) + hts_log_warning("The first line should be ##fileformat; is the VCF/BCF header broken?"); + + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + + // The filter PASS must appear first in the dictionary + hrec = bcf_hdr_parse_line(hdr,"##FILTER=",&len); + if (!hrec || bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + + // Parse the whole header + do { + while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { + if(len < 0) + { + return_val = -1; + done = -1; + break; + } + if (bcf_hdr_add_hrec(hdr, hrec) < 0) { + bcf_hrec_destroy(hrec); + return -1; + } + p += len; + } + assert(hrec == NULL); + if (len < 0) { + // len < 0 indicates out-of-memory, or similar error + hts_log_error("Could not parse header line: %s", strerror(errno)); + return -1; + } else if (len > 0) { + // Bad header line. bcf_hdr_parse_line() will have logged it. + // Skip and try again on the next line (p + len will be the start + // of the next one). + p += len; + continue; + } + + if(done < 0) + break; + + // Next should be the sample line. If not, it was a malformed + // header, in which case print a warning and skip (many VCF + // operations do not really care about a few malformed lines). + // In the future we may want to add a strict mode that errors in + // this case. + if ( strncmp("#CHROM\tPOS",p,10) != 0 ) { + char *eol = strchr(p, '\n'); + if (*p != '\0') { + char buffer[320]; + hts_log_warning("Could not parse header line: %s", + hts_strprint(buffer, sizeof(buffer), + '"', p, + eol ? (eol - p) : SIZE_MAX)); + } + if (eol) { + p = eol + 1; // Try from the next line. + } else { + done = -1; // No more lines left, give up. + } + } else { + done = 1; // Sample line found + } + } while (!done); + + size_t sample_line_length = 0; + if (done < 0) { + if(is_sample_line_required) + { + // No sample line is fatal. + hts_log_error("Could not parse the header, sample line not found"); + return -1; + } + } + else + { + if(return_val >= 0) + return_val = bcf_hdr_parse_sample_line(hdr,p); + } + (*hdr_length) = ((size_t)(p - htxt)) + sample_line_length; + if(return_val >= 0) + return_val = bcf_hdr_sync(hdr); + if(return_val >= 0) + bcf_hdr_check_sanity(hdr); + return return_val; +} + + + + + +size_t bcf_hdr_serialize(bcf_hdr_t* h, uint8_t* buffer, size_t offset, const size_t capacity, const uint8_t is_bcf, const uint8_t keep_idx_fields) +{ + if (!h) { + errno = EINVAL; + return offset; + } + if ( h->dirty ) { + if (bcf_hdr_sync(h) < 0) return offset; + } + + kstring_t htxt = {0,0,0}; + bcf_hdr_format(h, (is_bcf & keep_idx_fields), &htxt); + uint32_t hlen = htxt.l; + if(is_bcf) + { + kputc('\0', &htxt); // include the \0 byte + ++hlen; + + if((offset+5+sizeof(int)+hlen) <= capacity) + { + if(!keep_idx_fields) //htsjdk cannot deal with 2.2 header + memcpy(buffer+offset, "BCF\2\1", 5); + else + memcpy(buffer+offset, "BCF\2\2", 5); + offset += 5; + memcpy(buffer+offset, &hlen, sizeof(int)); + offset += sizeof(int); + memcpy(buffer+offset, htxt.s, hlen); + offset += hlen; + } + } + else + { + if(offset+hlen <= capacity) + { + memcpy(buffer+offset, htxt.s, hlen); + offset += hlen; + } + } + free(htxt.s); + return offset; +} + +size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf) +{ + size_t hdr_length = 0ull; + size_t curr_offset = offset; + if(is_bcf) + { + //magic string + hdr length + if(curr_offset+BCF_HEADER_MAGIC_STRING_LENGTH+sizeof(int) > capacity) + return offset; + const char* buffer_magic_string = (const char*)(buffer+curr_offset); + if(strncmp(buffer_magic_string, BCF_V_2_2_HEADER_MAGIC_STRING, BCF_HEADER_MAGIC_STRING_LENGTH) != 0 + && strncmp(buffer_magic_string, BCF_V_2_1_HEADER_MAGIC_STRING, BCF_HEADER_MAGIC_STRING_LENGTH) != 0) + { + fprintf(stderr,"[%s:%d %s] invalid BCF2 magic string: only BCFv2.2 and BCFv2.1 are supported.\n", __FILE__,__LINE__,__FUNCTION__); + return offset; + } + curr_offset += BCF_HEADER_MAGIC_STRING_LENGTH; + //Header length + memcpy(&hdr_length, buffer+curr_offset, sizeof(int)); + curr_offset += sizeof(int); + if(curr_offset+hdr_length > capacity) + return offset; + } + bcf_hdr_parse(h, (char*)(buffer+curr_offset)); + return curr_offset+hdr_length; +} + +size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr) +{ + if(is_bcf) + { + bcf_clear(v); + size_t curr_offset = offset; + if(curr_offset+8*sizeof(uint32_t) >= capacity) + return offset; + const if_pair* x = (if_pair*)(buffer+curr_offset); + size_t shared_length = x[0].i-6*sizeof(int); + size_t indiv_length = x[1].i; + if(curr_offset+8*sizeof(uint32_t)+shared_length+indiv_length > capacity) + return offset; + ks_resize(&v->shared, shared_length); + ks_resize(&v->indiv, indiv_length); + v->rid = x[2].i; + v->pos = x[3].i; + v->rlen = x[4].i; + v->qual = x[5].f; + v->n_allele = (x[6].i)>>16; v->n_info = (x[6].i)&0xffff; + v->n_fmt = (x[7].i)>>24; v->n_sample = (x[7].i)&0xffffff; + v->shared.l = shared_length, v->indiv.l = indiv_length; + // silent fix of broken BCFs produced by earlier versions of bcf_subset, prior to and including bd6ed8b4 + if ( (!v->indiv.l || !v->n_sample) && v->n_fmt ) v->n_fmt = 0; + curr_offset += 8*sizeof(uint32_t); + + memcpy(v->shared.s, buffer+curr_offset, shared_length); + curr_offset += shared_length; + + memcpy(v->indiv.s, buffer+curr_offset, indiv_length); + curr_offset += indiv_length; + return curr_offset; + } + else + { + kstring_t tmp; + assert(offset < capacity); + tmp.s = (char*)(buffer+offset); + size_t max_length = capacity-offset; + size_t line_length = max_length; + //See if newline exists + char* line_end_ptr = (char*)(memchr(tmp.s, '\n', max_length)); + if(line_end_ptr) + { + line_length = ((size_t)(line_end_ptr - tmp.s)); + *line_end_ptr = 0; //replace '\n' with null byte, vcf_parse doesn't like '\n' + } + tmp.l = line_length; + tmp.m = max_length; + int status = vcf_parse(&tmp, hdr, v); + //vcf parsed succesfully + if(status == 0) + return offset + line_length + (line_end_ptr ? 1u : 0u); //for the \n character + else + return offset; + } +} int bcf_translate(const bcf_hdr_t *dst_hdr, bcf_hdr_t *src_hdr, bcf1_t *line) { @@ -4721,6 +5159,7 @@ static void bcf_set_variant_type(const char *ref, const char *alt, bcf_variant_t if ( alt[0]=='<' ) { if ( alt[1]=='X' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } // mpileup's X allele shouldn't be treated as variant + if( strncmp(alt, "", 9) == 0) { var->n = 0; var->type = VCF_NON_REF; return; } if ( alt[1]=='*' && alt[2]=='>' ) { var->n = 0; var->type = VCF_REF; return; } if ( !strcmp("NON_REF>",alt+1) ) { var->n = 0; var->type = VCF_REF; return; } var->type = VCF_OTHER; @@ -4830,7 +5269,7 @@ int bcf_get_variant_type(bcf1_t *rec, int ith_allele) hts_log_error("Requested allele outside valid range"); exit(1); } - return rec->d.var[ith_allele].type & ORIG_VAR_TYPES; + return rec->d.var[ith_allele].type; } #undef ORIG_VAR_TYPES @@ -5664,3 +6103,23 @@ const char *bcf_strerror(int errorcode, char *buffer, size_t maxbuffer) { return buffer; } +uint64_t bcf_hdr_id2contig_length(const bcf_hdr_t* hdr, const int id) +{ + bcf_hrec_t* hrec = bcf_hdr_id2hrec(hdr, BCF_DT_CTG, 0, id); + int i = 0; + for(i=0;inkeys;++i) + if(strcmp(hrec->keys[i], "length") == 0) + return strtoull(hrec->vals[i], 0, 10); + return 0; +} + +void bcf_set_end_point_from_info(const bcf_hdr_t* hdr, bcf1_t* line) +{ + bcf_unpack(line, BCF_UN_INFO); + bcf_info_t* info = bcf_get_info(hdr, line, "END"); + if(info) + line->m_end_point = info->v1.i - 1; //END value is 1 based, line->pos is 0 based, change to 0 based + else //no END tag, end is same as pos if not deletion, else depends on rlen + line->m_end_point = line->pos + line->rlen - 1; +} + From 952ce612c7b1ef5afcd55cf19fde2da9f6f7eb87 Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Mon, 28 Aug 2023 14:05:50 -0700 Subject: [PATCH 476/488] rebase, merged with latest develop, includes speed ups --- htslib/vcf.h | 17 +++-- kstring.c | 10 +-- vcf.c | 181 ++++++++++++++++++++++++++++++++------------------- 3 files changed, 132 insertions(+), 76 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index d2cb6e6a8..67c8cb5db 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -157,8 +157,8 @@ extern uint8_t bcf_type_shift[]; #define VCF_OTHER 32 #define VCF_BND 64 // breakend #define VCF_OVERLAP 16 // overlapping deletion, ALT=* -#define VCF_INS (1<<6) // implies VCF_INDEL -#define VCF_DEL (1<<7) // implies VCF_INDEL +#define VCF_INS VCF_INDEL // implies VCF_INDEL +#define VCF_DEL VCF_INDEL // implies VCF_INDEL #define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) #define VCF_NON_REF 8 #define VCF_SPANNING_DELETION 16 @@ -1572,7 +1572,7 @@ static inline int bcf_format_gt(bcf_fmt_t *fmt, int isample, kstring_t *str) return e == 0 ? 0 : -1; } -static inline int bcf_enc_size(kstring_t *s, int size, int type) +static inline int bcf_enc_size(kstring_t *s, size_t size, int type) { // Most common case is first if (size < 15) { @@ -1598,11 +1598,17 @@ static inline int bcf_enc_size(kstring_t *s, int size, int type) *p++ = 1<<4|BCF_BT_INT16; i16_to_le(size, p); s->l += 4; - } else { + } + else if(size <= INT32_MAX){ *p++ = 1<<4|BCF_BT_INT32; i32_to_le(size, p); s->l += 6; } + else{ + *p++ = 1<<4|BCF_BT_INT64; + s->l += 10; + return -1; + } } return 0; } @@ -1611,7 +1617,8 @@ static inline int bcf_enc_inttype(long x) { if (x <= BCF_MAX_BT_INT8 && x >= BCF_MIN_BT_INT8) return BCF_BT_INT8; if (x <= BCF_MAX_BT_INT16 && x >= BCF_MIN_BT_INT16) return BCF_BT_INT16; - return BCF_BT_INT32; + if (x <= BCF_MAX_BT_INT32 && x >= BCF_MIN_BT_INT32) return BCF_BT_INT32; + return BCF_BT_INT64; } static inline int bcf_enc_int1(kstring_t *s, int32_t x) diff --git a/kstring.c b/kstring.c index f8e0f9f3d..5284d53d0 100644 --- a/kstring.c +++ b/kstring.c @@ -57,7 +57,7 @@ int kputd(double d, kstring_t *s) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases - int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d); + int s2 = snprintf(s->s + s->l, s->m - s->l, "%#g", d); len += s2; s->l += s2; return len; @@ -116,7 +116,7 @@ int kputd(double d, kstring_t *s) { } xp[0] = '.'; cp[7] = 0; ep=cp+6; - if (cp[6] == '.') cp[6] = 0; + //if (cp[6] == '.') cp[6] = 0; } // Cull trailing zeros @@ -125,8 +125,10 @@ int kputd(double d, kstring_t *s) { char *z = ep+1; while (ep > cp) { if (*ep == '.') { - if (z[-1] == '.') - z[-1] = 0; + if (z[-1] == '.'){ + z[0] = '0'; + z[1] = 0; + } else z[0] = 0; break; diff --git a/vcf.c b/vcf.c index 4f87db5ff..db7a01745 100644 --- a/vcf.c +++ b/vcf.c @@ -2621,6 +2621,76 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } #endif +int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) +{ + int64_t max = INT64_MIN, min = INT64_MAX; + int i; + if (n <= 0) return bcf_enc_size(s, 0, BCF_BT_NULL); + else if (n == 1) return bcf_enc_long1(s, a[0]); + else { + if (wsize <= 0) wsize = n; + for (i = 0; i < n; ++i) { + if (a[i] == bcf_int64_missing || a[i] == bcf_int64_vector_end ) continue; + if (max < a[i]) max = a[i]; + if (min > a[i]) min = a[i]; + } + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { + bcf_enc_size(s, wsize, BCF_BT_INT8); + for (i = 0; i < n; ++i) + if ( a[i]==bcf_int64_vector_end ) kputc(bcf_int8_vector_end, s); + else if ( a[i]==bcf_int64_missing ) kputc(bcf_int8_missing, s); + else kputc(a[i], s); + } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { + uint8_t *p; + bcf_enc_size(s, wsize, BCF_BT_INT16); + ks_resize(s, s->l + n * sizeof(int16_t)); + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) + { + int16_t x; + if ( a[i]==bcf_int64_vector_end ) x = bcf_int16_vector_end; + else if ( a[i]==bcf_int64_missing ) x = bcf_int16_missing; + else x = a[i]; + i16_to_le(x, p); + p += sizeof(int16_t); + } + s->l += n * sizeof(int16_t); + } else if(max <= BCF_MAX_BT_INT32 && min >= BCF_MIN_BT_INT32){ + uint8_t *p; + bcf_enc_size(s, wsize, BCF_BT_INT32); + ks_resize(s, s->l + n * sizeof(int32_t)); + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + int32_t x; + if ( a[i]==bcf_int64_vector_end ) x = bcf_int32_vector_end; + else if ( a[i]==bcf_int64_missing ) x = bcf_int32_missing; + else x = a[i]; + i32_to_le(x, p); + p += sizeof(int32_t); + } + s->l += n * sizeof(int32_t); + } +#ifdef VCF_ALLOW_INT64 + else { + uint8_t *p; + bcf_enc_size(s, wsize, BCF_BT_INT64); + ks_resize(s, s->l + n * sizeof(int64_t)); + p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i) { + int64_t x = a[i]; + i64_to_le(x, p); + p += sizeof(int64_t); + } + s->l += n * sizeof(int64_t); + } +#else + return -1; +#endif + } + + return 0; // FIXME: check for errs in this function +} + static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { uint8_t *p; size_t i; @@ -2687,6 +2757,7 @@ int bcf_fmt_array(kstring_t *s, int n, int type, void *data) case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, v==bcf_int8_missing, v==bcf_int8_vector_end, kputw(v, s)); break; case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, v==bcf_int16_missing, v==bcf_int16_vector_end, kputw(v, s)); break; case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, v==bcf_int32_missing, v==bcf_int32_vector_end, kputw(v, s)); break; + case BCF_BT_INT64: BRANCH(int64_t, le_to_i64, v==bcf_int64_missing, v==bcf_int64_vector_end, kputll(v, s)); break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, v==bcf_float_missing, v==bcf_float_vector_end, kputd(le_to_float(p), s)); break; default: hts_log_error("Unexpected type %d", type); exit(1); break; } @@ -3360,8 +3431,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p char *r, *key; khint_t k; vdict_t *d = (vdict_t*)h->dict[BCF_DT_ID]; - int32_t *a_val = NULL; - + int64_t *a_val = NULL; v->n_info = 0; if (*(q-1) == ';') *(q-1) = 0; for (r = key = p;; ++r) { @@ -3417,7 +3487,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p if (*t == ',') ++n_val; // Check both int and float size in one step for simplicity if (n_val > max_n_val) { - int32_t *a_tmp = (int32_t *)realloc(a_val, n_val * sizeof(*a_val)); + int64_t *a_tmp = (int64_t *)realloc(a_val, n_val * sizeof(*a_val)); if (!a_tmp) { hts_log_error("Could not allocate memory at %s:%"PRIhts_pos, bcf_seqname_safe(h,v), v->pos+1); v->errcode |= BCF_ERR_LIMITS; // No appropriate code? @@ -3426,67 +3496,35 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p a_val = a_tmp; max_n_val = n_val; } - if ((y>>4&0xf) == BCF_HT_INT) { + if (((y >> 4 & 0xf) == BCF_HT_INT) || + ((y >> 4 & 0xf) == BCF_HT_LONG)) { i = 0, t = val; int64_t val1; - int is_int64 = 0; -#ifdef VCF_ALLOW_INT64 - if ( n_val==1 ) - { - overflow = 0; - long long int tmp_val = hts_str2int(val, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); - if ( te==val ) tmp_val = bcf_int32_missing; - else if ( overflow || tmp_valBCF_MAX_BT_INT64 ) - { - if ( !extreme_int_warned ) - { - hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); - extreme_int_warned = 1; - } - tmp_val = bcf_int32_missing; - } - else - is_int64 = 1; - val1 = tmp_val; - t = te; - i = 1; // this is just to avoid adding another nested block... - } -#endif - for (; i < n_val; ++i, ++t) - { + for (; i < n_val; ++i, ++t) { overflow = 0; - long int tmp_val = hts_str2int(t, &te, sizeof(tmp_val)*CHAR_BIT, &overflow); - if ( te==t ) tmp_val = bcf_int32_missing; - else if ( overflow || tmp_valBCF_MAX_BT_INT32 ) - { - if ( !extreme_int_warned ) - { - hts_log_warning("Extreme INFO/%s value encountered and set to missing at %s:%"PRIhts_pos,key,bcf_seqname_safe(h,v), v->pos+1); + long long int tmp_val = hts_str2int( + t, &te, sizeof(tmp_val) * CHAR_BIT, &overflow); + if (te == t) + tmp_val = bcf_int64_missing; + else if (overflow || tmp_val < BCF_MIN_BT_INT64 || + tmp_val > BCF_MAX_BT_INT64) { + if (!extreme_int_warned) { + hts_log_warning( + "Extreme INFO/%s value encountered and set to " + "missing at %s:%" PRIhts_pos, + key, bcf_seqname_safe(h, v), v->pos + 1); extreme_int_warned = 1; } - tmp_val = bcf_int32_missing; + tmp_val = bcf_int64_missing; } a_val[i] = tmp_val; - for (t = te; *t && *t != ','; t++); - } - if (n_val == 1) { -#ifdef VCF_ALLOW_INT64 - if ( is_int64 ) - { - v->unpacked |= BCF_IS_64BIT; - bcf_enc_long1(str, val1); - } - else - bcf_enc_int1(str, (int32_t)val1); -#else - val1 = a_val[0]; - bcf_enc_int1(str, (int32_t)val1); -#endif - } else { - bcf_enc_vint(str, n_val, a_val, -1); + for (t = te; *t && *t != ','; t++) + ; } - if (n_val==1 && (val1!=bcf_int32_missing || is_int64) - && memcmp(key, "END", 4) == 0) + v->unpacked |= BCF_IS_64BIT; + bcf_enc_vlong(str, n_val, a_val, -1); + val1 = a_val[0]; + if (n_val==1 && strcmp(key, "END") == 0) { if ( val1 <= v->pos ) { @@ -3499,7 +3537,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p else v->rlen = val1 - v->pos; } - } else if ((y>>4&0xf) == BCF_HT_REAL) { + } else if ((y >> 4 & 0xf) == BCF_HT_REAL) { float *val_f = (float *)a_val; for (i = 0, t = val; i < n_val; ++i, ++t) { @@ -4730,8 +4768,7 @@ size_t bcf_hdr_deserialize(bcf_hdr_t* h, const uint8_t* buffer, const size_t off if(curr_offset+hdr_length > capacity) return offset; } - bcf_hdr_parse(h, (char*)(buffer+curr_offset)); - return curr_offset+hdr_length; + return bcf_hdr_parse(h, (char*)(buffer+curr_offset)); } size_t bcf_deserialize(bcf1_t* v, uint8_t* buffer, const size_t offset, const size_t capacity, const uint8_t is_bcf, const bcf_hdr_t* hdr) @@ -5245,7 +5282,7 @@ static int bcf_set_variant_types(bcf1_t *b) // to be compatible with callers that are not expecting newer values // like VCF_INS, VCF_DEL. The full set is available from the newer // vcf_has_variant_type* interfaces. -#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP) +#define ORIG_VAR_TYPES (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_NON_REF) int bcf_get_variant_types(bcf1_t *rec) { if ( rec->d.var_type==-1 ) { @@ -5269,7 +5306,7 @@ int bcf_get_variant_type(bcf1_t *rec, int ith_allele) hts_log_error("Requested allele outside valid range"); exit(1); } - return rec->d.var[ith_allele].type; + return rec->d.var[ith_allele].type & ORIG_VAR_TYPES; } #undef ORIG_VAR_TYPES @@ -5385,11 +5422,7 @@ int bcf_update_info(const bcf_hdr_t *hdr, bcf1_t *line, const char *key, const v #ifdef VCF_ALLOW_INT64 else if ( type==BCF_HT_LONG ) { - if (n != 1) { - hts_log_error("Only storing a single BCF_HT_LONG value is supported at %s:%"PRIhts_pos, bcf_seqname_safe(hdr,line), line->pos+1); - abort(); - } - bcf_enc_long1(&str, *(int64_t *) values); + bcf_enc_vlong(&str, n, (const int64_t*)values, -1); } #endif else @@ -5843,7 +5876,12 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi { int i, ret = -4, tag_id = bcf_hdr_id2int(hdr, BCF_DT_ID, tag); if ( !bcf_hdr_idinfo_exists(hdr,BCF_HL_INFO,tag_id) ) return -1; // no such INFO field in the header - if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type + if((type & 0xff) == BCF_HT_LONG) { + const int ht_type_in_hdr = bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id); + if(ht_type_in_hdr != BCF_HT_INT && ht_type_in_hdr != BCF_HT_LONG) return -2; // expected different type + } + else + if ( bcf_hdr_id2type(hdr,BCF_HL_INFO,tag_id)!=(type & 0xff) ) return -2; // expected different type if ( !(line->unpacked & BCF_UN_INFO) ) bcf_unpack(line, BCF_UN_INFO); @@ -5916,6 +5954,14 @@ int bcf_get_info_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, voi } else { BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=p, int32_t); break; } + case BCF_BT_INT64: + if (type == BCF_HT_LONG) { + BRANCH(int64_t, le_to_i64, p==bcf_int64_missing, p==bcf_int64_vector_end, *tmp=bcf_int64_missing, *tmp=p, int64_t); + } else { + hts_log_error("Trying to get 32-bit int data from a field which contains 64 bit values"); + return -2; + } + break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set(tmp, p), float); break; default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, info->type, bcf_seqname_safe(hdr,line), line->pos+1); return -2; } @@ -6025,6 +6071,7 @@ int bcf_get_format_values(const bcf_hdr_t *hdr, bcf1_t *line, const char *tag, v case BCF_BT_INT8: BRANCH(int8_t, le_to_i8, p==bcf_int8_missing, p==bcf_int8_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; case BCF_BT_INT16: BRANCH(int16_t, le_to_i16, p==bcf_int16_missing, p==bcf_int16_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; case BCF_BT_INT32: BRANCH(int32_t, le_to_i32, p==bcf_int32_missing, p==bcf_int32_vector_end, *tmp=bcf_int32_missing, *tmp=bcf_int32_vector_end, *tmp=p, int32_t); break; + case BCF_BT_INT64: BRANCH(int64_t, le_to_i64, p==bcf_int64_missing, p==bcf_int64_vector_end, *tmp=bcf_int64_missing, *tmp=bcf_int64_vector_end, *tmp=p, int64_t); break; case BCF_BT_FLOAT: BRANCH(uint32_t, le_to_u32, p==bcf_float_missing, p==bcf_float_vector_end, bcf_float_set_missing(*tmp), bcf_float_set_vector_end(*tmp), bcf_float_set(tmp, p), float); break; default: hts_log_error("Unexpected type %d at %s:%"PRIhts_pos, fmt->type, bcf_seqname_safe(hdr,line), line->pos+1); exit(1); } From a9026fb7bd84aee50db06c16e633bb5fa5f767e2 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 17 Aug 2023 20:59:07 +1200 Subject: [PATCH 477/488] Add hclose()-doesn't-close-fd option and use it for hopen("-") Enables repeated hopen("-") / hclose() / hopen("-") where previously the underlying STDIN/OUT_FILENO would have been prematurely closed. This means that stdout is never really closed and hclose()'s return value does not reflect closing the underlying fd. Hence particularly paranoid programs that have written significant data to stdout will want to close and check it themselves just before the end of main(): if (fclose(stdout) != 0 && errno != EBADF) perror("closing stdout") (Ignore EBADF as stdout may already have been closed and checked, e.g. if the program has been linked to an earlier HTSlib where hclose() still closes STDOUT_FILENO.) --- hfile.c | 12 ++++++++++-- htslib/hfile.h | 4 ++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/hfile.c b/hfile.c index f8d42e49a..78533dd56 100644 --- a/hfile.c +++ b/hfile.c @@ -524,7 +524,7 @@ void hclose_abruptly(hFILE *fp) typedef struct { hFILE base; int fd; - unsigned is_socket:1; + unsigned is_socket:1, is_shared:1; } hFILE_fd; static ssize_t fd_read(hFILE *fpv, void *buffer, size_t nbytes) @@ -599,6 +599,10 @@ static int fd_close(hFILE *fpv) { hFILE_fd *fp = (hFILE_fd *) fpv; int ret; + + // If we don't own the fd, return successfully without actually closing it + if (fp->is_shared) return 0; + do { #ifdef HAVE_CLOSESOCKET ret = fp->is_socket? closesocket(fp->fd) : close(fp->fd); @@ -636,6 +640,7 @@ static hFILE *hopen_fd(const char *filename, const char *mode) fp->fd = fd; fp->is_socket = 0; + fp->is_shared = 0; fp->base.backend = &fd_backend; return &fp->base; @@ -702,6 +707,7 @@ hFILE *hdopen(int fd, const char *mode) fp->fd = fd; fp->is_socket = (strchr(mode, 's') != NULL); + fp->is_shared = (strchr(mode, 'S') != NULL); fp->base.backend = &fd_backend; return &fp->base; } @@ -723,10 +729,12 @@ static hFILE *hopen_fd_fileuri(const char *url, const char *mode) static hFILE *hopen_fd_stdinout(const char *mode) { int fd = (strchr(mode, 'r') != NULL)? STDIN_FILENO : STDOUT_FILENO; + char mode_shared[101]; + snprintf(mode_shared, sizeof mode_shared, "S%s", mode); #if defined HAVE_SETMODE && defined O_BINARY if (setmode(fd, O_BINARY) < 0) return NULL; #endif - return hdopen(fd, mode); + return hdopen(fd, mode_shared); } HTSLIB_EXPORT diff --git a/htslib/hfile.h b/htslib/hfile.h index 6e3a2a22a..bc86757a2 100644 --- a/htslib/hfile.h +++ b/htslib/hfile.h @@ -80,6 +80,10 @@ Note that the file must be opened in binary mode, or else there will be problems on platforms that make a difference between text and binary mode. +By default, the returned hFILE "takes ownership" of the file descriptor +and _fd_ will be closed by hclose(). When _mode_ contains `S` (shared fd), +hclose() will destroy the hFILE but not close the underlying _fd_. + For socket descriptors (on Windows), _mode_ should contain `s`. */ HTSLIB_EXPORT From 2429986f3e55e3194016494c7acf3785c5c215ea Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sun, 20 Aug 2023 23:06:29 +1200 Subject: [PATCH 478/488] Take advantage of shared hopen("-") in htsfile.c We don't need to dup(STDOUT_FILENO) now that hclose()/hts_close() no longer irretrievably close stdout. Instead fclose(stdout) explicitly just before the end of main() as it is a last chance to observe I/O errors. --- htsfile.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/htsfile.c b/htsfile.c index 9f7bf4531..229af6656 100644 --- a/htsfile.c +++ b/htsfile.c @@ -31,7 +31,6 @@ DEALINGS IN THE SOFTWARE. */ #include #include #include -#include #include "htslib/hfile.h" #include "htslib/hts.h" @@ -62,13 +61,6 @@ void error(const char *format, ...) status = EXIT_FAILURE; } -static htsFile *dup_stdout(const char *mode) -{ - int fd = dup(STDOUT_FILENO); - hFILE *hfp = (fd >= 0)? hdopen(fd, mode) : NULL; - return hfp? hts_hopen(hfp, "-", mode) : NULL; -} - static void view_sam(samFile *in, const char *filename) { bam1_t *b = NULL; @@ -81,7 +73,7 @@ static void view_sam(samFile *in, const char *filename) goto clean; } - out = dup_stdout("w"); + out = hts_open("-", "w"); if (out == NULL) { error("reopening standard output failed"); goto clean; } if (show_headers) { @@ -125,7 +117,7 @@ static void view_vcf(vcfFile *in, const char *filename) goto clean; } - out = dup_stdout("w"); + out = hts_open("-", "w"); if (out == NULL) { error("reopening standard output failed"); goto clean; } if (show_headers) { @@ -325,5 +317,8 @@ int main(int argc, char **argv) if (fp && hclose(fp) < 0) error("closing \"%s\" failed", argv[i]); } + if (fclose(stdout) != 0 && errno != EBADF) + error("closing standard output failed"); + return status; } From 946f291cd057c4c1702b96db4393a986dbf976d7 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Fri, 25 Aug 2023 23:09:55 +1200 Subject: [PATCH 479/488] Explicitly fclose(stdout) in test/test_view.c too --- test/test_view.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/test/test_view.c b/test/test_view.c index 02d109297..c899ff995 100644 --- a/test/test_view.c +++ b/test/test_view.c @@ -25,6 +25,7 @@ DEALINGS IN THE SOFTWARE. */ #include +#include #include #include #include @@ -430,5 +431,10 @@ int main(int argc, char *argv[]) if (p.pool) hts_tpool_destroy(p.pool); + if (fclose(stdout) != 0 && errno != EBADF) { + fprintf(stderr, "Error closing standard output.\n"); + exit_code = EXIT_FAILURE; + } + return exit_code; } From 5acbc150b2c12ea77111d845c06179507df976ae Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Fri, 25 Aug 2023 16:50:04 +0100 Subject: [PATCH 480/488] bgzf_useek fails when offset is above block limits --- bgzf.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/bgzf.c b/bgzf.c index a775b1b83..b3f760f08 100644 --- a/bgzf.c +++ b/bgzf.c @@ -2566,6 +2566,7 @@ int bgzf_useek(BGZF *fp, off_t uoffset, int where) else break; } int i = ilo-1; + off_t offset = 0; if (bgzf_seek_common(fp, fp->idx->offs[i].caddr, 0) < 0) return -1; @@ -2573,9 +2574,14 @@ int bgzf_useek(BGZF *fp, off_t uoffset, int where) fp->errcode |= BGZF_ERR_IO; return -1; } - if ( uoffset - fp->idx->offs[i].uaddr > 0 ) + offset = uoffset - fp->idx->offs[i].uaddr; + if ( offset > 0 ) { - fp->block_offset = uoffset - fp->idx->offs[i].uaddr; + if (offset > fp->block_length) { + fp->errcode |= BGZF_ERR_IO; + return -1; //offset outside the available data + } + fp->block_offset = offset; assert( fp->block_offset <= fp->block_length ); // todo: skipped, unindexed, blocks } fp->uncompressed_address = uoffset; From 12759e4478acd527981a4974211a8e0a76d750dc Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Wed, 6 Sep 2023 11:55:49 -0700 Subject: [PATCH 481/488] tests are passing with speed-up improvements, a few improvements are left out to make test pass for now --- htslib/vcf.h | 5 +-- synced_bcf_reader.c | 25 ++----------- vcf.c | 87 +++++++++++++++++++++++++++++++++------------ 3 files changed, 69 insertions(+), 48 deletions(-) diff --git a/htslib/vcf.h b/htslib/vcf.h index 67c8cb5db..de830dda5 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -1605,8 +1605,9 @@ static inline int bcf_enc_size(kstring_t *s, size_t size, int type) s->l += 6; } else{ - *p++ = 1<<4|BCF_BT_INT64; - s->l += 10; + *p++ = 1<<4|BCF_BT_INT64;// + i64_to_le(size,p); + s->l += 10; // not so sure about +10 here, whether it is accurate or changes anything. return -1; } } diff --git a/synced_bcf_reader.c b/synced_bcf_reader.c index a86aebc18..acb208488 100644 --- a/synced_bcf_reader.c +++ b/synced_bcf_reader.c @@ -1048,7 +1048,6 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) kstring_t tmp = {0,0,0}; const char *sp = str, *ep = str; hts_pos_t from, to; - unsigned char inside_quotes = 0; while ( 1 ) { tmp.l = 0; @@ -1065,28 +1064,8 @@ static bcf_sr_regions_t *_regions_init_string(const char *str) } else { - //A quote is seen, flip flag inside_quotes - if(*ep == '"') - { - inside_quotes = 1 ^ inside_quotes; - sp = ++ep; - } - while ( *ep && ((inside_quotes && *ep!='"') || (!inside_quotes && *ep!=',' && *ep!=':')) ) ep++; - tmp.l = 0; - kputsn(sp,ep-sp,&tmp); - if(inside_quotes) - { - if(*ep == '"') - { - inside_quotes = 0; - ++ep; - } - else - { - fprintf(stderr,"[%s:%d %s] Could not parse the region(s): %s - terminating \" missing\n", __FILE__,__LINE__,__FUNCTION__,str); - free(reg); free(tmp.s); return NULL; - } - } + while ( *ep && *ep!=',' && *ep!=':' ) ep++; + kputsn(sp,ep-sp,&tmp); } if ( *ep==':' ) { diff --git a/vcf.c b/vcf.c index db7a01745..f5e00b675 100644 --- a/vcf.c +++ b/vcf.c @@ -2621,29 +2621,68 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } #endif -int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) +int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)//supposed to be optimized, not working yet. { int64_t max = INT64_MIN, min = INT64_MAX; int i; - if (n <= 0) return bcf_enc_size(s, 0, BCF_BT_NULL); - else if (n == 1) return bcf_enc_long1(s, a[0]); - else { + if (n <= 0) { + return bcf_enc_size(s, 0, BCF_BT_NULL); + } else if (n == 1) { + return bcf_enc_long1(s, a[0]); + } else { if (wsize <= 0) wsize = n; - for (i = 0; i < n; ++i) { - if (a[i] == bcf_int64_missing || a[i] == bcf_int64_vector_end ) continue; + + // Equivalent to: + // for (i = 0; i < n; ++i) { + // if (a[i] == bcf_int32_missing || a[i] == bcf_int32_vector_end ) + // continue; + // if (max < a[i]) max = a[i]; + // if (min > a[i]) min = a[i]; + // } + int64_t max4[4] = {INT64_MIN, INT64_MIN, INT64_MIN, INT64_MIN}; + int64_t min4[4] = {INT64_MAX, INT64_MAX, INT64_MAX, INT64_MAX}; + for (i = 0; i < (n&~3); i+=4) { + // bcf_int32_missing == INT32_MIN and + // bcf_int32_vector_end == INT32_MIN+1. + // We skip these, but can mostly avoid explicit checking + if (max4[0] < a[i+0]) max4[0] = a[i+0]; + if (max4[1] < a[i+1]) max4[1] = a[i+1]; + if (max4[2] < a[i+2]) max4[2] = a[i+2]; + if (max4[3] < a[i+3]) max4[3] = a[i+3]; + if (min4[0] > a[i+0] && a[i+0] > INT64_MIN+1) min4[0] = a[i+0]; + if (min4[1] > a[i+1] && a[i+1] > INT64_MIN+1) min4[1] = a[i+1]; + if (min4[2] > a[i+2] && a[i+2] > INT64_MIN+1) min4[2] = a[i+2]; + if (min4[3] > a[i+3] && a[i+3] > INT64_MIN+1) min4[3] = a[i+3]; + } + min = min4[0]; + if (min > min4[1]) min = min4[1]; + if (min > min4[2]) min = min4[2]; + if (min > min4[3]) min = min4[3]; + max = max4[0]; + if (max < max4[1]) max = max4[1]; + if (max < max4[2]) max = max4[2]; + if (max < max4[3]) max = max4[3]; + for (; i < n; ++i) { if (max < a[i]) max = a[i]; - if (min > a[i]) min = a[i]; + if (min > a[i] && a[i] > INT64_MIN+1) min = a[i]; } + if (max <= BCF_MAX_BT_INT8 && min >= BCF_MIN_BT_INT8) { - bcf_enc_size(s, wsize, BCF_BT_INT8); - for (i = 0; i < n; ++i) - if ( a[i]==bcf_int64_vector_end ) kputc(bcf_int8_vector_end, s); - else if ( a[i]==bcf_int64_missing ) kputc(bcf_int8_missing, s); - else kputc(a[i], s); + if (bcf_enc_size(s, wsize, BCF_BT_INT8) < 0 || + ks_resize(s, s->l + n) < 0) + return -1; + uint8_t *p = (uint8_t *) s->s + s->l; + for (i = 0; i < n; ++i, p++) { + if ( a[i]==bcf_int64_vector_end ) *p = bcf_int8_vector_end; + else if ( a[i]==bcf_int64_missing ) *p = bcf_int8_missing; + else *p = a[i]; + } + s->l += n; } else if (max <= BCF_MAX_BT_INT16 && min >= BCF_MIN_BT_INT16) { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT16); - ks_resize(s, s->l + n * sizeof(int16_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT16) < 0 || + ks_resize(s, s->l + n * sizeof(int16_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { @@ -2657,8 +2696,9 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) s->l += n * sizeof(int16_t); } else if(max <= BCF_MAX_BT_INT32 && min >= BCF_MIN_BT_INT32){ uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT32); - ks_resize(s, s->l + n * sizeof(int32_t)); + if (bcf_enc_size(s, wsize, BCF_BT_INT32) < 0 || + ks_resize(s, s->l + n * sizeof(int32_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { int32_t x; @@ -2670,11 +2710,11 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) } s->l += n * sizeof(int32_t); } -#ifdef VCF_ALLOW_INT64 + #ifdef VCF_ALLOW_INT64 else { uint8_t *p; - bcf_enc_size(s, wsize, BCF_BT_INT64); - ks_resize(s, s->l + n * sizeof(int64_t)); + if(bcf_enc_size(s, wsize, BCF_BT_INT64) < 0 || ks_resize(s, s->l + n * sizeof(int64_t)) < 0) + return -1; p = (uint8_t *) s->s + s->l; for (i = 0; i < n; ++i) { int64_t x = a[i]; @@ -2688,7 +2728,7 @@ int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) #endif } - return 0; // FIXME: check for errs in this function + return 0; } static inline int serialize_float_array(kstring_t *s, size_t n, const float *a) { @@ -3524,7 +3564,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->unpacked |= BCF_IS_64BIT; bcf_enc_vlong(str, n_val, a_val, -1); val1 = a_val[0]; - if (n_val==1 && strcmp(key, "END") == 0) + if (n_val==1 && strcmp(key, "END") == 0)//memset instead of strcmp { if ( val1 <= v->pos ) { @@ -3586,6 +3626,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) // Ensure string we parse has space to permit some over-flow when during // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over // the more straight forward looking strcmp, giving a speed advantage. + /* if (ks_resize(s, s->l+4) < 0) return -1; @@ -3598,7 +3639,7 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) s->s[s->l+1] = 0; s->s[s->l+2] = 0; s->s[s->l+3] = 0; - + */ // commented out the part that was required for optimization in vcf_parse_info function, will take a look later. bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t)); @@ -4657,7 +4698,7 @@ int bcf_hdr_parse_required_sample_line(bcf_hdr_t *hdr, char *htxt, size_t* hdr_l // operations do not really care about a few malformed lines). // In the future we may want to add a strict mode that errors in // this case. - if ( strncmp("#CHROM\tPOS",p,10) != 0 ) { + if ( strncmp("#CHROM\t",p,7) && strncmp("#CHROM ",p,7) ) { char *eol = strchr(p, '\n'); if (*p != '\0') { char buffer[320]; From ce36a3daa2100e8c00cd71c4baf266b35ec8afe7 Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Wed, 6 Sep 2023 12:26:13 -0700 Subject: [PATCH 482/488] fix comment --- vcf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vcf.c b/vcf.c index f5e00b675..e64326263 100644 --- a/vcf.c +++ b/vcf.c @@ -2621,7 +2621,7 @@ static int bcf_enc_long1(kstring_t *s, int64_t x) { } #endif -int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize)//supposed to be optimized, not working yet. +int bcf_enc_vlong(kstring_t *s, const int n, const int64_t *a, int wsize) { int64_t max = INT64_MIN, min = INT64_MAX; int i; @@ -3564,7 +3564,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->unpacked |= BCF_IS_64BIT; bcf_enc_vlong(str, n_val, a_val, -1); val1 = a_val[0]; - if (n_val==1 && strcmp(key, "END") == 0)//memset instead of strcmp + if (n_val==1 && strcmp(key, "END") == 0)//memcmp instead of strcmp { if ( val1 <= v->pos ) { From 9913d3750c1164d548456366b9a2ab180ccbcbce Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Wed, 6 Sep 2023 15:23:06 -0700 Subject: [PATCH 483/488] add back the optimization --- vcf.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/vcf.c b/vcf.c index e64326263..d23d84b7f 100644 --- a/vcf.c +++ b/vcf.c @@ -3564,7 +3564,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p v->unpacked |= BCF_IS_64BIT; bcf_enc_vlong(str, n_val, a_val, -1); val1 = a_val[0]; - if (n_val==1 && strcmp(key, "END") == 0)//memcmp instead of strcmp + if (n_val==1 && val1!=bcf_int64_missing && memcmp(key, "END", 4) == 0)//memcmp instead of strcmp { if ( val1 <= v->pos ) { @@ -3635,11 +3635,10 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) // almost certainly is never detected by the compiler so has no impact, // but equally so this code has minimal (often beneficial) impact on // performance too.) - s->s[s->l+0] = 0; - s->s[s->l+1] = 0; - s->s[s->l+2] = 0; - s->s[s->l+3] = 0; - */ // commented out the part that was required for optimization in vcf_parse_info function, will take a look later. + for(int i = 0; i < 4; i++) + s->s[s->l+i] = 0; + */ + // commented out the part that was required for optimization in vcf_parse_info function, will take a look later. bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t)); @@ -4622,6 +4621,9 @@ bcf_hdr_t *bcf_hdr_read_required_sample_line(htsFile *hfp, const uint8_t is_samp if (bgzf_read(fp, buf, 4) != 4) goto fail; hlen = buf[0] | (buf[1] << 8) | (buf[2] << 16) | ((size_t) buf[3] << 24); if (hlen >= SIZE_MAX) { errno = ENOMEM; goto fail; } +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (hlen > FUZZ_ALLOC_LIMIT) { errno = ENOMEM; goto fail; } +#endif htxt = (char*)malloc(hlen + 1); if (!htxt) goto fail; if (bgzf_read(fp, htxt, hlen) != hlen) goto fail; From 96834310daf8285cf640432e31b7bbae05eb1e69 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 13 Sep 2023 10:08:49 +0100 Subject: [PATCH 484/488] Fix a minor memory leak in malformed CRAM EXTERNAL blocks. (#1671) If we have an unofficial CRAM v4.0 encoded file and are attempting to use non-byte based data in an EXTERNAL encoded block, then we didn't free the cram_codecs structure when failing to parse and initialise. Credit to OSS-Fuzz Fixes oss-fuzz 62144 --- cram/cram_codecs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cram/cram_codecs.c b/cram/cram_codecs.c index 683badf93..7d9cf128d 100644 --- a/cram/cram_codecs.c +++ b/cram/cram_codecs.c @@ -482,10 +482,10 @@ cram_codec *cram_external_decode_init(cram_block_compression_hdr *hdr, else if (option == E_BYTE || option == E_BYTE_ARRAY) c->decode = cram_external_decode_char; else - return NULL; + goto malformed; break; default: - return NULL; + goto malformed; } } else { // CRAM 3 and earlier encodes integers as EXTERNAL. We need From 1f0375c2b001f2a06c025950fcbc66f4ea5bd625 Mon Sep 17 00:00:00 2001 From: vasudeva8 Date: Wed, 2 Aug 2023 16:04:30 +0100 Subject: [PATCH 485/488] Add support for multiple files to bgzip Reimplemented to reduce complexities. Update to avoid test failure in windows Updated to work with shared stdin/out hfile --- bgzip.1 | 7 +- bgzip.c | 613 ++++++++++++++++++++++++++++----------------------- test/test.pl | 46 ++++ 3 files changed, 382 insertions(+), 284 deletions(-) diff --git a/bgzip.1 b/bgzip.1 index b1950d246..b5786c190 100644 --- a/bgzip.1 +++ b/bgzip.1 @@ -4,7 +4,7 @@ bgzip \- Block compression/decompression utility .\" .\" Copyright (C) 2009-2011 Broad Institute. -.\" Copyright (C) 2018, 2021-2022 Genome Research Limited. +.\" Copyright (C) 2018, 2021-2023 Genome Research Limited. .\" .\" Author: Heng Li .\" @@ -54,7 +54,7 @@ bgzip \- Block compression/decompression utility .IR size ] .RB [ -@ .IR threads ] -.RI [ file ] +.RI [ file " ...]" .PP .SH DESCRIPTION .PP @@ -70,7 +70,8 @@ If the -c option is used, the result will be written to standard output, otherwise when compressing bgzip will write to a new file with a .gz suffix and remove the original. When decompressing the input file must have a .gz suffix, which will be removed to make the output name. Again -after decompression completes the input file will be removed. +after decompression completes the input file will be removed. When multiple +files are given as input, the operation is performed on all of them. .SH OPTIONS .TP 10 diff --git a/bgzip.c b/bgzip.c index 589f79f66..5f577af3a 100644 --- a/bgzip.c +++ b/bgzip.c @@ -1,7 +1,7 @@ /* bgzip.c -- Block compression/decompression utility. Copyright (C) 2008, 2009 Broad Institute / Massachusetts Institute of Technology - Copyright (C) 2010, 2013-2019, 2021-2022 Genome Research Ltd. + Copyright (C) 2010, 2013-2019, 2021-2023 Genome Research Ltd. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -134,7 +134,7 @@ int main(int argc, char **argv) char *buffer; long start, end, size; char *index_fname = NULL; - int threads = 1; + int threads = 1, isstdin = 0, usedstdout = 0, ret = 0; static const struct option loptions[] = { @@ -188,329 +188,380 @@ int main(int argc, char **argv) fprintf(stderr, "[bgzip] Illegal region: [%ld, %ld]\n", start, end); return 1; } - if (compress == 1) { - hFILE* f_src = NULL; - char out_mode[3] = "w\0"; - char out_mode_exclusive[4] = "wx\0"; - - if (compress_level < -1 || compress_level > 9) { - fprintf(stderr, "[bgzip] Invalid compress-level: %d\n", compress_level); - return 1; - } - if (compress_level >= 0) { - out_mode[1] = compress_level + '0'; - out_mode_exclusive[2] = compress_level + '0'; - } - if (!(f_src = hopen(argc > optind ? argv[optind] : "-", "r"))) { - fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), argv[optind]); - return 1; - } + if ( (index || reindex) && rebgzip ) + { + fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n"); + return 1; + } + if ( rebgzip && !index_fname ) + { + fprintf(stderr, "[bgzip] Index file name expected with rebgzip. See -I option.\n"); + return 1; + } + /* avoid -I / indexfile with multiple inputs while index/reindex. these wont be set during + read/decompress and are not considered even if set */ + if ( (index || reindex) && index_fname && argc - optind > 1) { + fprintf(stderr, "[bgzip] Cannot specify index filename with multiple data file on index, reindex.\n"); + return 1; + } - if ( argc>optind ) - { - if (pstdout) - fp = bgzf_open("-", out_mode); - else + do { + isstdin = optind >= argc ? 1 : !strcmp("-", argv[optind]); //using stdin or not? + /*stdout is in use when explicitly selected or when stdin in is in use, it need to be closed + explicitly to get all io errors*/ + usedstdout |= isstdin || pstdout || test; + + if (compress == 1) { + hFILE* f_src = NULL; + char out_mode[3] = "w\0"; + char out_mode_exclusive[4] = "wx\0"; + + if (compress_level < -1 || compress_level > 9) { + fprintf(stderr, "[bgzip] Invalid compress-level: %d\n", compress_level); + return 1; + } + if (compress_level >= 0) { + out_mode[1] = compress_level + '0'; + out_mode_exclusive[2] = compress_level + '0'; + } + if (!(f_src = hopen(!isstdin ? argv[optind] : "-", "r"))) { + fprintf(stderr, "[bgzip] %s: %s\n", strerror(errno), isstdin ? "stdin" : argv[optind]); + return 1; + } + + if ( argc>optind && !isstdin ) //named input file that isn't an explicit "-" { - char *name = malloc(strlen(argv[optind]) + 5); - strcpy(name, argv[optind]); - strcat(name, ".gz"); - fp = bgzf_open(name, is_forced? out_mode : out_mode_exclusive); - if (fp == NULL && errno == EEXIST && confirm_overwrite(name)) - fp = bgzf_open(name, out_mode); - if (fp == NULL) { - fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); + if (pstdout) + fp = bgzf_open("-", out_mode); + else + { + char *name = malloc(strlen(argv[optind]) + 5); + strcpy(name, argv[optind]); + strcat(name, ".gz"); + fp = bgzf_open(name, is_forced? out_mode : out_mode_exclusive); + if (fp == NULL && errno == EEXIST) { + if (confirm_overwrite(name)) { + fp = bgzf_open(name, out_mode); + } + else { + ret = 2; //explicit N - no overwrite, continue and return 2 + if (hclose(f_src) < 0) + ; //ignoring return value + free(name); + continue; + } + } + if (fp == NULL) { + fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); + free(name); + return 1; + } free(name); - return 1; } - free(name); } - } - else if (!pstdout && isatty(fileno((FILE *)stdout)) ) - return bgzip_main_usage(stderr, EXIT_FAILURE); - else if ( index && !index_fname ) - { - fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); - return 1; - } - else - fp = bgzf_open("-", out_mode); - - if ( index && rebgzip ) - { - fprintf(stderr, "[bgzip] Can't produce a index and rebgzip simultaneously\n"); - return 1; - } - - if ( rebgzip && !index_fname ) - { - fprintf(stderr, "[bgzip] Index file name expected when writing to stdout. See -I option.\n"); - return 1; - } + else if (!pstdout && isatty(fileno((FILE *)stdout)) ) + return bgzip_main_usage(stderr, EXIT_FAILURE); + else if ( index && !index_fname ) + { + fprintf(stderr, "[bgzip] Index file name expected when writing to stdout\n"); + return 1; + } + else + fp = bgzf_open("-", out_mode); - if ( index ) bgzf_index_build_init(fp); - if (threads > 1) - bgzf_mt(fp, threads, 256); + if ( index ) bgzf_index_build_init(fp); + if (threads > 1) + bgzf_mt(fp, threads, 256); - buffer = malloc(WINDOW_SIZE); - if (!buffer) - return 1; - if (rebgzip){ - if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.gzi\n", argv[optind]); + buffer = malloc(WINDOW_SIZE); + if (!buffer) + return 1; + if (rebgzip){ + if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) error("Could not load index: %s.%s\n", !isstdin ? argv[optind] : index_fname, !isstdin ? "gzi" : ""); - while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) - if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); - } - else { - htsFormat fmt; - int textual = 0; - if (!binary - && hts_detect_format(f_src, &fmt) == 0 - && fmt.compression == no_compression) { - switch(fmt.format) { - case text_format: - case sam: - case vcf: - case bed: - case fasta_format: - case fastq_format: - case fai_format: - case fqi_format: - textual = 1; - break; - default: break; // silence clang warnings - } + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_block_write(fp, buffer, c) < 0) error("Could not write %d bytes: Error %d\n", c, fp->errcode); } + else { + htsFormat fmt; + int textual = 0; + if (!binary + && hts_detect_format(f_src, &fmt) == 0 + && fmt.compression == no_compression) { + switch(fmt.format) { + case text_format: + case sam: + case vcf: + case bed: + case fasta_format: + case fastq_format: + case fai_format: + case fqi_format: + textual = 1; + break; + default: break; // silence clang warnings + } + } - if (binary || !textual) { - // Binary data, either detected or explicit - while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) - if (bgzf_write(fp, buffer, c) < 0) - error("Could not write %d bytes: Error %d\n", - c, fp->errcode); - } else { - /* Text mode, try a flush after a newline */ - int in_header = 1, n = 0, long_line = 0; - while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) { - int c2 = c+n; - int flush = 0; - if (in_header && - (long_line || buffer[0] == '@' || buffer[0] == '#')) { - // Scan forward to find the last header line. - int last_start = 0; - n = 0; - while (n < c2) { - if (buffer[n++] != '\n') - continue; - - last_start = n; - if (n < c2 && - !(buffer[n] == '@' || buffer[n] == '#')) { - in_header = 0; - break; + if (binary || !textual) { + // Binary data, either detected or explicit + while ((c = hread(f_src, buffer, WINDOW_SIZE)) > 0) + if (bgzf_write(fp, buffer, c) < 0) + error("Could not write %d bytes: Error %d\n", + c, fp->errcode); + } else { + /* Text mode, try a flush after a newline */ + int in_header = 1, n = 0, long_line = 0; + while ((c = hread(f_src, buffer+n, WINDOW_SIZE-n)) > 0) { + int c2 = c+n; + int flush = 0; + if (in_header && + (long_line || buffer[0] == '@' || buffer[0] == '#')) { + // Scan forward to find the last header line. + int last_start = 0; + n = 0; + while (n < c2) { + if (buffer[n++] != '\n') + continue; + + last_start = n; + if (n < c2 && + !(buffer[n] == '@' || buffer[n] == '#')) { + in_header = 0; + break; + } + } + if (!last_start) { + n = c2; + long_line = 1; + } else { + n = last_start; + flush = 1; + long_line = 0; } - } - if (!last_start) { - n = c2; - long_line = 1; - } else { - n = last_start; - flush = 1; - long_line = 0; - } - } else { - // Scan backwards to find the last newline. - n += c; // c read plus previous n overflow - while (--n >= 0 && ((char *)buffer)[n] != '\n') - ; - - if (n >= 0) { - flush = 1; - n++; } else { - n = c2; + // Scan backwards to find the last newline. + n += c; // c read plus previous n overflow + while (--n >= 0 && ((char *)buffer)[n] != '\n') + ; + + if (n >= 0) { + flush = 1; + n++; + } else { + n = c2; + } } + + // Pos n is either at the end of the buffer with flush==0, + // or the first byte after a newline and a flush point. + if (bgzf_write(fp, buffer, n) < 0) + error("Could not write %d bytes: Error %d\n", + n, fp->errcode); + if (flush) + if (bgzf_flush_try(fp, 65536) < 0) // force + return -1; + + memmove(buffer, buffer+n, c2-n); + n = c2-n; } - // Pos n is either at the end of the buffer with flush==0, - // or the first byte after a newline and a flush point. + // Trailing data. if (bgzf_write(fp, buffer, n) < 0) error("Could not write %d bytes: Error %d\n", - n, fp->errcode); - if (flush) - if (bgzf_flush_try(fp, 65536) < 0) // force - return -1; - - memmove(buffer, buffer+n, c2-n); - n = c2-n; + n, fp->errcode); } - - // Trailing data. - if (bgzf_write(fp, buffer, n) < 0) - error("Could not write %d bytes: Error %d\n", - n, fp->errcode); } + if ( index ) + { + if (index_fname) { + if (bgzf_index_dump(fp, index_fname, NULL) < 0) + error("Could not write index to '%s'\n", index_fname); + } else if (!isstdin) { + if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) + error("Could not write index to '%s.gz.gzi'\n", argv[optind]); + } + else { + //stdin, cant create index file as name is not present "-.gz.gzi" not a valid one! + error("Can not write index for stdin data without index filename, use -I option to set index file.\n"); + } + } + if (bgzf_close(fp) < 0) + error("Output close failed: Error %d\n", fp->errcode); + if (hclose(f_src) < 0) + error("Input close failed\n"); + if (argc > optind && !pstdout && !keep && !isstdin) unlink(argv[optind]); + free(buffer); } - if ( index ) + else if ( reindex ) { - if (index_fname) { + if ( argc>optind && !isstdin ) + { + fp = bgzf_open(argv[optind], "r"); + if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); + } + else + { + if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); + fp = bgzf_open("-", "r"); + if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); + } + + buffer = malloc(BGZF_BLOCK_SIZE); + bgzf_index_build_init(fp); + int ret; + while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; + free(buffer); + if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); + + if ( index_fname ) { if (bgzf_index_dump(fp, index_fname, NULL) < 0) error("Could not write index to '%s'\n", index_fname); - } else { - if (bgzf_index_dump(fp, argv[optind], ".gz.gzi") < 0) - error("Could not write index to '%s.gz.gzi'\n", - argv[optind]); + } else if (!isstdin) { + if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0) + error("Could not write index to '%s.gzi'\n", argv[optind]); + } + else { + //stdin, cant create index file as name is not present "-.gzi" not a valid one! + error("Can not write index for stdin data without index filename, use -I option to set index file.\n"); } - } - if (bgzf_close(fp) < 0) - error("Output close failed: Error %d\n", fp->errcode); - if (hclose(f_src) < 0) - error("Input close failed\n"); - if (argc > optind && !pstdout && !keep) unlink(argv[optind]); - free(buffer); - return 0; - } - else if ( reindex ) - { - if ( argc>optind ) - { - fp = bgzf_open(argv[optind], "r"); - if ( !fp ) error("[bgzip] Could not open file: %s\n", argv[optind]); - } - else - { - if ( !index_fname ) error("[bgzip] Index file name expected when reading from stdin\n"); - fp = bgzf_open("-", "r"); - if ( !fp ) error("[bgzip] Could not read from stdin: %s\n", strerror(errno)); - } - buffer = malloc(BGZF_BLOCK_SIZE); - bgzf_index_build_init(fp); - int ret; - while ( (ret=bgzf_read(fp, buffer, BGZF_BLOCK_SIZE))>0 ) ; - free(buffer); - if ( ret<0 ) error("Is the file gzipped or bgzipped? The latter is required for indexing.\n"); - - if ( index_fname ) { - if (bgzf_index_dump(fp, index_fname, NULL) < 0) - error("Could not write index to '%s'\n", index_fname); - } else { - if (bgzf_index_dump(fp, argv[optind], ".gzi") < 0) - error("Could not write index to '%s.gzi'\n", argv[optind]); + if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); } - - if ( bgzf_close(fp)<0 ) error("Close failed: Error %d\n",fp->errcode); - return 0; - } - else - { - int f_dst; - - if ( argc>optind ) + else { - fp = bgzf_open(argv[optind], "r"); - if (fp == NULL) { - fprintf(stderr, "[bgzip] Could not open %s: %s\n", argv[optind], strerror(errno)); - return 1; - } - if (bgzf_compression(fp) == no_compression) { - fprintf(stderr, "[bgzip] %s: not a compressed file -- ignored\n", argv[optind]); - bgzf_close(fp); - return 1; - } + int f_dst, is_forced_tmp = is_forced; - if (pstdout || test) { - f_dst = fileno(stdout); - } - else { - const int wrflags = O_WRONLY | O_CREAT | O_TRUNC; - char *name = argv[optind], *ext; - size_t pos; - for (pos = strlen(name); pos > 0; --pos) - if (name[pos] == '.' || name[pos] == '/') break; - if (pos == 0 || name[pos] != '.') { - fprintf(stderr, "[bgzip] can't remove an extension from %s -- please rename\n", argv[optind]); - bgzf_close(fp); + if ( argc>optind && !isstdin ) + { + fp = bgzf_open(argv[optind], "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not open %s: %s\n", argv[optind], strerror(errno)); return 1; } - name = strdup(argv[optind]); - name[pos] = '\0'; - ext = &name[pos+1]; - if (! (known_extension(ext) || confirm_filename(&is_forced, name, ext))) { - fprintf(stderr, "[bgzip] unknown extension .%s -- declining to decompress to %s\n", ext, name); + if (bgzf_compression(fp) == no_compression) { + fprintf(stderr, "[bgzip] %s: not a compressed file -- ignored\n", argv[optind]); bgzf_close(fp); - free(name); return 1; } - f_dst = open(name, is_forced? wrflags : wrflags|O_EXCL, 0666); - if (f_dst < 0 && errno == EEXIST && confirm_overwrite(name)) - f_dst = open(name, wrflags, 0666); - if (f_dst < 0) { - fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); + + if (pstdout || test) { + f_dst = fileno(stdout); + } + else { + const int wrflags = O_WRONLY | O_CREAT | O_TRUNC; + char *name = argv[optind], *ext; + size_t pos; + for (pos = strlen(name); pos > 0; --pos) + if (name[pos] == '.' || name[pos] == '/') break; + if (pos == 0 || name[pos] != '.') { + fprintf(stderr, "[bgzip] can't remove an extension from %s -- please rename\n", argv[optind]); + bgzf_close(fp); + return 1; + } + name = strdup(argv[optind]); + name[pos] = '\0'; + ext = &name[pos+1]; + if (! (known_extension(ext) || confirm_filename(&is_forced_tmp, name, ext))) { + fprintf(stderr, "[bgzip] unknown extension .%s -- declining to decompress to %s\n", ext, name); + bgzf_close(fp); + free(name); + ret = 2; //explicit N, continue and return 2 + continue; + } + f_dst = open(name, is_forced_tmp? wrflags : wrflags|O_EXCL, 0666); + if (f_dst < 0 && errno == EEXIST) { + if (confirm_overwrite(name)) { + f_dst = open(name, wrflags, 0666); + } + else { + ret = 2; //explicit N - no overwrite, continue and return 2 + free(name); + bgzf_close(fp); + continue; + } + } + if (f_dst < 0) { + fprintf(stderr, "[bgzip] can't create %s: %s\n", name, strerror(errno)); + free(name); + return 1; + } free(name); - return 1; } - free(name); } - } - else if (!pstdout && isatty(fileno((FILE *)stdin)) ) - return bgzip_main_usage(stderr, EXIT_FAILURE); - else - { - f_dst = fileno(stdout); - fp = bgzf_open("-", "r"); - if (fp == NULL) { - fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); - return 1; + else if (!pstdout && isatty(fileno((FILE *)stdin)) ) + return bgzip_main_usage(stderr, EXIT_FAILURE); + else + { + f_dst = fileno(stdout); + fp = bgzf_open("-", "r"); + if (fp == NULL) { + fprintf(stderr, "[bgzip] Could not read from stdin: %s\n", strerror(errno)); + return 1; + } + if (bgzf_compression(fp) == no_compression) { + fprintf(stderr, "[bgzip] stdin is not compressed -- ignored\n"); + bgzf_close(fp); + return 1; + } } - if (bgzf_compression(fp) == no_compression) { - fprintf(stderr, "[bgzip] stdin is not compressed -- ignored\n"); - bgzf_close(fp); - return 1; + + buffer = malloc(WINDOW_SIZE); + if ( start>0 ) + { + if (index_fname) { + if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) + error("Could not load index: %s\n", index_fname); + } else { + if (optind >= argc || isstdin) { + error("The -b option requires -I when reading from stdin " + "(and stdin must be seekable)\n"); + } + if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) + error("Could not load index: %s.gzi\n", argv[optind]); + } + if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } - } - buffer = malloc(WINDOW_SIZE); - if ( start>0 ) - { - if (index_fname) { - if ( bgzf_index_load(fp, index_fname, NULL) < 0 ) - error("Could not load index: %s\n", index_fname); - } else { - if (optind >= argc) { - error("The -b option requires -I when reading from stdin " - "(and stdin must be seekable)\n"); + if (threads > 1) + bgzf_mt(fp, threads, 256); + + #ifdef _WIN32 + _setmode(f_dst, O_BINARY); + #endif + long start_reg = start, end_reg = end; + while (1) { + if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); + else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); + if (c == 0) break; + if (c < 0) error("Error %d in block starting at offset %" PRId64 "(%" PRIX64 ")\n", fp->errcode, fp->block_address, fp->block_address); + start += c; + if ( !test && write(f_dst, buffer, c) != c ) { + #ifdef _WIN32 + if (GetLastError() != ERROR_NO_DATA) + #endif + error("Could not write %d bytes\n", c); } - if ( bgzf_index_load(fp, argv[optind], ".gzi") < 0 ) - error("Could not load index: %s.gzi\n", argv[optind]); + if (end >= 0 && start >= end) break; + } + start = start_reg; + end = end_reg; + free(buffer); + if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); + if (argc > optind && !pstdout && !test && !keep && !isstdin) unlink(argv[optind]); + if (!isstdin && !pstdout && !test) { + close(f_dst); //close output file when it is not stdout } - if ( bgzf_useek(fp, start, SEEK_SET) < 0 ) error("Could not seek to %d-th (uncompressd) byte\n", start); } + } while (++optind < argc); - if (threads > 1) - bgzf_mt(fp, threads, 256); - -#ifdef _WIN32 - _setmode(f_dst, O_BINARY); -#endif - while (1) { - if (end < 0) c = bgzf_read(fp, buffer, WINDOW_SIZE); - else c = bgzf_read(fp, buffer, (end - start > WINDOW_SIZE)? WINDOW_SIZE:(end - start)); - if (c == 0) break; - if (c < 0) error("Error %d in block starting at offset %" PRId64 "(%" PRIX64 ")\n", fp->errcode, fp->block_address, fp->block_address); - start += c; - if ( !test && write(f_dst, buffer, c) != c ) { -#ifdef _WIN32 - if (GetLastError() != ERROR_NO_DATA) -#endif - error("Could not write %d bytes\n", c); - } - if (end >= 0 && start >= end) break; + if (usedstdout && !reindex) { + //stdout in use, have to close explicitly to get any pending write errors + if (fclose(stdout) != 0 && errno != EBADF) { + fprintf(stderr, "[bgzip] Failed to close stdout, errno %d", errno); + ret = 1; } - free(buffer); - if (bgzf_close(fp) < 0) error("Close failed: Error %d\n",fp->errcode); - if (argc > optind && !pstdout && !test && !keep) unlink(argv[optind]); - return 0; } + return ret; } diff --git a/test/test.pl b/test/test.pl index 566e7cfce..a8af05b47 100755 --- a/test/test.pl +++ b/test/test.pl @@ -377,6 +377,8 @@ sub test_bgzip { my $index = "${compressed}.gzi"; my $test = sprintf('%s %2s threads', 'bgzip round-trip', $threads ? $threads : 'no'); + my $uncompressed1 = "$$opts{tmp}/ce.fa.$threads"; + my $uncompressed1_copy = "$$opts{tmp}/ce.fa.$threads.copy"; # Round-trip test print "$test: "; @@ -473,6 +475,50 @@ sub test_bgzip { return; } passed($opts,$test); + + # multi file test, expects compressed files from previous tests + # bgzip should return failure if both inputs not present + $test = sprintf('%s %2s threads', 'bgzip multifile', + $threads ? $threads : 'no'); + print "$test: "; + + #decompress and remove + $c = "$$opts{bin}/bgzip $at -d '$compressed' '$compressed_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + #check both files present and matches or not + $c = "cmp '$data' '$uncompressed1'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, + $out ? $out : "'$data' '$uncompressed1' differ"); + return; + } + $c = "cmp '$data' '$uncompressed1_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, + $out ? $out : "'$data' '$uncompressed1_copy' differ"); + return; + } + #compress and remove + $c = "$$opts{bin}/bgzip $at '$uncompressed1' '$uncompressed1_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + #decompress again to ensure successful compression + $c = "$$opts{bin}/bgzip $at -d '$compressed' '$compressed_copy'"; + ($ret, $out) = _cmd($c); + if ($ret) { + failed($opts, $test, "non-zero exit from $c"); + return; + } + passed($opts,$test); } my $test_view_failures; From 30211d88e8f7d30cba82ab12a77ed7786b0c0918 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 6 Sep 2023 10:07:28 +0100 Subject: [PATCH 486/488] Enable auto-vectorisation in CRAM 3.1 codecs. I suspect this was initially hard as on, but later made something we explicitly enable but forgetting to add that code into htslib. On Illumina it made little difference, but wasn't detrimental. I need bigger data sets, but they're mostly on unavailable systems right now. Small tests demonstrate utility though, specifically on decode speeds. For other platforms: Ultima Genomics =============== Orig real 0m25.784s user 0m24.506s sys 0m1.189s real 0m9.155s user 0m7.775s sys 0m1.379s RANS_ORDER_SIMD_AUTO real 0m24.987s user 0m23.699s sys 0m1.219s real 0m8.097s user 0m6.635s sys 0m1.461s That's 13% quicker decode and 3% quicker encode. It's mostly QS and tags: $ ~/samtools/samtools cram-size -v _.cram|grep 32x16 BLOCK 10 617823 77895 12.61% r32x16-o1 BLOCK 12 911236491 188134803 20.65% r32x16-o1R QS BLOCK 27 232221 38816 16.72% r32x16-o0 FC BLOCK 31 54067 10718 19.82% r32x16-o0 BS BLOCK 7614554 917596491 50148593 5.47% r32x16-o1 t0Z BLOCK 7630914 931877007 108982153 11.69% r32x16-o1R tpB ONT === Orig real 0m3.018s user 0m2.854s sys 0m0.130s real 0m0.578s user 0m0.538s sys 0m0.040s RANS_ORDER_SIMD_AUTO real 0m2.912s user 0m2.740s sys 0m0.120s real 0m0.500s user 0m0.430s sys 0m0.070s That's 16% quicker decode and 4% quicker encode, but sample size is admittedly tiny for both tests. File size changes are under 0.1% growth, mainly due to 32 rANS states instead of 4. The RANS_ORDER_SIMD_AUTO flag basically enables the 32-way rANS if the block is sufficiently large (>50kb), so it's the extra 112 byte state overhead isn't significant. --- cram/cram_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cram/cram_io.c b/cram/cram_io.c index 92a31583f..20e0a6555 100644 --- a/cram/cram_io.c +++ b/cram/cram_io.c @@ -1852,8 +1852,9 @@ static char *cram_compress_by_method(cram_slice *s, char *in, size_t in_size, // see enum cram_block. We map RANS_* methods to order bit-fields static int methmap[] = { 1, 64,9, 128,129, 192,193 }; + int m = method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]; cp = rans_compress_4x16((unsigned char *)in, in_size, &out_size_i, - method == RANS_PR0 ? 0 : methmap[method - RANS_PR1]); + m | RANS_ORDER_SIMD_AUTO); *out_size = out_size_i; return (char *)cp; } From ba14d0d9f2c6c217481d44bcb72821684a254962 Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Fri, 22 Sep 2023 14:05:20 -0700 Subject: [PATCH 487/488] revert precision change and minor fix in faidx.c --- faidx.c | 2 +- kstring.c | 10 ++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/faidx.c b/faidx.c index 8ac149485..133a43177 100644 --- a/faidx.c +++ b/faidx.c @@ -950,7 +950,7 @@ static void fai_retrieve_into_buffer(const faidx_t *fai, const faidx1_t *val, } s[l] = '\0'; - *len = l < INT_MAX ? l : INT_MAX; + *len = l; } void faidx_fetch_seq_into_buffer(const faidx_t *fai, diff --git a/kstring.c b/kstring.c index 5284d53d0..f8e0f9f3d 100644 --- a/kstring.c +++ b/kstring.c @@ -57,7 +57,7 @@ int kputd(double d, kstring_t *s) { if (ks_resize(s, s->l + 50) < 0) return EOF; // We let stdio handle the exponent cases - int s2 = snprintf(s->s + s->l, s->m - s->l, "%#g", d); + int s2 = snprintf(s->s + s->l, s->m - s->l, "%g", d); len += s2; s->l += s2; return len; @@ -116,7 +116,7 @@ int kputd(double d, kstring_t *s) { } xp[0] = '.'; cp[7] = 0; ep=cp+6; - //if (cp[6] == '.') cp[6] = 0; + if (cp[6] == '.') cp[6] = 0; } // Cull trailing zeros @@ -125,10 +125,8 @@ int kputd(double d, kstring_t *s) { char *z = ep+1; while (ep > cp) { if (*ep == '.') { - if (z[-1] == '.'){ - z[0] = '0'; - z[1] = 0; - } + if (z[-1] == '.') + z[-1] = 0; else z[0] = 0; break; From 5b60e74b7529940a582838e53fcd00b1168e5e5e Mon Sep 17 00:00:00 2001 From: eneskuluk <54481799+eneskuluk@users.noreply.github.com> Date: Mon, 25 Sep 2023 16:11:10 -0700 Subject: [PATCH 488/488] address some of the comments --- faidx.c | 6 ++++++ htslib/vcf.h | 36 ++++++++++++++++++------------------ vcf.c | 20 +------------------- 3 files changed, 25 insertions(+), 37 deletions(-) diff --git a/faidx.c b/faidx.c index 133a43177..089001dd3 100644 --- a/faidx.c +++ b/faidx.c @@ -926,6 +926,12 @@ static void fai_retrieve_into_buffer(const faidx_t *fai, const faidx1_t *val, *len = -1; return; } + + if (val->line_blen <= 0) { + hts_log_error("Invalid line length in index: %d", val->line_blen); + *len = -1; + return; + } ret = bgzf_useek(fai->bgzf, offset diff --git a/htslib/vcf.h b/htslib/vcf.h index de830dda5..908f15b65 100644 --- a/htslib/vcf.h +++ b/htslib/vcf.h @@ -61,16 +61,16 @@ extern "C" { #define BCF_HT_FLAG 0 // header type #define BCF_HT_INT 1 -#define BCF_HT_REAL 7 -#define BCF_HT_STR 8 -#define BCF_HT_CHAR 9 -#define BCF_HT_INT64 10 +#define BCF_HT_REAL 2 +#define BCF_HT_STR 3 +#define BCF_HT_UINT 4 +#define BCF_HT_CHAR 5 +#define BCF_HT_INT64 6 #define BCF_HT_LONG BCF_HT_INT64 // BCF_HT_INT, but for int64_t values; VCF only! -#define BCF_HT_VOID 12 -#define BCF_NUM_HT_TYPES 14 -#define BCF_HT_UINT 2 -#define BCF_HT_UINT64 11 -#define BCF_HT_DOUBLE 13 +#define BCF_HT_UINT64 7 +#define BCF_HT_VOID 8 +#define BCF_HT_DOUBLE 9 +#define BCF_NUM_HT_TYPES 10 #define BCF_VL_FIXED 0 // variable length #define BCF_VL_VAR 1 @@ -154,14 +154,14 @@ extern uint8_t bcf_type_shift[]; #define VCF_SNP (1<<0) #define VCF_MNP (1<<1) #define VCF_INDEL (1<<2) -#define VCF_OTHER 32 -#define VCF_BND 64 // breakend -#define VCF_OVERLAP 16 // overlapping deletion, ALT=* -#define VCF_INS VCF_INDEL // implies VCF_INDEL -#define VCF_DEL VCF_INDEL // implies VCF_INDEL +#define VCF_OTHER (1<<3) +#define VCF_BND (1<<4) // breakend +#define VCF_OVERLAP (1<<5) // overlapping deletion, ALT=* +#define VCF_SPANNING_DELETION VCF_OVERLAP +#define VCF_INS (1<<6) // implies VCF_INDEL +#define VCF_DEL (1<<7) // implies VCF_INDEL #define VCF_ANY (VCF_SNP|VCF_MNP|VCF_INDEL|VCF_OTHER|VCF_BND|VCF_OVERLAP|VCF_INS|VCF_DEL) // any variant type (but not VCF_REF) -#define VCF_NON_REF 8 -#define VCF_SPANNING_DELETION 16 +#define VCF_NON_REF (1<<8) typedef struct bcf_variant_t { int type, n; // variant type and the number of bases affected, negative for deletions @@ -1605,9 +1605,9 @@ static inline int bcf_enc_size(kstring_t *s, size_t size, int type) s->l += 6; } else{ - *p++ = 1<<4|BCF_BT_INT64;// + *p++ = 1<<4|BCF_BT_INT64; i64_to_le(size,p); - s->l += 10; // not so sure about +10 here, whether it is accurate or changes anything. + s->l += 10; return -1; } } diff --git a/vcf.c b/vcf.c index d23d84b7f..e6275ddad 100644 --- a/vcf.c +++ b/vcf.c @@ -1175,7 +1175,6 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) { int len, done = 0; char *p = htxt; - int return_val = 0; // Check sanity: "fileformat" string must come as first bcf_hrec_t *hrec = bcf_hdr_parse_line(hdr,p,&len); @@ -1198,7 +1197,6 @@ int bcf_hdr_parse(bcf_hdr_t *hdr, char *htxt) while (NULL != (hrec = bcf_hdr_parse_line(hdr, p, &len))) { if(len < 0) { - return_val = -1; done = -1; break; } @@ -3577,7 +3575,7 @@ static int vcf_parse_info(kstring_t *str, const bcf_hdr_t *h, bcf1_t *v, char *p else v->rlen = val1 - v->pos; } - } else if ((y >> 4 & 0xf) == BCF_HT_REAL) { + } else if ((y>>4&0xf) == BCF_HT_REAL) { float *val_f = (float *)a_val; for (i = 0, t = val; i < n_val; ++i, ++t) { @@ -3623,22 +3621,6 @@ int vcf_parse(kstring_t *s, const bcf_hdr_t *h, bcf1_t *v) // Assumed in lots of places, but we may as well spot this early assert(sizeof(float) == sizeof(int32_t)); - // Ensure string we parse has space to permit some over-flow when during - // parsing. Eg to do memcmp(key, "END", 4) in vcf_parse_info over - // the more straight forward looking strcmp, giving a speed advantage. - /* - if (ks_resize(s, s->l+4) < 0) - return -1; - - // Force our memory to be initialised so we avoid the technicality of - // undefined behaviour in using a 4-byte memcmp. (The reality is this - // almost certainly is never detected by the compiler so has no impact, - // but equally so this code has minimal (often beneficial) impact on - // performance too.) - for(int i = 0; i < 4; i++) - s->s[s->l+i] = 0; - */ - // commented out the part that was required for optimization in vcf_parse_info function, will take a look later. bcf_clear1(v); str = &v->shared; memset(&aux, 0, sizeof(ks_tokaux_t));