From 765f67e801e9460c46cf5625e939b38d3250dc99 Mon Sep 17 00:00:00 2001 From: Dan Gealow Date: Fri, 7 Nov 2025 02:53:18 +0000 Subject: [PATCH 1/3] Update pgen lib to v2.0.0-a.6.26 and add pgenlib_write.h / .cc --- external_libs/pgenlib/include/pgenlib_misc.cc | 83 +- external_libs/pgenlib/include/pgenlib_misc.h | 86 +- external_libs/pgenlib/include/pgenlib_read.cc | 264 +- external_libs/pgenlib/include/pgenlib_read.h | 63 +- .../pgenlib/include/pgenlib_write.cc | 2624 +++++++++++++++++ external_libs/pgenlib/include/pgenlib_write.h | 393 +++ external_libs/pgenlib/include/plink2_base.cc | 53 +- external_libs/pgenlib/include/plink2_base.h | 316 +- external_libs/pgenlib/include/plink2_bits.cc | 124 +- external_libs/pgenlib/include/plink2_bits.h | 46 +- 10 files changed, 3834 insertions(+), 218 deletions(-) create mode 100644 external_libs/pgenlib/include/pgenlib_write.cc create mode 100644 external_libs/pgenlib/include/pgenlib_write.h diff --git a/external_libs/pgenlib/include/pgenlib_misc.cc b/external_libs/pgenlib/include/pgenlib_misc.cc index 332ec399..1d3a75a4 100644 --- a/external_libs/pgenlib/include/pgenlib_misc.cc +++ b/external_libs/pgenlib/include/pgenlib_misc.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell, +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -14,13 +14,20 @@ // You should have received a copy of the GNU Lesser General Public License // along with this library. If not, see . - #include "pgenlib_misc.h" +#include + #ifdef __cplusplus namespace plink2 { #endif +#ifndef NDEBUG +char* g_pgl_errbuf = nullptr; +char* g_pgl_errbuf_write_iter = nullptr; +char* g_pgl_errbuf_end = nullptr; +#endif + #ifdef USE_AVX2 void CopyNyparrNonemptySubset(const uintptr_t* __restrict raw_nyparr, const uintptr_t* __restrict subset_mask, uint32_t raw_nyparr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_nyparr) { if (subset_entry_ct == raw_nyparr_entry_ct) { @@ -1244,6 +1251,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui // and target_4567 to // _ (4, 0) _ (5, 0) _ (6, 0) _ (7, 0) _ (4, 1) _ (5, 1) _ (6, 1) ... // This is perfectly arranged for movemask. + // todo: better ARM implementation VecW target_4567 = vecw_blendv(loader, vecw_srli(loader, 7), m8); target_iter7[vidx] = vecw_movemask(target_4567); target_4567 = vecw_slli(target_4567, 2); @@ -3261,77 +3269,6 @@ void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_ } } -void ClearGenoarrMissing1bit16Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals) { - const uint32_t orig_subset_size = *subset_sizep; - Halfword* subset_alias = DowncastWToHW(subset); - uint32_t read_idx = 0; - // deliberate overflow - for (uint32_t read_widx = UINT32_MAX; ; ) { - uint32_t subset_bits; - do { - subset_bits = subset_alias[++read_widx]; - } while (!subset_bits); - uintptr_t detect_11 = genoarr[read_widx]; - detect_11 = detect_11 & (detect_11 >> 1) & kMask5555; - if (detect_11) { - uint32_t detect_11_hw = PackWordToHalfword(detect_11); - const uint32_t joint_u32 = subset_bits & detect_11_hw; - if (joint_u32) { - uintptr_t lowbit = joint_u32 & (-joint_u32); - uint32_t write_idx = read_idx + PopcountWord(subset_bits & (lowbit - 1)); - read_idx = write_idx + 1; - uint32_t subset_bits_write = subset_bits ^ lowbit; - uint16_t* sparse_vals_u16 = S_CAST(uint16_t*, sparse_vals); - subset_bits &= -(2 * lowbit); - for (; read_idx != orig_subset_size; ++read_idx) { -#ifdef USE_AVX2 - if (!subset_bits) { - subset_alias[read_widx] = subset_bits_write; - do { - subset_bits = subset_alias[++read_widx]; - } while (!subset_bits); - subset_bits_write = subset_bits; - detect_11 = genoarr[read_widx]; - detect_11 = detect_11 & (detect_11 >> 1); - detect_11_hw = PackWordToHalfwordMask5555(detect_11); - } - lowbit = subset_bits & (-subset_bits); - subset_bits ^= lowbit; - if (lowbit & detect_11_hw) { - subset_bits_write ^= lowbit; - continue; - } -#else - if (!subset_bits) { - subset_alias[read_widx] = subset_bits_write; - do { - subset_bits = subset_alias[++read_widx]; - } while (!subset_bits); - subset_bits_write = subset_bits; - detect_11 = genoarr[read_widx]; - detect_11 = detect_11 & (detect_11 >> 1); - } - lowbit = subset_bits & (-subset_bits); - subset_bits ^= lowbit; - if ((lowbit * lowbit) & detect_11) { - subset_bits_write ^= lowbit; - continue; - } -#endif - sparse_vals_u16[write_idx++] = sparse_vals_u16[read_idx]; - } - subset_alias[read_widx] = subset_bits_write; - *subset_sizep = write_idx; - return; - } - } - read_idx += PopcountWord(subset_bits); - if (read_idx == orig_subset_size) { - return; - } - } -} - double u127prod_diff_d(uint64_t plus_term0, uint64_t plus_term1, uint64_t minus_term0, uint64_t minus_term1) { uint64_t plus_hi; const uint64_t plus_lo = multiply64to128(plus_term0, plus_term1, &plus_hi); diff --git a/external_libs/pgenlib/include/pgenlib_misc.h b/external_libs/pgenlib/include/pgenlib_misc.h index 0f616d5c..d05a4248 100644 --- a/external_libs/pgenlib/include/pgenlib_misc.h +++ b/external_libs/pgenlib/include/pgenlib_misc.h @@ -1,7 +1,7 @@ #ifndef __PGENLIB_MISC_H__ #define __PGENLIB_MISC_H__ -// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell, +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -74,17 +74,97 @@ // on the fly, since that tends to be faster than having to access twice as // much memory. +#include +#ifndef NDEBUG +# include // va_start() +#endif +#include +#include + +#include "plink2_base.h" #include "plink2_bits.h" // 10000 * major + 100 * minor + patch // Exception to CONSTI32, since we want the preprocessor to have access to this // value. Named with all caps as a consequence. -#define PGENLIB_INTERNAL_VERNUM 2003 +#define PGENLIB_INTERNAL_VERNUM 2100 #ifdef __cplusplus namespace plink2 { #endif +#ifdef NDEBUG +HEADER_INLINE BoolErr PglInitLog(__attribute__((unused)) uintptr_t log_capacity) { + return 0; +} + +HEADER_INLINE void PglLogprintf(__attribute__((unused)) const char* fmt, ...) { +} + +HEADER_INLINE char* PglReturnLog() { + return nullptr; +} + +HEADER_INLINE void PglResetLog() { +} +#else +extern char* g_pgl_errbuf; +extern char* g_pgl_errbuf_write_iter; +extern char* g_pgl_errbuf_end; + +HEADER_INLINE BoolErr PglInitLog(uintptr_t log_capacity) { + if (g_pgl_errbuf) { + if (log_capacity <= S_CAST(uintptr_t, g_pgl_errbuf_end - g_pgl_errbuf)) { + // existing allocation is fine. no need to shrink + g_pgl_errbuf_write_iter = g_pgl_errbuf; + *g_pgl_errbuf_write_iter = '\0'; + return 0; + } + free(g_pgl_errbuf); + } + // 128 extra bytes to make WordWrapMultiline() safe. + g_pgl_errbuf = S_CAST(char*, malloc(log_capacity + 128)); + if (!g_pgl_errbuf) { + g_pgl_errbuf_write_iter = nullptr; + g_pgl_errbuf_end = nullptr; + // in the unlikely event this comes up, caller is free to ignore the error, + // logging just won't happen. though, if malloc is failing, something else + // is likely to fail... + return 1; + } + g_pgl_errbuf_write_iter = g_pgl_errbuf; + *g_pgl_errbuf_write_iter = '\0'; + g_pgl_errbuf_end = &(g_pgl_errbuf[log_capacity]); + return 0; +} + +HEADER_INLINE void PglLogprintf(const char* fmt, ...) { + // possible todo: log levels + if (g_pgl_errbuf_write_iter != nullptr) { + va_list args; + va_start(args, fmt); + const uintptr_t remaining_space = g_pgl_errbuf_end - g_pgl_errbuf_write_iter; + const uintptr_t intended_slen = vsnprintf(g_pgl_errbuf_write_iter, remaining_space, fmt, args); + if (intended_slen < remaining_space) { + g_pgl_errbuf_write_iter = &(g_pgl_errbuf_write_iter[intended_slen]); + } else { + g_pgl_errbuf_write_iter = g_pgl_errbuf_end; + } + } +} + +HEADER_INLINE char* PglReturnLog() { + return g_pgl_errbuf; +} + +HEADER_INLINE void PglResetLog() { + if (g_pgl_errbuf) { + g_pgl_errbuf_write_iter = g_pgl_errbuf; + g_pgl_errbuf_write_iter[0] = '\0'; + } +} +#endif + // other configuration-ish values needed by plink2_common subset typedef unsigned char AlleleCode; typedef uint16_t DoubleAlleleCode; @@ -574,8 +654,6 @@ HEADER_INLINE uint32_t GenoIter1x(const uintptr_t* __restrict genoarr, uintptr_t // sparse_vals entries. void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals); -void ClearGenoarrMissing1bit16Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals); - // See EasyasPi's answer to // https://stackoverflow.com/questions/25095741/how-can-i-multiply-64-bit-operands-and-get-128-bit-result-portably HEADER_INLINE uint64_t multiply64to128(uint64_t lhs, uint64_t rhs, uint64_t* high) { diff --git a/external_libs/pgenlib/include/pgenlib_read.cc b/external_libs/pgenlib/include/pgenlib_read.cc index 8c2f7a06..b139b746 100644 --- a/external_libs/pgenlib/include/pgenlib_read.cc +++ b/external_libs/pgenlib/include/pgenlib_read.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell, +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -18,6 +18,11 @@ #include "pgenlib_read.h" #include +#include +#include + +// Uncomment this during e.g. pgenlibr development to enable error-throwing. +// #include #ifdef __cplusplus namespace plink2 { @@ -621,15 +626,15 @@ uintptr_t CountPgrAllocCachelinesRequired(uint32_t raw_sample_ct, PgenGlobalFlag const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1; const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor); + const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present); if ((gflags & kfPgenGlobalDifflistOrLdPresent) || (max_allele_ct > 2)) { // workspace_difflist_sample_ids // bugfix: must add 1 since several routines add a terminator element - cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline); + cachelines_required += 1 + (max_difflist_entry_ct / kInt32PerCacheline); } if (gflags & kfPgenGlobalDifflistOrLdPresent) { - // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present); // workspace_raregeno_vec - cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base); + cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct); // workspace_raregeno_tmp_loadbuf cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base); @@ -1570,6 +1575,7 @@ PglErr PgfiInitPhase2Ex(PgenHeaderCtrl header_ctrl, uint32_t allele_cts_already_ for (; vrtypes_alias_iter != vrtypes_alias_end; ++vrtypes_alias_iter) { const VecW cur_vvec = *vrtypes_alias_iter; #ifdef __LP64__ + // todo: better ARM implementation const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5); const VecW cur_vvec_bit1 = vecw_slli(cur_vvec, 6); // check if any vrtype has bit 1 set and bit 2 clear @@ -1846,6 +1852,7 @@ uint32_t GetLdbaseVidx(const unsigned char* vrtypes, uint32_t cur_vidx) { if (cur_vidx_orig_remainder) { const VecW cur_vvec = vrtypes_valias[vidx_vec_idx]; // non-ld: ((bit 2) OR (NOT bit 1)) + // todo: better ARM implementation const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5); const VecW inv_cur_vvec_bit1 = ~vecw_slli(cur_vvec, 6); v8ui = vecw_movemask(cur_vvec_bit2 | inv_cur_vvec_bit1); @@ -2094,22 +2101,22 @@ PglErr PgrInit(const char* fname, uint32_t max_vrec_width, PgenFileInfo* pgfip, const uint32_t bitvec_bytes_req = BitCtToCachelineCt(raw_sample_ct) * kCacheline; const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1; const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor); + const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present); const uint32_t max_allele_ct = pgrp->fi.max_allele_ct; pgrp->workspace_difflist_sample_ids = nullptr; if ((gflags & kfPgenGlobalDifflistOrLdPresent) || (max_allele_ct > 2)) { - pgrp->workspace_difflist_sample_ids = S_CAST(uint32_t*, arena_alloc_raw_rd((max_difflist_entry_ct_base + 1) * sizeof(int32_t), &pgr_alloc_iter)); + pgrp->workspace_difflist_sample_ids = S_CAST(uint32_t*, arena_alloc_raw_rd((max_difflist_entry_ct + 1) * sizeof(int32_t), &pgr_alloc_iter)); } if (gflags & kfPgenGlobalDifflistOrLdPresent) { - // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present); - - const uintptr_t raregeno_bytes_req = NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline; - pgrp->workspace_raregeno_vec = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_bytes_req, &pgr_alloc_iter)); - pgrp->workspace_raregeno_tmp_loadbuf = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_bytes_req, &pgr_alloc_iter)); + const uintptr_t raregeno_ldmerge_bytes_req = NypCtToCachelineCt(max_difflist_entry_ct) * kCacheline; + pgrp->workspace_raregeno_vec = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_ldmerge_bytes_req, &pgr_alloc_iter)); + const uintptr_t raregeno_no_ld_bytes_req = NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline; + pgrp->workspace_raregeno_tmp_loadbuf = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_no_ld_bytes_req, &pgr_alloc_iter)); if (ld_compression_present) { pgrp->ldbase_genovec = S_CAST(uintptr_t*, arena_alloc_raw(genovec_bytes_req, &pgr_alloc_iter)); - pgrp->ldbase_raregeno = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_bytes_req, &pgr_alloc_iter)); + pgrp->ldbase_raregeno = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_no_ld_bytes_req, &pgr_alloc_iter)); pgrp->ldbase_difflist_sample_ids = S_CAST(uint32_t*, arena_alloc_raw_rd((max_difflist_entry_ct_base + 1) * sizeof(int32_t), &pgr_alloc_iter)); } @@ -2267,8 +2274,9 @@ PglErr ParseAndSaveDifflist(const unsigned char* fread_end, uint32_t raw_sample_ } PglErr ParseAndSaveDifflistProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr, uintptr_t* __restrict raregeno_workspace) { - // Requires a PROPER subset. Might want to just merge this with - // ParseAndSaveDifflist() and rename appropriately. + // Requires a PROPER subset, since it assumes sample_include is non-null. + // Might want to just merge this with ParseAndSaveDifflist() and rename + // appropriately. // Trailing bits of raregeno are zeroed out. uint32_t raw_difflist_len; const unsigned char* group_info_iter; @@ -2344,6 +2352,8 @@ PglErr ParseLdAndMergeDifflistSubset(const unsigned char* fread_end, const uintp // Used when the ldbase variant was saved as a difflist, and it's useful to // process the current variant as a difflist. // * Assumes ldbase_difflist_sample_ids[ldbase_difflist_len]==sample_ct. + // * However, it is not necessary for merged_difflist_sample_ids to have + // space for an extra element. // * Assumes sample_include == nullptr if no subsetting needed. (Otherwise, // it'll still work, but performance will be worse.) // Trailing bits of merged_raregeno may not be zeroed out. @@ -3134,23 +3144,31 @@ PglErr LdLoadMinimalSubsetIfNecessary(const uintptr_t* __restrict sample_include return kPglRetSuccess; } -PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { +PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { assert(vidx < pgrp->fi.raw_variant_ct); assert(sample_ct); - assert(max_simple_difflist_len < sample_ct); // Side effects: // may use pgr.workspace_raregeno_tmp_loadbuf // Trailing bits of genovec/main_raregeno may not be zeroed out. + // Always sets difflist_common_geno; may not set difflist_len. const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx); const uint32_t maintrack_vrtype = vrtype & 7; const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct; + assert(max_difflist_len < raw_sample_ct); const uint32_t subsetting_required = (sample_ct != raw_sample_ct); // const uint32_t multiallelic_hc_present = fread_pp && VrtypeMultiallelic(vrtype); if (VrtypeLdCompressed(maintrack_vrtype)) { - // LD compression - - // note that this can currently load a difflist longer than - // max_simple_difflist_len + // Variant of interest is LD-compressed. + // Before 13 Feb 2025, this branch ignored the max_difflist_len parameter, + // forcing callers to allocate main_raregeno and difflist_sample_ids up to + // 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor) entries. Now we + // confirm that + // len(possibly-subsetted ldbase difflist) + + // len(raw differences-from-ldbase list) + // is no longer than max_difflist_len. This misses some cases where we can + // give a sparse return (since the raw differences-from-ldbase list can + // contain entries outside the current subset or overlap with the first + // list), but should be good enough in practice. PglErr reterr = LdLoadMinimalSubsetIfNecessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp); if (unlikely(reterr)) { return reterr; @@ -3162,19 +3180,25 @@ PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_incl } const uint32_t ld_invert = (maintrack_vrtype == 3); if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) { - const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3; - // unnecessary for this to branch on LD difflist length, since that's - // limited to 3/4 of the ldbase difflist length. - *difflist_common_geno_ptr = ldbase_common_geno; - reterr = ParseLdAndMergeDifflistSubset(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->ldbase_difflist_len, ldbase_common_geno, raw_sample_ct, sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, pgrp->workspace_raregeno_tmp_loadbuf); - if (unlikely(reterr)) { - return reterr; + const uint32_t cur_difflist_len = PeekVint31(fread_ptr, fread_end); + if (pgrp->ldbase_difflist_len + cur_difflist_len <= max_difflist_len) { + const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3; + *difflist_common_geno_ptr = ldbase_common_geno; + reterr = ParseLdAndMergeDifflistSubset(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->ldbase_difflist_len, ldbase_common_geno, raw_sample_ct, sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, pgrp->workspace_raregeno_tmp_loadbuf); + if (unlikely(reterr)) { + return reterr; + } + if (ld_invert) { + *difflist_common_geno_ptr = (6 - ldbase_common_geno) & 3; + GenovecInvertUnsafe(*difflist_len_ptr, main_raregeno); + } + return kPglRetSuccess; } - if (ld_invert) { - *difflist_common_geno_ptr = (6 - ldbase_common_geno) & 3; - GenovecInvertUnsafe(*difflist_len_ptr, main_raregeno); + if (!(pgrp->ldbase_stypes & kfPgrLdcacheNyp)) { + const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3; + PgrDifflistToGenovecUnsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, ldbase_common_geno, sample_ct, pgrp->ldbase_difflist_len, pgrp->ldbase_genovec); + pgrp->ldbase_stypes |= kfPgrLdcacheNyp; } - return kPglRetSuccess; } if (pgrp->ldbase_stypes & kfPgrLdcacheNyp) { CopyNyparr(pgrp->ldbase_genovec, sample_ct, genovec); @@ -3209,7 +3233,7 @@ PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_incl // no limit is slightly better than /16 but substantially worse than /32 on // the large test dataset (/64 is slightly worse than /32) // no limit is best on the small test dataset - if (saved_difflist_len > max_simple_difflist_len) { + if (saved_difflist_len > max_difflist_len) { *difflist_common_geno_ptr = UINT32_MAX; PglErr reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, genovec); if (unlikely(reterr)) { @@ -3257,14 +3281,14 @@ PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_incl return kPglRetSuccess; } -PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { +PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { if (!sample_ct) { *difflist_common_geno_ptr = UINT32_MAX; return kPglRetSuccess; } PgenReaderMain* pgrp = GetPgrp(pgr_ptr); assert(vidx < pgrp->fi.raw_variant_ct); - return ReadDifflistOrGenovecSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, genovec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr); + return ReadDifflistOrGenovecSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, max_difflist_len, vidx, pgrp, nullptr, nullptr, genovec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr); } PglErr LdSubsetAdjustGenocounts(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict ldbase_genovec, uint32_t raw_sample_ct, const unsigned char** fread_pp, STD_ARRAY_REF(uint32_t, 4) genocounts, uintptr_t* __restrict raregeno_workspace) { @@ -3387,6 +3411,7 @@ PglErr SkipDeltalistIds(const unsigned char* fread_end, const unsigned char* gro } const VecW vv = vecw_loadu(fread_ptr); fread_ptr = &(fread_ptr[kBytesPerVec]); + // todo: better ARM implementation const uint32_t highbits = vecw_movemask(vv); remaining_id_ct -= kBytesPerVec - PopcountVec8thUint(highbits); } @@ -5537,7 +5562,7 @@ PglErr IMPLPgrGetInv1(const uintptr_t* __restrict sample_include, const uint32_t return reterr; } -PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { +PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { if (!sample_ct) { *difflist_common_geno_ptr = UINT32_MAX; return kPglRetSuccess; @@ -5545,7 +5570,7 @@ PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_includ const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx); const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype); if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) { - PglErr reterr = ReadDifflistOrGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr); + PglErr reterr = ReadDifflistOrGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_difflist_len, vidx, pgrp, nullptr, nullptr, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr); if (unlikely(reterr)) { return reterr; } @@ -7479,6 +7504,175 @@ PglErr IMPLPgrGetD(const uintptr_t* __restrict sample_include, const uint32_t* _ return ParseDosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, allele_ct, pgrp, dosage_ct_ptr, nullptr, nullptr, nullptr, dosage_present, dosage_main); } +static const uint16_t kGenoToDosage16[4] = {0, 16384, 32768, 65535}; + +// These tables should be renamed in a way that distinguishes them from the +// likes of kGenoToDosage16. +const uint16_t kHcToDosage16[1024] = QUAD_TABLE256(0, 16384, 32768, 65535); + +PglErr IMPLPgrGetDMaybeSparse(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t max_sparse_dosage_ct, PgenReaderMain* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr, uint16_t* difflist_common_dosage_ptr, uint32_t* difflist_sample_ids) { + assert(vidx < pgrp->fi.raw_variant_ct); + *difflist_common_dosage_ptr = 1; + if (!sample_ct) { + return kPglRetSuccess; + } + const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx); + const uint32_t dosage_matters = dosage_present && VrtypeDosage(vrtype); + const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct; + const uint32_t subsetting_required = (sample_ct != raw_sample_ct); + if (dosage_matters && (((vrtype & 0x60) != 0x20) || (subsetting_required && VrtypeHphase(vrtype)))) { + // If dense dosages are present, we can immediately bail to PgrGetD(). + // + // If a dosage-list is present, we punt the hardcall-phase + subsetting + // subcase for now. We can fill it in by calling + // ReadDifflistOrGenovecSubsetUnsafe() in no-subsetting mode (so we know + // the correct het_ct), calling SkipAux2(), and then subsetting. (Or + // ReadDifflistOrGenovecSubsetUnsafe() could optionally return raw_het_ct, + // but I expect that approach to degrade hotter-path performance by too + // much to be worth it.) Eventually, that code path will also need to + // support multiallelic variants. + return IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, genovec, dosage_present, dosage_main, dosage_ct_ptr); + } + *dosage_ct_ptr = 0; + uintptr_t* subsetted_raregeno = pgrp->workspace_raregeno_vec; + // If a dosage-list is present, we may need to merge a (sample ID, genotype) + // list returned by ReadDifflistOrGenovecSubsetUnsafe() with the (sample ID, + // dosage) list. So we save the IDs in the first list to a workspace buffer. + // However, if dosages aren't present (or we're ignoring them), we can write + // the sample IDs directly to the final destination. + uint32_t* subsetted_raregeno_sample_ids = dosage_matters? pgrp->workspace_difflist_sample_ids : difflist_sample_ids; + const unsigned char* fread_ptr = nullptr; + const unsigned char* fread_end = nullptr; + uint32_t difflist_common_geno; + uint32_t subsetted_raregeno_ct; + PglErr reterr = ReadDifflistOrGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_sparse_dosage_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec, &difflist_common_geno, subsetted_raregeno, subsetted_raregeno_sample_ids, &subsetted_raregeno_ct); + if (unlikely(reterr)) { + return reterr; + } + if (!dosage_matters) { + *dosage_ct_ptr = subsetted_raregeno_ct; + if (difflist_common_geno != UINT32_MAX) { + *difflist_common_dosage_ptr = kGenoToDosage16[difflist_common_geno]; + GenoarrLookup256x2bx4(subsetted_raregeno, kHcToDosage16, subsetted_raregeno_ct, dosage_main); + } + return kPglRetSuccess; + } + const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets; + const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2; + if (allele_ct != 2) { + // Don't need aux1-skipping logic yet. +#ifndef PGENLIB_NOPRINT + fputs("multiallelic dosages not yet supported by PgrGetDMaybeSparse()\n", stderr); +#endif + return kPglRetNotYetSupported; + } + if (VrtypeHphase(vrtype)) { + assert(!subsetting_required); + const uint32_t raw_het_ct = CountNyp(subsetted_raregeno, kMask5555, raw_sample_ct); + reterr = SkipAux2(fread_end, raw_het_ct, &fread_ptr, nullptr); + if (unlikely(reterr)) { + return reterr; + } + } + // Only invoke sparse-return logic iff a sufficiently-short dosage-list is + // stored. + const uint32_t peek_raw_dosage_ct = PeekVint31(fread_ptr, fread_end); + if (peek_raw_dosage_ct + subsetted_raregeno_ct > max_sparse_dosage_ct) { + if (difflist_common_geno != UINT32_MAX) { + PgrDifflistToGenovecUnsafe(subsetted_raregeno, subsetted_raregeno_sample_ids, difflist_common_geno, sample_ct, subsetted_raregeno_ct, genovec); + } + return ParseDosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, allele_ct, pgrp, dosage_ct_ptr, nullptr, nullptr, nullptr, dosage_present, dosage_main); + } + *difflist_common_dosage_ptr = kGenoToDosage16[difflist_common_geno]; + if (!subsetting_required) { + sample_include = nullptr; + } + subsetted_raregeno_sample_ids[subsetted_raregeno_ct] = sample_ct; + // Rest of this function is similar to ParseLdAndMergeDifflistSubset(). + uint32_t raw_dosage_ct; + const unsigned char* group_info_iter; + reterr = ParseDifflistHeader(fread_end, raw_sample_ct, &fread_ptr, nullptr, &group_info_iter, &raw_dosage_ct); + if (unlikely(reterr)) { + return reterr; + } + const unsigned char* deltalist_iter = fread_ptr; + const unsigned char* cur_raw_dosage_main_start = fread_ptr; + reterr = SkipDeltalistIds(fread_end, group_info_iter, raw_dosage_ct, raw_sample_ct, 0, &cur_raw_dosage_main_start); + if (unlikely(reterr)) { + return reterr; + } + // raw_dosage_ct == 0 is technically permitted, but shouldn't happen in + // practice. So we handle it correctly but don't give it a fast-path. + const int32_t subgroup_idx_last = ((raw_dosage_ct + kBitsPerWordD2 - 1) / kBitsPerWordD2) - 1; + const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct); + uint32_t difflist_write_idx = 0; + uintptr_t subsetted_raregeno_word = 0; + uint32_t subsetted_raregeno_sample_idx = subsetted_raregeno_sample_ids[0]; + uintptr_t raw_sample_idx = 0; + uint32_t sample_idx = 0; + uint32_t subsetted_raregeno_difflist_idx = 0; + uint32_t done = 0; + uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1; + for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) { + uint32_t dosage_idx_lowbits = 0; + if (S_CAST(int32_t, subgroup_idx) >= subgroup_idx_last) { + if (S_CAST(int32_t, subgroup_idx) > subgroup_idx_last) { + done = 1; + sample_idx = sample_ct; + goto IMPLPgrGetDMaybeSparse_finish; + } + subgroup_len_m1 &= raw_dosage_ct - 1; + } + if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) { + raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct); + group_info_iter = &(group_info_iter[sample_id_byte_ct]); + } else { + // May as well use cur_raw_dosage_main_start over fread_end here. + raw_sample_idx += GetVint31(cur_raw_dosage_main_start, &deltalist_iter); + } + for (; ; ++dosage_idx_lowbits) { + if (unlikely(raw_sample_idx >= raw_sample_ct)) { + return kPglRetMalformedInput; + } + if ((!sample_include) || IsSet(sample_include, raw_sample_idx)) { + sample_idx = sample_include? RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx) : raw_sample_idx; + IMPLPgrGetDMaybeSparse_finish: + while (subsetted_raregeno_sample_idx < sample_idx) { + if (!(subsetted_raregeno_difflist_idx % kBitsPerWordD2)) { + subsetted_raregeno_word = subsetted_raregeno[subsetted_raregeno_difflist_idx / kBitsPerWordD2]; + } + difflist_sample_ids[difflist_write_idx] = subsetted_raregeno_sample_idx; + dosage_main[difflist_write_idx] = kGenoToDosage16[subsetted_raregeno_word & 3]; + ++difflist_write_idx; + ++subsetted_raregeno_difflist_idx; + subsetted_raregeno_word >>= 2; + subsetted_raregeno_sample_idx = subsetted_raregeno_sample_ids[subsetted_raregeno_difflist_idx]; + } + if (subsetted_raregeno_sample_idx == sample_idx) { + if (done) { + *dosage_ct_ptr = difflist_write_idx; + return kPglRetSuccess; + } + if (!(subsetted_raregeno_difflist_idx % kBitsPerWordD2)) { + subsetted_raregeno_word = subsetted_raregeno[subsetted_raregeno_difflist_idx / kBitsPerWordD2]; + } + ++subsetted_raregeno_difflist_idx; + subsetted_raregeno_word >>= 2; + subsetted_raregeno_sample_idx = subsetted_raregeno_sample_ids[subsetted_raregeno_difflist_idx]; + } + difflist_sample_ids[difflist_write_idx] = sample_idx; + CopyFromUnalignedOffsetU16(&(dosage_main[difflist_write_idx]), cur_raw_dosage_main_start, dosage_idx_lowbits); + ++difflist_write_idx; + } + if (dosage_idx_lowbits == subgroup_len_m1) { + break; + } + raw_sample_idx += GetVint31(cur_raw_dosage_main_start, &deltalist_iter); + } + cur_raw_dosage_main_start = &(cur_raw_dosage_main_start[kBitsPerWordD2 * sizeof(int16_t)]); + } +} + PglErr PgrGet1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) { PgenReaderMain* pgrp = GetPgrp(pgr_ptr); const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi); diff --git a/external_libs/pgenlib/include/pgenlib_read.h b/external_libs/pgenlib/include/pgenlib_read.h index 734e81a5..1fcedbd5 100644 --- a/external_libs/pgenlib/include/pgenlib_read.h +++ b/external_libs/pgenlib/include/pgenlib_read.h @@ -1,7 +1,7 @@ #ifndef __PGENLIB_READ_H__ #define __PGENLIB_READ_H__ -// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell, +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -20,7 +20,11 @@ // pgenlib_read contains reader-specific code. +#include + #include "pgenlib_misc.h" +#include "plink2_base.h" +#include "plink2_bits.h" #ifdef __cplusplus namespace plink2 { @@ -149,10 +153,14 @@ typedef struct PgenReaderMainStruct { // can also be used for other purposes after we're done processing aux2. uintptr_t* workspace_vec; - // currently must hold (raw_sample_ct / kPglMaxDifflistLenDivisor) - // entries; may need to double the sizes later - // some top-level interface functions use these, so several lower-level - // functions cannot + // workspace_raregeno_vec must hold 2 * (raw_sample_ct / + // kPglMaxDifflistLenDivisor) entries, due to PgrGetDMaybeSparse(). + // workspace_difflist_sample_ids requires space for one more entry than that, + // due to a potential terminator element. + // + // Several other top-level interface functions use + // workspace_difflist_sample_ids. Lower-level functions should never refer + // directly to either of these two buffers. uintptr_t* workspace_raregeno_vec; uint32_t* workspace_difflist_sample_ids; @@ -556,17 +564,10 @@ PglErr PgrGet(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex p // difflist_common_geno to the common genotype value in that case. Otherwise, // genovec is populated and difflist_common_geno is set to UINT32_MAX. // -// max_simple_difflist_len must be smaller than sample_ct. -// -// Note that the returned difflist_len can be much larger than -// max_simple_difflist_len when the variant is LD-encoded; it's bounded by -// 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor). -// (probable todo: this interface has... rather sharp edges, even relative to -// the rest of this low-level library. Maybe it shouldn't be deleted, but it -// would be better if there was a function that took a max_difflist_len -// parameter, and it was safe for difflist_sample_ids to only be allocated up -// to that length.) -PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr); +// max_difflist_len must be smaller than raw_sample_ct. The returned difflist +// will never be longer than this. (Note that this was not true before 13 Feb +// 2025.) +PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr); // genocounts[0] = # hom ref, [1] = # het ref, [2] = two alts, [3] = missing PglErr PgrGetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts); @@ -599,12 +600,12 @@ HEADER_INLINE PglErr PgrGetInv1(const uintptr_t* __restrict sample_include, PgrS return IMPLPgrGetInv1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec); } -PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr); +PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr); -HEADER_INLINE PglErr PgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { +HEADER_INLINE PglErr PgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) { PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m); const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts); - return IMPLPgrGetInv1DifflistOrGenovec(sample_include, sample_include_cumulative_popcounts, sample_ct, max_simple_difflist_len, vidx, allele_idx, pgrp, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr); + return IMPLPgrGetInv1DifflistOrGenovec(sample_include, sample_include_cumulative_popcounts, sample_ct, max_difflist_len, vidx, allele_idx, pgrp, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr); } PglErr IMPLPgrGet2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReaderMain* pgrp, uintptr_t* __restrict genovec); @@ -667,6 +668,30 @@ HEADER_INLINE PglErr PgrGetD(const uintptr_t* __restrict sample_include, PgrSamp return IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, genovec, dosage_present, dosage_main, dosage_ct_ptr); } +extern const uint16_t kHcToDosage16[1024]; + +PglErr IMPLPgrGetDMaybeSparse(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t max_sparse_dosage_ct, PgenReaderMain* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr, uint16_t* difflist_common_dosage_ptr, uint32_t* difflist_sample_ids); + +// If the following conditions hold: +// 1. Hardcalls and dosages are stored sparsely. +// 2. Either no subsetting is requested, or neither multiallelic-hardcall nor +// hardcall-phase data is present. (This condition may be removed in the +// future.) +// 3. PgrGetDifflistOrGenovec(..., max_difflist_len=max_sparse_dosage_ct, ...) +// returns a sparse list, and the length of that sparse list plus the length +// of the explicit-dosage list <= max_sparse_dosage_ct. +// Then difflist_common_dosage is set to the common value (either 0, 32768, or +// 65535), and {difflist_sample_ids, dosage_main, dosage_ct} provide a sparse +// representation of the other values. +// Otherwise, difflist_common_dosage is set to 1, and return values are as with +// PgrGetD(). +// max_sparse_dosage_ct must be less than raw_sample_ct. +HEADER_INLINE PglErr PgrGetDMaybeSparse(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t max_sparse_dosage_ct, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr, uint16_t* difflist_common_dosage_ptr, uint32_t* difflist_sample_ids) { + PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m); + const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts); + return IMPLPgrGetDMaybeSparse(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, max_sparse_dosage_ct, pgrp, genovec, dosage_present, dosage_main, dosage_ct_ptr, difflist_common_dosage_ptr, difflist_sample_ids); +} + PglErr PgrGet1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr); PglErr PgrGetInv1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr); diff --git a/external_libs/pgenlib/include/pgenlib_write.cc b/external_libs/pgenlib/include/pgenlib_write.cc new file mode 100644 index 00000000..db96d566 --- /dev/null +++ b/external_libs/pgenlib/include/pgenlib_write.cc @@ -0,0 +1,2624 @@ +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, +// Christopher Chang. +// +// This library is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 3 of the License, or (at your +// option) any later version. +// +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library. If not, see . + +#include "pgenlib_write.h" + +#include +#include +#include +#include // unlink() + +#include "plink2_bits.h" + +#ifdef __cplusplus +namespace plink2 { +#endif + +static inline PgenWriterCommon* GetPwcp(STPgenWriter* spgwp) { + return &GET_PRIVATE(*spgwp, pwc); +} + +static inline FILE** GetPgenOutfilep(STPgenWriter* spgwp) { + return &GET_PRIVATE(*spgwp, pgen_outfile); +} + +static inline FILE** GetPgiOrFinalPgenOutfilep(STPgenWriter* spgwp) { + return &GET_PRIVATE(*spgwp, pgi_or_final_pgen_outfile); +} + +static inline char** GetFnameBufp(STPgenWriter* spgwp) { + return &GET_PRIVATE(*spgwp, fname_buf); +} + +void GenovecInvertCopyUnsafe(const uintptr_t* __restrict genovec, uint32_t sample_ct, uintptr_t* __restrict genovec_inverted_copy) { + // flips 0 to 2 and vice versa. + // "unsafe" because trailing bits are not zeroed out. + const uint32_t vec_ct = NypCtToVecCt(sample_ct); + assert(IsVecAligned(genovec)); + const VecW not_m1 = VCONST_W(kMaskAAAA); + const VecW* vin_ptr = R_CAST(const VecW*, genovec); + VecW* vout_ptr = R_CAST(VecW*, genovec_inverted_copy); + for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) { + const VecW cur_vec = vin_ptr[vidx]; + // flip high bit iff low bit is unset + vout_ptr[vidx] = cur_vec ^ vecw_and_notfirst(vecw_slli(cur_vec, 1), not_m1); + } +} + +void PreinitSpgw(STPgenWriter* spgwp) { + *GetPgenOutfilep(spgwp) = nullptr; + *GetPgiOrFinalPgenOutfilep(spgwp) = nullptr; + *GetFnameBufp(spgwp) = nullptr; +} + +void PreinitMpgw(MTPgenWriter* mpgwp) { + mpgwp->pgen_outfile = nullptr; + mpgwp->pgi_or_final_pgen_outfile = nullptr; + mpgwp->fname_buf = nullptr; +} + +// OpenFail and WriteFail errors can now refer to either the .pgen, .pgen.tmp, +// or .pgen.pgi file. +PglErr PwcInitPhase1(const char* __restrict fname, uintptr_t* explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct_limit, uint32_t sample_ct, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uintptr_t vrec_len_byte_ct, PgenWriterCommon* pwcp, FILE** pgen_outfile_ptr, FILE** pgi_or_final_pgen_outfile_ptr, char** fname_buf_ptr) { + pwcp->explicit_nonref_flags = nullptr; + if (nonref_flags_storage == 3) { + if (unlikely(!explicit_nonref_flags)) { + return kPglRetImproperFunctionCall; + } + pwcp->explicit_nonref_flags = explicit_nonref_flags; + } + pwcp->header_exts = header_exts; + pwcp->footer_exts = footer_exts; + pwcp->variant_ct_limit = variant_ct_limit; + pwcp->sample_ct = sample_ct; + pwcp->phase_dosage_gflags = phase_dosage_gflags; + pwcp->nonref_flags_storage = nonref_flags_storage; + pwcp->vrec_len_byte_ct = vrec_len_byte_ct; +#ifndef NDEBUG + pwcp->vblock_fpos = nullptr; + pwcp->vrec_len_buf = nullptr; + pwcp->vrtype_buf = nullptr; + pwcp->fwrite_buf = nullptr; + pwcp->fwrite_bufp = nullptr; + pwcp->genovec_hets_buf = nullptr; + pwcp->genovec_invert_buf = nullptr; + pwcp->ldbase_genovec = nullptr; + pwcp->ldbase_raregeno = nullptr; + pwcp->ldbase_difflist_sample_ids = nullptr; +#endif + pwcp->vidx = 0; + + *pgen_outfile_ptr = nullptr; + *pgi_or_final_pgen_outfile_ptr = nullptr; + *fname_buf_ptr = nullptr; + const int32_t ext_present = (header_exts != nullptr) || (footer_exts != nullptr); + const int32_t third_byte = ((write_mode == kPgenWriteSeparateIndex)? 0x20 : 0x10) + ext_present; + if (write_mode != kPgenWriteBackwardSeek) { + const uint32_t fname_slen = strlen(fname); + if (fname_slen > kPglFnamesize - 5) { + return kPglRetMalformedInput; + } + pwcp->vblock_fpos_offset = 3; + if (write_mode == kPgenWriteAndCopy) { + if (unlikely(pgl_malloc(fname_slen + 5, fname_buf_ptr))) { + return kPglRetNomem; + } + char* fname_iter = memcpya(*fname_buf_ptr, fname, fname_slen); + // In the kPgenWriteAndCopy case, pgen_outfile is the *temporary* .pgen + // output. + strcpy_k(fname_iter, ".tmp"); + *pgen_outfile_ptr = fopen(*fname_buf_ptr, FOPEN_WB); + if (unlikely(!(*pgen_outfile_ptr))) { + return kPglRetOpenFail; + } + fwrite_unlocked("l\x1b", 2, 1, *pgen_outfile_ptr); + if (unlikely(putc_checked(third_byte + 0x10, *pgen_outfile_ptr))) { + return kPglRetWriteFail; + } + } else { + // kPgenWriteSeparateIndex + char fname_buf[kPglFnamesize]; + char* fname_iter = memcpya(fname_buf, fname, fname_slen); + strcpy_k(fname_iter, ".pgi"); + *pgi_or_final_pgen_outfile_ptr = fopen(fname_buf, FOPEN_WB); + if (unlikely(!(*pgi_or_final_pgen_outfile_ptr))) { + return kPglRetOpenFail; + } + fwrite_unlocked("l\x1b", 2, 1, *pgi_or_final_pgen_outfile_ptr); + if (unlikely(putc_checked(third_byte + 0x10, *pgi_or_final_pgen_outfile_ptr))) { + return kPglRetWriteFail; + } + } + } + FILE* header_ff = fopen(fname, FOPEN_WB); + if (unlikely(!header_ff)) { + return kPglRetOpenFail; + } + if (write_mode != kPgenWriteAndCopy) { + *pgen_outfile_ptr = header_ff; + } else { + *pgi_or_final_pgen_outfile_ptr = header_ff; + } + fwrite_unlocked("l\x1b", 2, 1, header_ff); + if (unlikely(putc_checked(third_byte, header_ff))) { + return kPglRetWriteFail; + } + if (write_mode != kPgenWriteBackwardSeek) { + return kPglRetSuccess; + } + + uint64_t header_bytes_left = PglHeaderBaseEndOffset(variant_ct_limit, vrec_len_byte_ct, phase_dosage_gflags != kfPgenGlobal0, nonref_flags_storage == 3) - 3; + if (ext_present) { + // Additional header contents: + // 1. Flag varint describing which header extensions are present. + // 2. Flag varint describing which footer extensions are present. + // 3. If at least one footer extension present, uint64 byte offset of + // footer start. + // 4. For each present header extension, a varint indicating its byte + // length. + // 5. Bodies of header extensions, in order. + + // Fixed minimum size of (1) + (2). + header_bytes_left += 2; + if (header_exts) { + PgenExtensionLl* header_exts_iter = header_exts; + int32_t prev_type_idx = -1; + do { + const int32_t cur_type_idx = header_exts_iter->type_idx; + if (cur_type_idx <= prev_type_idx) { + return kPglRetImproperFunctionCall; + } + const uint64_t cur_size = header_exts_iter->size; + if (unlikely(cur_size >= (1LLU << (kBitsPerWord - 1)))) { + return kPglRetImproperFunctionCall; + } + header_bytes_left += cur_size + VintBytect(cur_size); + + prev_type_idx = cur_type_idx; + header_exts_iter = header_exts_iter->next; + } while (header_exts_iter); + header_bytes_left += S_CAST(uint32_t, prev_type_idx) / 7; + } + if (footer_exts) { + PgenExtensionLl* footer_exts_iter = footer_exts; + int32_t prev_type_idx = -1; + do { + const int32_t cur_type_idx = footer_exts_iter->type_idx; + if (cur_type_idx <= prev_type_idx) { + return kPglRetImproperFunctionCall; + } + prev_type_idx = cur_type_idx; + footer_exts_iter = footer_exts_iter->next; + } while (footer_exts_iter); + // 8 more fixed bytes for (3). + header_bytes_left += sizeof(int64_t) + S_CAST(uint32_t, prev_type_idx) / 7; + } + } + + // this should be the position of the first variant + pwcp->vblock_fpos_offset = 3 + header_bytes_left; + + uintptr_t zeroed_cachelines_needed = DivUp(header_bytes_left, kCacheline); + if (zeroed_cachelines_needed > (kPglFwriteBlockSize / kCacheline)) { + zeroed_cachelines_needed = kPglFwriteBlockSize / kCacheline; + } + // could wait until fwrite_buf is allocated, and make sure it's aligned? + unsigned char zerobuf[kPglFwriteBlockSize]; + memset(zerobuf, 0, zeroed_cachelines_needed * kCacheline); + while (header_bytes_left > kPglFwriteBlockSize) { + fwrite_unlocked(zerobuf, kPglFwriteBlockSize, 1, header_ff); + header_bytes_left -= kPglFwriteBlockSize; + } + if (unlikely(fwrite_checked(zerobuf, header_bytes_left, header_ff))) { + return kPglRetWriteFail; + } + return kPglRetSuccess; +} + +uintptr_t CountSpgwAllocCachelinesRequired(uint32_t variant_ct_limit, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uint32_t max_vrec_len) { + // vblock_fpos + const uint32_t vblock_ct_limit = DivUp(variant_ct_limit, kPglVblockSize); + uintptr_t cachelines_required = Int64CtToCachelineCt(vblock_ct_limit); + + // vrec_len_buf + const uintptr_t vrec_len_byte_ct = BytesToRepresentNzU32(max_vrec_len); + cachelines_required += DivUp(variant_ct_limit * vrec_len_byte_ct, kCacheline); + + // vrtype_buf + if (phase_dosage_gflags) { + cachelines_required += DivUp(variant_ct_limit, kCacheline); + } else { + cachelines_required += DivUp(variant_ct_limit, kCacheline * 2); + } + + // genovec_hets_buf, genovec_invert_buf, ldbase_genovec + cachelines_required += 3 * NypCtToCachelineCt(sample_ct); + + const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor); + // ldbase_raregeno + cachelines_required += NypCtToCachelineCt(max_difflist_len); + + // ldbase_difflist_sample_ids + cachelines_required += 1 + (max_difflist_len / kInt32PerCacheline); + + // fwrite_buf + // + (5 + sizeof(AlleleCode)) * kPglDifflistGroupSize to avoid buffer + // overflow in middle of difflist writing + // TODO: document which code approaches this limit. sizeof(AlleleCode) + // implies it's one of the multiallelic subcases? + cachelines_required += DivUp(max_vrec_len + kPglFwriteBlockSize + (5 + sizeof(AlleleCode)) * kPglDifflistGroupSize, kCacheline); + // possible todo: dosage (doesn't currently need an allocation, but that's + // unlikely to remain true--e.g. get_ref_nonref_genotype_counts_and_dosages + // tends to use workspace_vec when a function it calls doesn't use it...) + + // probable todo: multiallelic (phased) dosage + return cachelines_required; +} + +static_assert(kPglMaxAlleleCt == 255, "Need to update SpgwInitPhase1Ex()."); +PglErr SpgwInitPhase1Ex(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct_limit, uint32_t sample_ct, uint32_t allele_ct_upper_bound, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, STPgenWriter* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr) { + assert(variant_ct_limit); + assert(sample_ct); + + // separate from MpgwInitPhase1's version of this computation since the + // latter wants a better bound on the compressed size of an entire vblock + // than max_vrec_len * kPglVblockSize... + uint64_t max_vrec_len = NypCtToByteCt(sample_ct); + uintptr_t max_alt_ct_p1 = 2; + if (allele_idx_offsets) { + const uint32_t variant_ct = variant_ct_limit; + if (allele_idx_offsets[variant_ct] != 2 * variant_ct) { + assert(allele_idx_offsets[0] == 0); + assert(allele_idx_offsets[variant_ct] > 2 * variant_ct); + max_alt_ct_p1 = 3; + uintptr_t prev_offset = 0; + for (uint32_t vidx = 1; vidx <= variant_ct; ++vidx) { + const uintptr_t cur_offset = allele_idx_offsets[vidx]; + if (cur_offset - prev_offset > max_alt_ct_p1) { + max_alt_ct_p1 = cur_offset - prev_offset; + } + prev_offset = cur_offset; + } + } + } else if (allele_ct_upper_bound) { + max_alt_ct_p1 = allele_ct_upper_bound; + } + if (max_alt_ct_p1 > 2) { + // see comments in middle of MpgwInitPhase1() + // bugfix (29 Mar 2023): forgot (sample_ct + 6) / 8 term + max_vrec_len += 2 + sizeof(AlleleCode) + (sample_ct + 6) / 8 + GetAux1bAlleleEntryByteCt(max_alt_ct_p1, sample_ct - 1); + // try to permit uncompressed records to be larger than this, only error + // out when trying to write a larger compressed record. + } + if (phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) { + // phasepresent, phaseinfo + max_vrec_len += 2 * DivUp(sample_ct, CHAR_BIT); + } + if (phase_dosage_gflags & kfPgenGlobalDosagePresent) { + const uint32_t dphase_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePhasePresent) & 1; + // aux3, aux7 + max_vrec_len += (1 + dphase_gflag) * DivUp(sample_ct, 8); + // aux4, aux8 + max_vrec_len += (2 + 2 * dphase_gflag) * S_CAST(uint64_t, sample_ct); + // todo: multiallelic dosage + } +#ifdef __LP64__ + if (max_vrec_len >= kPglMaxBytesPerVariant) { + max_vrec_len = kPglMaxBytesPerVariant; + } +#else + if (unlikely(max_vrec_len > kMaxBytesPerIO + 1 - kPglFwriteBlockSize)) { + return kPglRetNomem; + } +#endif + *max_vrec_len_ptr = max_vrec_len; + const uintptr_t vrec_len_byte_ct = BytesToRepresentNzU32(max_vrec_len); + + PgenWriterCommon* pwcp = GetPwcp(spgwp); + FILE** pgen_outfilep = GetPgenOutfilep(spgwp); + FILE** pgi_or_final_pgen_outfilep = GetPgiOrFinalPgenOutfilep(spgwp); + char** fname_bufp = GetFnameBufp(spgwp); + PglErr reterr = PwcInitPhase1(fname, explicit_nonref_flags, header_exts, footer_exts, variant_ct_limit, sample_ct, write_mode, phase_dosage_gflags, nonref_flags_storage, vrec_len_byte_ct, pwcp, pgen_outfilep, pgi_or_final_pgen_outfilep, fname_bufp); + if (likely(!reterr)) { + *alloc_cacheline_ct_ptr = CountSpgwAllocCachelinesRequired(variant_ct_limit, sample_ct, phase_dosage_gflags, max_vrec_len); + } + return reterr; +} + +static_assert(kPglMaxAlleleCt == 255, "Need to update MpgwInitPhase1()."); +void MpgwInitPhase1(const uintptr_t* __restrict allele_idx_offsets, uint32_t variant_ct, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uintptr_t* alloc_base_cacheline_ct_ptr, uint64_t* alloc_per_thread_cacheline_ct_ptr, uint32_t* vrec_len_byte_ct_ptr, uint64_t* vblock_cacheline_ct_ptr) { + // variant_ct must be exact due to how MpgwFlush() works. + assert(variant_ct); + assert(sample_ct); + // vblock_fpos + const uint32_t vblock_ct = DivUp(variant_ct, kPglVblockSize); + uint32_t alloc_base_cacheline_ct = Int64CtToCachelineCt(vblock_ct); + + // vrtype_buf + if (phase_dosage_gflags) { + alloc_base_cacheline_ct += DivUp(variant_ct, kCacheline); + } else { + alloc_base_cacheline_ct += DivUp(variant_ct, kCacheline * 2); + } + + // pwcs + uint64_t alloc_per_thread_cacheline_ct = DivUp(sizeof(PgenWriterCommon), kCacheline); + + // genovec_hets_buf, genovec_invert_buf, ldbase_genovec + // (could make genovec_hets_buf allocation conditional, but not worth the + // additional load this puts on import functions) + alloc_per_thread_cacheline_ct += 3 * NypCtToCachelineCt(sample_ct); + + const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor); + // ldbase_raregeno + alloc_per_thread_cacheline_ct += NypCtToCachelineCt(max_difflist_len); + + // ldbase_difflist_sample_ids + alloc_per_thread_cacheline_ct += 1 + (max_difflist_len / kInt32PerCacheline); + + uint64_t max_vrec_len = NypCtToByteCt(sample_ct); + if (phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) { + max_vrec_len += 2 * DivUp(sample_ct, CHAR_BIT); + } + const uint32_t dosage_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePresent) & 1; + const uint32_t dosage_phase_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePhasePresent) & 1; + if (dosage_gflag) { + max_vrec_len += ((1 + dosage_phase_gflag) * DivUp(sample_ct, CHAR_BIT)) + (2 + 2 * dosage_phase_gflag) * S_CAST(uint64_t, sample_ct); + } + const uint32_t max_vblock_size = MINV(variant_ct, kPglVblockSize); + // max_vrec_len is a uint64_t + uint64_t max_vblock_byte_ct = max_vrec_len * max_vblock_size; + if (allele_idx_offsets && (allele_idx_offsets[variant_ct] != 2 * variant_ct)) { + assert(allele_idx_offsets[0] == 0); + assert(allele_idx_offsets[variant_ct] > 2 * variant_ct); + // When multiallelic variants are present, larger write buffers are needed. + // we compute the largest possible size here. + // + // One way to hit the maximum size is to have exactly 1 ref/altx genotype, + // all other genotypes are altx/alty, and all genotypes include a rarealt. + // Then, we need 2 + sizeof(AlleleCode) bytes for the format byte and + // aux1a, (sample_ct + 6) / 8 bytes for the bitarray at the front of aux1b, + // and + // alt ct aux1_last_quarter bytes required + // ------ -------------------------------- + // 2 (sample_ct - 1 + 7) / 8 + // 3-4 (sample_ct - 1 + 1) / 2 + // 5-16 (sample_ct - 1) + // 17-256 2 * (sample_ct - 1) + // + // (todo: also take multiallelic dosage into account) + assert(!dosage_gflag); + uintptr_t prev_offset = 0; + uint32_t vidx = 0; + const uint32_t extra_bytes_base = 2 + sizeof(AlleleCode) + (sample_ct + 6) / 8; + const uint64_t extra_bytes_max = kPglMaxBytesPerVariant - max_vrec_len; + uint64_t extra_byte_cts[4]; + uint32_t extra_alt_ceil = kPglMaxAlleleCt; + + // alt_ct == 2 + uint64_t cur_extra_byte_ct = extra_bytes_base + (sample_ct + 6) / 8; + extra_byte_cts[0] = cur_extra_byte_ct; + extra_byte_cts[1] = 0; // force initialization + extra_byte_cts[2] = 0; + extra_byte_cts[3] = 0; + if (cur_extra_byte_ct >= extra_bytes_max) { + extra_alt_ceil = 2; + } else { + // alt_ct in [3, 4] + cur_extra_byte_ct = extra_bytes_base + (sample_ct / 2); + extra_byte_cts[1] = cur_extra_byte_ct; + if (cur_extra_byte_ct >= extra_bytes_max) { + extra_alt_ceil = 3; + } else { + // alt_ct in [5, 16] + cur_extra_byte_ct = extra_bytes_base + sample_ct - 1; + extra_byte_cts[2] = cur_extra_byte_ct; + if (cur_extra_byte_ct >= extra_bytes_max) { + extra_alt_ceil = 5; + } else { + // alt_ct in [17, 256] + cur_extra_byte_ct = extra_bytes_base + 2 * (sample_ct - 1); + extra_byte_cts[3] = cur_extra_byte_ct; + if (cur_extra_byte_ct >= extra_bytes_max) { + extra_alt_ceil = 16; + } + } + } + } + uint32_t extra_alt_ceil_ct = 0; + const uint64_t uncompressed_biallelic_vrec_len = max_vrec_len; + uint32_t altx_seen_mask = 0; + uint32_t max_alt_ct_p1 = 3; + for (uint32_t vblock_end = vidx + kPglVblockSize; ; vblock_end += kPglVblockSize) { + if (vblock_end > variant_ct) { + if (vidx == variant_ct) { + break; + } + vblock_end = variant_ct; + } + uint64_t cur_vblock_byte_ct = uncompressed_biallelic_vrec_len * (vblock_end - vidx); + uint32_t altx_seen[4]; + ZeroU32Arr(4, altx_seen); + for (; vidx < vblock_end; ) { + const uintptr_t cur_offset = allele_idx_offsets[++vidx]; + const uint32_t alt_ct_p1 = cur_offset - prev_offset; + if (alt_ct_p1 > 2) { + if (alt_ct_p1 >= extra_alt_ceil) { + ++extra_alt_ceil_ct; + } else { + // don't need to track this when we hit the ceiling + if (alt_ct_p1 > max_alt_ct_p1) { + max_alt_ct_p1 = alt_ct_p1; + } + + if (alt_ct_p1 < 6) { + altx_seen[(alt_ct_p1 != 3)] += 1; + } else { + altx_seen[2 + (alt_ct_p1 >= 17)] += 1; + } + } + } + prev_offset = cur_offset; + } + cur_vblock_byte_ct += extra_alt_ceil_ct * extra_bytes_max; + for (uint32_t uii = 0; uii != 4; ++uii) { + if (altx_seen[uii]) { + const uint32_t cur_seen_ct = altx_seen[uii]; + altx_seen_mask |= 1 << uii; + cur_vblock_byte_ct += cur_seen_ct * extra_byte_cts[uii]; + } + } + if (cur_vblock_byte_ct > max_vblock_byte_ct) { + max_vblock_byte_ct = cur_vblock_byte_ct; + } + } + if (extra_alt_ceil_ct) { + max_vrec_len = kPglMaxBytesPerVariant; + } else { + max_vrec_len = uncompressed_biallelic_vrec_len + extra_byte_cts[bsru32(altx_seen_mask)]; + } + } + if (max_vrec_len >= kPglMaxBytesPerVariant) { + max_vrec_len = kPglMaxBytesPerVariant; + max_vblock_byte_ct = kPglMaxBytesPerVariant * S_CAST(uint64_t, max_vblock_size); + } + + // vrec_len_buf + // previously used overlapping uint32_t writes-to-memory, but that was + // incompatible with multithreaded compression + *vrec_len_byte_ct_ptr = BytesToRepresentNzU32(max_vrec_len); + *alloc_base_cacheline_ct_ptr = alloc_base_cacheline_ct + DivUp(S_CAST(uintptr_t, variant_ct) * (*vrec_len_byte_ct_ptr), kCacheline); + + // main write buffer + *vblock_cacheline_ct_ptr = DivUpU64(max_vblock_byte_ct, kCacheline); + *alloc_per_thread_cacheline_ct_ptr = alloc_per_thread_cacheline_ct + (*vblock_cacheline_ct_ptr); +} + + +void PwcInitPhase2(uintptr_t fwrite_cacheline_ct, uint32_t thread_ct, PgenWriterCommon** pwcs, unsigned char* pwc_alloc) { + const uint32_t variant_ct_limit = pwcs[0]->variant_ct_limit; + + unsigned char* alloc_iter = pwc_alloc; + const uint32_t vblock_ct = DivUp(variant_ct_limit, kPglVblockSize); + const PgenGlobalFlags phase_dosage_gflags = pwcs[0]->phase_dosage_gflags; + uint32_t vrtype_buf_byte_ct; + if (phase_dosage_gflags) { + vrtype_buf_byte_ct = RoundUpPow2(variant_ct_limit, kCacheline); + } else { + vrtype_buf_byte_ct = DivUp(variant_ct_limit, kCacheline * 2) * kCacheline; + } + pwcs[0]->vblock_fpos = S_CAST(uint64_t*, arena_alloc_raw_rd(vblock_ct * sizeof(int64_t), &alloc_iter)); + + pwcs[0]->vrec_len_buf = S_CAST(unsigned char*, arena_alloc_raw_rd(variant_ct_limit * pwcs[0]->vrec_len_byte_ct, &alloc_iter)); + + pwcs[0]->vrtype_buf = S_CAST(uintptr_t*, arena_alloc_raw(vrtype_buf_byte_ct, &alloc_iter)); + // the PwcAppend... functions assume these bytes are zeroed out + memset(pwcs[0]->vrtype_buf, 0, vrtype_buf_byte_ct); + + const uint32_t sample_ct = pwcs[0]->sample_ct; + const uint32_t genovec_byte_alloc = NypCtToCachelineCt(sample_ct) * kCacheline; + const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor); + for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) { + if (tidx) { + pwcs[tidx]->vblock_fpos = pwcs[0]->vblock_fpos; + pwcs[tidx]->vrec_len_buf = pwcs[0]->vrec_len_buf; + pwcs[tidx]->vrtype_buf = pwcs[0]->vrtype_buf; + } + pwcs[tidx]->genovec_hets_buf = S_CAST(uintptr_t*, arena_alloc_raw(genovec_byte_alloc, &alloc_iter)); + pwcs[tidx]->genovec_invert_buf = S_CAST(uintptr_t*, arena_alloc_raw(genovec_byte_alloc, &alloc_iter)); + pwcs[tidx]->ldbase_genovec = S_CAST(uintptr_t*, arena_alloc_raw(genovec_byte_alloc, &alloc_iter)); + + pwcs[tidx]->ldbase_raregeno = S_CAST(uintptr_t*, arena_alloc_raw(NypCtToCachelineCt(max_difflist_len) * kCacheline, &alloc_iter)); + pwcs[tidx]->ldbase_difflist_sample_ids = S_CAST(uint32_t*, arena_alloc_raw_rd((max_difflist_len + 1) * sizeof(int32_t), &alloc_iter)); + + pwcs[tidx]->fwrite_buf = alloc_iter; + pwcs[tidx]->fwrite_bufp = alloc_iter; + alloc_iter = &(alloc_iter[fwrite_cacheline_ct * kCacheline]); + } +} + +void SpgwInitPhase2(uint32_t max_vrec_len, STPgenWriter* spgwp, unsigned char* spgw_alloc) { + const uintptr_t fwrite_cacheline_ct = DivUp(max_vrec_len + kPglFwriteBlockSize + (5 + sizeof(AlleleCode)) * kPglDifflistGroupSize, kCacheline); + PgenWriterCommon* pwcp = GetPwcp(spgwp); + PwcInitPhase2(fwrite_cacheline_ct, 1, &pwcp, spgw_alloc); +} + +PglErr MpgwInitPhase2Ex(const char* __restrict fname, uintptr_t* __restrict explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct, uint32_t sample_ct, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, MTPgenWriter* mpgwp) { + assert(thread_ct); + const uintptr_t pwc_byte_ct = RoundUpPow2(sizeof(PgenWriterCommon), kCacheline); + for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) { + mpgwp->pwcs[tidx] = R_CAST(PgenWriterCommon*, &(mpgw_alloc[tidx * pwc_byte_ct])); + } + PglErr reterr = PwcInitPhase1(fname, explicit_nonref_flags, header_exts, footer_exts, variant_ct, sample_ct, write_mode, phase_dosage_gflags, nonref_flags_storage, vrec_len_byte_ct, mpgwp->pwcs[0], &(mpgwp->pgen_outfile), &(mpgwp->pgi_or_final_pgen_outfile), &(mpgwp->fname_buf)); + if (unlikely(reterr)) { + return reterr; + } + mpgwp->thread_ct = thread_ct; + for (uint32_t tidx = 1; tidx != thread_ct; ++tidx) { + *(mpgwp->pwcs[tidx]) = *(mpgwp->pwcs[0]); + mpgwp->pwcs[tidx]->vidx = tidx * kPglVblockSize; + } + PwcInitPhase2(vblock_cacheline_ct, thread_ct, mpgwp->pwcs, &(mpgw_alloc[thread_ct * pwc_byte_ct])); + return kPglRetSuccess; +} + + +void CountLdAndInvertedLdDiffs(const uintptr_t* __restrict ldbase_genovec, const uintptr_t* __restrict genovec, uint32_t sample_ct, uint32_t* ld_diff_ctp, uint32_t* ld_inv_diff_ctp) { + // Requires trailing bits to be zeroed out. + const uint32_t word_ct = NypCtToWordCt(sample_ct); + const uintptr_t* genovec_end = &(genovec[word_ct]); + uint32_t ld_diff_ct = 0; + uint32_t ld_inv_diff_ct = 0; + // construct the words we want to popcount_nypvec_01 on the fly + const VecW m1 = VCONST_W(kMask5555); + const VecW m2 = VCONST_W(kMask3333); + const VecW m4 = VCONST_W(kMask0F0F); + const VecW* ldbase_vvec_iter = R_CAST(const VecW*, ldbase_genovec); + const VecW* geno_vvec_iter = R_CAST(const VecW*, genovec); + VecW acc_ld = vecw_setzero(); + VecW acc_ld_inv = vecw_setzero(); + uintptr_t cur_incr = 60; + for (uintptr_t full_vecs_left = 3 * (word_ct / (3 * kWordsPerVec)); ; full_vecs_left -= cur_incr) { + if (full_vecs_left < 60) { + if (!full_vecs_left) { + ld_diff_ct = HsumW(acc_ld); + ld_inv_diff_ct = HsumW(acc_ld_inv); + break; + } + cur_incr = full_vecs_left; + } + VecW inner_acc_ld = vecw_setzero(); + VecW inner_acc_ld_inv = vecw_setzero(); + const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]); + do { + VecW loader_ldbase1 = *ldbase_vvec_iter++; + VecW loader_geno1 = *geno_vvec_iter++; + VecW loader_ldbase2 = *ldbase_vvec_iter++; + VecW loader_geno2 = *geno_vvec_iter++; + VecW xor1 = loader_ldbase1 ^ loader_geno1; + VecW xor2 = loader_ldbase2 ^ loader_geno2; + VecW xor_shifted1 = vecw_srli(xor1, 1); + VecW xor_shifted2 = vecw_srli(xor2, 1); + // xor(_low) xor_shifted loader_geno result + // 1 1 + // 0 0 0 1 + // 0 0 1 0 + // 0 1 0 0 + // 0 1 1 1 + // gah, don't see a way to avoid throwing in an extra xor for + // loader_geno... + VecW count_ld_inv = (xor1 | (xor_shifted1 ^ loader_geno1 ^ m1)) & m1; + loader_ldbase1 = *ldbase_vvec_iter++; + VecW count_ld = (xor1 | xor_shifted1) & m1; + loader_geno1 = *geno_vvec_iter++; + count_ld_inv = count_ld_inv + ((xor2 | (xor_shifted2 ^ loader_geno2 ^ m1)) & m1); + xor1 = loader_ldbase1 ^ loader_geno1; + count_ld = count_ld + ((xor2 | xor_shifted2) & m1); + xor_shifted1 = vecw_srli(xor1, 1); + count_ld_inv = count_ld_inv + ((xor1 | (xor_shifted1 ^ loader_geno1 ^ m1)) & m1); + count_ld = count_ld + ((xor1 | xor_shifted1) & m1); + // now count_ld and count_ld_inv each have 64 2-bit values from 0-3 + + count_ld_inv = (count_ld_inv & m2) + (vecw_srli(count_ld_inv, 2) & m2); + count_ld = (count_ld & m2) + (vecw_srli(count_ld, 2) & m2); + // now they have 32 4-bit values from 0-6 + + inner_acc_ld_inv = inner_acc_ld_inv + ((count_ld_inv + vecw_srli(count_ld_inv, 4)) & m4); + inner_acc_ld = inner_acc_ld + ((count_ld + vecw_srli(count_ld, 4)) & m4); + } while (geno_vvec_iter < geno_vvec_stop); + const VecW m0 = vecw_setzero(); + acc_ld_inv = acc_ld_inv + vecw_bytesum(inner_acc_ld_inv, m0); + acc_ld = acc_ld + vecw_bytesum(inner_acc_ld, m0); + } + const uintptr_t* ldbase_iter = DowncastKVecWToW(ldbase_vvec_iter); + const uintptr_t* genovec_iter = DowncastKVecWToW(geno_vvec_iter); + while (genovec_iter < genovec_end) { + uintptr_t ldbase_word = *ldbase_iter++; + uintptr_t geno_word = *genovec_iter++; + uintptr_t xor_result = ldbase_word ^ geno_word; + uintptr_t xor_result_shifted = xor_result >> 1; + ld_diff_ct += Popcount01Word((xor_result | xor_result_shifted) & kMask5555); + ld_inv_diff_ct += Popcount01Word((xor_result | (xor_result_shifted ^ (~geno_word))) & kMask5555); + } + *ld_diff_ctp = ld_diff_ct; + // trailing entries in last word are always "different" + *ld_inv_diff_ctp = ld_inv_diff_ct - ((-sample_ct) & (kBitsPerWordD2 - 1)); +} + +void CountLdAndInvertedLdDiffsList(const uintptr_t* __restrict ldbase_raregeno, const uint32_t* __restrict ldbase_difflist_sample_ids, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t ldbase_difflist_len, uint32_t difflist_len, uint32_t* ld_diff_ctp, uint32_t* ld_inv_diff_ctp) { + // assumes ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct + + // only the count(s) with aligned common_geno values are valid. e.g. if + // ldbase_common_geno and difflist_common_geno are both zero, the ld_inv_diff + // return value can be anything, while if they're both three, ld_diff and + // ld_inv_diff are both accurate. + + // some similarities to ParseLdAndMergeDifflistSubset(), but much simpler. + // noticeably slower than CountLdAndInvertedLdDiffs() when the lists aren't + // tiny. + // possible todo: take threshold into account? + + uint32_t collision_ct = 0; + uint32_t ld_diff_ct = 0; + uint32_t ld_inv_diff_ct = 0; + uint32_t ldbase_sample_idx = ldbase_difflist_sample_ids[0]; + uint32_t ldbase_difflist_idx = 1; + // this loop is a bit slow. attempt to bail halfway through? + for (uint32_t difflist_idx = 0; difflist_idx != difflist_len; ++difflist_idx) { + const uint32_t raw_sample_idx = difflist_sample_ids[difflist_idx]; + while (ldbase_sample_idx < raw_sample_idx) { + ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx++]; + } + if (ldbase_sample_idx > raw_sample_idx) { + continue; + } + const uint32_t cur_raregeno = GetNyparrEntry(raregeno, difflist_idx); + const uint32_t cur_ldbase_raregeno = GetNyparrEntry(ldbase_raregeno, ldbase_difflist_idx - 1); + const uint32_t cur_inv_raregeno = (6 - cur_raregeno) & 3; + ld_diff_ct += (cur_ldbase_raregeno != cur_raregeno); + ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx++]; + ++collision_ct; + ld_inv_diff_ct += (cur_ldbase_raregeno != cur_inv_raregeno); + } + // no more collisions, don't actually need to look at rest of + // ldbase_difflist + const uint32_t base_diff_ct = ldbase_difflist_len + difflist_len - 2 * collision_ct; + *ld_diff_ctp = base_diff_ct + ld_diff_ct; + *ld_inv_diff_ctp = base_diff_ct + ld_inv_diff_ct; +} + +uint32_t SaveLdDifflist(const uintptr_t* __restrict genovec, const uintptr_t* __restrict ldbase_genovec, uintptr_t common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp) { + unsigned char* fwrite_bufp = pwcp->fwrite_bufp; + if (!difflist_len) { + *fwrite_bufp = 0; + pwcp->fwrite_bufp = &(fwrite_bufp[1]); + return 1; + } + unsigned char* fwrite_bufp_start = fwrite_bufp; + fwrite_bufp = Vint32Append(difflist_len, fwrite_bufp); + const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct); + const uintptr_t common_geno_word = common_geno * kMask5555; + const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize); + unsigned char* group_first_sample_ids_iter = fwrite_bufp; + unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]); + unsigned char* raregeno_biter = &(extra_byte_cts_iter[group_ct - 1]); + fwrite_bufp = &(extra_byte_cts_iter[group_ct + (difflist_len - 1) / 4]); + unsigned char* last_group_vint_start = fwrite_bufp; + uintptr_t raregeno_word = 0; + uint32_t last_sample_idx = 0; + uint32_t difflist_idx = 0; + for (uint32_t widx = 0; ; ++widx) { + const uintptr_t cur_geno_word = genovec[widx]; + uintptr_t xor_word = ldbase_genovec? ldbase_genovec[widx] : common_geno_word; + xor_word ^= cur_geno_word; + if (xor_word) { + const uint32_t sample_idx_base = widx * kBitsPerWordD2; + do { + const uint32_t sample_idx_lowbits = ctzw(xor_word) / 2; + const uint32_t new_sample_idx = sample_idx_base + sample_idx_lowbits; + raregeno_word |= ((cur_geno_word >> (2 * sample_idx_lowbits)) & 3) << (2 * (difflist_idx % kBitsPerWordD2)); + if (!(difflist_idx % kPglDifflistGroupSize)) { + group_first_sample_ids_iter = memcpyua(group_first_sample_ids_iter, &new_sample_idx, sample_id_byte_ct); + if (difflist_idx) { + *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1); + } + last_group_vint_start = fwrite_bufp; + } else { + assert(new_sample_idx >= last_sample_idx + 1); + fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp); + } + ++difflist_idx; + last_sample_idx = new_sample_idx; + if (difflist_idx == difflist_len) { + SubwordStore(raregeno_word, 1 + (((difflist_len - 1) / 4) % sizeof(intptr_t)), raregeno_biter); + pwcp->fwrite_bufp = fwrite_bufp; + return fwrite_bufp - fwrite_bufp_start; + } + if (!(difflist_idx % kBitsPerWordD2)) { + AppendW(raregeno_word, &raregeno_biter); + raregeno_word = 0; + } + xor_word &= (~(3 * k1LU)) << (2 * sample_idx_lowbits); + } while (xor_word); + } + } +} + +void OnebitPreprocessBuf(const uintptr_t* __restrict genovec, uint32_t sample_ct, uint32_t common2_code, uintptr_t* __restrict genovec_buf) { + assert(sample_ct); + const uint32_t vec_ct = NypCtToVecCt(sample_ct); + // todo: look for better ways to perform some of these operations + const VecW* geno_vvec_iter = R_CAST(const VecW*, genovec); + const VecW* geno_vvec_end = &(geno_vvec_iter[vec_ct]); + VecW* write_iter = R_CAST(VecW*, genovec_buf); + const VecW m1 = VCONST_W(kMask5555); + if (common2_code < 5) { + if (common2_code == 1) { + // 11 -> 10, everything else unchanged + // todo: check if these loops are actually faster as simple while loops + // todo: check if it's better to unroll these loops to process 2 __m128is + // at a time + do { + const VecW cur_geno = *geno_vvec_iter++; + *write_iter++ = vecw_and_notfirst(m1 & vecw_srli(cur_geno, 1), cur_geno); + } while (geno_vvec_iter < geno_vvec_end); + } else if (common2_code == 3) { + // 00 -> 00, 01 -> 10, 10 -> 10, 11 -> 01 + do { + const VecW cur_geno = *geno_vvec_iter++; + const VecW cur_geno_rshift = vecw_srli(cur_geno, 1); + const VecW cur_geno_xor_masked = (cur_geno ^ cur_geno_rshift) & m1; + const VecW cur_geno_or_masked = (cur_geno | cur_geno_rshift) & m1; + *write_iter++ = cur_geno_xor_masked + cur_geno_or_masked; + } while (geno_vvec_iter < geno_vvec_end); + } else { + assert(common2_code == 2); + // 00 -> 00, 01 -> 10, 10 -> 01, 11 -> 10 + do { + const VecW cur_geno = *geno_vvec_iter++; + const VecW cur_geno_or_masked = (cur_geno | vecw_srli(cur_geno, 1)) & m1; + const VecW cur_geno_lowbits = cur_geno & m1; + *write_iter++ = cur_geno_lowbits + cur_geno_or_masked; + } while (geno_vvec_iter < geno_vvec_end); + } + } else { + if (common2_code == 5) { + // 00 -> 10, 01 -> 00, 10 -> 01, 11 -> 10 + do { + const VecW cur_geno = *geno_vvec_iter++; + const VecW cur_geno_rshift = vecw_srli(cur_geno, 1); + const VecW cur_geno_not_xor_masked = vecw_and_notfirst(cur_geno ^ cur_geno_rshift, m1); + const VecW cur_geno_rshift_masked = cur_geno_rshift & m1; + *write_iter++ = cur_geno_not_xor_masked + (cur_geno_not_xor_masked | cur_geno_rshift_masked); + } while (geno_vvec_iter < geno_vvec_end); + } else if (common2_code == 9) { + // 00 -> 10, 01 -> 10, 10 -> 00, 11 -> 01 + const VecW not_m1 = VCONST_W(kMaskAAAA); + do { + const VecW cur_geno = *geno_vvec_iter++; + *write_iter++ = (cur_geno ^ not_m1) - vecw_and_notfirst(not_m1, vecw_and_notfirst(vecw_srli(cur_geno, 1), cur_geno)); + } while (geno_vvec_iter < geno_vvec_end); + } else { + assert(common2_code == 6); + // 00 -> 10, 01 -> 00, 10 -> 10, 11 -> 01 + do { + const VecW cur_geno = *geno_vvec_iter++; + const VecW cur_geno_not_lowbits = vecw_and_notfirst(cur_geno, m1); + const VecW cur_geno_rshift_masked = vecw_srli(cur_geno, 1) & m1; + *write_iter++ = cur_geno_not_lowbits + (cur_geno_not_lowbits | cur_geno_rshift_masked); + } while (geno_vvec_iter < geno_vvec_end); + } + } +} + +uint32_t SaveOnebit(const uintptr_t* __restrict genovec, uint32_t common2_code, uint32_t onebit_difflist_len, PgenWriterCommon* pwcp) { + // Uses ldbase_genovec as a temporary buffer. + + // common2_code is expected to have the difference between the common + // genotype values in bits 0-1, and the smaller common genotype value in bits + // 2-3. + unsigned char* fwrite_bufp_start = pwcp->fwrite_bufp; + *fwrite_bufp_start = common2_code; + const uint32_t sample_ct = pwcp->sample_ct; + uintptr_t* __restrict genovec_buf = pwcp->ldbase_genovec; + // There's a 4-byte-interleaved format which is slightly more efficient for + // unsubsetted handling (~10 fewer compression/decompression operations per + // 32 genotypes), but that's only a 1-2% speedup, which probably isn't worth + // the more annoying subsetting. + // + // Any 10s and 11s are saved as 00 in this part. + // Similar logic is used to handle the other five possibilities (00/10, + // 00/11, 01/10, 01/11, 10/11); all of them should be expected to actually + // happen. (E.g. 01/11 can happen at a high MAF variant when there's lots of + // missing data.) To reduce branching, we preprocess genovec_buf to have + // even bit set iff the corresponding genotype is equal to the high common + // genotype value, and odd bit set iff the corresponding genotype is one of + // the two uncommon values. (There may be a better way to do this, analogous + // to the simpler decompression algorithm.) + OnebitPreprocessBuf(genovec, sample_ct, common2_code, genovec_buf); + ZeroTrailingNyps(sample_ct, genovec_buf); + const uint32_t word_read_ct = NypCtToWordCt(sample_ct); + PackWordsToHalfwordsMask(genovec_buf, word_read_ct, &(fwrite_bufp_start[1])); + const uint32_t onebit_block_len = (sample_ct + 15) / CHAR_BIT; + unsigned char* fwrite_bufp = Vint32Append(onebit_difflist_len, &(fwrite_bufp_start[onebit_block_len])); + // the rest is almost identical to SaveLdDifflist() + if (!onebit_difflist_len) { + pwcp->fwrite_bufp = fwrite_bufp; + return (onebit_block_len + 1); + } + const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct); + const uint32_t group_ct = DivUp(onebit_difflist_len, kPglDifflistGroupSize); + unsigned char* group_first_sample_ids_iter = fwrite_bufp; + unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]); + unsigned char* raregeno_biter = &(extra_byte_cts_iter[group_ct - 1]); + fwrite_bufp = &(extra_byte_cts_iter[group_ct + (onebit_difflist_len - 1) / 4]); + unsigned char* last_group_vint_start = fwrite_bufp; + uintptr_t raregeno_word = 0; + uint32_t last_sample_idx = 0; + uint32_t difflist_idx = 0; + for (uint32_t widx = 0; ; ++widx) { + uintptr_t xor_word = genovec_buf[widx] & kMaskAAAA; + if (xor_word) { + const uintptr_t cur_geno_word = genovec[widx]; + const uint32_t sample_idx_base = widx * kBitsPerWordD2; + + do { + const uint32_t sample_idx_lowbits = ctzw(xor_word) / 2; + const uint32_t new_sample_idx = sample_idx_base + sample_idx_lowbits; + if (!(difflist_idx % kBitsPerWordD2)) { + if (!(difflist_idx % kPglDifflistGroupSize)) { + SubU32StoreMov(new_sample_idx, sample_id_byte_ct, &group_first_sample_ids_iter); + if (difflist_idx) { + *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1); + AppendW(raregeno_word, &raregeno_biter); + raregeno_word = 0; + } + last_group_vint_start = fwrite_bufp; + goto SaveOnebit_skip_delta_write; + } + AppendW(raregeno_word, &raregeno_biter); + raregeno_word = 0; + } + assert(new_sample_idx >= last_sample_idx + 1); + fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp); + SaveOnebit_skip_delta_write: + raregeno_word |= ((cur_geno_word >> (2 * sample_idx_lowbits)) & 3) << (2 * (difflist_idx % kBitsPerWordD2)); + ++difflist_idx; + last_sample_idx = new_sample_idx; + xor_word &= xor_word - 1; + } while (xor_word); + // trailing bits of genovec_buf guaranteed to be zeroed out + if (difflist_idx == onebit_difflist_len) { + SubwordStore(raregeno_word, 1 + (((onebit_difflist_len - 1) / 4) % sizeof(intptr_t)), raregeno_biter); + pwcp->fwrite_bufp = fwrite_bufp; + return fwrite_bufp - fwrite_bufp_start; + } + } + } +} + +// returns vrec_len +uint32_t PwcAppendBiallelicGenovecMain(const uintptr_t* __restrict genovec, uint32_t vidx, PgenWriterCommon* pwcp, uint32_t* het_ct_ptr, uint32_t* altxy_ct_ptr, unsigned char* vrtype_ptr) { + const uint32_t sample_ct = pwcp->sample_ct; + assert((!(sample_ct % kBitsPerWordD2)) || (!(genovec[sample_ct / kBitsPerWordD2] >> (2 * (sample_ct % kBitsPerWordD2))))); + STD_ARRAY_DECL(uint32_t, 4, genocounts); + GenoarrCountFreqsUnsafe(genovec, sample_ct, genocounts); + if (het_ct_ptr) { + *het_ct_ptr = genocounts[1]; + if (altxy_ct_ptr) { + *altxy_ct_ptr = genocounts[2]; + } + } + uint32_t most_common_geno = (genocounts[1] > genocounts[0]); + uint32_t second_most_common_geno = 1 - most_common_geno; + uint32_t largest_geno_ct = genocounts[most_common_geno]; + uint32_t second_largest_geno_ct = genocounts[second_most_common_geno]; + for (uint32_t cur_geno = 2; cur_geno != 4; ++cur_geno) { + const uint32_t cur_geno_ct = genocounts[cur_geno]; + if (cur_geno_ct > second_largest_geno_ct) { + if (cur_geno_ct > largest_geno_ct) { + second_largest_geno_ct = largest_geno_ct; + second_most_common_geno = most_common_geno; + largest_geno_ct = cur_geno_ct; + most_common_geno = cur_geno; + } else { + second_largest_geno_ct = cur_geno_ct; + second_most_common_geno = cur_geno; + } + } + } + const uint32_t difflist_len = sample_ct - largest_geno_ct; + const uint32_t rare_2_geno_ct_sum = difflist_len - second_largest_geno_ct; + // average of 10-11 bits per difflist entry + const uint32_t sample_ctd8 = sample_ct / 8; + const uint32_t sample_ctd64 = sample_ct / 64; + uint32_t max_difflist_len = sample_ctd8 - 2 * sample_ctd64 + rare_2_geno_ct_sum; + if (max_difflist_len > sample_ctd8) { + max_difflist_len = sample_ctd8; + } + const uint32_t difflist_viable = (most_common_geno != 1) && (difflist_len <= max_difflist_len); + + uintptr_t* ldbase_genovec = pwcp->ldbase_genovec; + STD_ARRAY_REF(uint32_t, 4) ldbase_genocounts = pwcp->ldbase_genocounts; + if (!(vidx % kPglVblockSize)) { + // beginning of a variant block. save raw fpos in header; LD compression + // prohibited. + + // er, need to use a relative offset in the multithreaded case, absolute + // position isn't known + pwcp->vblock_fpos[vidx / kPglVblockSize] = pwcp->vblock_fpos_offset + S_CAST(uintptr_t, pwcp->fwrite_bufp - pwcp->fwrite_buf); + } else if (difflist_len > sample_ctd64) { + // do not use LD compression if there are at least this many differences. + // tune this threshold in the future. + const uint32_t ld_diff_threshold = difflist_viable? (difflist_len - sample_ctd64) : max_difflist_len; + // number of changes between current genovec and LD reference is bounded + // below by sum(genocounts[x] - ldbase_genocounts[x]) / 2 + const int32_t count02_limit = 2 * ld_diff_threshold - abs_i32(genocounts[1] - ldbase_genocounts[1]) + abs_i32(genocounts[3] - ldbase_genocounts[3]); + if ((S_CAST(int32_t, abs_i32(genocounts[0] - ldbase_genocounts[0]) + abs_i32(genocounts[2] - ldbase_genocounts[2])) < count02_limit) || (S_CAST(int32_t, abs_i32(genocounts[0] - ldbase_genocounts[2]) + abs_i32(genocounts[2] - ldbase_genocounts[0])) < count02_limit)) { + uint32_t ld_diff_ct; + uint32_t ld_inv_diff_ct; + // okay, perform a brute-force diff + // (could check LD vs. inverted LD separately?) + if (pwcp->ldbase_common_geno < 4) { + // unpack to ldbase_genovec + PgrDifflistToGenovecUnsafe(pwcp->ldbase_raregeno, pwcp->ldbase_difflist_sample_ids, pwcp->ldbase_common_geno, sample_ct, pwcp->ldbase_difflist_len, ldbase_genovec); + ZeroTrailingNyps(sample_ct, ldbase_genovec); + pwcp->ldbase_common_geno = UINT32_MAX; + } + CountLdAndInvertedLdDiffs(ldbase_genovec, genovec, sample_ct, &ld_diff_ct, &ld_inv_diff_ct); + if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) { + const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct); + *vrtype_ptr = 2 + invert_before_compressing; + if (invert_before_compressing) { + GenovecInvertCopyUnsafe(genovec, sample_ct, pwcp->genovec_invert_buf); + ld_diff_ct = ld_inv_diff_ct; + } + return SaveLdDifflist(invert_before_compressing? pwcp->genovec_invert_buf : genovec, ldbase_genovec, 0, ld_diff_ct, pwcp); + } + } + } + const uint32_t genovec_word_ct = NypCtToWordCt(sample_ct); + STD_ARRAY_COPY(genocounts, 4, ldbase_genocounts); + pwcp->ldbase_common_geno = UINT32_MAX; + if ((!difflist_viable) && (rare_2_geno_ct_sum < sample_ct / (2 * kPglMaxDifflistLenDivisor))) { + *vrtype_ptr = 1; + uint32_t larger_common_geno = second_most_common_geno; + uint32_t smaller_common_geno = most_common_geno; + if (most_common_geno > second_most_common_geno) { + larger_common_geno = most_common_geno; + smaller_common_geno = second_most_common_geno; + } + const uint32_t vrec_len = SaveOnebit(genovec, larger_common_geno + (smaller_common_geno * 3), rare_2_geno_ct_sum, pwcp); + memcpy(ldbase_genovec, genovec, genovec_word_ct * sizeof(intptr_t)); + return vrec_len; + } + memcpy(ldbase_genovec, genovec, genovec_word_ct * sizeof(intptr_t)); + if (difflist_viable) { +#ifdef FUTURE_ENCODER + if ((!difflist_len) && (!most_common_geno)) { + *vrtype_ptr = 5; + return 0; + } +#endif + *vrtype_ptr = 4 + most_common_geno; + return SaveLdDifflist(genovec, nullptr, most_common_geno, difflist_len, pwcp); + } + *vrtype_ptr = 0; + const uint32_t vrec_len = NypCtToByteCt(sample_ct); + pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, genovec, vrec_len); + return vrec_len; +} + +void PwcAppendBiallelicGenovec(const uintptr_t* __restrict genovec, PgenWriterCommon* pwcp) { + const uint32_t vidx = pwcp->vidx; + unsigned char vrtype; + const uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, nullptr, nullptr, &vrtype); + const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + pwcp->vidx += 1; + SubU32Store(vrec_len, vrec_len_byte_ct, &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct])); + // could have a single expression which branchlessly handles both cases, but + // doubt that's worthwhile + if (!pwcp->phase_dosage_gflags) { + pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= S_CAST(uintptr_t, vrtype) << (4 * (vidx % kBitsPerWordD4)); + } else { + DowncastToUc(pwcp->vrtype_buf)[vidx] = vrtype; + } +} + +BoolErr SpgwFlush(STPgenWriter* spgwp) { + PgenWriterCommon* pwcp = GetPwcp(spgwp); + if (pwcp->fwrite_bufp >= &(pwcp->fwrite_buf[kPglFwriteBlockSize])) { + const uintptr_t cur_byte_ct = pwcp->fwrite_bufp - pwcp->fwrite_buf; + FILE** pgen_outfilep = GetPgenOutfilep(spgwp); + if (unlikely(fwrite_checked(pwcp->fwrite_buf, cur_byte_ct, *pgen_outfilep))) { + return 1; + } + pwcp->vblock_fpos_offset += cur_byte_ct; + pwcp->fwrite_bufp = pwcp->fwrite_buf; + } + return 0; +} + + +uint32_t SaveLdTwoListDelta(const uintptr_t* __restrict difflist_raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t ld_diff_ct, PgenWriterCommon* pwcp) { + // assumes ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct, and + // difflist_sample_ids[ldbase_difflist_len] == sample_ct. + // assumes biallelic data. + + // similar to SaveLdDifflist() and, to a lesser degree, + // ParseLdAndMergeDifflistSubset() + unsigned char* fwrite_bufp = pwcp->fwrite_bufp; + if (!ld_diff_ct) { + *fwrite_bufp = 0; + pwcp->fwrite_bufp = &(fwrite_bufp[1]); + return 1; + } + unsigned char* fwrite_bufp_start = fwrite_bufp; + fwrite_bufp = Vint32Append(ld_diff_ct, fwrite_bufp); + const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct); + const uint32_t group_ct = DivUp(ld_diff_ct, kPglDifflistGroupSize); + const uint32_t ldbase_common_geno = pwcp->ldbase_common_geno; + assert(ldbase_common_geno < 4); + const uintptr_t* __restrict ldbase_raregeno = pwcp->ldbase_raregeno; + const uint32_t* __restrict ldbase_sample_ids = pwcp->ldbase_difflist_sample_ids; + unsigned char* group_first_sample_ids_iter = fwrite_bufp; + unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]); + unsigned char* raregeno_write_biter = &(extra_byte_cts_iter[group_ct - 1]); + fwrite_bufp = &(extra_byte_cts_iter[group_ct + (ld_diff_ct - 1) / 4]); + unsigned char* last_group_vint_start = fwrite_bufp; + uintptr_t ldbase_raregeno_word = 0; + uintptr_t difflist_raregeno_word = 0; + uintptr_t raregeno_write_word = 0; + uint32_t last_sample_idx = 0; + + uint32_t next_ldbase_sample_idx = ldbase_sample_ids[0]; + uint32_t next_difflist_sample_idx = difflist_sample_ids[0]; + uint32_t ldbase_idx = 0; + uint32_t difflist_idx = 0; + uint32_t diff_written_ct = 0; + while (diff_written_ct < ld_diff_ct) { + uintptr_t cur_geno; + uint32_t new_sample_idx; + if (next_ldbase_sample_idx <= next_difflist_sample_idx) { + ldbase_raregeno_word >>= 2; + if (!(ldbase_idx % kBitsPerWordD2)) { + ldbase_raregeno_word = ldbase_raregeno[ldbase_idx / kBitsPerWordD2]; + } + ++ldbase_idx; + } + if (next_difflist_sample_idx <= next_ldbase_sample_idx) { + difflist_raregeno_word >>= 2; + if (!(difflist_idx % kBitsPerWordD2)) { + difflist_raregeno_word = difflist_raregeno[difflist_idx / kBitsPerWordD2]; + } + new_sample_idx = next_difflist_sample_idx; + ++difflist_idx; + cur_geno = difflist_raregeno_word & 3; + next_difflist_sample_idx = difflist_sample_ids[difflist_idx]; + if (next_ldbase_sample_idx == new_sample_idx) { + next_ldbase_sample_idx = ldbase_sample_ids[ldbase_idx]; + if (cur_geno == (ldbase_raregeno_word & 3)) { + continue; + } + } + } else { + cur_geno = ldbase_common_geno; + new_sample_idx = next_ldbase_sample_idx; + next_ldbase_sample_idx = ldbase_sample_ids[ldbase_idx]; + } + raregeno_write_word |= cur_geno << (2 * (diff_written_ct % kBitsPerWordD2)); + if (!(diff_written_ct % kPglDifflistGroupSize)) { + SubU32StoreMov(new_sample_idx, sample_id_byte_ct, &group_first_sample_ids_iter); + if (diff_written_ct) { + *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1); + } + last_group_vint_start = fwrite_bufp; + } else { + fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp); + } + last_sample_idx = new_sample_idx; + ++diff_written_ct; + if (!(diff_written_ct % kBitsPerWordD2)) { + AppendW(raregeno_write_word, &raregeno_write_biter); + raregeno_write_word = 0; + } + } + if (diff_written_ct % kBitsPerWordD2) { + SubwordStore(raregeno_write_word, 1 + (((ld_diff_ct - 1) / 4) % kBytesPerWord), raregeno_write_biter); + } + pwcp->fwrite_bufp = fwrite_bufp; + return fwrite_bufp - fwrite_bufp_start; +} + +uint32_t SaveLdInputList(PgenWriterCommon* pwcp) { + // simply "copies" ldbase_{raregeno,difflist_sample_ids,difflist_len} to the + // write buffer. + unsigned char* fwrite_bufp = pwcp->fwrite_bufp; + const uint32_t difflist_len = pwcp->ldbase_difflist_len; + if (!difflist_len) { + *fwrite_bufp = 0; + pwcp->fwrite_bufp = &(fwrite_bufp[1]); + return 1; + } + unsigned char* fwrite_bufp_start = fwrite_bufp; + fwrite_bufp = Vint32Append(difflist_len, fwrite_bufp); + const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct); + const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize); + const uint32_t* __restrict difflist_sample_ids = pwcp->ldbase_difflist_sample_ids; + unsigned char* group_first_sample_ids_iter = fwrite_bufp; + unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]); + fwrite_bufp = memcpyua(&(extra_byte_cts_iter[group_ct - 1]), pwcp->ldbase_raregeno, NypCtToByteCt(difflist_len)); + unsigned char* last_group_vint_start = nullptr; + uint32_t last_sample_idx = 0; + for (uint32_t difflist_idx = 0; difflist_idx != difflist_len; ++difflist_idx) { + const uint32_t new_sample_idx = difflist_sample_ids[difflist_idx]; + if (!(difflist_idx % kPglDifflistGroupSize)) { + SubU32StoreMov(new_sample_idx, sample_id_byte_ct, &group_first_sample_ids_iter); + if (difflist_idx) { + *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1); + } + last_group_vint_start = fwrite_bufp; + } else { + // assert(new_sample_idx >= last_sample_idx + 1); + fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp); + } + last_sample_idx = new_sample_idx; + } + pwcp->fwrite_bufp = fwrite_bufp; + return fwrite_bufp - fwrite_bufp_start; +} + +uint32_t PwcAppendBiallelicDifflistLimitedMain(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t vidx, uint32_t difflist_common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp, unsigned char* vrtype_ptr) { + const uint32_t sample_ct = pwcp->sample_ct; + // caller's responsibility not to exceed this limit + assert(difflist_len <= 2 * (sample_ct / kPglMaxDifflistLenDivisor)); + + // trailing bits of raregeno must be zeroed out + + assert(difflist_common_geno < 4); + assert((!(difflist_len % kBitsPerWordD2)) || (!(raregeno[difflist_len / kBitsPerWordD2] >> (2 * (difflist_len % kBitsPerWordD2))))); + assert(difflist_sample_ids[difflist_len] == sample_ct); + STD_ARRAY_DECL(uint32_t, 4, genocounts); + GenoarrCountFreqsUnsafe(raregeno, difflist_len, genocounts); + assert(!genocounts[difflist_common_geno]); + genocounts[difflist_common_geno] = sample_ct - difflist_len; + uint32_t second_most_common_geno = difflist_common_geno? 0 : 1; + uint32_t second_largest_geno_ct = genocounts[second_most_common_geno]; + for (uint32_t cur_geno = second_most_common_geno + 1; cur_geno != 4; ++cur_geno) { + if (cur_geno == difflist_common_geno) { + continue; + } + const uint32_t cur_geno_ct = genocounts[cur_geno]; + if (cur_geno_ct > second_largest_geno_ct) { + second_most_common_geno = cur_geno; + second_largest_geno_ct = cur_geno_ct; + } + } + const uint32_t rare_2_geno_ct_sum = difflist_len - second_largest_geno_ct; + const uint32_t sample_ctd8 = sample_ct / 8; + const uint32_t sample_ctd64 = sample_ct / 64; + uint32_t max_difflist_len = sample_ctd8 - 2 * sample_ctd64 + rare_2_geno_ct_sum; + if (max_difflist_len > sample_ctd8) { + max_difflist_len = sample_ctd8; + } + const uint32_t difflist_viable = (difflist_common_geno != 1) && (difflist_len <= max_difflist_len); + STD_ARRAY_REF(uint32_t, 4) ldbase_genocounts = pwcp->ldbase_genocounts; + if (!(vidx % kPglVblockSize)) { + pwcp->vblock_fpos[vidx / kPglVblockSize] = pwcp->vblock_fpos_offset + S_CAST(uintptr_t, pwcp->fwrite_bufp - pwcp->fwrite_buf); + } else if (difflist_len > sample_ctd64) { + const uint32_t ld_diff_threshold = difflist_viable? (difflist_len - sample_ctd64) : max_difflist_len; + // number of changes between current genovec and LD reference is bounded + // below by sum(genocounts[x] - ldbase_genocounts[x]) / 2 + const int32_t count02_limit = 2 * ld_diff_threshold - abs_i32(genocounts[1] - ldbase_genocounts[1]) + abs_i32(genocounts[3] - ldbase_genocounts[3]); + if ((S_CAST(int32_t, abs_i32(genocounts[0] - ldbase_genocounts[0]) + abs_i32(genocounts[2] - ldbase_genocounts[2])) < count02_limit) || (S_CAST(int32_t, abs_i32(genocounts[0] - ldbase_genocounts[2]) + abs_i32(genocounts[2] - ldbase_genocounts[0])) < count02_limit)) { + uint32_t ld_diff_ct; + uint32_t ld_inv_diff_ct; + if (pwcp->ldbase_common_geno < 4) { + pwcp->ldbase_difflist_sample_ids[pwcp->ldbase_difflist_len] = sample_ct; + CountLdAndInvertedLdDiffsList(pwcp->ldbase_raregeno, pwcp->ldbase_difflist_sample_ids, raregeno, difflist_sample_ids, pwcp->ldbase_difflist_len, difflist_len, &ld_diff_ct, &ld_inv_diff_ct); + const uint32_t difflist_common_geno_inv = (6 - difflist_common_geno) & 3; + if (pwcp->ldbase_common_geno != difflist_common_geno) { + ld_diff_ct = ld_diff_threshold; + } + if (pwcp->ldbase_common_geno != difflist_common_geno_inv) { + ld_inv_diff_ct = ld_diff_threshold; + } + if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) { + const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct); + *vrtype_ptr = 2 + invert_before_compressing; + if (invert_before_compressing) { + GenovecInvertCopyUnsafe(raregeno, difflist_len, pwcp->genovec_invert_buf); + // difflist_common_geno = difflist_common_geno_inv; + ld_diff_ct = ld_inv_diff_ct; + } + return SaveLdTwoListDelta(invert_before_compressing? pwcp->genovec_invert_buf : raregeno, difflist_sample_ids, ld_diff_ct, pwcp); + } + } else { + uintptr_t* __restrict genobuf = pwcp->genovec_invert_buf; + PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, sample_ct, difflist_len, genobuf); + ZeroTrailingNyps(sample_ct, genobuf); + CountLdAndInvertedLdDiffs(pwcp->ldbase_genovec, genobuf, sample_ct, &ld_diff_ct, &ld_inv_diff_ct); + if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) { + const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct); + *vrtype_ptr = 2 + invert_before_compressing; + if (invert_before_compressing) { + GenovecInvertUnsafe(sample_ct, genobuf); + ld_diff_ct = ld_inv_diff_ct; + } + return SaveLdDifflist(genobuf, pwcp->ldbase_genovec, 0, ld_diff_ct, pwcp); + } + } + } + } + STD_ARRAY_COPY(genocounts, 4, ldbase_genocounts); + if (difflist_viable) { + *vrtype_ptr = 4 + difflist_common_geno; + memcpy(pwcp->ldbase_raregeno, raregeno, NypCtToByteCt(difflist_len)); + memcpy(pwcp->ldbase_difflist_sample_ids, difflist_sample_ids, difflist_len * sizeof(int32_t)); + // memcpy(pwcp->ldbase_difflist_sample_ids, difflist_sample_ids, (difflist_len + 1) * sizeof(int32_t)); + pwcp->ldbase_common_geno = difflist_common_geno; + pwcp->ldbase_difflist_len = difflist_len; + return SaveLdInputList(pwcp); + } + pwcp->ldbase_common_geno = UINT32_MAX; + const uint32_t use_onebit = (rare_2_geno_ct_sum < sample_ct / (2 * kPglMaxDifflistLenDivisor)); + uintptr_t* genobuf = use_onebit? pwcp->genovec_invert_buf : pwcp->ldbase_genovec; + PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, sample_ct, difflist_len, genobuf); + ZeroTrailingNyps(sample_ct, genobuf); + *vrtype_ptr = use_onebit; + if (use_onebit) { + uint32_t larger_common_geno = second_most_common_geno; + uint32_t smaller_common_geno = difflist_common_geno; + if (difflist_common_geno > second_most_common_geno) { + larger_common_geno = difflist_common_geno; + smaller_common_geno = second_most_common_geno; + } + const uint32_t vrec_len = SaveOnebit(genobuf, larger_common_geno + (smaller_common_geno * 3), rare_2_geno_ct_sum, pwcp); + memcpy(pwcp->ldbase_genovec, genobuf, NypCtToWordCt(sample_ct) * sizeof(uintptr_t)); + return vrec_len; + } + const uint32_t vrec_len = NypCtToByteCt(sample_ct); + pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, genobuf, vrec_len); + return vrec_len; +} + +void PwcAppendBiallelicDifflistLimited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp) { + const uint32_t vidx = pwcp->vidx; + unsigned char vrtype; + const uint32_t vrec_len = PwcAppendBiallelicDifflistLimitedMain(raregeno, difflist_sample_ids, vidx, difflist_common_geno, difflist_len, pwcp, &vrtype); + const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + pwcp->vidx += 1; + SubU32Store(vrec_len, vrec_len_byte_ct, &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct])); + if (!pwcp->phase_dosage_gflags) { + pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= S_CAST(uintptr_t, vrtype) << (4 * (vidx % kBitsPerWordD4)); + } else { + DowncastToUc(pwcp->vrtype_buf)[vidx] = vrtype; + } +} + + +static inline BoolErr CheckedVrecLenIncr(uintptr_t incr, uint32_t* vrec_len_ptr) { + // maybe track vrec_left instead of vrec_len... +#ifdef __LP64__ + const uintptr_t new_vrec_len = *vrec_len_ptr + incr; + if (unlikely(new_vrec_len > kPglMaxBytesPerVariant)) { + return 1; + } + *vrec_len_ptr = new_vrec_len; + return 0; +#else + *vrec_len_ptr += incr; + return 0; +#endif +} + +// ok if delta_bitarr trailing bits set +BoolErr PwcAppendDeltalist(const uintptr_t* delta_bitarr, uint32_t deltalist_len, PgenWriterCommon* pwcp, uint32_t* vrec_len_ptr) { + assert(deltalist_len); + unsigned char* fwrite_bufp = pwcp->fwrite_bufp; + unsigned char* fwrite_bufp_start = fwrite_bufp; + fwrite_bufp = Vint32Append(deltalist_len, fwrite_bufp); + const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct); + const uint32_t group_ct = DivUp(deltalist_len, kPglDifflistGroupSize); + unsigned char* group_first_sample_ids_iter = fwrite_bufp; + if (unlikely(CheckedVrecLenIncr(S_CAST(uintptr_t, fwrite_bufp - fwrite_bufp_start) + group_ct * sample_id_byte_ct + (group_ct - 1), vrec_len_ptr))) { + return 1; + } + unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]); + fwrite_bufp_start = &(extra_byte_cts_iter[group_ct - 1]); + fwrite_bufp = fwrite_bufp_start; +#ifdef __LP64__ + const intptr_t vrec_left = kPglMaxBytesPerVariant - (*vrec_len_ptr); +#endif + unsigned char* last_group_vint_start = nullptr; + uintptr_t last_sample_idx = 0; + uintptr_t new_sample_idx_base = 0; + uintptr_t delta_bits = delta_bitarr[0]; + for (uint32_t deltalist_idx = 0; deltalist_idx != deltalist_len; ++deltalist_idx) { + const uintptr_t new_sample_idx = BitIter1(delta_bitarr, &new_sample_idx_base, &delta_bits); + if (!(deltalist_idx % kPglDifflistGroupSize)) { + SubU32StoreMov(new_sample_idx, sample_id_byte_ct, &group_first_sample_ids_iter); + if (deltalist_idx) { + *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1); + } +#ifdef __LP64__ + // Note that this inequality goes in the opposite direction from the + // usual PtrCheck. + if (unlikely(S_CAST(intptr_t, fwrite_bufp - fwrite_bufp_start) > vrec_left)) { + return 1; + } +#endif + last_group_vint_start = fwrite_bufp; + } else { + assert(new_sample_idx >= last_sample_idx + 1); + fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp); + } + last_sample_idx = new_sample_idx; + } + pwcp->fwrite_bufp = fwrite_bufp; + return CheckedVrecLenIncr(fwrite_bufp - fwrite_bufp_start, vrec_len_ptr); +} + +static_assert(sizeof(AlleleCode) == 1, "PwcAppendMultiallelicMain() needs to be updated."); +BoolErr PwcAppendMultiallelicMain(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, uint32_t vidx, PgenWriterCommon* pwcp, const uintptr_t** genovec_hets_ptr, uint32_t* het_ct_ptr, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) { + uint32_t genovec_het_ct; + uint32_t genovec_altxy_ct; + uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, &genovec_het_ct, &genovec_altxy_ct, vrtype_ptr); + if ((!patch_01_ct) && (!patch_10_ct)) { + if (genovec_hets_ptr) { + *genovec_hets_ptr = genovec; + *het_ct_ptr = genovec_het_ct; + } + *vrec_len_ptr = vrec_len; + return 0; + } + assert(allele_ct > 2); + unsigned char* format_byte_ptr = pwcp->fwrite_bufp; + ++vrec_len; + pwcp->fwrite_bufp += 1; + unsigned char format_byte = 0; + if (!patch_01_ct) { + format_byte = 15; + } else { + // We could use a different bitarray vs. deltalist threshold, since the + // usual threshold was chosen under the assumption that + // (maximum allowed deltalist index + 1) == bitarray length. + // However, the subsetting-speed advantage of the deltalist representation + // can be expected to make up for its larger size, so we may actually want + // to increase instead of decrease the current 1/9 fraction. For now, I'll + // punt on this decision. + const uint32_t max_deltalist_entry_ct = genovec_het_ct / kPglMaxDeltalistLenDivisor; + if (patch_01_ct < max_deltalist_entry_ct) { + // impossible for this to run out of space + PwcAppendDeltalist(patch_01_set, patch_01_ct, pwcp, &vrec_len); + format_byte = 1; + } else { + const uint32_t genovec_het_ctb = DivUp(genovec_het_ct, CHAR_BIT); + CopyGenomatchSubset(patch_01_set, genovec, kMask5555, 0, genovec_het_ct, pwcp->fwrite_bufp); + pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[genovec_het_ctb]); + vrec_len += genovec_het_ctb; + } + if (allele_ct > 3) { + if (allele_ct <= 18) { + const AlleleCode* patch_01_vals_iter = patch_01_vals; + + // todo: revisit this logic, we may not always want to write a word at + // a time + unsigned char* write_fixed_width_start = pwcp->fwrite_bufp; + uint32_t written_byte_ct; + if (allele_ct == 4) { + // 1 bit per entry, clear = ref/alt2, set = ref/alt3 +#ifdef USE_SSE2 + const uint32_t fullvec_ct = patch_01_ct / kBytesPerVec; + if (fullvec_ct) { + // parallel-equality-to-3 check, movemask + // (+125 or <<7 followed by movemask also works) + // todo: better ARM implementation + for (uint32_t vec_idx = 0; vec_idx != fullvec_ct; ++vec_idx) { + VecUc cur_vec = vecuc_loadu(patch_01_vals_iter); + patch_01_vals_iter = &(patch_01_vals_iter[kBytesPerVec]); + const Vec8thUint v8ui = vecuc_movemask(VecWToUc(vecw_slli(VecUcToW(cur_vec), 7))); + CopyToUnalignedOffsetV8(write_fixed_width_start, &v8ui, vec_idx); + } + } + const uint32_t remainder = patch_01_ct % kBytesPerVec; + if (remainder) { + const uintptr_t* cur_read_iter = R_CAST(const uintptr_t*, patch_01_vals_iter); + unsigned char* write_uc = &(write_fixed_width_start[fullvec_ct * sizeof(Vec8thUint)]); + const uint32_t word_ct_m1 = (remainder - 1) / kBytesPerWord; + for (uint32_t widx = 0; ; ++widx) { + uintptr_t cur_word; + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + cur_word = SubwordLoad(cur_read_iter, ModNz(remainder, kBytesPerWord)); + } else { + cur_word = *cur_read_iter++; + } +# ifdef USE_AVX2 + cur_word = _pext_u64(cur_word, kMask0101); +# else + // extract the low bit from each byte + cur_word = cur_word & kMask0101; + cur_word = (cur_word * 0x102040810204080LLU) >> 56; +# endif + write_uc[widx] = cur_word; + } + } +#else // !USE_SSE2 + const uint32_t word_ct_m1 = (patch_01_ct - 1) / kBitsPerWord; + uint32_t loop_len = kBitsPerWord; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = ModNz(patch_01_ct, kBitsPerWord); + } + uintptr_t cur_word = 0; + for (uint32_t uii = loop_len - 1; ; --uii) { + cur_word |= patch_01_vals_iter[uii] & 1; + if (!uii) { + break; + } + cur_word <<= 1; + } + CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx); + patch_01_vals_iter = &(patch_01_vals_iter[loop_len]); + } +#endif // !USE_SSE2 + written_byte_ct = DivUp(patch_01_ct, 8); + } else if (allele_ct <= 6) { + // 2 bits per entry + const uint32_t word_ct_m1 = (patch_01_ct - 1) / kBitsPerWordD2; + uint32_t loop_len = kBitsPerWordD2; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = ModNz(patch_01_ct, kBitsPerWordD2); + } + uintptr_t cur_word = 0; + for (uint32_t uii = loop_len - 1; ; --uii) { + // todo: + // 1. parallel-subtract (now all bytes are in 0..3) + // 2. bitwise or with right-shift-6 + // 3. bitwise or with right-shift-12 + // 4. _mm{256}_shuffle_epi8() gather from bytes 0, 4, 8, ... + // SSE4.2: + // 5. store result of _mm_extract_epi32(, 0) + // AVX2: + // 5. store results of _mm256_extract_epi32(, 0) and (, 4) + cur_word |= patch_01_vals_iter[uii] - 2; + if (!uii) { + break; + } + cur_word <<= 2; + } + CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx); + patch_01_vals_iter = &(patch_01_vals_iter[loop_len]); + } + written_byte_ct = DivUp(patch_01_ct, 4); + } else { + // 4 bits per entry + const uint32_t word_ct_m1 = (patch_01_ct - 1) / kBitsPerWordD4; + uint32_t loop_len = kBitsPerWordD4; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = ModNz(patch_01_ct, kBitsPerWordD4); + } + uintptr_t cur_word = 0; + for (uint32_t uii = loop_len - 1; ; --uii) { + // todo: replace this with parallel-subtract, followed by bitwise + // or with right-shift-4, followed by _mm{256}_shuffle_epi8() + // gather from even bytes, followed by _mm{256}_extract_epi64() + // (or _mm256_permute4x64_epi64() followed by + // _mm256_extracti128_si256()? test this.) + // Or in non-SSE4.2 case, use end of PackWordToHalfword logic. + cur_word |= patch_01_vals_iter[uii] - 2; + if (!uii) { + break; + } + cur_word <<= 4; + } + CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx); + patch_01_vals_iter = &(patch_01_vals_iter[loop_len]); + } + written_byte_ct = DivUp(patch_01_ct, 2); + } + pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[written_byte_ct]); + vrec_len += written_byte_ct; + } else { + // 1 byte per entry + // need 2-byte and 3-byte cases here for larger AlleleCode, those also + // need to check for vrec_len overflow + unsigned char* payload = pwcp->fwrite_bufp; + for (uint32_t patch_idx = 0; patch_idx != patch_01_ct; ++patch_idx) { + // todo: check if the compiler vectorizes this + payload[patch_idx] = patch_01_vals[patch_idx] - 2; + } + pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[patch_01_ct]); + vrec_len += patch_01_ct; + } + } + } + if (!patch_10_ct) { + format_byte |= 240; + if (genovec_hets_ptr) { + *genovec_hets_ptr = genovec; + *het_ct_ptr = genovec_het_ct; + } + } else { + const uint32_t max_deltalist_entry_ct = genovec_altxy_ct / kPglMaxDeltalistLenDivisor; + if (patch_10_ct < max_deltalist_entry_ct) { + // can't actually fail for now, but cannot ignore overflow check if + // sizeof(AlleleCode) > 1 + if (unlikely(PwcAppendDeltalist(patch_10_set, patch_10_ct, pwcp, &vrec_len))) { + return 1; + } + format_byte |= 16; + } else { + const uintptr_t genovec_altxy_ctb = DivUp(genovec_altxy_ct, CHAR_BIT); + if (unlikely(CheckedVrecLenIncr(genovec_altxy_ctb, &vrec_len))) { + return 1; + } + CopyGenomatchSubset(patch_10_set, genovec, kMaskAAAA, 0, genovec_altxy_ct, pwcp->fwrite_bufp); + pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[genovec_altxy_ctb]); + } + if (allele_ct <= 17) { + const AlleleCode* patch_10_vals_iter = patch_10_vals; + unsigned char* write_fixed_width_start = pwcp->fwrite_bufp; + uint32_t bytes_to_write; + if (allele_ct == 3) { + // 1 bit per entry, clear = alt1/alt2, set = alt2/alt2 + bytes_to_write = DivUp(patch_10_ct, 8); + if (unlikely(CheckedVrecLenIncr(bytes_to_write, &vrec_len))) { + return 1; + } +#ifdef USE_SSE2 + const uint32_t fullvec_ct = patch_10_ct / (kBytesPerVec / 2); + if (fullvec_ct) { +# if defined(USE_SHUFFLE8) && !defined(USE_AVX2) + // SSE4.2: _mm_shuffle_epi8() to gather even bytes, parallel + // equality-to-2 check, movemask + // (+126 or <<6 followed by movemask also works) + // todo: better ARM implementation + const VecUc gather_even = vecuc_setr8(0, 2, 4, 6, 8, 10, 12, 14, + -1, -1, -1, -1, -1, -1, -1, -1); + for (uint32_t vec_idx = 0; vec_idx != fullvec_ct; ++vec_idx) { + const VecUc cur_vec = vecuc_loadu(patch_10_vals_iter); + patch_10_vals_iter = &(patch_10_vals_iter[kBytesPerVec]); + const VecUc even_vec = vecuc_shuffle8(cur_vec, gather_even); + write_fixed_width_start[vec_idx] = vecuc_movemask(VecWToUc(vecw_slli(VecUcToW(even_vec), 6))); + } +# else + // SSE2/AVX2: parallel equality-to-{2, 2} check, movemask, + // PackWordToHalfword + // (uint16_t +126 followed by movemask also works, and avoids an + // extra masking step in SSE2 case) + // (<<6 doesn't work here) + const VecU16 all126 = vecu16_set1(126); + for (uint32_t vec_idx = 0; vec_idx != fullvec_ct; ++vec_idx) { + VecU16 cur_vec = vecu16_loadu(patch_10_vals_iter); + patch_10_vals_iter = &(patch_10_vals_iter[kBytesPerVec]); + const uint32_t cur_u32 = vecu16_movemask(cur_vec + all126); + const Vec16thUint v16ui = PackVec8thUintTo16th(cur_u32); + CopyToUnalignedOffsetV16(write_fixed_width_start, &v16ui, vec_idx); + } +# endif + } + const uint32_t remainder = patch_10_ct % (kBytesPerVec / 2); + if (remainder) { + uint32_t cur_u32 = 0; + for (uint32_t uii = remainder - 1; ; --uii) { + cur_u32 += patch_10_vals_iter[2 * uii]; + if (!uii) { + break; + } + cur_u32 <<= 1; + } + // this effectively subtracts 1 from each entry + const Vec16thUint v16ui = cur_u32 + 1 - (1U << remainder); + CopyToUnalignedOffsetV16(write_fixed_width_start, &v16ui, fullvec_ct); + patch_10_vals_iter = &(patch_10_vals_iter[remainder * 2]); + } +#else // !USE_SSE2 + const uint32_t word_ct_m1 = (patch_10_ct - 1) / kBitsPerWord; + uint32_t loop_len = kBitsPerWord; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = ModNz(patch_10_ct, kBitsPerWord); + } + uintptr_t cur_word = 0; + for (uint32_t uii = loop_len - 1; ; --uii) { + cur_word |= patch_10_vals_iter[2 * uii] - 1; + if (!uii) { + break; + } + cur_word <<= 1; + } + CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx); + patch_10_vals_iter = &(patch_10_vals_iter[loop_len * 2]); + } +#endif // !USE_SSE2 + } else if (allele_ct <= 5) { + // 2 bits per half-entry + bytes_to_write = DivUp(patch_10_ct, 2); + if (unlikely(CheckedVrecLenIncr(bytes_to_write, &vrec_len))) { + return 1; + } + // this may be worse than simple loop, check this later + const uint32_t word_ct_m1 = (patch_10_ct - 1) / kBitsPerWordD4; + uint32_t loop_len = kBitsPerWordD2; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = 2 * ModNz(patch_10_ct, kBitsPerWordD4); + } + uintptr_t cur_word = 0; + for (uint32_t uii = loop_len - 1; ; --uii) { + // todo: same as 2-bit patch_01 case, except starting with + // parallel-subtract 1 instead of 2 + cur_word |= patch_10_vals_iter[uii] - 1; + if (!uii) { + break; + } + cur_word <<= 2; + } + CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx); + patch_10_vals_iter = &(patch_10_vals_iter[loop_len]); + } + } else { + // 4 bits per half-entry + bytes_to_write = patch_10_ct; + if (unlikely(CheckedVrecLenIncr(bytes_to_write, &vrec_len))) { + return 1; + } + const uint32_t word_ct_m1 = (patch_10_ct - 1) / kBytesPerWord; + uint32_t loop_len = kBitsPerWordD4; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + break; + } + loop_len = 2 * ModNz(patch_10_ct, kBytesPerWord); + } + uintptr_t cur_word = 0; + for (uint32_t uii = loop_len - 1; ; --uii) { + cur_word |= patch_10_vals_iter[uii] - 1; + if (!uii) { + break; + } + cur_word <<= 4; + } + CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx); + patch_10_vals_iter = &(patch_10_vals_iter[loop_len]); + } + } + pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[bytes_to_write]); + } else { + // 1 byte per half-entry + // need 2- and 3-byte cases for larger AlleleCode + unsigned char* payload = pwcp->fwrite_bufp; + if (unlikely(CheckedVrecLenIncr(patch_10_ct * (2 * k1LU), &vrec_len))) { + return 1; + } + const uint32_t patch_10_ct_x2 = patch_10_ct * 2; + // hopefully the compiler automatically vectorizes this when + // sizeof(AlleleCode) == 1 + for (uint32_t uii = 0; uii != patch_10_ct_x2; ++uii) { + payload[uii] = patch_10_vals[uii] - 1; + } + pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[patch_10_ct * 2]); + } + if (genovec_hets_ptr) { + // This is somewhat redundant with the earlier steps, but let's get + // everything working before doing more optimization. + const uint32_t sample_ct = pwcp->sample_ct; + const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct); + const Halfword* patch_10_set_alias = DowncastKWToHW(patch_10_set); + const AlleleCode* patch_10_vals_iter = patch_10_vals; + const AlleleCode* patch_10_vals_end = &(patch_10_vals[patch_10_ct * 2]); + uintptr_t* genovec_hets = nullptr; + uint32_t het_ct = genovec_het_ct; + // todo: try inverting this loop, at least in 64-bit case. perform + // movemask-based scan for heterozygous patch_10_vals entries, then + // advance widx/detect_10 as needed. + for (uint32_t widx = 0; widx != sample_ctl2; ++widx) { + uint32_t patch_10_hw = patch_10_set_alias[widx]; + if (patch_10_hw) { + do { + const AlleleCode val1 = *patch_10_vals_iter++; + const AlleleCode val2 = *patch_10_vals_iter++; + const uintptr_t lowbit = patch_10_hw & (-patch_10_hw); + if (val1 != val2) { + if (!genovec_hets) { + genovec_hets = pwcp->genovec_hets_buf; + memcpy(genovec_hets, genovec, sample_ctl2 * sizeof(intptr_t)); + } + // clear high bit, set low bit + genovec_hets[widx] ^= lowbit * lowbit * 3; + ++het_ct; + } + patch_10_hw ^= lowbit; + } while (patch_10_hw); + if (patch_10_vals_iter == patch_10_vals_end) { + break; + } + } + } + *het_ct_ptr = het_ct; + if (genovec_hets) { + *genovec_hets_ptr = genovec_hets; + } else { + *genovec_hets_ptr = genovec; + } + } + } + *format_byte_ptr = format_byte; + *vrtype_ptr |= 8; + *vrec_len_ptr = vrec_len; + return 0; +} + +BoolErr PwcAppendMultiallelicSparse(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp) { + const uint32_t vidx = pwcp->vidx; + unsigned char vrtype; + uint32_t vrec_len; + if (unlikely(PwcAppendMultiallelicMain(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, allele_ct, patch_01_ct, patch_10_ct, vidx, pwcp, nullptr, nullptr, &vrtype, &vrec_len))) { + return 1; + } + pwcp->vidx += 1; + const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + SubU32Store(vrec_len, vrec_len_byte_ct, &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct])); + if (!pwcp->phase_dosage_gflags) { + pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= S_CAST(uintptr_t, vrtype) << (4 * (vidx % kBitsPerWordD4)); + } else { + DowncastToUc(pwcp->vrtype_buf)[vidx] = vrtype; + } + return 0; +} + +void PglMultiallelicDenseToSparse(const AlleleCode* __restrict wide_codes, uint32_t sample_ct, uintptr_t* __restrict genoarr, uintptr_t* __restrict patch_01_set, AlleleCode* __restrict patch_01_vals, uintptr_t* __restrict patch_10_set, AlleleCode* __restrict patch_10_vals, uint32_t* __restrict patch_01_ct_ptr, uint32_t* __restrict patch_10_ct_ptr) { + const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2; + const AlleleCode* wide_codes_iter = wide_codes; + Halfword* patch_01_set_alias = DowncastWToHW(patch_01_set); + Halfword* patch_10_set_alias = DowncastWToHW(patch_10_set); + AlleleCode* patch_01_vals_iter = patch_01_vals; + AlleleCode* patch_10_vals_iter = patch_10_vals; + uint32_t loop_len = kBitsPerWordD2; + for (uint32_t widx = 0; ; ++widx) { + if (widx >= word_ct_m1) { + if (widx > word_ct_m1) { + *patch_01_ct_ptr = patch_01_vals_iter - patch_01_vals; + *patch_10_ct_ptr = S_CAST(uintptr_t, patch_10_vals_iter - patch_10_vals) / 2; + return; + } + loop_len = ModNz(sample_ct, kBitsPerWordD2); + } + uintptr_t geno_word = 0; + uint32_t patch_01_hw = 0; + uint32_t patch_10_hw = 0; + // ascending loop may be worse than descending here + for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits != loop_len; ++sample_idx_lowbits) { + // todo: try making this movemask-based in 64-bit case + // parallel checks for equality-to-255, equality-to-0, inequality-to-1 + // rarealts are inequality-to-1 and (not 0 or 255); can use ctzu32() and + // &= x - 1 to iterate + // hom-ref iff double-equality to 0 + // het-ref iff low equality-to-0 bit set, high clear + // missing iff equality-to-255 (better be double) + // altx-alty otherwise + const AlleleCode first_code = *wide_codes_iter++; + const AlleleCode second_code = *wide_codes_iter++; + uintptr_t cur_geno; + if (first_code) { + if (first_code == kMissingAlleleCode) { + cur_geno = 3; + } else { + cur_geno = 2; + if (second_code > 1) { + patch_10_hw |= 1U << sample_idx_lowbits; + *patch_10_vals_iter++ = first_code; + *patch_10_vals_iter++ = second_code; + } + } + } else if (!second_code) { + continue; + } else { + cur_geno = 1; + if (second_code > 1) { + patch_01_hw |= 1U << sample_idx_lowbits; + *patch_01_vals_iter++ = second_code; + } + } + geno_word |= cur_geno << (2 * sample_idx_lowbits); + } + genoarr[widx] = geno_word; + patch_01_set_alias[widx] = patch_01_hw; + patch_10_set_alias[widx] = patch_10_hw; + } +} + +static_assert(sizeof(AlleleCode) == 1, "PglMultiallelicSparseToDense() needs to be updated."); +void PglMultiallelicSparseToDense(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const AlleleCode* __restrict remap, uint32_t sample_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, uintptr_t* __restrict flipped, AlleleCode* __restrict wide_codes) { + if (flipped) { + ZeroWArr(BitCtToWordCt(sample_ct), flipped); + } + if ((!remap) || ((!remap[0]) && (remap[1] == 1))) { + GenoarrLookup256x2bx4(genoarr, kHcToAlleleCodes, sample_ct, wide_codes); + if (!remap) { + if (patch_01_ct) { + uintptr_t sample_idx_base = 0; + uintptr_t cur_bits = patch_01_set[0]; + AlleleCode* wide_codes1 = &(wide_codes[1]); + for (uint32_t uii = 0; uii != patch_01_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits); + wide_codes1[2 * sample_idx] = patch_01_vals[uii]; + } + } + if (patch_10_ct) { + uintptr_t sample_idx_base = 0; + uintptr_t cur_bits = patch_10_set[0]; + const DoubleAlleleCode* patch_10_vals_alias = R_CAST(const DoubleAlleleCode*, patch_10_vals); + DoubleAlleleCode* wide_codes_alias = R_CAST(DoubleAlleleCode*, wide_codes); + for (uint32_t uii = 0; uii != patch_10_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits); + wide_codes_alias[sample_idx] = patch_10_vals_alias[uii]; + } + } + return; + } + } else { + // 0 -> (remap[0], remap[0]) + // 1 -> (remap[0], remap[1]) + // 2 -> (remap[1], remap[1]) + // 3 -> (255, 255) + // 256x4 lookup table usually too expensive to reinitialize for every + // single variant. 16x2 should be a good compromise? + // todo: try Expand1bitTo8 strategy in SSE4.2+ case, see if that's fast + // enough to remove 0/1 special case + const uint32_t remap0 = remap[0]; + const uint32_t remap1 = remap[1]; + uint32_t table4[4]; + table4[0] = remap0 * 257; + table4[1] = remap0 + remap1 * 256; + table4[2] = remap1 * 257; + table4[3] = 0xffff; + if (remap1 < remap0) { + table4[1] = remap1 + remap0 * 256; + if (flipped) { + const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct); + PackWordsToHalfwordsInvmatch(genoarr, kMaskAAAA, sample_ctl2, flipped); + } + } + // could have an InitLookup16x2bx2 function for this + uint32_t table16[16]; + for (uint32_t uii = 0; uii != 4; ++uii) { + const uint32_t high_bits = table4[uii] << 16; + uint32_t* table_row = &(table16[uii * 4]); + for (uint32_t ujj = 0; ujj != 4; ++ujj) { + table_row[ujj] = high_bits | table4[ujj]; + } + } + const uint32_t sample_ctd4 = sample_ct / 4; + const unsigned char* genoarr_alias = DowncastKToUc(genoarr); + uint32_t* wide_codes_alias_iter = R_CAST(uint32_t*, wide_codes); + for (uint32_t byte_idx = 0; byte_idx != sample_ctd4; ++byte_idx) { + const uint32_t geno_byte = genoarr_alias[byte_idx]; + *wide_codes_alias_iter++ = table16[geno_byte & 15]; + *wide_codes_alias_iter++ = table16[geno_byte >> 4]; + } + const uint32_t remainder = sample_ct % 4; + if (remainder) { + uint32_t geno_byte = genoarr_alias[sample_ctd4]; + if (remainder & 2) { + *wide_codes_alias_iter++ = table16[geno_byte & 15]; + geno_byte = geno_byte >> 4; + } + if (remainder & 1) { + uint16_t* last_code_pair_ptr = DowncastU32ToU16(wide_codes_alias_iter); + // guess it's easy enough to defend against dirty trailing bits for now + *last_code_pair_ptr = table4[geno_byte & 3]; + } + } + } + if (patch_01_vals) { + uintptr_t sample_idx_base = 0; + uintptr_t cur_bits = patch_01_set[0]; + const AlleleCode remap0 = remap[0]; + const AlleleCode remap1 = remap[1]; + if ((!remap0) || ((!flipped) && (remap0 == 1) && (!remap1))) { + // no flips possible + AlleleCode* wide_codes1 = &(wide_codes[1]); + for (uint32_t uii = 0; uii != patch_01_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits); + wide_codes1[2 * sample_idx] = remap[patch_01_vals[uii]]; + } + } else if (!flipped) { + for (uint32_t uii = 0; uii != patch_01_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits); + const AlleleCode ac = remap[patch_01_vals[uii]]; + if (ac > remap0) { + // bugfix (4 Jul 2024): cannot assume wide_codes[2 * sample_idx] == + // remap0. It's remap1 when remap1 < remap0. + wide_codes[2 * sample_idx] = remap0; + wide_codes[2 * sample_idx + 1] = ac; + } else { + wide_codes[2 * sample_idx] = ac; + wide_codes[2 * sample_idx + 1] = remap0; + } + } + } else if (remap1 >= remap0) { + for (uint32_t uii = 0; uii != patch_01_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits); + const AlleleCode ac = remap[patch_01_vals[uii]]; + if (ac > remap0) { + wide_codes[2 * sample_idx + 1] = ac; + } else { + wide_codes[2 * sample_idx] = ac; + wide_codes[2 * sample_idx + 1] = remap0; + SetBit(sample_idx, flipped); + } + } + } else { + for (uint32_t uii = 0; uii != patch_01_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits); + const AlleleCode ac = remap[patch_01_vals[uii]]; + if (ac > remap0) { + wide_codes[2 * sample_idx] = remap0; + wide_codes[2 * sample_idx + 1] = ac; + ClearBit(sample_idx, flipped); + } else { + wide_codes[2 * sample_idx] = ac; + wide_codes[2 * sample_idx + 1] = remap0; + } + } + } + } + if (patch_10_ct) { + uintptr_t sample_idx_base = 0; + uintptr_t cur_bits = patch_10_set[0]; + if (!flipped) { + for (uint32_t uii = 0; uii != patch_10_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits); + const AlleleCode ac0 = remap[patch_10_vals[2 * uii]]; + const AlleleCode ac1 = remap[patch_10_vals[2 * uii + 1]]; + if (ac0 <= ac1) { + wide_codes[2 * sample_idx] = ac0; + wide_codes[2 * sample_idx + 1] = ac1; + } else { + wide_codes[2 * sample_idx] = ac1; + wide_codes[2 * sample_idx + 1] = ac0; + } + } + } else { + for (uint32_t uii = 0; uii != patch_10_ct; ++uii) { + const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits); + const AlleleCode ac0 = remap[patch_10_vals[2 * uii]]; + const AlleleCode ac1 = remap[patch_10_vals[2 * uii + 1]]; + if (ac0 <= ac1) { + wide_codes[2 * sample_idx] = ac0; + wide_codes[2 * sample_idx + 1] = ac1; + } else { + wide_codes[2 * sample_idx] = ac1; + wide_codes[2 * sample_idx + 1] = ac0; + SetBit(sample_idx, flipped); + } + } + } + } +} + +// tolerates extraneous phaseinfo bits +BoolErr AppendHphase(const uintptr_t* __restrict genoarr_hets, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t het_ct, uint32_t phasepresent_ct, PgenWriterCommon* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) { + assert(phasepresent_ct); + const uint32_t sample_ct = pwcp->sample_ct; + *vrtype_ptr += 16; + const uint32_t het_ctp1_8 = 1 + (het_ct / CHAR_BIT); + unsigned char* fwrite_bufp_iter = pwcp->fwrite_bufp; + uintptr_t phaseinfo_write_word = 0; + uint32_t phaseinfo_write_idx_lowbits; + if (het_ct == phasepresent_ct) { + // no need to write phasepresent; just write phaseinfo directly to output + // buffer + if (unlikely(CheckedVrecLenIncr(het_ctp1_8, vrec_len_ptr))) { + return 1; + } + CopyGenomatchSubset(phaseinfo, genoarr_hets, kMask5555, 1, het_ct, pwcp->fwrite_bufp); + fwrite_bufp_iter = &(pwcp->fwrite_bufp[het_ctp1_8]); + } else { + // this is a minor variant of ExpandThenSubsetBytearr() + if (unlikely(CheckedVrecLenIncr(het_ctp1_8 + DivUp(phasepresent_ct, 8), vrec_len_ptr))) { + return 1; + } + const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct); + uintptr_t* phaseinfo_tmp = pwcp->genovec_invert_buf; + uintptr_t* phaseinfo_tmp_iter = phaseinfo_tmp; + uint32_t phasepresent_write_idx_lowbits = 1; + phaseinfo_write_idx_lowbits = 0; + uintptr_t phasepresent_write_word = 1; // first bit set + for (uint32_t widx = 0; widx != sample_ctl2; ++widx) { + const uintptr_t geno_word = genoarr_hets[widx]; + uintptr_t geno_hets = Word01(geno_word); + if (geno_hets) { + const uint32_t phasepresent_halfword = DowncastKWToHW(phasepresent)[widx]; + if (phasepresent_halfword) { + const uint32_t phaseinfo_halfword = DowncastKWToHW(phaseinfo)[widx]; + do { + const uint32_t sample_idx_lowbits = ctzw(geno_hets) / 2; + if ((phasepresent_halfword >> sample_idx_lowbits) & 1) { + phasepresent_write_word |= k1LU << phasepresent_write_idx_lowbits; + phaseinfo_write_word |= S_CAST(uintptr_t, (phaseinfo_halfword >> sample_idx_lowbits) & k1LU) << phaseinfo_write_idx_lowbits; + if (++phaseinfo_write_idx_lowbits == kBitsPerWord) { + *phaseinfo_tmp_iter++ = phaseinfo_write_word; + phaseinfo_write_word = 0; + phaseinfo_write_idx_lowbits = 0; + } + } + if (++phasepresent_write_idx_lowbits == kBitsPerWord) { + AppendW(phasepresent_write_word, &fwrite_bufp_iter); + phasepresent_write_word = 0; + phasepresent_write_idx_lowbits = 0; + } + geno_hets &= geno_hets - 1; + } while (geno_hets); + } else { + phasepresent_write_idx_lowbits += PopcountWord(geno_hets); + if (phasepresent_write_idx_lowbits >= kBitsPerWord) { + AppendW(phasepresent_write_word, &fwrite_bufp_iter); + phasepresent_write_word = 0; + phasepresent_write_idx_lowbits -= kBitsPerWord; + } + } + } + } + if (phasepresent_write_idx_lowbits) { + const uint32_t cur_byte_ct = DivUp(phasepresent_write_idx_lowbits, CHAR_BIT); + // er, safe to write the entire word... + SubwordStoreMov(phasepresent_write_word, cur_byte_ct, &fwrite_bufp_iter); + } + fwrite_bufp_iter = memcpyua(fwrite_bufp_iter, phaseinfo_tmp, sizeof(intptr_t) * (phaseinfo_tmp_iter - phaseinfo_tmp)); + if (phaseinfo_write_idx_lowbits) { + const uint32_t cur_byte_ct = DivUp(phaseinfo_write_idx_lowbits, CHAR_BIT); + SubwordStoreMov(phaseinfo_write_word, cur_byte_ct, &fwrite_bufp_iter); + } + assert(S_CAST(uintptr_t, fwrite_bufp_iter - pwcp->fwrite_bufp) == het_ctp1_8 + DivUp(phasepresent_ct, 8)); + } + pwcp->fwrite_bufp = fwrite_bufp_iter; + return 0; +} + +void PwcAppendBiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, PgenWriterCommon* pwcp) { + // assumes phase_dosage_gflags is nonzero + const uint32_t vidx = pwcp->vidx; + unsigned char* vrtype_dest = &(DowncastToUc(pwcp->vrtype_buf)[vidx]); + uint32_t het_ct; + uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, &het_ct, nullptr, vrtype_dest); + const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + const uint32_t sample_ct = pwcp->sample_ct; + const uint32_t sample_ctl = BitCtToWordCt(sample_ct); + pwcp->vidx += 1; + unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]); + const uint32_t phasepresent_ct = phasepresent? PopcountWords(phasepresent, sample_ctl) : het_ct; + if (phasepresent_ct) { + AppendHphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len); + } + SubU32Store(vrec_len, vrec_len_byte_ct, vrec_len_dest); +} + +BoolErr PwcAppendMultiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp) { + const uint32_t vidx = pwcp->vidx; + const uintptr_t* genovec_hets; + unsigned char vrtype; + uint32_t het_ct; + uint32_t vrec_len; + if (unlikely(PwcAppendMultiallelicMain(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, allele_ct, patch_01_ct, patch_10_ct, vidx, pwcp, &genovec_hets, &het_ct, &vrtype, &vrec_len))) { + return 1; + } + const uint32_t sample_ct = pwcp->sample_ct; + const uint32_t sample_ctl = BitCtToWordCt(sample_ct); + const uint32_t phasepresent_ct = phasepresent? PopcountWords(phasepresent, sample_ctl) : het_ct; + unsigned char* vrtype_dest = &(DowncastToUc(pwcp->vrtype_buf)[vidx]); + *vrtype_dest = vrtype; + if (phasepresent_ct) { + if (unlikely(AppendHphase(genovec_hets, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len))) { + return 1; + } + } + pwcp->vidx += 1; + const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + SubU32Store(vrec_len, vrec_len_byte_ct, &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct])); + return 0; +} + +// ok if dosage_present trailing bits set +BoolErr AppendDosage16(const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) { + const uint32_t sample_ct = pwcp->sample_ct; + const uint32_t max_deltalist_entry_ct = sample_ct / kPglMaxDeltalistLenDivisor; + if (dosage_ct < max_deltalist_entry_ct) { + // case 1: store dosage IDs as deltalist. + if (unlikely(PwcAppendDeltalist(dosage_present, dosage_ct, pwcp, vrec_len_ptr))) { + return 1; + } + *vrtype_ptr += 0x20; + } else if ((dosage_ct == sample_ct) && ((!dphase_ct) || (dphase_ct == sample_ct))) { + // case 2: fixed-width, no need to store dosage IDs at all. + // dosage_main permitted to have 65535 = missing + // bugfix (9 Dec 2018): since this forces fixed-width dosage-phase storage, + // also require dphase_ct == 0 or dphase_ct == sample_ct. + *vrtype_ptr += 0x40; + } else { + // case 3: save dosage_present bitarray directly. + const uint32_t sample_ctb = DivUp(sample_ct, CHAR_BIT); + if (unlikely(CheckedVrecLenIncr(sample_ctb, vrec_len_ptr))) { + return 1; + } + pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, dosage_present, sample_ctb); + *vrtype_ptr += 0x60; + } + if (unlikely(CheckedVrecLenIncr(dosage_ct * sizeof(int16_t), vrec_len_ptr))) { + return 1; + } + pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, dosage_main, dosage_ct * sizeof(int16_t)); + return 0; +} + +// ok if dosage_present trailing bits set +BoolErr PwcAppendBiallelicGenovecDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp) { + // safe to call this even when entire file has no phase/dosage info + const uint32_t vidx = pwcp->vidx; + unsigned char vrtype; + uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, nullptr, nullptr, &vrtype); + const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + pwcp->vidx += 1; + unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]); + if (dosage_ct) { + if (unlikely(AppendDosage16(dosage_present, dosage_main, dosage_ct, 0, pwcp, &vrtype, &vrec_len))) { + return 1; + } + } + SubU32Store(vrec_len, vrec_len_byte_ct, vrec_len_dest); + if (!pwcp->phase_dosage_gflags) { + pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= S_CAST(uintptr_t, vrtype) << (4 * (vidx % kBitsPerWordD4)); + } else { + DowncastToUc(pwcp->vrtype_buf)[vidx] = vrtype; + } + return 0; +} + +// NOT ok if phasepresent trailing bits set, but ok for dosage_present +BoolErr PwcAppendBiallelicGenovecHphaseDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp) { + // assumes there is phase and/or dosage data in output file, otherwise + // vrtype_dest needs to be replaced + + // this mostly overlaps with PwcAppendBiallelicGenovecHphase(); probably + // get rid of the latter + const uint32_t vidx = pwcp->vidx; + unsigned char* vrtype_dest = &(DowncastToUc(pwcp->vrtype_buf)[vidx]); + uint32_t het_ct; + uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, &het_ct, nullptr, vrtype_dest); + const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + const uint32_t sample_ct = pwcp->sample_ct; + const uint32_t sample_ctl = BitCtToWordCt(sample_ct); + pwcp->vidx += 1; + unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]); + const uint32_t phasepresent_ct = phasepresent? PopcountWords(phasepresent, sample_ctl) : het_ct; + if (phasepresent_ct) { + AppendHphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len); + } + if (dosage_ct) { + if (unlikely(AppendDosage16(dosage_present, dosage_main, dosage_ct, 0, pwcp, vrtype_dest, &vrec_len))) { + return 1; + } + } + SubU32Store(vrec_len, vrec_len_byte_ct, vrec_len_dest); + return 0; +} + +BoolErr AppendDphase16(const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) { + assert(dphase_ct); + *vrtype_ptr += 0x80; + if (dphase_ct != pwcp->sample_ct) { + // bugfix (9 Dec 2018): forgot to separate fixed-width phased-dosage case + // bugfix (12 Feb 2019): double-added dphase_ct * sizeof(int16_t) + const uint32_t dphase_present_byte_ct = DivUp(dosage_ct, CHAR_BIT); + if (unlikely(CheckedVrecLenIncr(dphase_present_byte_ct, vrec_len_ptr))) { + return 1; + } + CopyBitarrSubsetToUnaligned(dphase_present, dosage_present, dosage_ct, pwcp->fwrite_bufp); + pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[dphase_present_byte_ct]); + } + // bugfix (2 Jan 2019): forgot to update vrec_len here + if (unlikely(CheckedVrecLenIncr(dphase_ct * sizeof(int16_t), vrec_len_ptr))) { + return 1; + } + pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, dphase_delta, dphase_ct * sizeof(int16_t)); + return 0; +} + +BoolErr PwcAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp) { + // assumes there is phase and/or dosage data in output file, otherwise + // vrtype_dest needs to be replaced + const uint32_t vidx = pwcp->vidx; + unsigned char* vrtype_dest = &(DowncastToUc(pwcp->vrtype_buf)[vidx]); + uint32_t het_ct; + uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, &het_ct, nullptr, vrtype_dest); + const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + const uint32_t sample_ct = pwcp->sample_ct; + const uint32_t sample_ctl = BitCtToWordCt(sample_ct); + pwcp->vidx += 1; + unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]); + const uint32_t phasepresent_ct = phasepresent? PopcountWords(phasepresent, sample_ctl) : het_ct; + if (phasepresent_ct) { + AppendHphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len); + } + if (dosage_ct) { + if (unlikely(AppendDosage16(dosage_present, dosage_main, dosage_ct, dphase_ct, pwcp, vrtype_dest, &vrec_len))) { + return 1; + } + if (dphase_ct) { + if (unlikely(AppendDphase16(dosage_present, dphase_present, dphase_delta, dosage_ct, dphase_ct, pwcp, vrtype_dest, &vrec_len))) { + return 1; + } + } + } + SubU32Store(vrec_len, vrec_len_byte_ct, vrec_len_dest); + return 0; +} + +uint32_t PhaseOrDosagePresent(const uintptr_t* vrtype_buf, uint32_t variant_ct) { + // Assumes trailing bits of vrtype_buf are zeroed out. (This happens during + // vrtype_buf initialization.) + const uint32_t word_ct = DivUp(variant_ct, kBytesPerWord); + for (uint32_t widx = 0; widx != word_ct; ++widx) { + if (vrtype_buf[widx] & (kMask0101 * 0xf0)) { + return 1; + } + } + return 0; +} + +// ext_iter == nullptr ok +IntErr AppendExtVarint(const PgenExtensionLl* ext_iter, FILE* pgen) { + int32_t cur_outbyte = 0; + uint32_t flushed_byte_ct = 0; + while (ext_iter) { + const uint32_t cur_type_idx = ext_iter->type_idx; + const uint32_t cur_type_idx_div7 = cur_type_idx / 7; + const uint32_t bytes_to_flush = cur_type_idx_div7 - flushed_byte_ct; + if (bytes_to_flush) { + putc_unlocked(cur_outbyte | 128, pgen); + for (uint32_t uii = 1; uii != bytes_to_flush; ++uii) { + putc_unlocked(128, pgen); + } + cur_outbyte = 0; + flushed_byte_ct = cur_type_idx_div7; + } + cur_outbyte |= 1 << (cur_type_idx % 7); + + ext_iter = ext_iter->next; + } + return putc_checked(cur_outbyte, pgen); +} + +PglErr AppendExtSizesAndContents(const PgenExtensionLl* exts, FILE* pgen) { + const PgenExtensionLl* exts_iter = exts; + do { + const uint64_t size = exts_iter->size; + if (unlikely(size >= (1LLU << (kBitsPerWord - 1)))) { + return kPglRetImproperFunctionCall; + } + FPutVint64(size, pgen); + exts_iter = exts_iter->next; + } while (exts_iter); + exts_iter = exts; + do { + if (unlikely(fwrite_checked(exts_iter->contents, exts_iter->size, pgen))) { + return kPglRetWriteFail; + } + exts_iter = exts_iter->next; + } while (exts_iter); + return kPglRetSuccess; +} + +PglErr PwcFinish(PgenWriterCommon* pwcp, FILE** pgen_outfile_ptr, FILE** pgi_or_final_pgen_outfile_ptr, char** fname_buf_ptr) { + uint64_t footer_fpos = 0; + if (pwcp->footer_exts) { + footer_fpos = ftello(*pgen_outfile_ptr); + if (!(*fname_buf_ptr)) { + PglErr reterr = AppendExtSizesAndContents(pwcp->footer_exts, *pgen_outfile_ptr); + if (unlikely(reterr)) { + return reterr; + } + } + } + const uint32_t variant_ct = pwcp->vidx; + FILE** header_ff_ptr; + if (*pgi_or_final_pgen_outfile_ptr) { + // kPgenWriteSeparateIndex, kPgenWriteAndCopy + assert(variant_ct); + assert(pwcp->variant_ct_limit >= variant_ct); + if (pwcp->phase_dosage_gflags) { + // Check if there is at least one variant with a phase or dosage data + // track. + if (!PhaseOrDosagePresent(pwcp->vrtype_buf, variant_ct)) { + // If not, shrink the index. + Reduce8to4bitInplaceUnsafe(variant_ct, pwcp->vrtype_buf); + pwcp->phase_dosage_gflags = kfPgenGlobal0; + } + } + if (unlikely(fclose_null(pgen_outfile_ptr))) { + return kPglRetWriteFail; + } + header_ff_ptr = pgi_or_final_pgen_outfile_ptr; + if (*fname_buf_ptr) { + *pgen_outfile_ptr = fopen(*fname_buf_ptr, FOPEN_RB); + if (unlikely(!(*pgen_outfile_ptr))) { + return kPglRetOpenFail; + } + } + } else { + // kPgenWriteBackwardSeek + assert(pwcp->variant_ct_limit == variant_ct); + header_ff_ptr = pgen_outfile_ptr; + if (unlikely(fseeko(*header_ff_ptr, 3, SEEK_SET))) { + return kPglRetWriteFail; + } + } + FILE* header_ff = *header_ff_ptr; + const PgenGlobalFlags phase_dosage_gflags = pwcp->phase_dosage_gflags; + fwrite_unlocked(&variant_ct, sizeof(int32_t), 1, header_ff); + fwrite_unlocked(&(pwcp->sample_ct), sizeof(int32_t), 1, header_ff); + + const uint32_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct; + const uint32_t control_byte = (vrec_len_byte_ct - 1) + (4 * (phase_dosage_gflags != 0)) + (pwcp->nonref_flags_storage << 6); + putc_unlocked(control_byte, header_ff); + + const uint32_t vblock_ct = DivUp(variant_ct, kPglVblockSize); + if (*fname_buf_ptr) { + // Current vblock_fpos values are with no index. Determine the size of the + // index and add it to all array entries. + + // variant_ct, sample_ct, control_byte, vblock_fpos itself + uintptr_t index_size = 9 + vblock_ct * sizeof(int64_t); + // vrtypes + if (phase_dosage_gflags != 0) { + index_size += variant_ct; + } else { + index_size += DivUp(variant_ct, 2); + } + // vrec_lens + index_size += vrec_len_byte_ct * S_CAST(uintptr_t, variant_ct); + // nonref_flags + if (pwcp->explicit_nonref_flags) { + index_size += DivUp(variant_ct, CHAR_BIT); + } + for (uint32_t vblock_idx = 0; vblock_idx != vblock_ct; ++vblock_idx) { + pwcp->vblock_fpos[vblock_idx] += index_size; + } + } + fwrite_unlocked(pwcp->vblock_fpos, vblock_ct * sizeof(int64_t), 1, header_ff); + const unsigned char* vrtype_buf_iter = DowncastToUc(pwcp->vrtype_buf); + const unsigned char* vrec_len_buf_iter = pwcp->vrec_len_buf; + uint32_t vrec_iter_incr = kPglVblockSize * vrec_len_byte_ct; + uint32_t vrtype_buf_iter_incr = phase_dosage_gflags? kPglVblockSize : (kPglVblockSize / 2); + uint32_t nonref_flags_write_byte_ct = kPglVblockSize / CHAR_BIT; + const unsigned char* vrec_len_buf_last = &(vrec_len_buf_iter[S_CAST(uintptr_t, vblock_ct - 1) * vrec_iter_incr]); + uintptr_t* explicit_nonref_flags = pwcp->explicit_nonref_flags; + uintptr_t* explicit_nonref_flags_iter = explicit_nonref_flags; + for (; ; vrec_len_buf_iter = &(vrec_len_buf_iter[vrec_iter_incr])) { + if (vrec_len_buf_iter >= vrec_len_buf_last) { + if (vrec_len_buf_iter > vrec_len_buf_last) { + break; + } + const uint32_t vblock_size = ModNz(variant_ct, kPglVblockSize); + vrtype_buf_iter_incr = phase_dosage_gflags? vblock_size : DivUp(vblock_size, 2); + vrec_iter_incr = vblock_size * vrec_len_byte_ct; + nonref_flags_write_byte_ct = DivUp(vblock_size, CHAR_BIT); + } + // 4b(i): array of 4-bit or 1-byte vrtypes + fwrite_unlocked(vrtype_buf_iter, vrtype_buf_iter_incr, 1, header_ff); + vrtype_buf_iter = &(vrtype_buf_iter[vrtype_buf_iter_incr]); + + // 4b(ii): array of variant record lengths + if (unlikely(fwrite_checked(vrec_len_buf_iter, vrec_iter_incr, header_ff))) { + return kPglRetWriteFail; + } + + // If we were writing allele_idx_offsets, that would happen here. + + // 4b(iii): explicit nonref flags + if (explicit_nonref_flags) { + if (unlikely(fwrite_checked(explicit_nonref_flags_iter, nonref_flags_write_byte_ct, header_ff))) { + return kPglRetWriteFail; + } + explicit_nonref_flags_iter = &(explicit_nonref_flags_iter[kPglVblockSize / kBitsPerWord]); + } + } + if (pwcp->header_exts || pwcp->footer_exts) { + if (unlikely(AppendExtVarint(pwcp->header_exts, header_ff) || + AppendExtVarint(pwcp->footer_exts, header_ff))) { + return kPglRetWriteFail; + } + if (pwcp->footer_exts) { + if (*fname_buf_ptr) { + // In kPgenWriteAndCopy mode, initial footer_fpos value is relative to + // the start of the .pgen body. We now need to add the header size. + footer_fpos += ftello(header_ff) + sizeof(int64_t); + const PgenExtensionLl* header_exts_iter = pwcp->header_exts; + while (header_exts_iter) { + const uintptr_t cur_size = header_exts_iter->size; + footer_fpos += cur_size + VintBytect(cur_size); + header_exts_iter = header_exts_iter->next; + } + } + if (unlikely(!fwrite_unlocked(&footer_fpos, sizeof(int64_t), 1, header_ff))) { + return kPglRetWriteFail; + } + } + if (pwcp->header_exts) { + PglErr reterr = AppendExtSizesAndContents(pwcp->header_exts, header_ff); + if (unlikely(reterr)) { + return reterr; + } + } + } + if (!(*fname_buf_ptr)) { + return fclose_null(header_ff_ptr)? kPglRetWriteFail : kPglRetSuccess; + } + // Append temporary-.pgen body (excluding the 3 bytes containing the leading + // magic number) to final output file. + unsigned char copybuf[kPglFwriteBlockSize + 3]; + uintptr_t nbyte = fread(copybuf, 1, kPglFwriteBlockSize + 3, *pgen_outfile_ptr); + if (unlikely((nbyte <= 3) || (!memequal_k(copybuf, "l\x1b", 2)))) { + return kPglRetRewindFail; + } + nbyte -= 3; + if (unlikely(!fwrite_unlocked(&(copybuf[3]), nbyte, 1, header_ff))) { + return kPglRetWriteFail; + } + while (1) { + nbyte = fread(copybuf, 1, kPglFwriteBlockSize, *pgen_outfile_ptr); + if (!nbyte) { + break; + } + if (unlikely(!fwrite_unlocked(copybuf, nbyte, 1, header_ff))) { + return kPglRetWriteFail; + } + } + if (pwcp->footer_exts) { + PglErr reterr = AppendExtSizesAndContents(pwcp->footer_exts, header_ff); + if (unlikely(reterr)) { + return kPglRetWriteFail; + } + } + if (unlikely(fclose_null(pgen_outfile_ptr) || + unlink(*fname_buf_ptr) || + fclose_null(header_ff_ptr))) { + return kPglRetWriteFail; + } + free(*fname_buf_ptr); + *fname_buf_ptr = nullptr; + return kPglRetSuccess; +} + +PglErr SpgwFinish(STPgenWriter* spgwp) { + PgenWriterCommon* pwcp = GetPwcp(spgwp); + FILE** pgen_outfilep = GetPgenOutfilep(spgwp); + if (unlikely(fwrite_checked(pwcp->fwrite_buf, pwcp->fwrite_bufp - pwcp->fwrite_buf, *pgen_outfilep))) { + return kPglRetWriteFail; + } + FILE** pgi_or_final_pgen_outfilep = GetPgiOrFinalPgenOutfilep(spgwp); + char** fname_bufp = GetFnameBufp(spgwp); + return PwcFinish(pwcp, pgen_outfilep, pgi_or_final_pgen_outfilep, fname_bufp); +} + +PglErr MpgwFlush(MTPgenWriter* mpgwp) { + PgenWriterCommon* pwcp = mpgwp->pwcs[0]; + uint32_t vidx = RoundDownPow2(pwcp->vidx - 1, kPglVblockSize); + uint32_t thread_ct = mpgwp->thread_ct; + const uint32_t variant_ct = pwcp->variant_ct_limit; + const uint32_t is_last_flush = ((vidx + thread_ct * kPglVblockSize) >= variant_ct); + if (is_last_flush) { + thread_ct = DivUp(variant_ct - vidx, kPglVblockSize); + } + uint64_t* vblock_fpos = pwcp->vblock_fpos; + FILE* pgen_outfile = mpgwp->pgen_outfile; + const uint32_t vidx_incr = (thread_ct - 1) * kPglVblockSize; + uint64_t cur_vblock_fpos = ftello(pgen_outfile); + for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) { + vblock_fpos[(vidx / kPglVblockSize) + tidx] = cur_vblock_fpos; + PgenWriterCommon* cur_pwcp = mpgwp->pwcs[tidx]; + uintptr_t cur_vblock_byte_ct = cur_pwcp->fwrite_bufp - cur_pwcp->fwrite_buf; + if (unlikely(fwrite_checked(cur_pwcp->fwrite_buf, cur_vblock_byte_ct, pgen_outfile))) { + return kPglRetWriteFail; + } + cur_pwcp->vidx += vidx_incr; + cur_pwcp->fwrite_bufp = cur_pwcp->fwrite_buf; + cur_vblock_fpos += cur_vblock_byte_ct; + } + if (!is_last_flush) { + return kPglRetSuccess; + } + pwcp->vidx = variant_ct; + return PwcFinish(pwcp, &(mpgwp->pgen_outfile), &(mpgwp->pgi_or_final_pgen_outfile), &(mpgwp->fname_buf)); +} + +BoolErr CleanupSpgw(STPgenWriter* spgwp, PglErr* reterrp) { + // assume file is open if spgw.pgen_outfile is not null + // memory is the responsibility of the caller for now + FILE** pgen_outfilep = GetPgenOutfilep(spgwp); + FILE** pgi_or_final_pgen_outfilep = GetPgiOrFinalPgenOutfilep(spgwp); + char** fname_bufp = GetFnameBufp(spgwp); + BoolErr fclose_err = 0; + if (*pgi_or_final_pgen_outfilep) { + fclose_err = fclose_null(pgi_or_final_pgen_outfilep); + } + if (*pgen_outfilep) { + if (fclose_null(pgen_outfilep)) { + fclose_err = 1; + } + } + if (*fname_bufp) { + free(*fname_bufp); + *fname_bufp = nullptr; + } + if (unlikely(fclose_err && (*reterrp == kPglRetSuccess))) { + *reterrp = kPglRetWriteFail; + } + return fclose_err; +} + +BoolErr CleanupMpgw(MTPgenWriter* mpgwp, PglErr* reterrp) { + if (!mpgwp) { + return 0; + } + FILE** pgen_outfilep = &(mpgwp->pgen_outfile); + FILE** pgi_or_final_pgen_outfilep = &(mpgwp->pgi_or_final_pgen_outfile); + char** fname_bufp = &(mpgwp->fname_buf); + BoolErr fclose_err = 0; + if (*pgi_or_final_pgen_outfilep) { + fclose_err = fclose_null(pgi_or_final_pgen_outfilep); + } + if (*pgen_outfilep) { + if (fclose_null(pgen_outfilep)) { + fclose_err = 1; + } + } + if (*fname_bufp) { + free(*fname_bufp); + *fname_bufp = nullptr; + } + if (unlikely(fclose_err && (*reterrp == kPglRetSuccess))) { + *reterrp = kPglRetWriteFail; + } + return fclose_err; +} + +#ifdef __cplusplus +} // namespace plink2 +#endif diff --git a/external_libs/pgenlib/include/pgenlib_write.h b/external_libs/pgenlib/include/pgenlib_write.h new file mode 100644 index 00000000..801b85b5 --- /dev/null +++ b/external_libs/pgenlib/include/pgenlib_write.h @@ -0,0 +1,393 @@ +#ifndef __PGENLIB_WRITE_H__ +#define __PGENLIB_WRITE_H__ + +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, +// Christopher Chang. +// +// This library is free software: you can redistribute it and/or modify it +// under the terms of the GNU Lesser General Public License as published by the +// Free Software Foundation; either version 3 of the License, or (at your +// option) any later version. +// +// This library is distributed in the hope that it will be useful, but WITHOUT +// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or +// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License +// for more details. +// +// You should have received a copy of the GNU Lesser General Public License +// along with this library. If not, see . + + +// pgenlib_write contains writer-specific code. + +#include "pgenlib_misc.h" +#include "plink2_base.h" + +#ifdef __cplusplus +namespace plink2 { +#endif + +typedef struct PgenWriterCommonStruct { + // was marked noncopyable, but, well, gcc 9 caught me cheating (memcpying the + // whole struct) in the multithreaded writer implementation. So, copyable + // now. + + // This must be the true variant count when writing a regular .pgen; but if + // the index is being written to a separate .pgen.pgi file, it is okay for + // this to just be an upper bound. + uint32_t variant_ct_limit; + + uint32_t sample_ct; + PgenGlobalFlags phase_dosage_gflags; // subset of gflags + unsigned char nonref_flags_storage; + + // I'll cache this for now + uintptr_t vrec_len_byte_ct; + + // There should be a single copy of these arrays shared by all threads. + uint64_t* vblock_fpos; + unsigned char* vrec_len_buf; + uintptr_t* vrtype_buf; + uintptr_t* explicit_nonref_flags; // usually nullptr + + // needed for multiallelic-phased case + uintptr_t* genovec_hets_buf; + + PgenExtensionLl* header_exts; + PgenExtensionLl* footer_exts; + + // should match ftello() return value in singlethreaded case, but be set to + // zero in multithreaded case + uint64_t vblock_fpos_offset; + + + // The remainder of this data structure is unshared. + STD_ARRAY_DECL(uint32_t, 4, ldbase_genocounts); + + // these must hold sample_ct entries + // genovec_invert_buf also used as phaseinfo and dphase_present temporary + // storage + uintptr_t* genovec_invert_buf; + uintptr_t* ldbase_genovec; + + // these must hold 2 * (sample_ct / kPglMaxDifflistLenDivisor) entries + uintptr_t* ldbase_raregeno; + uint32_t* ldbase_difflist_sample_ids; // 1 extra entry, == sample_ct + + // this must fit 64k variants in multithreaded case + unsigned char* fwrite_buf; + unsigned char* fwrite_bufp; + + uint32_t ldbase_common_geno; // UINT32_MAX if ldbase_genovec present + uint32_t ldbase_difflist_len; + + uint32_t vidx; +} PgenWriterCommon; + +// Given packed arrays of unphased biallelic genotypes in uncompressed plink2 +// binary format (00 = hom ref, 01 = het ref/alt1, 10 = hom alt1, 11 = +// missing), {S,M}TPgenWriter performs difflist (sparse variant), one bit +// (mostly-two-value), and LD compression before writing to disk, and flushes +// the header at the end. +// +// The flush requires a backward seek if write_mode is set to +// kPgenWriteBackwardSeek during initialization. Otherwise, the (possibly +// temporary) .pgen is written sequentially; in the kPgenWriteSeparateIndex +// mode, the index is then saved to a separate .pgen.pgi file, while in the +// kPgenWriteAndCopy mode, the real .pgen is only created at the end, by +// writing the index and then appending the body of the first .pgen (which is +// then deleted). +// +// Slightly over 128 KiB (kPglFwriteBlockSize) of stack space is currently +// required. We use plain fwrite() instead of write() to reduce the amount of +// necessary platform-specific logic; a 128 KiB write-block size keeps the +// associated performance penalty to a minimum. +// +// MTPgenWriter has additional restrictions: +// * You must process large blocks of variants at a time (64k per thread). +// * You must know the true variant_ct (and, in the multiallelic case, the true +// allele_idx_offsets) during initialization. +// Thus, STPgenWriter is still worth using in many cases. + +ENUM_U31_DEF_START() + kPgenWriteBackwardSeek, + kPgenWriteSeparateIndex, + kPgenWriteAndCopy +ENUM_U31_DEF_END(PgenWriteMode); + +typedef struct STPgenWriterStruct { + MOVABLE_BUT_NONCOPYABLE(STPgenWriterStruct); +#ifdef __cplusplus + PgenWriterCommon& GET_PRIVATE_pwc() { return pwc; } + PgenWriterCommon const& GET_PRIVATE_pwc() const { return pwc; } + FILE*& GET_PRIVATE_pgen_outfile() { return pgen_outfile; } + FILE* const& GET_PRIVATE_pgen_outfile() const { return pgen_outfile; } + FILE*& GET_PRIVATE_pgi_or_final_pgen_outfile() { return pgi_or_final_pgen_outfile; } + FILE* const& GET_PRIVATE_pgi_or_final_pgen_outfile() const { return pgi_or_final_pgen_outfile; } + char*& GET_PRIVATE_fname_buf() { return fname_buf; } + char* const& GET_PRIVATE_fname_buf() const { return fname_buf; } + private: +#endif + PgenWriterCommon pwc; + FILE* pgen_outfile; + FILE* pgi_or_final_pgen_outfile; + // Initialized in kPgenWriteAndCopy mode. It is necessary on at least macOS + // to close and reopen the initially-written .pgen; freopen() with + // filename=nullptr can't be used to change from write-mode to read-mode. + char* fname_buf; +} STPgenWriter; + +typedef struct MTPgenWriterStruct { + MOVABLE_BUT_NONCOPYABLE(MTPgenWriterStruct); + FILE* pgen_outfile; + FILE* pgi_or_final_pgen_outfile; + char* fname_buf; + uint32_t thread_ct; + PgenWriterCommon* pwcs[]; +} MTPgenWriter; + +HEADER_INLINE uint32_t SpgwGetVariantCt(STPgenWriter* spgwp) { + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + return pwcp->variant_ct_limit; +} + +HEADER_INLINE uint32_t SpgwGetSampleCt(STPgenWriter* spgwp) { + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + return pwcp->sample_ct; +} + +HEADER_INLINE uint32_t SpgwGetVidx(STPgenWriter* spgwp) { + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + return pwcp->vidx; +} + +void PreinitSpgw(STPgenWriter* spgwp); + +void PreinitMpgw(MTPgenWriter* mpgwp); + +// phase_dosage_gflags zero vs. nonzero is most important: this determines size +// of header. Otherwise, setting more flags than necessary just increases +// memory requirements. +// +// nonref_flags_storage values: +// 0 = no info stored +// 1 = always trusted +// 2 = always untrusted +// 3 = use explicit_nonref_flags +// +// Caller is responsible for printing open-fail error message. +// +// In the multiallelic case, allele_idx_offsets can be nullptr if +// allele_ct_upper_bound is provided. +// +// If write_mode is kPgenWriteBackwardSeek, variant_ct_limit must be the actual +// variant count. Otherwise, variant_ct_limit just needs to be an upper bound, +// unless you're writing multiallelic variants and you don't specify +// allele_ct_upper_bound. If you're using a plink2-style memory workspace, +// variant_ct_limit := min(kPglMaxVariantCt, bigstack_left() / 8) is a +// reasonable default: this tends to consume less than half of your remaining +// workspace, without falling below the number of variants you can +// realistically load on the system. +// +// The body of explicit_nonref_flags doesn't need to be filled until final +// flush. +// For header extensions, type_idx and size must be correct when this function +// is called, but contents doesn't need to be filled until final flush. +// For footer extensions, type_idx must be correct when this function is +// called, but size and contents don't need to be filled until final flush. +// Extension linked-lists must be ordered with lower type_idx first. +PglErr SpgwInitPhase1Ex(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct_limit, uint32_t sample_ct, uint32_t allele_ct_upper_bound, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, STPgenWriter* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr); + +HEADER_INLINE PglErr SpgwInitPhase1(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct_limit, uint32_t sample_ct, uint32_t allele_ct_upper_bound, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, STPgenWriter* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr) { + return SpgwInitPhase1Ex(fname, allele_idx_offsets, explicit_nonref_flags, nullptr, nullptr, variant_ct_limit, sample_ct, allele_ct_upper_bound, write_mode, phase_dosage_gflags, nonref_flags_storage, spgwp, alloc_cacheline_ct_ptr, max_vrec_len_ptr); +} + +void SpgwInitPhase2(uint32_t max_vrec_len, STPgenWriter* spgwp, unsigned char* spgw_alloc); + +// moderately likely that there isn't enough memory to use the maximum number +// of threads, so this returns per-thread memory requirements before forcing +// the caller to specify thread count +// (eventually should write code which falls back on STPgenWriter +// when there isn't enough memory for even a single 64k variant block, at least +// for the most commonly used plink 2.0 functions) +void MpgwInitPhase1(const uintptr_t* __restrict allele_idx_offsets, uint32_t variant_ct, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uintptr_t* alloc_base_cacheline_ct_ptr, uint64_t* alloc_per_thread_cacheline_ct_ptr, uint32_t* vrec_len_byte_ct_ptr, uint64_t* vblock_cacheline_ct_ptr); + +// Caller is responsible for printing open-fail error message. +PglErr MpgwInitPhase2Ex(const char* __restrict fname, uintptr_t* __restrict explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct, uint32_t sample_ct, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, MTPgenWriter* mpgwp); + +HEADER_INLINE PglErr MpgwInitPhase2(const char* __restrict fname, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, MTPgenWriter* mpgwp) { + return MpgwInitPhase2Ex(fname, explicit_nonref_flags, nullptr, nullptr, variant_ct, sample_ct, write_mode, phase_dosage_gflags, nonref_flags_storage, vrec_len_byte_ct, vblock_cacheline_ct, thread_ct, mpgw_alloc, mpgwp); +} + + +// trailing bits of genovec must be zeroed out +void PwcAppendBiallelicGenovec(const uintptr_t* __restrict genovec, PgenWriterCommon* pwcp); + +BoolErr SpgwFlush(STPgenWriter* spgwp); + +HEADER_INLINE PglErr SpgwAppendBiallelicGenovec(const uintptr_t* __restrict genovec, STPgenWriter* spgwp) { + if (unlikely(SpgwFlush(spgwp))) { + return kPglRetWriteFail; + } + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + PwcAppendBiallelicGenovec(genovec, pwcp); + return kPglRetSuccess; +} + +// trailing bits of raregeno must be zeroed out +// all raregeno entries assumed to be unequal to difflist_common_geno; the +// difflist should be compacted first if this isn't true +// difflist_len must be <= 2 * (sample_ct / kPglMaxDifflistLenDivisor); +// there's an assert checking this +void PwcAppendBiallelicDifflistLimited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp); + +HEADER_INLINE PglErr SpgwAppendBiallelicDifflistLimited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, STPgenWriter* spgwp) { + if (unlikely(SpgwFlush(spgwp))) { + return kPglRetWriteFail; + } + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + PwcAppendBiallelicDifflistLimited(raregeno, difflist_sample_ids, difflist_common_geno, difflist_len, pwcp); + return kPglRetSuccess; +} + +// Two interfaces for appending multiallelic hardcalls: +// 1. sparse: genovec, bitarray+values describing ref/altx hardcalls which +// aren't ref/alt1, bitarray+values describing altx/alty hardcalls which +// aren't alt1/alt1. +// patch_01_vals[] contains one entry per relevant sample (only need altx +// index), patch_10_vals[] contains two. +// Ok if patch_01_ct == patch_10_ct == 0; in this case no aux1 track is +// saved and bit 3 of vrtype is not set. (Note that multiallelic dosage may +// still be present when vrtype bit 3 is unset.) +// 2. generic dense: takes a length-2n array of AlleleCode allele codes. +// Assumes [2k] <= [2k+1] for each k. Instead of providing direct API +// functions for this, we just provide a dense -> sparse helper function. +// +// All arrays should be vector-aligned. +BoolErr PwcAppendMultiallelicSparse(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp); + +HEADER_INLINE PglErr SpgwAppendMultiallelicSparse(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, STPgenWriter* spgwp) { + if (unlikely(SpgwFlush(spgwp))) { + return kPglRetWriteFail; + } + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + if (unlikely(PwcAppendMultiallelicSparse(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, allele_ct, patch_01_ct, patch_10_ct, pwcp))) { + return kPglRetVarRecordTooLarge; + } + return kPglRetSuccess; +} + +// This may not zero out trailing halfword of patch_{01,10}_set. +void PglMultiallelicDenseToSparse(const AlleleCode* __restrict wide_codes, uint32_t sample_ct, uintptr_t* __restrict genoarr, uintptr_t* __restrict patch_01_set, AlleleCode* __restrict patch_01_vals, uintptr_t* __restrict patch_10_set, AlleleCode* __restrict patch_10_vals, uint32_t* __restrict patch_01_ct_ptr, uint32_t* __restrict patch_10_ct_ptr); + +// If remap is not nullptr, this simultaneously performs a rotation operation: +// wide_codes[2n] and [2n+1] are set to remap[geno[n]] rather than geno[n], and +// bit n of flipped (if flipped non-null) is set iff phase orientation is +// flipped (i.e. wide_codes[2n] was larger than wide_codes[2n+1] before the +// final reordering pass; caller needs to know this to properly update +// phaseinfo, dphase_delta, multidphase_delta). +// It currently assumes no present alleles are being mapped to 'missing'. +void PglMultiallelicSparseToDense(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const AlleleCode* __restrict remap, uint32_t sample_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, uintptr_t* __restrict flipped, AlleleCode* __restrict wide_codes); + +// phasepresent == nullptr ok, that indicates that ALL heterozygous calls are +// phased. Caller should use e.g. PwcAppendBiallelicGenovec() if it's known +// in advance that no calls are phased. +// Ok for phaseinfo to have bits set at non-het calls, NOT currently okay for +// phasepresent +void PwcAppendBiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, PgenWriterCommon* pwcp); + +// phasepresent == nullptr ok +// ok for trailing bits of phaseinfo to not be zeroed out, NOT currently ok for +// phasepresent +HEADER_INLINE PglErr SpgwAppendBiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, STPgenWriter* spgwp) { + if (unlikely(SpgwFlush(spgwp))) { + return kPglRetWriteFail; + } + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + PwcAppendBiallelicGenovecHphase(genovec, phasepresent, phaseinfo, pwcp); + return kPglRetSuccess; +} + +BoolErr PwcAppendMultiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp); + +HEADER_INLINE PglErr SpgwAppendMultiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, STPgenWriter* spgwp) { + if (unlikely(SpgwFlush(spgwp))) { + return kPglRetWriteFail; + } + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + if (unlikely(PwcAppendMultiallelicGenovecHphase(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, phasepresent, phaseinfo, allele_ct, patch_01_ct, patch_10_ct, pwcp))) { + return kPglRetVarRecordTooLarge; + } + return kPglRetSuccess; +} + + +// dosage_main[] has length dosage_ct, not sample_ct +// ok for trailing bits of dosage_present to not be zeroed out +BoolErr PwcAppendBiallelicGenovecDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp); + +HEADER_INLINE PglErr SpgwAppendBiallelicGenovecDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, STPgenWriter* spgwp) { + if (unlikely(SpgwFlush(spgwp))) { + return kPglRetWriteFail; + } + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + if (unlikely(PwcAppendBiallelicGenovecDosage16(genovec, dosage_present, dosage_main, dosage_ct, pwcp))) { + return kPglRetVarRecordTooLarge; + } + return kPglRetSuccess; +} + +BoolErr PwcAppendBiallelicGenovecHphaseDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp); + +HEADER_INLINE PglErr SpgwAppendBiallelicGenovecHphaseDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, STPgenWriter* spgwp) { + if (unlikely(SpgwFlush(spgwp))) { + return kPglRetWriteFail; + } + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + if (unlikely(PwcAppendBiallelicGenovecHphaseDosage16(genovec, phasepresent, phaseinfo, dosage_present, dosage_main, dosage_ct, pwcp))) { + return kPglRetVarRecordTooLarge; + } + return kPglRetSuccess; +} + +// dosage_present cannot be null for nonzero dosage_ct +// trailing bits of dosage_present MUST be zeroed out +// could make dosage_main[] has length dosage_ct + dphase_ct instead of having +// separate dphase_delta[]? +BoolErr PwcAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp); + +HEADER_INLINE PglErr SpgwAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, STPgenWriter* spgwp) { + if (unlikely(SpgwFlush(spgwp))) { + return kPglRetWriteFail; + } + PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc); + if (unlikely(PwcAppendBiallelicGenovecDphase16(genovec, phasepresent, phaseinfo, dosage_present, dphase_present, dosage_main, dphase_delta, dosage_ct, dphase_ct, pwcp))) { + return kPglRetVarRecordTooLarge; + } + return kPglRetSuccess; +} + +// Writes footer if present, backfills header, then closes the file. +PglErr SpgwFinish(STPgenWriter* spgwp); + +// Last flush automatically writes footer if present, backfills header, and +// closes the file. +// (caller should set mpgwp = nullptr after that) +PglErr MpgwFlush(MTPgenWriter* mpgwp); + + +// these close the file if open, but do not free any memory +// MpgwCleanup() handles mpgwp == nullptr, since it shouldn't be allocated on +// the stack +// error-return iff reterr was success and was changed to kPglRetWriteFail +// (i.e. an error message should be printed), though this is not relevant for +// plink2 +BoolErr CleanupSpgw(STPgenWriter* spgwp, PglErr* reterrp); + +BoolErr CleanupMpgw(MTPgenWriter* mpgwp, PglErr* reterrp); + +#ifdef __cplusplus +} // namespace plink2 +#endif + +#endif // __PGENLIB_WRITE_H__ diff --git a/external_libs/pgenlib/include/plink2_base.cc b/external_libs/pgenlib/include/plink2_base.cc index 58f7f4e1..69339e0b 100644 --- a/external_libs/pgenlib/include/plink2_base.cc +++ b/external_libs/pgenlib/include/plink2_base.cc @@ -1,4 +1,4 @@ -// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell, +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -21,6 +21,13 @@ namespace plink2 { #endif +const char kErrprintfFopen[] = "Error: Failed to open %s : %s.\n"; +const char kErrprintfFread[] = "Error: %s read failure: %s.\n"; +const char kErrprintfRewind[] = "Error: %s could not be scanned twice. (Process-substitution/named-pipe input is not permitted in this use case.)\n"; +const char kErrstrNomem[] = "Error: Out of memory. The --memory flag may be helpful.\n"; +const char kErrstrWrite[] = "Error: File write failure: %s.\n"; +const char kErrprintfDecompress[] = "Error: %s decompression failure: %s.\n"; + uint64_t g_failed_alloc_attempt_size = 0; #if (((__GNUC__ == 4) && (__GNUC_MINOR__ < 7)) || (__GNUC__ >= 11)) && !defined(__APPLE__) @@ -359,7 +366,7 @@ int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) { // tried unrolling this, doesn't make a difference const VecUc v1 = vecuc_loadu(&(m1_alias[vidx])); const VecUc v2 = vecuc_loadu(&(m2_alias[vidx])); - if (vecuc_movemask(v1 == v2) != kVec8thUintMax) { + if (!vecucs_are_equal(v1, v2)) { return 0; } } @@ -369,7 +376,7 @@ int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) { const uintptr_t final_offset = byte_ct - kBytesPerVec; const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset])); const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset])); - if (vecuc_movemask(v1 == v2) != kVec8thUintMax) { + if (!vecucs_are_equal(v1, v2)) { return 0; } } @@ -453,22 +460,39 @@ int32_t Memcmp(const void* m1, const void* m2, uintptr_t byte_ct) { for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) { const VecUc v1 = vecuc_loadu(&(m1_alias[vidx])); const VecUc v2 = vecuc_loadu(&(m2_alias[vidx])); +# ifndef SIMDE_ARM_NEON_A32V8_NATIVE // is this even worthwhile now in non-AVX2 case? const uint32_t movemask_result = vecuc_movemask(v1 == v2); if (movemask_result != kVec8thUintMax) { + // todo: check if this is faster to do outside of vector-space const uintptr_t diff_pos = vidx * kBytesPerVec + ctzu32(~movemask_result); return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1; } +# else + const uint64_t eq_nybbles = arm_shrn4_uc(v1 == v2); + if (eq_nybbles != UINT64_MAX) { + const uintptr_t diff_pos = vidx * kBytesPerVec + ctzw(~eq_nybbles) / 4; + return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1; + } +# endif } if (byte_ct % kBytesPerVec) { const uintptr_t final_offset = byte_ct - kBytesPerVec; const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset])); const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset])); +# ifndef SIMDE_ARM_NEON_A32V8_NATIVE const uint32_t movemask_result = vecuc_movemask(v1 == v2); if (movemask_result != kVec8thUintMax) { const uintptr_t diff_pos = final_offset + ctzu32(~movemask_result); return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1; } +# else + const uint64_t eq_nybbles = arm_shrn4_uc(v1 == v2); + if (eq_nybbles != UINT64_MAX) { + const uintptr_t diff_pos = final_offset + ctzw(~eq_nybbles) / 4; + return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1; + } +# endif } return 0; } @@ -633,21 +657,37 @@ uintptr_t FirstUnequal4(const void* arr1, const void* arr2, uintptr_t nbytes) { for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) { const VecUc v1 = vecuc_loadu(&(arr1_alias[vidx])); const VecUc v2 = vecuc_loadu(&(arr2_alias[vidx])); +# ifndef SIMDE_ARM_NEON_A32V8_NATIVE const uint32_t eq_result = vecw_movemask(v1 == v2); if (eq_result != kVec8thUintMax) { return vidx * kBytesPerVec + ctzu32(~eq_result); } +# else + const uint64_t eq_nybbles = arm_shrn4_uc(v1 == v2); + if (eq_nybbles != UINT64_MAX) { + return vidx * kBytesPerVec + ctzw(~eq_nybbles) / 4; + } +# endif } if (nbytes % kBytesPerVec) { const uintptr_t final_offset = nbytes - kBytesPerVec; const char* s1 = S_CAST(const char*, arr1); const char* s2 = S_CAST(const char*, arr2); - const VecW v1 = vecw_loadu(&(s1[final_offset])); - const VecW v2 = vecw_loadu(&(s2[final_offset])); - const uint32_t eq_result = vecw_movemask(v1 == v2); + // bugfix (5 Jul 2025): this must be VecUc on ARM, not VecW + // (disturbing that arm_shrn4_uc() call compiled at all...) + const VecUc v1 = vecuc_loadu(&(s1[final_offset])); + const VecUc v2 = vecuc_loadu(&(s2[final_offset])); +# ifndef SIMDE_ARM_NEON_A32V8_NATIVE + const uint32_t eq_result = vecuc_movemask(v1 == v2); if (eq_result != kVec8thUintMax) { return final_offset + ctzu32(~eq_result); } +# else + const uint64_t eq_nybbles = arm_shrn4_uc(v1 == v2); + if (eq_nybbles != UINT64_MAX) { + return final_offset + ctzw(~eq_nybbles) / 4; + } +# endif } return nbytes; } @@ -688,6 +728,7 @@ uintptr_t CountVintsNonempty(const unsigned char* buf, const unsigned char* buf_ const uintptr_t ending_addr = R_CAST(uintptr_t, buf_end); const VecUc* buf_vlast = R_CAST(const VecUc*, RoundDownPow2(ending_addr - 1, kBytesPerVec)); const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, buf_viter); + // todo: better ARM implementation Vec8thUint vint_ends = (UINT32_MAX << leading_byte_ct) & (~vecuc_movemask(*buf_viter)); uintptr_t total = 0; while (buf_viter != buf_vlast) { diff --git a/external_libs/pgenlib/include/plink2_base.h b/external_libs/pgenlib/include/plink2_base.h index b9d9a29d..e29b9d0d 100644 --- a/external_libs/pgenlib/include/plink2_base.h +++ b/external_libs/pgenlib/include/plink2_base.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_BASE_H__ #define __PLINK2_BASE_H__ -// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell, +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -77,10 +77,11 @@ // valid input types, NOT counting VecW*. -// gcc 8.3.0 has been miscompiling the ParseOnebitUnsafe() function in -// pgenlib_read.cc for the last several years. gcc 8.4 does not have this -// problem, and neither does any other gcc major version I've tested to date. -#ifndef __clang__ +// gcc 8.3.0 has been miscompiling the vector code in the ParseOnebitUnsafe() +// function in pgenlib_read.cc for the last several years. gcc 8.4 does not +// have this problem, and neither does any other gcc major version I've tested +// to date. +#if !defined(__clang__) && defined(__LP64__) # if (__GNUC__ == 8) && (__GNUC_MINOR__ < 4) # error "gcc 8.3 is known to have a miscompilation bug that was fixed in 8.4." # endif @@ -106,24 +107,33 @@ // 10000 * major + 100 * minor + patch // Exception to CONSTI32, since we want the preprocessor to have access // to this value. Named with all caps as a consequence. -#define PLINK2_BASE_VERNUM 815 +#define PLINK2_BASE_VERNUM 822 +// We now try to adhere to include-what-you-use in simple cases. However, +// we don't want to repeat either platform-specific ifdefs, or stuff like +// "#define _FILE_OFFSET_BITS 64" before "#include ", so we use +// "IWYU pragma: export" for those cases. +// TODO: unfortunately, there is no straightforward way to tell IWYU to stop +// looking behind the likes of or "../simde/x86/sse2.h". I'm +// manually ignoring the associated false-positives for now, but this should be +// systematized. #define _FILE_OFFSET_BITS 64 +#ifdef __USE_MINGW_ANSI_STDIO +# undef __USE_MINGW_ANSI_STDIO +#endif +#define __USE_MINGW_ANSI_STDIO 1 -#include -#include -#include -#include // offsetof() -#include +#include #ifndef __STDC_FORMAT_MACROS # define __STDC_FORMAT_MACROS 1 #endif -#include +#include // IWYU pragma: export #include // CHAR_BIT, PATH_MAX - -// #define NDEBUG -#include +#include // offsetof() +#include // IWYU pragma: export +#include +#include #ifdef _WIN32 // needed for EnterCriticalSection, etc. @@ -135,19 +145,21 @@ # ifndef WIN32_LEAN_AND_MEAN # define WIN32_LEAN_AND_MEAN # endif -# include +# include // IWYU pragma: export #endif #if __cplusplus >= 201103L -# include +# include // IWYU pragma: export #endif #ifdef __LP64__ -// TODO: working no-SSE2 fallback on 64-bit little-endian platforms unsupported -// by simde. Can perform early test by compiling on M1/M2 without simde. +// Possible todo: working no-SSE2 fallback on 64-bit little-endian platforms +// unsupported by simde. Can perform early test by compiling on arm64 without +// simde. +// (But not yet aware of a use case that matters.) # define USE_SSE2 # ifdef __x86_64__ -# include +# include // IWYU pragma: export # else # define SIMDE_ENABLE_NATIVE_ALIASES // Since e.g. an old zstd system header breaks the build, and plink2 is @@ -156,9 +168,9 @@ // are manually updated as necessary. // To use system headers, define IGNORE_BUNDLED_{ZSTD,LIBDEFLATE.SIMDE}. # ifdef IGNORE_BUNDLED_SIMDE -# include +# include // IWYU pragma: export # else -# include "../simde/x86/sse2.h" +# include "../simde/x86/sse2.h" // IWYU pragma: export # endif # ifdef SIMDE_ARM_NEON_A32V8_NATIVE // For Apple M1, we effectively use SSE2 + constrained _mm_shuffle_epi8(). @@ -218,6 +230,8 @@ namespace plink2 { // RoundUpPow2()...), or (ii) it allows a useful static_assert to be inserted // for a hardcoded constant. # if __cplusplus >= 201103L +# define HAS_CONSTEXPR +# define PREFER_CONSTEXPR constexpr # define HEADER_CINLINE constexpr # define CSINLINE static constexpr # if __cplusplus > 201103L @@ -228,6 +242,7 @@ namespace plink2 { # define CSINLINE2 static inline # endif # else +# define PREFER_CONSTEXPR const # define HEADER_CINLINE inline # define HEADER_CINLINE2 inline # define CSINLINE static inline @@ -241,6 +256,7 @@ namespace plink2 { # endif # endif #else +# define PREFER_CONSTEXPR const # define HEADER_INLINE static inline # define HEADER_CINLINE static inline # define HEADER_CINLINE2 static inline @@ -352,7 +368,13 @@ HEADER_INLINE unsigned char* CToUc(char* pp) { // * IntErr allows implicit conversion from int, but conversion back to // int32_t requires an explicit cast. It mainly serves as a holding pen for // C standard library error return values, which can be negative. -#if __cplusplus >= 201103L +#if __cplusplus >= 201103L && !defined(NO_CPP11_TYPE_ENFORCEMENT) +// Cython doesn't support these, and the previous workaround of forcing C++98 +// is breaking down. So allow this C++11 feature to be selectively disabled. +# define CPP11_TYPE_ENFORCEMENT +#endif + +#ifdef CPP11_TYPE_ENFORCEMENT struct PglErr { enum class ec #else @@ -397,7 +419,7 @@ typedef enum kPglRetHelp = 125, kPglRetLongLine = 126, kPglRetEof = 127} -#if __cplusplus >= 201103L +#ifdef CPP11_TYPE_ENFORCEMENT ; PglErr() {} @@ -466,7 +488,7 @@ const PglErr kPglRetEof = PglErr::ec::kPglRetEof; PglErr; #endif -#if __cplusplus >= 201103L +#ifdef CPP11_TYPE_ENFORCEMENT // allow efficient arithmetic on these, but force them to require explicit // int32_t/uint32_t casts; only permit implicit assignment from // int32_t/uint32_t by default. @@ -512,6 +534,13 @@ typedef int32_t IntErr; typedef uint32_t BoolErr; #endif +extern const char kErrprintfFopen[]; +extern const char kErrprintfFread[]; +extern const char kErrprintfRewind[]; +extern const char kErrstrNomem[]; +extern const char kErrstrWrite[]; +extern const char kErrprintfDecompress[]; + // make this work on 32-bit as well as 64-bit systems, across // Windows/OS X/Linux // (todo: clean this up a bit. it's inherently a baling-wire-and-duct-tape @@ -557,16 +586,9 @@ typedef uint32_t BoolErr; # endif #endif -#ifdef _WIN32 -# undef PRId64 -# undef PRIu64 -# define PRId64 "I64d" -# define PRIu64 "I64u" -#else -# ifdef __cplusplus -# ifndef PRId64 -# define PRId64 "lld" -# endif +#ifdef __cplusplus +# ifndef PRId64 +# define PRId64 "lld" # endif #endif @@ -633,26 +655,13 @@ HEADER_INLINE uint32_t bsrw(unsigned long ulii) { #endif #ifdef __LP64__ -# ifdef _WIN32 // i.e. Win64 - -# undef PRIuPTR -# undef PRIdPTR -# define PRIuPTR PRIu64 -# define PRIdPTR PRId64 -# define PRIxPTR2 "016I64x" - -# else // not _WIN32 - -# ifndef PRIuPTR -# define PRIuPTR "lu" -# endif -# ifndef PRIdPTR -# define PRIdPTR "ld" -# endif -# define PRIxPTR2 "016lx" - -# endif // Win64 - +# ifndef PRIuPTR +# define PRIuPTR "lu" +# endif +# ifndef PRIdPTR +# define PRIdPTR "ld" +# endif +# define PRIxPTR2 "016lx" #else // not __LP64__ // without this, we get ridiculous warning spew... @@ -665,6 +674,15 @@ HEADER_INLINE uint32_t bsrw(unsigned long ulii) { # define PRIdPTR "ld" # endif +# ifdef _WIN32 +# ifndef PRIuPTR +# define PRIuPTR "u" +# endif +# ifndef PRIdPTR +# define PRIdPTR "d" +# endif +# endif + # define PRIxPTR2 "08lx" #endif @@ -943,8 +961,6 @@ HEADER_INLINE VecI8 veci8_set1(char cc) { return VecToI8( _mm256_set1_epi8(cc)); } -// TODO: on ARM, replace most movemask uses: -// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon HEADER_INLINE uint32_t vecw_movemask(VecW vv) { return _mm256_movemask_epi8(WToVec(vv)); } @@ -1160,10 +1176,18 @@ HEADER_INLINE VecW vecw_sad(VecW v1, VecW v2) { return VecToW(_mm256_sad_epu8(WToVec(v1), WToVec(v2))); } +HEADER_INLINE VecUc vecuc_add(VecUc v1, VecUc v2) { + return VecToUc(_mm256_add_epi8(UcToVec(v1), UcToVec(v2))); +} + HEADER_INLINE VecUc vecuc_adds(VecUc v1, VecUc v2) { return VecToUc(_mm256_adds_epu8(UcToVec(v1), UcToVec(v2))); } +HEADER_INLINE VecUc vecuc_signed_cmpgt(VecUc v1, VecUc v2) { + return VecToUc(_mm256_cmpgt_epi8(UcToVec(v1), UcToVec(v2))); +} + HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) { return VecToU16(_mm256_min_epu8(U16ToVec(v1), U16ToVec(v2))); } @@ -1554,15 +1578,16 @@ HEADER_INLINE VecUc vecuc_gather_odd(VecUc src_lo, VecUc src_hi) { # ifdef USE_SHUFFLE8 # ifdef SIMDE_ARM_NEON_A64V8_NATIVE // See simde_mm_shuffle_epi8(). -// In the future, this may need to be written more carefully in the -// IGNORE_BUNDLED_SIMDE case. But this is compatible with simde v0.7.x and -// v0.8.x. -SIMDE_FUNCTION_ATTRIBUTES simde__m128i _mm_shuffle_epi8(simde__m128i a, simde__m128i b) { - simde__m128i_private a_ = simde__m128i_to_private(a); - simde__m128i_private b_ = simde__m128i_to_private(b); - simde__m128i_private r_; - r_.neon_i8 = vqtbl1q_s8(a_.neon_i8, b_.neon_u8); - return simde__m128i_from_private(r_); +HEADER_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { + SIMDE_ALIGN_TO_16 int8x16_t a_; + SIMDE_ALIGN_TO_16 uint8x16_t b_; + SIMDE_ALIGN_TO_16 int8x16_t r_; + memcpy(&a_, &a, sizeof(a_)); + memcpy(&b_, &b, sizeof(b_)); + r_ = vqtbl1q_s8(a_, b_); + __m128i r; + memcpy(&r, &r_, sizeof(r)); + return r; } # endif HEADER_INLINE VecW vecw_shuffle8(VecW table, VecW indexes) { @@ -1643,10 +1668,18 @@ HEADER_INLINE VecW vecw_sad(VecW v1, VecW v2) { return VecToW(_mm_sad_epu8(WToVec(v1), WToVec(v2))); } +HEADER_INLINE VecUc vecuc_add(VecUc v1, VecUc v2) { + return VecToUc(_mm_add_epi8(UcToVec(v1), UcToVec(v2))); +} + HEADER_INLINE VecUc vecuc_adds(VecUc v1, VecUc v2) { return VecToUc(_mm_adds_epu8(UcToVec(v1), UcToVec(v2))); } +HEADER_INLINE VecUc vecuc_signed_cmpgt(VecUc v1, VecUc v2) { + return VecToUc(_mm_cmpgt_epi8(UcToVec(v1), UcToVec(v2))); +} + HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) { return VecToU16(_mm_min_epu8(U16ToVec(v1), U16ToVec(v2))); } @@ -1906,7 +1939,6 @@ CONSTI32(kFloatPerFVec, kBytesPerFVec / 4); CONSTI32(kDoublePerDVec, kBytesPerDVec / 8); #if defined(__APPLE__) && defined(__LP64__) && !defined(__x86_64__) -// TODO: make this 128 once that stops breaking code # define CACHELINE128 CONSTI32(kCacheline, 128); #else @@ -1987,7 +2019,7 @@ HEADER_INLINE void PrintVecD(const VecD* vv_ptr, const char* preprint) { fputs("\n", stdout); } -#if __cplusplus >= 201103L +#ifdef CPP11_TYPE_ENFORCEMENT // Main application of std::array in this codebase is enforcing length when // passing references between functions. Conversely, if the array type has // different lengths in different functions (e.g. col_skips[]/col_types[]), we @@ -2806,6 +2838,115 @@ HEADER_INLINE uint32_t PopcountVec8thUint(uint32_t val) { # endif #endif +#ifdef USE_SSE2 +// On ARM, emulated movemask isn't great. But there are alternative +// instructions that efficiently perform what you usually want to do with +// movemasks: +// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon + +# ifndef SIMDE_ARM_NEON_A32V8_NATIVE +// vec0255 refers to a VecUc where all bytes are equal to 0 or 255. +// (possible todo: define this as its own type, with automatic downcast to +// VecUc but not the other way around.) +// +// Return value in nonzero case is architecture-dependent (though always +// nonzero). So, best to only use this in other architecture-specific code; +// hence the leading underscore in the function name. +HEADER_INLINE uint64_t _vec0255_is_nonzero(VecUc vv) { + return vecuc_movemask(vv); +} + +HEADER_INLINE uint32_t vec0255_set_ct(VecUc vv) { + return PopcountVec8thUint(vecuc_movemask(vv)); +} + +HEADER_INLINE uint32_t vec0255_is_all_set(VecUc vv) { + return (vecuc_movemask(vv) == kVec8thUintMax); +} + +HEADER_INLINE uint32_t vec0255u16_is_all_set(VecU16 vv) { + return (vecu16_movemask(vv) == kVec8thUintMax); +} + +HEADER_INLINE uint32_t m128is_are_equal(__m128i v1, __m128i v2) { + return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535); +} +# else +HEADER_INLINE uint64_t arm_shrn4_uc(VecUc vv) { + uint16x8_t vv_; + memcpy(&vv_, &vv, sizeof(vv_)); + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vv_, 4)), 0); +} + +HEADER_INLINE uint64_t arm_shrn4_i8(VecI8 vv) { + uint16x8_t vv_; + memcpy(&vv_, &vv, sizeof(vv_)); + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vv_, 4)), 0); +} + +HEADER_INLINE uint64_t arm_shrn4_u16(VecU16 vv) { + uint16x8_t vv_; + memcpy(&vv_, &vv, sizeof(vv_)); + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vv_, 4)), 0); +} + +HEADER_INLINE uint64_t arm_shrn4_m128i(__m128i vv) { + uint16x8_t vv_; + memcpy(&vv_, &vv, sizeof(vv_)); + return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vv_, 4)), 0); +} + +HEADER_INLINE uint64_t _vec0255_is_nonzero(VecUc vv) { + return arm_shrn4_uc(vv); +} + +// set_bits4 must only have bits in 0, 4, ..., 60 set. +// move this out of ARM-only ifdef if we ever want this elsewhere. +HEADER_INLINE uint32_t popcount_bits4(uint64_t set_bits4) { + // Branchlessly count the number of set bits, taking advantage of the limited + // set of positions they can be in. + // Multiplication by the magic constant kMask1111 usually puts the sum of + // all 16 bits of interest in the high nybble of the result... except that + // the nybble overflows when all 16 bits are set. We work around this by + // (i) multiplying by (kMask1111 >> 4) instead, which excludes the lowest bit + // from the high-nybble sum, and + // (ii) then adding the lowest bit afterward. + const uint32_t set_ct_excluding_lowest = (set_bits4 * (kMask1111 >> 4)) >> 60; + return set_ct_excluding_lowest + (set_bits4 & 1); +} + +// xx must have all nybbles equal to 0 or 15. +HEADER_INLINE uint32_t count_set_nybbles(uint64_t xx) { + return popcount_bits4(xx & kMask1111); +} + +HEADER_INLINE uint32_t vec0255_set_ct(VecUc vv) { + return count_set_nybbles(arm_shrn4_uc(vv)); +} + +HEADER_INLINE uint32_t vec0255_is_all_set(VecUc vv) { + return (arm_shrn4_uc(vv) == UINT64_MAX); +} + +HEADER_INLINE uint32_t vec0255u16_is_all_set(VecU16 vv) { + return (arm_shrn4_u16(vv) == UINT64_MAX); +} + +HEADER_INLINE uint32_t m128is_are_equal(__m128i v1, __m128i v2) { + const uint64_t set_nybbles = arm_shrn4_m128i(_mm_cmpeq_epi8(v1, v2)); + return (set_nybbles == UINT64_MAX); +} +# endif + +HEADER_INLINE uint32_t vecucs_are_equal(VecUc v1, VecUc v2) { + return vec0255_is_all_set(v1 == v2); +} + +HEADER_INLINE uint32_t vecu16s_are_equal(VecU16 v1, VecU16 v2) { + return vec0255u16_is_all_set(v1 == v2); +} +#endif + // Downcasts don't risk alignment issues. HEADER_INLINE unsigned char* DowncastToUc(void* pp) { return S_CAST(unsigned char*, pp); @@ -3167,7 +3308,12 @@ HEADER_INLINE void aligned_free_cond(void* aligned_ptr) { } } -// C spec is slightly broken here +// Common pattern is to initialize a data structure, then use it without +// mutating it, then free it. +// Yes, in principle you should have a non-const pointer to the data structure +// to free it. But my experience is that that makes too many beneficial uses +// of const less ergonomic, without offering enough in return. So we cheat a +// bit, in a way that can be systematically cleaned up later if necessary. HEADER_INLINE void free_const(const void* memptr) { free(K_CAST(void*, memptr)); } @@ -3314,7 +3460,7 @@ template <> struct MemequalKImpl<16> { static int32_t MemequalK(const void* m1, const void* m2) { const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1)); const __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2)); - return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535); + return m128is_are_equal(v1, v2); } }; @@ -3325,7 +3471,7 @@ template struct MemequalKImpl > { const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1)); const __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2)); return - (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535) && + m128is_are_equal(v1, v2) && ((*R_CAST(const uint64_t*, &(m1_uc[N - 8]))) == (*R_CAST(const uint64_t*, &(m2_uc[N - 8])))); } }; @@ -3334,14 +3480,14 @@ template struct MemequalKImpl > { static int32_t MemequalK(const void* m1, const void* m2) { __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1)); __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2)); - if (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) != 65535) { + if (!m128is_are_equal(v1, v2)) { return 0; } const unsigned char* m1_uc = S_CAST(const unsigned char*, m1); const unsigned char* m2_uc = S_CAST(const unsigned char*, m2); v1 = _mm_loadu_si128(R_CAST(const __m128i*, &(m1_uc[N - 16]))); v2 = _mm_loadu_si128(R_CAST(const __m128i*, &(m2_uc[N - 16]))); - return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535); + return m128is_are_equal(v1, v2); } }; @@ -3578,7 +3724,7 @@ HEADER_INLINE char* strcpya(char* __restrict dst, const void* __restrict src) { return memcpya(dst, src, slen); } -#if defined(__LP64__) && (__cplusplus >= 201103L) +#if __cplusplus >= 201103L constexpr uint32_t CompileTimeSlen(const char* k_str) { return k_str[0]? (1 + CompileTimeSlen(&(k_str[1]))) : 0; } @@ -3900,6 +4046,18 @@ HEADER_INLINE void ZeroHwArr(uintptr_t entry_ct, Halfword* hwarr) { memset(hwarr, 0, entry_ct * sizeof(Halfword)); } +HEADER_INLINE void ZeroFArr(uintptr_t entry_ct, float* farr) { + for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) { + *farr++ = 0.0; + } +} + +HEADER_INLINE void ZeroDArr(uintptr_t entry_ct, double* darr) { + for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) { + *darr++ = 0.0; + } +} + HEADER_INLINE void SetAllWArr(uintptr_t entry_ct, uintptr_t* warr) { // todo: test this against vecset() for (uintptr_t idx = 0; idx != entry_ct; ++idx) { @@ -3907,6 +4065,12 @@ HEADER_INLINE void SetAllWArr(uintptr_t entry_ct, uintptr_t* warr) { } } +HEADER_INLINE void SetAllU32Arr(uintptr_t entry_ct, uint32_t* u32arr) { + for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) { + *u32arr++ = ~0U; + } +} + // tried _bzhi_u64() in AVX2 case, it was actually worse on my Mac (more // opaque to compiler?) @@ -4157,7 +4321,7 @@ HEADER_INLINE uint32_t VintBytect(uintptr_t ulii) { // enum base type to make it safe for the enum to serve as the flagset type. // * Implicit conversion to int is not prevented for now, since I'm trying to // keep PglErr-style code duplication to a minimum. -#if __cplusplus >= 201103L +#ifdef CPP11_TYPE_ENFORCEMENT // could avoid the typedef here, but that leads to a bit more verbosity. # define FLAGSET_DEF_START() typedef enum : uint32_t { @@ -4269,7 +4433,7 @@ private: \ # define ENUM_U31_DEF_START() typedef enum : uint32_t { # define ENUM_U31_DEF_END(tname) } tname -#else // !__cplusplus >= 201103L +#else // !CPP11_TYPE_ENFORCEMENT # define FLAGSET_DEF_START() enum { # define FLAGSET_DEF_END(tname) } ; \ diff --git a/external_libs/pgenlib/include/plink2_bits.cc b/external_libs/pgenlib/include/plink2_bits.cc index 08ef9de5..0b2a5048 100644 --- a/external_libs/pgenlib/include/plink2_bits.cc +++ b/external_libs/pgenlib/include/plink2_bits.cc @@ -14,7 +14,6 @@ // You should have received a copy of the GNU Lesser General Public License // along with this library. If not, see . - #include "plink2_bits.h" #ifdef __cplusplus @@ -387,7 +386,7 @@ uintptr_t NextNonmissingUnsafe(const uintptr_t* genoarr, uintptr_t loc) { */ uint32_t AdvBoundedTo1Bit(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) { - // safe version. + // Can overread a single word if loc == ceil. const uintptr_t* bitarr_iter = &(bitarr[loc / kBitsPerWord]); uintptr_t ulii = (*bitarr_iter) >> (loc % kBitsPerWord); if (ulii) { @@ -406,7 +405,7 @@ uint32_t AdvBoundedTo1Bit(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) } uintptr_t AdvBoundedTo0Bit(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) { - assert(ceil >= 1); + // Can overread a single word if loc == ceil. const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]); uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % kBitsPerWord); if (ulii) { @@ -1832,6 +1831,27 @@ uintptr_t PopcountBytesMasked(const void* bitarr, const uintptr_t* mask_arr, uin #endif } +uintptr_t PopcountBitRange(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) { + uintptr_t start_idxl = start_idx / kBitsPerWord; + const uintptr_t start_idxlr = start_idx & (kBitsPerWord - 1); + const uintptr_t end_idxl = end_idx / kBitsPerWord; + const uintptr_t end_idxlr = end_idx & (kBitsPerWord - 1); + uintptr_t ct = 0; + if (start_idxl == end_idxl) { + return PopcountWord(bitvec[start_idxl] & ((k1LU << end_idxlr) - (k1LU << start_idxlr))); + } + if (start_idxlr) { + ct = PopcountWord(bitvec[start_idxl++] >> start_idxlr); + } + if (end_idxl > start_idxl) { + ct += PopcountWordsNzbase(bitvec, start_idxl, end_idxl); + } + if (end_idxlr) { + ct += PopcountWord(bzhi(bitvec[end_idxl], end_idxlr)); + } + return ct; +} + void FillCumulativePopcounts(const uintptr_t* subset_mask, uint32_t word_ct, uint32_t* cumulative_popcounts) { assert(word_ct); const uint32_t word_ct_m1 = word_ct - 1; @@ -2066,6 +2086,7 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u // buf0 and buf1 must both be 32KiB vector-aligned buffers when // kCacheline==64, and 128KiB when kCacheline==128. + // todo: better ARM implementation const uint32_t buf0_row_ct = DivUp(write_row_ct, 64); { uintptr_t* buf0_ul = DowncastVecWToW(buf0); @@ -2916,7 +2937,8 @@ void Reduce8to4bitInplaceUnsafe(uintptr_t entry_ct, uintptr_t* mainvec) { vmainvec[write_vidx] = vecw_gather_even(v0, v1, m8); } uintptr_t write_idx = fullvec_ct * kWordsPerVec; - if (write_idx == entry_ct * 2) { + // bugfix (9 Jun 2025): mixed up units in this comparison + if (write_idx * kBitsPerWordD4 == entry_ct) { return; } #else @@ -2940,6 +2962,100 @@ void Reduce8to4bitInplaceUnsafe(uintptr_t entry_ct, uintptr_t* mainvec) { mainvec[write_idx] = bzhi_max(write_word, remaining_entry_ct * 4); } +// advances forward_ct set bits; forward_ct must be positive. (stays put if +// forward_ct == 1 and current bit is set. may want to tweak this interface, +// easy to introduce off-by-one bugs...) +// In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end of +// the trailing 16-byte block can be safely read from. +uintptr_t FindNth1BitFrom(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct) { + assert(forward_ct); + uintptr_t widx = cur_pos / kBitsPerWord; + uintptr_t ulii = cur_pos % kBitsPerWord; + const uintptr_t* bptr = &(bitvec[widx]); + uintptr_t uljj; + uintptr_t ulkk; +#ifdef __LP64__ + const VecW* vptr; + assert(IsVecAligned(bitvec)); +#endif + if (ulii) { + uljj = (*bptr) >> ulii; + ulkk = PopcountWord(uljj); + if (ulkk >= forward_ct) { + FindNth1BitFrom_finish: + return widx * kBitsPerWord + ulii + WordBitIdxToUidx(uljj, forward_ct - 1); + } + forward_ct -= ulkk; + ++widx; + ++bptr; + } + ulii = 0; +#ifdef __LP64__ + while (widx & (kWordsPerVec - k1LU)) { + uljj = *bptr; + ulkk = PopcountWord(uljj); + if (ulkk >= forward_ct) { + goto FindNth1BitFrom_finish; + } + forward_ct -= ulkk; + ++widx; + ++bptr; + } + vptr = R_CAST(const VecW*, bptr); +#ifdef USE_AVX2 + while (forward_ct > kBitsPerWord * (16 * kWordsPerVec)) { + uljj = (forward_ct - 1) / (kBitsPerWord * kWordsPerVec); + ulkk = PopcountVecsAvx2(vptr, uljj); + vptr = &(vptr[uljj]); + forward_ct -= ulkk; + } +#else + while (forward_ct > kBitsPerWord * (3 * kWordsPerVec)) { + uljj = ((forward_ct - 1) / (kBitsPerWord * (3 * kWordsPerVec))) * 3; + // yeah, yeah, this is suboptimal if we have SSE4.2 + ulkk = PopcountVecsNoAvx2(vptr, uljj); + vptr = &(vptr[uljj]); + forward_ct -= ulkk; + } +#endif + bptr = R_CAST(const uintptr_t*, vptr); + while (forward_ct > kBitsPerWord) { + forward_ct -= PopcountWord(*bptr++); + } +#else + while (forward_ct > kBitsPerWord) { + uljj = (forward_ct - 1) / kBitsPerWord; + ulkk = PopcountWords(bptr, uljj); + bptr = &(bptr[uljj]); + forward_ct -= ulkk; + } +#endif + for (; ; ++bptr) { + uljj = *bptr; + ulkk = PopcountWord(uljj); + if (ulkk >= forward_ct) { + widx = bptr - bitvec; + goto FindNth1BitFrom_finish; + } + forward_ct -= ulkk; + } +} + +void FillU32SubsetStarts(const uintptr_t* subset, uint32_t thread_ct, uint32_t start, uint64_t bit_ct, uint32_t* starts) { + assert(bit_ct); + uint32_t uidx = AdvTo1Bit(subset, start); + uint32_t prev_bit_idx = 0; + starts[0] = uidx; + for (uint32_t tidx = 1; tidx != thread_ct; ++tidx) { + const uint32_t bit_idx = (tidx * bit_ct) / thread_ct; + if (bit_idx != prev_bit_idx) { + uidx = FindNth1BitFrom(subset, uidx + 1, bit_idx - prev_bit_idx); + prev_bit_idx = bit_idx; + } + starts[tidx] = uidx; + } +} + #ifdef __cplusplus } // namespace plink2 #endif diff --git a/external_libs/pgenlib/include/plink2_bits.h b/external_libs/pgenlib/include/plink2_bits.h index c86d2ee6..3026bfc7 100644 --- a/external_libs/pgenlib/include/plink2_bits.h +++ b/external_libs/pgenlib/include/plink2_bits.h @@ -1,7 +1,7 @@ #ifndef __PLINK2_BITS_H__ #define __PLINK2_BITS_H__ -// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell, +// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell, // Christopher Chang. // // This library is free software: you can redistribute it and/or modify it @@ -20,6 +20,10 @@ // Bitarray support. (Inline single-word operations are in plink2_base.h.) +#include +#include +#include + #include "plink2_base.h" #ifdef __cplusplus @@ -106,8 +110,10 @@ uintptr_t AdvTo0Bit(const uintptr_t* bitarr, uintptr_t loc); // uintptr_t NextNonmissingUnsafe(const uintptr_t* genoarr, uintptr_t loc); +// Can overread a single word if loc == ceil. uint32_t AdvBoundedTo1Bit(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil); +// Can overread a single word if loc == ceil. uintptr_t AdvBoundedTo0Bit(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil); uintptr_t FindLast1BitBefore(const uintptr_t* bitarr, uintptr_t loc); @@ -284,6 +290,35 @@ uintptr_t PopcountWordsXor(const uintptr_t* __restrict bitvec1_iter, const uintp // uintptr_t PopcountWordsIntersect3(const uintptr_t* __restrict bitvec1_iter, const uintptr_t* __restrict bitvec2_iter, const uintptr_t* __restrict bitvec3_iter, uintptr_t word_ct); +#ifdef __LP64__ +HEADER_INLINE uintptr_t PopcountWordsNzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) { + uintptr_t prefix_ct = 0; +# ifdef USE_AVX2 + while (start_idx & 3) { + if (end_idx == start_idx) { + return prefix_ct; + } + prefix_ct += PopcountWord(bitvec[start_idx++]); + } +# else + if (start_idx & 1) { + if (end_idx == start_idx) { + return 0; + } + prefix_ct = PopcountWord(bitvec[start_idx++]); + } +# endif // USE_AVX2 + return prefix_ct + PopcountWords(&(bitvec[start_idx]), end_idx - start_idx); +} +#else +HEADER_INLINE uintptr_t PopcountWordsNzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) { + return PopcountWords(&(bitvec[start_idx]), end_idx - start_idx); +} +#endif + +// start_idx == end_idx ok +uintptr_t PopcountBitRange(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx); + // requires positive word_ct // stay agnostic a bit longer re: word_ct := DIV_UP(entry_ct, kBitsPerWord) // vs. word_ct := 1 + (entry_ct / kBitsPerWord) @@ -636,6 +671,15 @@ HEADER_INLINE void AssignNybblearrEntry(uint32_t idx, uintptr_t newval, uintptr_ // positive. void Reduce8to4bitInplaceUnsafe(uintptr_t entry_ct, uintptr_t* mainvec); +// forward_ct must be positive. Stays put if forward_ct == 1 and current bit +// is set. +// In usual 64-bit case, also assumes bitvec is vector aligned. +uintptr_t FindNth1BitFrom(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct); + +// This function assumes (bit_ct * (thread_ct - 1)) < 2^64. +// bit_ct must be positive, but can be smaller than thread_ct +void FillU32SubsetStarts(const uintptr_t* subset, uint32_t thread_ct, uint32_t start, uint64_t bit_ct, uint32_t* starts); + #ifdef __cplusplus } // namespace plink2 #endif From 8d51dac7864542edfbfd2525ada0e4e4c20e9cd4 Mon Sep 17 00:00:00 2001 From: Dan Gealow Date: Fri, 7 Nov 2025 03:10:02 +0000 Subject: [PATCH 2/3] Implement --mask-format pgen --- src/Data.cpp | 15 ++-- src/Masks.cpp | 197 +++++++++++++++++++++++++++++++++++++++++-- src/Masks.hpp | 13 +++ src/Regenie.cpp | 27 +++++- src/Regenie.hpp | 2 +- src/Step2_Models.cpp | 8 +- 6 files changed, 246 insertions(+), 16 deletions(-) diff --git a/src/Data.cpp b/src/Data.cpp index d5e49d2f..25a69238 100755 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -2040,11 +2040,16 @@ void Data::print_test_info(){ if(params.write_masks) { - bm.write_info(¶ms, &in_filters, sout); - sout << " * user specified to write masks (in PLINK bed format)\n"; - if(params.dosage_mode) - sout << " +dosages will be converted to hardcalls\n"; - if(params.write_setlist) + if(params.mask_format_pgen) { + bm.write_info_pgen(¶ms, &in_filters, sout); + sout << " * user specified to write masks (in PLINK2 pgen format with dosage support)\n"; + } else { + bm.write_info(¶ms, &in_filters, sout); + sout << " * user specified to write masks (in PLINK bed format)\n"; + if(params.dosage_mode) + sout << " +dosages will be converted to hardcalls\n"; + } + if(params.write_setlist) bm.prep_setlists(files.new_sets, files.out_file, sout); } diff --git a/src/Masks.cpp b/src/Masks.cpp index 632f8140..79ecb63e 100644 --- a/src/Masks.cpp +++ b/src/Masks.cpp @@ -58,6 +58,7 @@ void GenoMask::prep_run(struct param& params, struct in_files const& files){ vc_collapse_MAC = params.skat_collapse_MAC; w_vc_cust_weights = params.vc_with_weights; write_masks = params.write_masks; + mask_format_pgen = params.mask_format_pgen; write_snplist = params.write_mask_snplist; force_singleton = params.aaf_file_wSingletons; verbose = params.verbose || params.debug; @@ -701,8 +702,13 @@ void GenoMask::computeMasks(struct param* params, struct filter* filters, const tmpsnp.allele2 = buffer.str(); if(write_masks && in_bed(index_start)) { - write_genovec(index_start); - write_genobim(tmpsnp); + if(mask_format_pgen) { + write_pgen_variant(index_start); + write_pgen_pvar(tmpsnp); + } else { + write_genovec(index_start); + write_genobim(tmpsnp); + } if(write_setlist) append_setlist(index_start, tmpsnp.ID); } @@ -1015,7 +1021,12 @@ void GenoMask::buildMask(int const& isnp, int const& chrom, uint32_t const& phys maskvec(index++) = ds; } //cerr << maskvec.matrix().transpose().array().head(5) << endl << endl << maskvec.mean()<nChrom) mac = total; // use MAC assuming diploid coding @@ -1167,6 +1178,72 @@ void GenoMask::write_famfile(struct param* params, struct filter const* filters, params->FIDvec.clear(); } +void GenoMask::write_info_pgen(struct param* params, struct filter const* filters, mstream& sout){ + using namespace plink2; + + uint32_t sample_ct = filters->ind_in_analysis.count(); + + // Write .psam file (PLINK2 sample file with header) + string psam_fname = gfile_prefix + ".psam"; + Files psam_out; + psam_out.openForWrite(psam_fname, sout); + psam_out << "#FID\tIID\tSEX\n"; + for (int i = 0; i < filters->ind_in_analysis.size(); i++) { + if( filters->ind_in_analysis(i) ){ + psam_out << + params->FIDvec[i][0] << "\t" << + params->FIDvec[i][1] << "\t" << + params->sex(i) << "\n"; + } + } + psam_out.closeFile(); + + // Initialize pgen writer + PreinitSpgw(&spgw); + uintptr_t alloc_cacheline_ct; + uint32_t max_vrec_len; + + PglErr reterr = SpgwInitPhase1( + (gfile_prefix + ".pgen").c_str(), + nullptr, // allele_idx_offsets (biallelic) + nullptr, // explicit_nonref_flags + nmasks_total, // variant_ct_limit + sample_ct, + 2, // allele_ct_upper_bound (biallelic) + kPgenWriteBackwardSeek, // write mode + kfPgenGlobalDosagePresent, // phase_dosage_gflags + 0, // nonref_flags_storage + &spgw, + &alloc_cacheline_ct, + &max_vrec_len + ); + + if(reterr != kPglRetSuccess) { + throw "failed to initialize pgen writer"; + } + + // Allocate memory + spgw_alloc = (unsigned char*) malloc(alloc_cacheline_ct * kCacheline); + SpgwInitPhase2(max_vrec_len, &spgw, spgw_alloc); + + // Initialize dosage storage + dosage_vals.resize(nmasks_total); + dosage_present.resize(nmasks_total); + dosage_counts.resize(nmasks_total); + + size_t dosage_present_size = (sample_ct + kBitsPerWord - 1) / kBitsPerWord; + for(int i = 0; i < nmasks_total; i++) { + dosage_vals[i].resize(sample_ct); + dosage_present[i].resize(dosage_present_size); + std::fill(dosage_present[i].begin(), dosage_present[i].end(), 0); + } + + // Open .pvar file + string fname = gfile_prefix + ".pvar"; + openStream(&outfile_bim, fname, std::ios::out, sout); + outfile_bim << "#CHROM\tPOS\tID\tREF\tALT\n"; +} + void GenoMask::reset_gvec(){ gvec.resize(nmasks_total); for(int i = 0; i < nmasks_total; i++) @@ -1177,6 +1254,7 @@ void GenoMask::reset_gvec(){ void GenoMask::make_genovec(int const& isnp, Ref mask, struct filter const* filters){ int byte, bit_start, hc; + static bool warned_clamping = false; setAllBitsOne(isnp); for(int i = 0, index = 0; i < mask.size(); i++){ @@ -1184,7 +1262,16 @@ void GenoMask::make_genovec(int const& isnp, Ref mask, struct fil if( !filters->ind_in_analysis(i) ) continue; // round to nearest int - hc = (int) (mask(i) + 0.5); + hc = (int) (mask(i) + 0.5); + + // Warn and clamp if value > 2 (bed format limitation) + if(hc > 2) { + if(!warned_clamping) { + cerr << "WARNING: Mask values > 2 detected (weighted masks with weights > 1). Values will be clamped to 2.\n"; + warned_clamping = true; + } + hc = 2; + } // using 'ref-last': // 00 -> hom. alt @@ -1226,6 +1313,94 @@ void GenoMask::write_genovec(int const& isnp){ } +void GenoMask::make_pgen_dosage(int const& isnp, Ref mask, struct filter const* filters){ + using namespace plink2; + + uint32_t dosage_ct = 0; + static bool warned_clamping = false; + + for(int i = 0, index = 0; i < mask.size(); i++){ + if( !filters->ind_in_analysis(i) ) continue; + + double dosage_dbl = mask(i); + + // Clamp to [0, 2] range (pgen format limitation) + if(dosage_dbl < 0 || dosage_dbl > 2) { + if(!warned_clamping) { + cerr << "WARNING: Mask values outside [0,2] range detected (weighted masks with weights > 1). Values will be clamped to [0,2].\n"; + warned_clamping = true; + } + if(dosage_dbl < 0) dosage_dbl = 0; + if(dosage_dbl > 2) dosage_dbl = 2; + } + + // Convert double to uint16_t dosage (0-32768 range, 16384 = 1.0) + uint16_t dosage_val = (uint16_t)(dosage_dbl * 16384.0 + 0.5); + dosage_vals[isnp][index] = dosage_val; + + // Set dosage_present bit if non-zero + if(dosage_val > 0) { + size_t word_idx = index / kBitsPerWord; + size_t bit_idx = index % kBitsPerWord; + dosage_present[isnp][word_idx] |= (1ULL << bit_idx); + dosage_ct++; + } + + index++; + } + + dosage_counts[isnp] = dosage_ct; +} + +void GenoMask::write_pgen_variant(int const& isnp){ + using namespace plink2; + + // Create hardcall genovec (all missing for now - can optimize later) + uint32_t sample_ct = SpgwGetSampleCt(&spgw); + size_t genovec_size = NypCtToVecCt(sample_ct) * kBytesPerVec; + uintptr_t* genovec = (uintptr_t*) calloc(genovec_size, 1); // Initialize to zero + + // Set all sample bits to missing (0b11), but leave trailing bits zero + // Each sample is 2 bits (nybble), so sample_ct samples = sample_ct nybbles + uint32_t full_word_ct = sample_ct / kBitsPerWordD2; + uint32_t remainder_nyp = sample_ct % kBitsPerWordD2; + + // Set full words to all missing (all bits = 1, so 0b11 for each sample) + for(uint32_t i = 0; i < full_word_ct; i++) { + genovec[i] = ~k0LU; // All bits set + } + + // Set remainder samples to missing, but clear trailing bits beyond sample_ct + if(remainder_nyp > 0) { + // Create mask: lower (remainder_nyp * 2) bits set to 1 (0b11 for each sample), rest zero + // This ensures bits beyond sample_ct are zero (required by pgenlib assertion) + uintptr_t mask = (~k0LU) >> (kBitsPerWord - (2 * remainder_nyp)); + genovec[full_word_ct] = mask; + } + + PglErr reterr = SpgwAppendBiallelicGenovecDosage16( + genovec, + dosage_present[isnp].data(), + dosage_vals[isnp].data(), + dosage_counts[isnp], + &spgw + ); + + free(genovec); + + if(reterr != kPglRetSuccess) { + throw "failed to write pgen variant"; + } +} + +void GenoMask::write_pgen_pvar(struct snp const& tsnp){ + // Write variant info to pvar file + // Format: #CHROM POS ID REF ALT + outfile_bim << tsnp.chrom << "\t" << tsnp.physpos << "\t" + << tsnp.ID << "\t" << tsnp.allele1 << "\t" + << tsnp.allele2 << endl; +} + // get list of indices for each mask (across all AAF bins) void GenoMask::build_map(map>& mask_map){ @@ -1367,8 +1542,20 @@ void GenoMask::make_setlist(string const& sname, int const& chr, uint32_t const& } void GenoMask::closeFiles(){ + using namespace plink2; + outfile_bim.close(); - outfile_bed.close(); + if(mask_format_pgen) { + PglErr reterr = kPglRetSuccess; + SpgwFinish(&spgw); + CleanupSpgw(&spgw, &reterr); + if(spgw_alloc) { + free(spgw_alloc); + spgw_alloc = nullptr; + } + } else { + outfile_bed.close(); + } if(write_setlist){ for(size_t i = 0; i < setfiles.size(); i++) setfiles[i]->closeFile(); diff --git a/src/Masks.hpp b/src/Masks.hpp index a11bc59a..378fcb4d 100644 --- a/src/Masks.hpp +++ b/src/Masks.hpp @@ -27,6 +27,7 @@ #ifndef MASK_H #define MASK_H +#include "../external_libs/pgenlib/include/pgenlib_write.h" class GenoMask { @@ -46,6 +47,14 @@ class GenoMask { Files snplist_out; std::vector > list_snps;//contains snplist + // For pgen output + plink2::STPgenWriter spgw; + unsigned char* spgw_alloc; + bool mask_format_pgen = false; + std::vector> dosage_vals; // dosages in pgen format + std::vector> dosage_present; // bitarray for dosages + std::vector dosage_counts; // count per mask + double tol = 1e-6; double minAAF = 1e-7, default_aaf = .01; int n_aaf_bins, max_aaf_bins = 12, nmasks_total; @@ -93,6 +102,10 @@ class GenoMask { void write_genobim(struct snp const&); void setAllBitsZero(const int&); void setAllBitsOne(const int&); + void write_info_pgen(struct param*, struct filter const*, mstream&); + void make_pgen_dosage(const int&, Eigen::Ref, struct filter const*); + void write_pgen_variant(const int&); + void write_pgen_pvar(struct snp const&); std::string build_header(); void build_map(std::map>&); void prep_snplist(const std::string&,mstream& sout); diff --git a/src/Regenie.cpp b/src/Regenie.cpp index ff2c0477..877ec346 100755 --- a/src/Regenie.cpp +++ b/src/Regenie.cpp @@ -243,7 +243,8 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct ("weights-col", "column index (1-based) for user-defined weights in annotation file", cxxopts::value(params->vc_weight_col)) ("joint", "comma spearated list of joint tests to perform", cxxopts::value(params->burden),"STRING") ("singleton-carrier", "define singletons as variants with a single carrier in the sample") - ("write-mask", "write masks in PLINK bed/bim/fam format") + ("write-mask", "write masks to file") + ("mask-format", "format for writing masks: 'bed' (PLINK bed/bim/fam, default) or 'pgen' (PLINK2 pgen/pvar/psam with dosage support)", cxxopts::value(), "STRING") ("mask-lovo", "apply Leave-One-Variant-Out (LOVO) scheme when building masks (,,)", cxxopts::value(),"STRING") ("mask-lodo", "apply Leave-One-Domain-Out (LODO) scheme when building masks (,,)", cxxopts::value(),"STRING") ("skip-test", "skip computing association tests after building masks") @@ -500,6 +501,16 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct if( vm.count("mask-lovo") ) params->mask_loo = true; if( vm.count("mask-lodo") ) params->mask_lodo = true; if( vm.count("write-mask") ) params->write_masks = true; + if( vm.count("mask-format") ) { + std::string format = vm["mask-format"].as(); + if(format == "pgen") { + params->mask_format_pgen = true; + } else if(format == "bed") { + params->mask_format_pgen = false; + } else { + throw "invalid value for --mask-format: must be 'bed' or 'pgen'"; + } + } if( vm.count("write-setlist") ) params->write_setlist = true; if( vm.count("write-mask-snplist") ) params->write_mask_snplist = true; if( vm.count("skip-test") ) params->skip_test = true; @@ -971,7 +982,10 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct } if(params->write_masks){ sout << "WARNING: cannot use --write-mask with --mask-lovo.\n"; - params->write_masks = false; valid_args[ "write-mask" ] = false; + params->write_masks = false; + params->mask_format_pgen = false; + valid_args[ "write-mask" ] = false; + valid_args[ "mask-format" ] = false; } if(params->joint_test) throw "cannot use --joint with --mask-lovo"; @@ -996,7 +1010,10 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct } if(params->write_masks){ sout << "WARNING: cannot use --write-mask with --mask-lodo.\n"; - params->write_masks = false; valid_args[ "write-mask" ] = false; + params->write_masks = false; + params->mask_format_pgen = false; + valid_args[ "write-mask" ] = false; + valid_args[ "mask-format" ] = false; } valid_args[ "vc-maxAAF" ] = valid_args[ "aaf-bins" ] = false; } @@ -1014,6 +1031,10 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct if(params->build_mask) throw "option --force-mac-filter cannot be used when building masks"; } else valid_args[ "force-mac-filter" ] = false; + if(vm.count("mask-format") && !params->write_masks){ + sout << "ERROR: --mask-format can only be used with --write-mask.\n"; + exit(EXIT_FAILURE); + } if(!params->build_mask && params->write_masks) {params->write_masks = false; valid_args[ "write-mask" ] = false;} if(!params->build_mask && params->check_mask_files) {params->check_mask_files = false; valid_args[ "check-burden-files" ] = false;} if(!params->build_mask && params->strict_check_burden) {params->strict_check_burden = false; valid_args[ "strict-check-burden" ] = false;} diff --git a/src/Regenie.hpp b/src/Regenie.hpp index b0d0c393..c78f6ec2 100644 --- a/src/Regenie.hpp +++ b/src/Regenie.hpp @@ -383,7 +383,7 @@ struct param { bool aaf_file_wSingletons = false;//for choosing snps in singleton masks bool singleton_carriers = false; // carrier count used to define singletons uint64 max_bsize = 0; // number of SNPs per variant set - bool write_masks = false, write_setlist = false, write_mask_snplist = false; //write masks to bed file + bool write_masks = false, mask_format_pgen = false, write_setlist = false, write_mask_snplist = false; //write masks to bed/pgen file bool mask_loo = false, mask_lodo = false; bool use_max_bsize = false; // set bsize to max set size bool p_joint_only = false; diff --git a/src/Step2_Models.cpp b/src/Step2_Models.cpp index f64026ce..f8fc4047 100644 --- a/src/Step2_Models.cpp +++ b/src/Step2_Models.cpp @@ -2683,8 +2683,12 @@ std::string print_summary(Files* ofile, string const& out, std::vector Date: Fri, 7 Nov 2025 06:35:04 +0000 Subject: [PATCH 3/3] Fix bug and nix dead code --- src/Masks.cpp | 48 ++++++++++++++++-------------------------------- src/Masks.hpp | 2 +- 2 files changed, 17 insertions(+), 33 deletions(-) diff --git a/src/Masks.cpp b/src/Masks.cpp index 79ecb63e..2a4988bb 100644 --- a/src/Masks.cpp +++ b/src/Masks.cpp @@ -705,6 +705,7 @@ void GenoMask::computeMasks(struct param* params, struct filter* filters, const if(mask_format_pgen) { write_pgen_variant(index_start); write_pgen_pvar(tmpsnp); + pgen_variant_ct++; } else { write_genovec(index_start); write_genobim(tmpsnp); @@ -965,7 +966,7 @@ void GenoMask::buildMask(int const& isnp, int const& chrom, uint32_t const& phys if(take_comphet) maskvec = maskvec.min(2); // if dosages were given and writing to PLINK bed, convert dosages to hardcalls - if(params->dosage_mode && write_masks) maskvec = maskvec.round(); + if(params->dosage_mode && write_masks && !mask_format_pgen) maskvec = maskvec.round(); // get counts for (int i = 0, index = 0; i < filters->ind_ignore.size(); i++) { @@ -1254,7 +1255,6 @@ void GenoMask::reset_gvec(){ void GenoMask::make_genovec(int const& isnp, Ref mask, struct filter const* filters){ int byte, bit_start, hc; - static bool warned_clamping = false; setAllBitsOne(isnp); for(int i = 0, index = 0; i < mask.size(); i++){ @@ -1264,15 +1264,6 @@ void GenoMask::make_genovec(int const& isnp, Ref mask, struct fil // round to nearest int hc = (int) (mask(i) + 0.5); - // Warn and clamp if value > 2 (bed format limitation) - if(hc > 2) { - if(!warned_clamping) { - cerr << "WARNING: Mask values > 2 detected (weighted masks with weights > 1). Values will be clamped to 2.\n"; - warned_clamping = true; - } - hc = 2; - } - // using 'ref-last': // 00 -> hom. alt // 10 -> missing @@ -1317,34 +1308,24 @@ void GenoMask::make_pgen_dosage(int const& isnp, Ref mask, struct using namespace plink2; uint32_t dosage_ct = 0; - static bool warned_clamping = false; + // Build densely packed dosage array (only non-missing samples) for(int i = 0, index = 0; i < mask.size(); i++){ if( !filters->ind_in_analysis(i) ) continue; double dosage_dbl = mask(i); - // Clamp to [0, 2] range (pgen format limitation) - if(dosage_dbl < 0 || dosage_dbl > 2) { - if(!warned_clamping) { - cerr << "WARNING: Mask values outside [0,2] range detected (weighted masks with weights > 1). Values will be clamped to [0,2].\n"; - warned_clamping = true; - } - if(dosage_dbl < 0) dosage_dbl = 0; - if(dosage_dbl > 2) dosage_dbl = 2; - } - - // Convert double to uint16_t dosage (0-32768 range, 16384 = 1.0) + // Convert double to uint16_t dosage (0-32768 range, 16384 = 1.0, 32768 = 2.0) uint16_t dosage_val = (uint16_t)(dosage_dbl * 16384.0 + 0.5); - dosage_vals[isnp][index] = dosage_val; - - // Set dosage_present bit if non-zero - if(dosage_val > 0) { - size_t word_idx = index / kBitsPerWord; - size_t bit_idx = index % kBitsPerWord; - dosage_present[isnp][word_idx] |= (1ULL << bit_idx); - dosage_ct++; - } + + // Set dosage_present bit for all samples (including 0 dosages) + size_t word_idx = index / kBitsPerWord; + size_t bit_idx = index % kBitsPerWord; + dosage_present[isnp][word_idx] |= (1ULL << bit_idx); + + // Store in packed array at position dosage_ct + dosage_vals[isnp][dosage_ct] = dosage_val; + dosage_ct++; index++; } @@ -1546,6 +1527,9 @@ void GenoMask::closeFiles(){ outfile_bim.close(); if(mask_format_pgen) { + // Update variant count to actual number written (fixes assertion in SpgwFinish) + PgenWriterCommon* pwcp = &GET_PRIVATE(spgw, pwc); + pwcp->variant_ct_limit = pgen_variant_ct; PglErr reterr = kPglRetSuccess; SpgwFinish(&spgw); CleanupSpgw(&spgw, &reterr); diff --git a/src/Masks.hpp b/src/Masks.hpp index 378fcb4d..1430b44d 100644 --- a/src/Masks.hpp +++ b/src/Masks.hpp @@ -54,7 +54,7 @@ class GenoMask { std::vector> dosage_vals; // dosages in pgen format std::vector> dosage_present; // bitarray for dosages std::vector dosage_counts; // count per mask - + uint32_t pgen_variant_ct = 0; // actual number of variants written to pgen double tol = 1e-6; double minAAF = 1e-7, default_aaf = .01; int n_aaf_bins, max_aaf_bins = 12, nmasks_total;