diff --git a/external_libs/pgenlib/include/pgenlib_misc.cc b/external_libs/pgenlib/include/pgenlib_misc.cc
index 332ec399..1d3a75a4 100644
--- a/external_libs/pgenlib/include/pgenlib_misc.cc
+++ b/external_libs/pgenlib/include/pgenlib_misc.cc
@@ -1,4 +1,4 @@
-// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell,
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
@@ -14,13 +14,20 @@
// You should have received a copy of the GNU Lesser General Public License
// along with this library. If not, see .
-
#include "pgenlib_misc.h"
+#include
+
#ifdef __cplusplus
namespace plink2 {
#endif
+#ifndef NDEBUG
+char* g_pgl_errbuf = nullptr;
+char* g_pgl_errbuf_write_iter = nullptr;
+char* g_pgl_errbuf_end = nullptr;
+#endif
+
#ifdef USE_AVX2
void CopyNyparrNonemptySubset(const uintptr_t* __restrict raw_nyparr, const uintptr_t* __restrict subset_mask, uint32_t raw_nyparr_entry_ct, uint32_t subset_entry_ct, uintptr_t* __restrict output_nyparr) {
if (subset_entry_ct == raw_nyparr_entry_ct) {
@@ -1244,6 +1251,7 @@ void TransposeNypblock64(const uintptr_t* read_iter, uint32_t read_ul_stride, ui
// and target_4567 to
// _ (4, 0) _ (5, 0) _ (6, 0) _ (7, 0) _ (4, 1) _ (5, 1) _ (6, 1) ...
// This is perfectly arranged for movemask.
+ // todo: better ARM implementation
VecW target_4567 = vecw_blendv(loader, vecw_srli(loader, 7), m8);
target_iter7[vidx] = vecw_movemask(target_4567);
target_4567 = vecw_slli(target_4567, 2);
@@ -3261,77 +3269,6 @@ void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_
}
}
-void ClearGenoarrMissing1bit16Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals) {
- const uint32_t orig_subset_size = *subset_sizep;
- Halfword* subset_alias = DowncastWToHW(subset);
- uint32_t read_idx = 0;
- // deliberate overflow
- for (uint32_t read_widx = UINT32_MAX; ; ) {
- uint32_t subset_bits;
- do {
- subset_bits = subset_alias[++read_widx];
- } while (!subset_bits);
- uintptr_t detect_11 = genoarr[read_widx];
- detect_11 = detect_11 & (detect_11 >> 1) & kMask5555;
- if (detect_11) {
- uint32_t detect_11_hw = PackWordToHalfword(detect_11);
- const uint32_t joint_u32 = subset_bits & detect_11_hw;
- if (joint_u32) {
- uintptr_t lowbit = joint_u32 & (-joint_u32);
- uint32_t write_idx = read_idx + PopcountWord(subset_bits & (lowbit - 1));
- read_idx = write_idx + 1;
- uint32_t subset_bits_write = subset_bits ^ lowbit;
- uint16_t* sparse_vals_u16 = S_CAST(uint16_t*, sparse_vals);
- subset_bits &= -(2 * lowbit);
- for (; read_idx != orig_subset_size; ++read_idx) {
-#ifdef USE_AVX2
- if (!subset_bits) {
- subset_alias[read_widx] = subset_bits_write;
- do {
- subset_bits = subset_alias[++read_widx];
- } while (!subset_bits);
- subset_bits_write = subset_bits;
- detect_11 = genoarr[read_widx];
- detect_11 = detect_11 & (detect_11 >> 1);
- detect_11_hw = PackWordToHalfwordMask5555(detect_11);
- }
- lowbit = subset_bits & (-subset_bits);
- subset_bits ^= lowbit;
- if (lowbit & detect_11_hw) {
- subset_bits_write ^= lowbit;
- continue;
- }
-#else
- if (!subset_bits) {
- subset_alias[read_widx] = subset_bits_write;
- do {
- subset_bits = subset_alias[++read_widx];
- } while (!subset_bits);
- subset_bits_write = subset_bits;
- detect_11 = genoarr[read_widx];
- detect_11 = detect_11 & (detect_11 >> 1);
- }
- lowbit = subset_bits & (-subset_bits);
- subset_bits ^= lowbit;
- if ((lowbit * lowbit) & detect_11) {
- subset_bits_write ^= lowbit;
- continue;
- }
-#endif
- sparse_vals_u16[write_idx++] = sparse_vals_u16[read_idx];
- }
- subset_alias[read_widx] = subset_bits_write;
- *subset_sizep = write_idx;
- return;
- }
- }
- read_idx += PopcountWord(subset_bits);
- if (read_idx == orig_subset_size) {
- return;
- }
- }
-}
-
double u127prod_diff_d(uint64_t plus_term0, uint64_t plus_term1, uint64_t minus_term0, uint64_t minus_term1) {
uint64_t plus_hi;
const uint64_t plus_lo = multiply64to128(plus_term0, plus_term1, &plus_hi);
diff --git a/external_libs/pgenlib/include/pgenlib_misc.h b/external_libs/pgenlib/include/pgenlib_misc.h
index 0f616d5c..d05a4248 100644
--- a/external_libs/pgenlib/include/pgenlib_misc.h
+++ b/external_libs/pgenlib/include/pgenlib_misc.h
@@ -1,7 +1,7 @@
#ifndef __PGENLIB_MISC_H__
#define __PGENLIB_MISC_H__
-// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell,
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
@@ -74,17 +74,97 @@
// on the fly, since that tends to be faster than having to access twice as
// much memory.
+#include
+#ifndef NDEBUG
+# include // va_start()
+#endif
+#include
+#include
+
+#include "plink2_base.h"
#include "plink2_bits.h"
// 10000 * major + 100 * minor + patch
// Exception to CONSTI32, since we want the preprocessor to have access to this
// value. Named with all caps as a consequence.
-#define PGENLIB_INTERNAL_VERNUM 2003
+#define PGENLIB_INTERNAL_VERNUM 2100
#ifdef __cplusplus
namespace plink2 {
#endif
+#ifdef NDEBUG
+HEADER_INLINE BoolErr PglInitLog(__attribute__((unused)) uintptr_t log_capacity) {
+ return 0;
+}
+
+HEADER_INLINE void PglLogprintf(__attribute__((unused)) const char* fmt, ...) {
+}
+
+HEADER_INLINE char* PglReturnLog() {
+ return nullptr;
+}
+
+HEADER_INLINE void PglResetLog() {
+}
+#else
+extern char* g_pgl_errbuf;
+extern char* g_pgl_errbuf_write_iter;
+extern char* g_pgl_errbuf_end;
+
+HEADER_INLINE BoolErr PglInitLog(uintptr_t log_capacity) {
+ if (g_pgl_errbuf) {
+ if (log_capacity <= S_CAST(uintptr_t, g_pgl_errbuf_end - g_pgl_errbuf)) {
+ // existing allocation is fine. no need to shrink
+ g_pgl_errbuf_write_iter = g_pgl_errbuf;
+ *g_pgl_errbuf_write_iter = '\0';
+ return 0;
+ }
+ free(g_pgl_errbuf);
+ }
+ // 128 extra bytes to make WordWrapMultiline() safe.
+ g_pgl_errbuf = S_CAST(char*, malloc(log_capacity + 128));
+ if (!g_pgl_errbuf) {
+ g_pgl_errbuf_write_iter = nullptr;
+ g_pgl_errbuf_end = nullptr;
+ // in the unlikely event this comes up, caller is free to ignore the error,
+ // logging just won't happen. though, if malloc is failing, something else
+ // is likely to fail...
+ return 1;
+ }
+ g_pgl_errbuf_write_iter = g_pgl_errbuf;
+ *g_pgl_errbuf_write_iter = '\0';
+ g_pgl_errbuf_end = &(g_pgl_errbuf[log_capacity]);
+ return 0;
+}
+
+HEADER_INLINE void PglLogprintf(const char* fmt, ...) {
+ // possible todo: log levels
+ if (g_pgl_errbuf_write_iter != nullptr) {
+ va_list args;
+ va_start(args, fmt);
+ const uintptr_t remaining_space = g_pgl_errbuf_end - g_pgl_errbuf_write_iter;
+ const uintptr_t intended_slen = vsnprintf(g_pgl_errbuf_write_iter, remaining_space, fmt, args);
+ if (intended_slen < remaining_space) {
+ g_pgl_errbuf_write_iter = &(g_pgl_errbuf_write_iter[intended_slen]);
+ } else {
+ g_pgl_errbuf_write_iter = g_pgl_errbuf_end;
+ }
+ }
+}
+
+HEADER_INLINE char* PglReturnLog() {
+ return g_pgl_errbuf;
+}
+
+HEADER_INLINE void PglResetLog() {
+ if (g_pgl_errbuf) {
+ g_pgl_errbuf_write_iter = g_pgl_errbuf;
+ g_pgl_errbuf_write_iter[0] = '\0';
+ }
+}
+#endif
+
// other configuration-ish values needed by plink2_common subset
typedef unsigned char AlleleCode;
typedef uint16_t DoubleAlleleCode;
@@ -574,8 +654,6 @@ HEADER_INLINE uint32_t GenoIter1x(const uintptr_t* __restrict genoarr, uintptr_t
// sparse_vals entries.
void ClearGenoarrMissing1bit8Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals);
-void ClearGenoarrMissing1bit16Unsafe(const uintptr_t* __restrict genoarr, uint32_t* subset_sizep, uintptr_t* __restrict subset, void* __restrict sparse_vals);
-
// See EasyasPi's answer to
// https://stackoverflow.com/questions/25095741/how-can-i-multiply-64-bit-operands-and-get-128-bit-result-portably
HEADER_INLINE uint64_t multiply64to128(uint64_t lhs, uint64_t rhs, uint64_t* high) {
diff --git a/external_libs/pgenlib/include/pgenlib_read.cc b/external_libs/pgenlib/include/pgenlib_read.cc
index 8c2f7a06..b139b746 100644
--- a/external_libs/pgenlib/include/pgenlib_read.cc
+++ b/external_libs/pgenlib/include/pgenlib_read.cc
@@ -1,4 +1,4 @@
-// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell,
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
@@ -18,6 +18,11 @@
#include "pgenlib_read.h"
#include
+#include
+#include
+
+// Uncomment this during e.g. pgenlibr development to enable error-throwing.
+// #include
#ifdef __cplusplus
namespace plink2 {
@@ -621,15 +626,15 @@ uintptr_t CountPgrAllocCachelinesRequired(uint32_t raw_sample_ct, PgenGlobalFlag
const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1;
const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor);
+ const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
if ((gflags & kfPgenGlobalDifflistOrLdPresent) || (max_allele_ct > 2)) {
// workspace_difflist_sample_ids
// bugfix: must add 1 since several routines add a terminator element
- cachelines_required += 1 + (max_difflist_entry_ct_base / kInt32PerCacheline);
+ cachelines_required += 1 + (max_difflist_entry_ct / kInt32PerCacheline);
}
if (gflags & kfPgenGlobalDifflistOrLdPresent) {
- // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
// workspace_raregeno_vec
- cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base);
+ cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct);
// workspace_raregeno_tmp_loadbuf
cachelines_required += NypCtToCachelineCt(max_difflist_entry_ct_base);
@@ -1570,6 +1575,7 @@ PglErr PgfiInitPhase2Ex(PgenHeaderCtrl header_ctrl, uint32_t allele_cts_already_
for (; vrtypes_alias_iter != vrtypes_alias_end; ++vrtypes_alias_iter) {
const VecW cur_vvec = *vrtypes_alias_iter;
#ifdef __LP64__
+ // todo: better ARM implementation
const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5);
const VecW cur_vvec_bit1 = vecw_slli(cur_vvec, 6);
// check if any vrtype has bit 1 set and bit 2 clear
@@ -1846,6 +1852,7 @@ uint32_t GetLdbaseVidx(const unsigned char* vrtypes, uint32_t cur_vidx) {
if (cur_vidx_orig_remainder) {
const VecW cur_vvec = vrtypes_valias[vidx_vec_idx];
// non-ld: ((bit 2) OR (NOT bit 1))
+ // todo: better ARM implementation
const VecW cur_vvec_bit2 = vecw_slli(cur_vvec, 5);
const VecW inv_cur_vvec_bit1 = ~vecw_slli(cur_vvec, 6);
v8ui = vecw_movemask(cur_vvec_bit2 | inv_cur_vvec_bit1);
@@ -2094,22 +2101,22 @@ PglErr PgrInit(const char* fname, uint32_t max_vrec_width, PgenFileInfo* pgfip,
const uint32_t bitvec_bytes_req = BitCtToCachelineCt(raw_sample_ct) * kCacheline;
const uint32_t ld_compression_present = (gflags / kfPgenGlobalLdCompressionPresent) & 1;
const uint32_t max_difflist_entry_ct_base = (raw_sample_ct / kPglMaxDifflistLenDivisor);
+ const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
const uint32_t max_allele_ct = pgrp->fi.max_allele_ct;
pgrp->workspace_difflist_sample_ids = nullptr;
if ((gflags & kfPgenGlobalDifflistOrLdPresent) || (max_allele_ct > 2)) {
- pgrp->workspace_difflist_sample_ids = S_CAST(uint32_t*, arena_alloc_raw_rd((max_difflist_entry_ct_base + 1) * sizeof(int32_t), &pgr_alloc_iter));
+ pgrp->workspace_difflist_sample_ids = S_CAST(uint32_t*, arena_alloc_raw_rd((max_difflist_entry_ct + 1) * sizeof(int32_t), &pgr_alloc_iter));
}
if (gflags & kfPgenGlobalDifflistOrLdPresent) {
- // const uint32_t max_difflist_entry_ct = max_difflist_entry_ct_base * (1 + ld_compression_present);
-
- const uintptr_t raregeno_bytes_req = NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline;
- pgrp->workspace_raregeno_vec = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_bytes_req, &pgr_alloc_iter));
- pgrp->workspace_raregeno_tmp_loadbuf = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_bytes_req, &pgr_alloc_iter));
+ const uintptr_t raregeno_ldmerge_bytes_req = NypCtToCachelineCt(max_difflist_entry_ct) * kCacheline;
+ pgrp->workspace_raregeno_vec = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_ldmerge_bytes_req, &pgr_alloc_iter));
+ const uintptr_t raregeno_no_ld_bytes_req = NypCtToCachelineCt(max_difflist_entry_ct_base) * kCacheline;
+ pgrp->workspace_raregeno_tmp_loadbuf = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_no_ld_bytes_req, &pgr_alloc_iter));
if (ld_compression_present) {
pgrp->ldbase_genovec = S_CAST(uintptr_t*, arena_alloc_raw(genovec_bytes_req, &pgr_alloc_iter));
- pgrp->ldbase_raregeno = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_bytes_req, &pgr_alloc_iter));
+ pgrp->ldbase_raregeno = S_CAST(uintptr_t*, arena_alloc_raw(raregeno_no_ld_bytes_req, &pgr_alloc_iter));
pgrp->ldbase_difflist_sample_ids = S_CAST(uint32_t*, arena_alloc_raw_rd((max_difflist_entry_ct_base + 1) * sizeof(int32_t), &pgr_alloc_iter));
}
@@ -2267,8 +2274,9 @@ PglErr ParseAndSaveDifflist(const unsigned char* fread_end, uint32_t raw_sample_
}
PglErr ParseAndSaveDifflistProperSubset(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t raw_sample_ct, const unsigned char** fread_pp, uintptr_t* __restrict raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr, uintptr_t* __restrict raregeno_workspace) {
- // Requires a PROPER subset. Might want to just merge this with
- // ParseAndSaveDifflist() and rename appropriately.
+ // Requires a PROPER subset, since it assumes sample_include is non-null.
+ // Might want to just merge this with ParseAndSaveDifflist() and rename
+ // appropriately.
// Trailing bits of raregeno are zeroed out.
uint32_t raw_difflist_len;
const unsigned char* group_info_iter;
@@ -2344,6 +2352,8 @@ PglErr ParseLdAndMergeDifflistSubset(const unsigned char* fread_end, const uintp
// Used when the ldbase variant was saved as a difflist, and it's useful to
// process the current variant as a difflist.
// * Assumes ldbase_difflist_sample_ids[ldbase_difflist_len]==sample_ct.
+ // * However, it is not necessary for merged_difflist_sample_ids to have
+ // space for an extra element.
// * Assumes sample_include == nullptr if no subsetting needed. (Otherwise,
// it'll still work, but performance will be worse.)
// Trailing bits of merged_raregeno may not be zeroed out.
@@ -3134,23 +3144,31 @@ PglErr LdLoadMinimalSubsetIfNecessary(const uintptr_t* __restrict sample_include
return kPglRetSuccess;
}
-PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
+PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, PgenReaderMain* pgrp, const unsigned char** fread_pp, const unsigned char** fread_endp, uintptr_t* __restrict genovec, uint32_t* difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
assert(vidx < pgrp->fi.raw_variant_ct);
assert(sample_ct);
- assert(max_simple_difflist_len < sample_ct);
// Side effects:
// may use pgr.workspace_raregeno_tmp_loadbuf
// Trailing bits of genovec/main_raregeno may not be zeroed out.
+ // Always sets difflist_common_geno; may not set difflist_len.
const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
const uint32_t maintrack_vrtype = vrtype & 7;
const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+ assert(max_difflist_len < raw_sample_ct);
const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
// const uint32_t multiallelic_hc_present = fread_pp && VrtypeMultiallelic(vrtype);
if (VrtypeLdCompressed(maintrack_vrtype)) {
- // LD compression
-
- // note that this can currently load a difflist longer than
- // max_simple_difflist_len
+ // Variant of interest is LD-compressed.
+ // Before 13 Feb 2025, this branch ignored the max_difflist_len parameter,
+ // forcing callers to allocate main_raregeno and difflist_sample_ids up to
+ // 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor) entries. Now we
+ // confirm that
+ // len(possibly-subsetted ldbase difflist) +
+ // len(raw differences-from-ldbase list)
+ // is no longer than max_difflist_len. This misses some cases where we can
+ // give a sparse return (since the raw differences-from-ldbase list can
+ // contain entries outside the current subset or overlap with the first
+ // list), but should be good enough in practice.
PglErr reterr = LdLoadMinimalSubsetIfNecessary(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp);
if (unlikely(reterr)) {
return reterr;
@@ -3162,19 +3180,25 @@ PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_incl
}
const uint32_t ld_invert = (maintrack_vrtype == 3);
if (pgrp->ldbase_stypes & kfPgrLdcacheDifflist) {
- const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3;
- // unnecessary for this to branch on LD difflist length, since that's
- // limited to 3/4 of the ldbase difflist length.
- *difflist_common_geno_ptr = ldbase_common_geno;
- reterr = ParseLdAndMergeDifflistSubset(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->ldbase_difflist_len, ldbase_common_geno, raw_sample_ct, sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, pgrp->workspace_raregeno_tmp_loadbuf);
- if (unlikely(reterr)) {
- return reterr;
+ const uint32_t cur_difflist_len = PeekVint31(fread_ptr, fread_end);
+ if (pgrp->ldbase_difflist_len + cur_difflist_len <= max_difflist_len) {
+ const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3;
+ *difflist_common_geno_ptr = ldbase_common_geno;
+ reterr = ParseLdAndMergeDifflistSubset(fread_end, subsetting_required? sample_include : nullptr, sample_include_cumulative_popcounts, pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, pgrp->ldbase_difflist_len, ldbase_common_geno, raw_sample_ct, sample_ct, &fread_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr, pgrp->workspace_raregeno_tmp_loadbuf);
+ if (unlikely(reterr)) {
+ return reterr;
+ }
+ if (ld_invert) {
+ *difflist_common_geno_ptr = (6 - ldbase_common_geno) & 3;
+ GenovecInvertUnsafe(*difflist_len_ptr, main_raregeno);
+ }
+ return kPglRetSuccess;
}
- if (ld_invert) {
- *difflist_common_geno_ptr = (6 - ldbase_common_geno) & 3;
- GenovecInvertUnsafe(*difflist_len_ptr, main_raregeno);
+ if (!(pgrp->ldbase_stypes & kfPgrLdcacheNyp)) {
+ const uint32_t ldbase_common_geno = pgrp->fi.vrtypes[pgrp->ldbase_vidx] & 3;
+ PgrDifflistToGenovecUnsafe(pgrp->ldbase_raregeno, pgrp->ldbase_difflist_sample_ids, ldbase_common_geno, sample_ct, pgrp->ldbase_difflist_len, pgrp->ldbase_genovec);
+ pgrp->ldbase_stypes |= kfPgrLdcacheNyp;
}
- return kPglRetSuccess;
}
if (pgrp->ldbase_stypes & kfPgrLdcacheNyp) {
CopyNyparr(pgrp->ldbase_genovec, sample_ct, genovec);
@@ -3209,7 +3233,7 @@ PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_incl
// no limit is slightly better than /16 but substantially worse than /32 on
// the large test dataset (/64 is slightly worse than /32)
// no limit is best on the small test dataset
- if (saved_difflist_len > max_simple_difflist_len) {
+ if (saved_difflist_len > max_difflist_len) {
*difflist_common_geno_ptr = UINT32_MAX;
PglErr reterr = ParseNonLdGenovecSubsetUnsafe(fread_end, sample_include, sample_include_cumulative_popcounts, sample_ct, vrtype, &fread_ptr, pgrp, genovec);
if (unlikely(reterr)) {
@@ -3257,14 +3281,14 @@ PglErr ReadDifflistOrGenovecSubsetUnsafe(const uintptr_t* __restrict sample_incl
return kPglRetSuccess;
}
-PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
+PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
if (!sample_ct) {
*difflist_common_geno_ptr = UINT32_MAX;
return kPglRetSuccess;
}
PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
assert(vidx < pgrp->fi.raw_variant_ct);
- return ReadDifflistOrGenovecSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, genovec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
+ return ReadDifflistOrGenovecSubsetUnsafe(sample_include, GetSicp(pssi), sample_ct, max_difflist_len, vidx, pgrp, nullptr, nullptr, genovec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
}
PglErr LdSubsetAdjustGenocounts(const unsigned char* fread_end, const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, const uintptr_t* __restrict ldbase_genovec, uint32_t raw_sample_ct, const unsigned char** fread_pp, STD_ARRAY_REF(uint32_t, 4) genocounts, uintptr_t* __restrict raregeno_workspace) {
@@ -3387,6 +3411,7 @@ PglErr SkipDeltalistIds(const unsigned char* fread_end, const unsigned char* gro
}
const VecW vv = vecw_loadu(fread_ptr);
fread_ptr = &(fread_ptr[kBytesPerVec]);
+ // todo: better ARM implementation
const uint32_t highbits = vecw_movemask(vv);
remaining_id_ct -= kBytesPerVec - PopcountVec8thUint(highbits);
}
@@ -5537,7 +5562,7 @@ PglErr IMPLPgrGetInv1(const uintptr_t* __restrict sample_include, const uint32_t
return reterr;
}
-PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
+PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
if (!sample_ct) {
*difflist_common_geno_ptr = UINT32_MAX;
return kPglRetSuccess;
@@ -5545,7 +5570,7 @@ PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_includ
const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
const uint32_t multiallelic_hc_present = VrtypeMultiallelicHc(vrtype);
if ((!allele_idx) || ((allele_idx == 1) && (!multiallelic_hc_present))) {
- PglErr reterr = ReadDifflistOrGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_simple_difflist_len, vidx, pgrp, nullptr, nullptr, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
+ PglErr reterr = ReadDifflistOrGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_difflist_len, vidx, pgrp, nullptr, nullptr, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
if (unlikely(reterr)) {
return reterr;
}
@@ -7479,6 +7504,175 @@ PglErr IMPLPgrGetD(const uintptr_t* __restrict sample_include, const uint32_t* _
return ParseDosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, allele_ct, pgrp, dosage_ct_ptr, nullptr, nullptr, nullptr, dosage_present, dosage_main);
}
+static const uint16_t kGenoToDosage16[4] = {0, 16384, 32768, 65535};
+
+// These tables should be renamed in a way that distinguishes them from the
+// likes of kGenoToDosage16.
+const uint16_t kHcToDosage16[1024] = QUAD_TABLE256(0, 16384, 32768, 65535);
+
+PglErr IMPLPgrGetDMaybeSparse(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t max_sparse_dosage_ct, PgenReaderMain* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr, uint16_t* difflist_common_dosage_ptr, uint32_t* difflist_sample_ids) {
+ assert(vidx < pgrp->fi.raw_variant_ct);
+ *difflist_common_dosage_ptr = 1;
+ if (!sample_ct) {
+ return kPglRetSuccess;
+ }
+ const uint32_t vrtype = GetPgfiVrtype(&(pgrp->fi), vidx);
+ const uint32_t dosage_matters = dosage_present && VrtypeDosage(vrtype);
+ const uint32_t raw_sample_ct = pgrp->fi.raw_sample_ct;
+ const uint32_t subsetting_required = (sample_ct != raw_sample_ct);
+ if (dosage_matters && (((vrtype & 0x60) != 0x20) || (subsetting_required && VrtypeHphase(vrtype)))) {
+ // If dense dosages are present, we can immediately bail to PgrGetD().
+ //
+ // If a dosage-list is present, we punt the hardcall-phase + subsetting
+ // subcase for now. We can fill it in by calling
+ // ReadDifflistOrGenovecSubsetUnsafe() in no-subsetting mode (so we know
+ // the correct het_ct), calling SkipAux2(), and then subsetting. (Or
+ // ReadDifflistOrGenovecSubsetUnsafe() could optionally return raw_het_ct,
+ // but I expect that approach to degrade hotter-path performance by too
+ // much to be worth it.) Eventually, that code path will also need to
+ // support multiallelic variants.
+ return IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, genovec, dosage_present, dosage_main, dosage_ct_ptr);
+ }
+ *dosage_ct_ptr = 0;
+ uintptr_t* subsetted_raregeno = pgrp->workspace_raregeno_vec;
+ // If a dosage-list is present, we may need to merge a (sample ID, genotype)
+ // list returned by ReadDifflistOrGenovecSubsetUnsafe() with the (sample ID,
+ // dosage) list. So we save the IDs in the first list to a workspace buffer.
+ // However, if dosages aren't present (or we're ignoring them), we can write
+ // the sample IDs directly to the final destination.
+ uint32_t* subsetted_raregeno_sample_ids = dosage_matters? pgrp->workspace_difflist_sample_ids : difflist_sample_ids;
+ const unsigned char* fread_ptr = nullptr;
+ const unsigned char* fread_end = nullptr;
+ uint32_t difflist_common_geno;
+ uint32_t subsetted_raregeno_ct;
+ PglErr reterr = ReadDifflistOrGenovecSubsetUnsafe(sample_include, sample_include_cumulative_popcounts, sample_ct, max_sparse_dosage_ct, vidx, pgrp, &fread_ptr, &fread_end, genovec, &difflist_common_geno, subsetted_raregeno, subsetted_raregeno_sample_ids, &subsetted_raregeno_ct);
+ if (unlikely(reterr)) {
+ return reterr;
+ }
+ if (!dosage_matters) {
+ *dosage_ct_ptr = subsetted_raregeno_ct;
+ if (difflist_common_geno != UINT32_MAX) {
+ *difflist_common_dosage_ptr = kGenoToDosage16[difflist_common_geno];
+ GenoarrLookup256x2bx4(subsetted_raregeno, kHcToDosage16, subsetted_raregeno_ct, dosage_main);
+ }
+ return kPglRetSuccess;
+ }
+ const uintptr_t* allele_idx_offsets = pgrp->fi.allele_idx_offsets;
+ const uint32_t allele_ct = allele_idx_offsets? (allele_idx_offsets[vidx + 1] - allele_idx_offsets[vidx]) : 2;
+ if (allele_ct != 2) {
+ // Don't need aux1-skipping logic yet.
+#ifndef PGENLIB_NOPRINT
+ fputs("multiallelic dosages not yet supported by PgrGetDMaybeSparse()\n", stderr);
+#endif
+ return kPglRetNotYetSupported;
+ }
+ if (VrtypeHphase(vrtype)) {
+ assert(!subsetting_required);
+ const uint32_t raw_het_ct = CountNyp(subsetted_raregeno, kMask5555, raw_sample_ct);
+ reterr = SkipAux2(fread_end, raw_het_ct, &fread_ptr, nullptr);
+ if (unlikely(reterr)) {
+ return reterr;
+ }
+ }
+ // Only invoke sparse-return logic iff a sufficiently-short dosage-list is
+ // stored.
+ const uint32_t peek_raw_dosage_ct = PeekVint31(fread_ptr, fread_end);
+ if (peek_raw_dosage_ct + subsetted_raregeno_ct > max_sparse_dosage_ct) {
+ if (difflist_common_geno != UINT32_MAX) {
+ PgrDifflistToGenovecUnsafe(subsetted_raregeno, subsetted_raregeno_sample_ids, difflist_common_geno, sample_ct, subsetted_raregeno_ct, genovec);
+ }
+ return ParseDosage16(fread_ptr, fread_end, sample_include, sample_ct, vidx, allele_ct, pgrp, dosage_ct_ptr, nullptr, nullptr, nullptr, dosage_present, dosage_main);
+ }
+ *difflist_common_dosage_ptr = kGenoToDosage16[difflist_common_geno];
+ if (!subsetting_required) {
+ sample_include = nullptr;
+ }
+ subsetted_raregeno_sample_ids[subsetted_raregeno_ct] = sample_ct;
+ // Rest of this function is similar to ParseLdAndMergeDifflistSubset().
+ uint32_t raw_dosage_ct;
+ const unsigned char* group_info_iter;
+ reterr = ParseDifflistHeader(fread_end, raw_sample_ct, &fread_ptr, nullptr, &group_info_iter, &raw_dosage_ct);
+ if (unlikely(reterr)) {
+ return reterr;
+ }
+ const unsigned char* deltalist_iter = fread_ptr;
+ const unsigned char* cur_raw_dosage_main_start = fread_ptr;
+ reterr = SkipDeltalistIds(fread_end, group_info_iter, raw_dosage_ct, raw_sample_ct, 0, &cur_raw_dosage_main_start);
+ if (unlikely(reterr)) {
+ return reterr;
+ }
+ // raw_dosage_ct == 0 is technically permitted, but shouldn't happen in
+ // practice. So we handle it correctly but don't give it a fast-path.
+ const int32_t subgroup_idx_last = ((raw_dosage_ct + kBitsPerWordD2 - 1) / kBitsPerWordD2) - 1;
+ const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(raw_sample_ct);
+ uint32_t difflist_write_idx = 0;
+ uintptr_t subsetted_raregeno_word = 0;
+ uint32_t subsetted_raregeno_sample_idx = subsetted_raregeno_sample_ids[0];
+ uintptr_t raw_sample_idx = 0;
+ uint32_t sample_idx = 0;
+ uint32_t subsetted_raregeno_difflist_idx = 0;
+ uint32_t done = 0;
+ uint32_t subgroup_len_m1 = kBitsPerWordD2 - 1;
+ for (uint32_t subgroup_idx = 0; ; ++subgroup_idx) {
+ uint32_t dosage_idx_lowbits = 0;
+ if (S_CAST(int32_t, subgroup_idx) >= subgroup_idx_last) {
+ if (S_CAST(int32_t, subgroup_idx) > subgroup_idx_last) {
+ done = 1;
+ sample_idx = sample_ct;
+ goto IMPLPgrGetDMaybeSparse_finish;
+ }
+ subgroup_len_m1 &= raw_dosage_ct - 1;
+ }
+ if (!(subgroup_idx % (kPglDifflistGroupSize / kBitsPerWordD2))) {
+ raw_sample_idx = SubU32Load(group_info_iter, sample_id_byte_ct);
+ group_info_iter = &(group_info_iter[sample_id_byte_ct]);
+ } else {
+ // May as well use cur_raw_dosage_main_start over fread_end here.
+ raw_sample_idx += GetVint31(cur_raw_dosage_main_start, &deltalist_iter);
+ }
+ for (; ; ++dosage_idx_lowbits) {
+ if (unlikely(raw_sample_idx >= raw_sample_ct)) {
+ return kPglRetMalformedInput;
+ }
+ if ((!sample_include) || IsSet(sample_include, raw_sample_idx)) {
+ sample_idx = sample_include? RawToSubsettedPos(sample_include, sample_include_cumulative_popcounts, raw_sample_idx) : raw_sample_idx;
+ IMPLPgrGetDMaybeSparse_finish:
+ while (subsetted_raregeno_sample_idx < sample_idx) {
+ if (!(subsetted_raregeno_difflist_idx % kBitsPerWordD2)) {
+ subsetted_raregeno_word = subsetted_raregeno[subsetted_raregeno_difflist_idx / kBitsPerWordD2];
+ }
+ difflist_sample_ids[difflist_write_idx] = subsetted_raregeno_sample_idx;
+ dosage_main[difflist_write_idx] = kGenoToDosage16[subsetted_raregeno_word & 3];
+ ++difflist_write_idx;
+ ++subsetted_raregeno_difflist_idx;
+ subsetted_raregeno_word >>= 2;
+ subsetted_raregeno_sample_idx = subsetted_raregeno_sample_ids[subsetted_raregeno_difflist_idx];
+ }
+ if (subsetted_raregeno_sample_idx == sample_idx) {
+ if (done) {
+ *dosage_ct_ptr = difflist_write_idx;
+ return kPglRetSuccess;
+ }
+ if (!(subsetted_raregeno_difflist_idx % kBitsPerWordD2)) {
+ subsetted_raregeno_word = subsetted_raregeno[subsetted_raregeno_difflist_idx / kBitsPerWordD2];
+ }
+ ++subsetted_raregeno_difflist_idx;
+ subsetted_raregeno_word >>= 2;
+ subsetted_raregeno_sample_idx = subsetted_raregeno_sample_ids[subsetted_raregeno_difflist_idx];
+ }
+ difflist_sample_ids[difflist_write_idx] = sample_idx;
+ CopyFromUnalignedOffsetU16(&(dosage_main[difflist_write_idx]), cur_raw_dosage_main_start, dosage_idx_lowbits);
+ ++difflist_write_idx;
+ }
+ if (dosage_idx_lowbits == subgroup_len_m1) {
+ break;
+ }
+ raw_sample_idx += GetVint31(cur_raw_dosage_main_start, &deltalist_iter);
+ }
+ cur_raw_dosage_main_start = &(cur_raw_dosage_main_start[kBitsPerWordD2 * sizeof(int16_t)]);
+ }
+}
+
PglErr PgrGet1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr) {
PgenReaderMain* pgrp = GetPgrp(pgr_ptr);
const uint32_t* sample_include_cumulative_popcounts = GetSicp(pssi);
diff --git a/external_libs/pgenlib/include/pgenlib_read.h b/external_libs/pgenlib/include/pgenlib_read.h
index 734e81a5..1fcedbd5 100644
--- a/external_libs/pgenlib/include/pgenlib_read.h
+++ b/external_libs/pgenlib/include/pgenlib_read.h
@@ -1,7 +1,7 @@
#ifndef __PGENLIB_READ_H__
#define __PGENLIB_READ_H__
-// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell,
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
@@ -20,7 +20,11 @@
// pgenlib_read contains reader-specific code.
+#include
+
#include "pgenlib_misc.h"
+#include "plink2_base.h"
+#include "plink2_bits.h"
#ifdef __cplusplus
namespace plink2 {
@@ -149,10 +153,14 @@ typedef struct PgenReaderMainStruct {
// can also be used for other purposes after we're done processing aux2.
uintptr_t* workspace_vec;
- // currently must hold (raw_sample_ct / kPglMaxDifflistLenDivisor)
- // entries; may need to double the sizes later
- // some top-level interface functions use these, so several lower-level
- // functions cannot
+ // workspace_raregeno_vec must hold 2 * (raw_sample_ct /
+ // kPglMaxDifflistLenDivisor) entries, due to PgrGetDMaybeSparse().
+ // workspace_difflist_sample_ids requires space for one more entry than that,
+ // due to a potential terminator element.
+ //
+ // Several other top-level interface functions use
+ // workspace_difflist_sample_ids. Lower-level functions should never refer
+ // directly to either of these two buffers.
uintptr_t* workspace_raregeno_vec;
uint32_t* workspace_difflist_sample_ids;
@@ -556,17 +564,10 @@ PglErr PgrGet(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex p
// difflist_common_geno to the common genotype value in that case. Otherwise,
// genovec is populated and difflist_common_geno is set to UINT32_MAX.
//
-// max_simple_difflist_len must be smaller than sample_ct.
-//
-// Note that the returned difflist_len can be much larger than
-// max_simple_difflist_len when the variant is LD-encoded; it's bounded by
-// 2 * (raw_sample_ct / kPglMaxDifflistLenDivisor).
-// (probable todo: this interface has... rather sharp edges, even relative to
-// the rest of this low-level library. Maybe it shouldn't be deleted, but it
-// would be better if there was a function that took a max_difflist_len
-// parameter, and it was safe for difflist_sample_ids to only be allocated up
-// to that length.)
-PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr);
+// max_difflist_len must be smaller than raw_sample_ct. The returned difflist
+// will never be longer than this. (Note that this was not true before 13 Feb
+// 2025.)
+PglErr PgrGetDifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr);
// genocounts[0] = # hom ref, [1] = # het ref, [2] = two alts, [3] = missing
PglErr PgrGetCounts(const uintptr_t* __restrict sample_include, const uintptr_t* __restrict sample_include_interleaved_vec, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, PgenReader* pgr_ptr, STD_ARRAY_REF(uint32_t, 4) genocounts);
@@ -599,12 +600,12 @@ HEADER_INLINE PglErr PgrGetInv1(const uintptr_t* __restrict sample_include, PgrS
return IMPLPgrGetInv1(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, allele_idx, pgrp, allele_invcountvec);
}
-PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr);
+PglErr IMPLPgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReaderMain* pgrp, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr);
-HEADER_INLINE PglErr PgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_simple_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
+HEADER_INLINE PglErr PgrGetInv1DifflistOrGenovec(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t max_difflist_len, uint32_t vidx, uint32_t allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uint32_t* __restrict difflist_common_geno_ptr, uintptr_t* __restrict main_raregeno, uint32_t* __restrict difflist_sample_ids, uint32_t* __restrict difflist_len_ptr) {
PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
- return IMPLPgrGetInv1DifflistOrGenovec(sample_include, sample_include_cumulative_popcounts, sample_ct, max_simple_difflist_len, vidx, allele_idx, pgrp, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
+ return IMPLPgrGetInv1DifflistOrGenovec(sample_include, sample_include_cumulative_popcounts, sample_ct, max_difflist_len, vidx, allele_idx, pgrp, allele_invcountvec, difflist_common_geno_ptr, main_raregeno, difflist_sample_ids, difflist_len_ptr);
}
PglErr IMPLPgrGet2(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t allele_idx0, uint32_t allele_idx1, PgenReaderMain* pgrp, uintptr_t* __restrict genovec);
@@ -667,6 +668,30 @@ HEADER_INLINE PglErr PgrGetD(const uintptr_t* __restrict sample_include, PgrSamp
return IMPLPgrGetD(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, pgrp, genovec, dosage_present, dosage_main, dosage_ct_ptr);
}
+extern const uint16_t kHcToDosage16[1024];
+
+PglErr IMPLPgrGetDMaybeSparse(const uintptr_t* __restrict sample_include, const uint32_t* __restrict sample_include_cumulative_popcounts, uint32_t sample_ct, uint32_t vidx, uint32_t max_sparse_dosage_ct, PgenReaderMain* pgrp, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr, uint16_t* difflist_common_dosage_ptr, uint32_t* difflist_sample_ids);
+
+// If the following conditions hold:
+// 1. Hardcalls and dosages are stored sparsely.
+// 2. Either no subsetting is requested, or neither multiallelic-hardcall nor
+// hardcall-phase data is present. (This condition may be removed in the
+// future.)
+// 3. PgrGetDifflistOrGenovec(..., max_difflist_len=max_sparse_dosage_ct, ...)
+// returns a sparse list, and the length of that sparse list plus the length
+// of the explicit-dosage list <= max_sparse_dosage_ct.
+// Then difflist_common_dosage is set to the common value (either 0, 32768, or
+// 65535), and {difflist_sample_ids, dosage_main, dosage_ct} provide a sparse
+// representation of the other values.
+// Otherwise, difflist_common_dosage is set to 1, and return values are as with
+// PgrGetD().
+// max_sparse_dosage_ct must be less than raw_sample_ct.
+HEADER_INLINE PglErr PgrGetDMaybeSparse(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, uint32_t max_sparse_dosage_ct, PgenReader* pgr_ptr, uintptr_t* __restrict genovec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr, uint16_t* difflist_common_dosage_ptr, uint32_t* difflist_sample_ids) {
+ PgenReaderMain* pgrp = &GET_PRIVATE(*pgr_ptr, m);
+ const uint32_t* sample_include_cumulative_popcounts = GET_PRIVATE(pssi, cumulative_popcounts);
+ return IMPLPgrGetDMaybeSparse(sample_include, sample_include_cumulative_popcounts, sample_ct, vidx, max_sparse_dosage_ct, pgrp, genovec, dosage_present, dosage_main, dosage_ct_ptr, difflist_common_dosage_ptr, difflist_sample_ids);
+}
+
PglErr PgrGet1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_countvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr);
PglErr PgrGetInv1D(const uintptr_t* __restrict sample_include, PgrSampleSubsetIndex pssi, uint32_t sample_ct, uint32_t vidx, AlleleCode allele_idx, PgenReader* pgr_ptr, uintptr_t* __restrict allele_invcountvec, uintptr_t* __restrict dosage_present, uint16_t* dosage_main, uint32_t* dosage_ct_ptr);
diff --git a/external_libs/pgenlib/include/pgenlib_write.cc b/external_libs/pgenlib/include/pgenlib_write.cc
new file mode 100644
index 00000000..db96d566
--- /dev/null
+++ b/external_libs/pgenlib/include/pgenlib_write.cc
@@ -0,0 +1,2624 @@
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation; either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library. If not, see .
+
+#include "pgenlib_write.h"
+
+#include
+#include
+#include
+#include // unlink()
+
+#include "plink2_bits.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+static inline PgenWriterCommon* GetPwcp(STPgenWriter* spgwp) {
+ return &GET_PRIVATE(*spgwp, pwc);
+}
+
+static inline FILE** GetPgenOutfilep(STPgenWriter* spgwp) {
+ return &GET_PRIVATE(*spgwp, pgen_outfile);
+}
+
+static inline FILE** GetPgiOrFinalPgenOutfilep(STPgenWriter* spgwp) {
+ return &GET_PRIVATE(*spgwp, pgi_or_final_pgen_outfile);
+}
+
+static inline char** GetFnameBufp(STPgenWriter* spgwp) {
+ return &GET_PRIVATE(*spgwp, fname_buf);
+}
+
+void GenovecInvertCopyUnsafe(const uintptr_t* __restrict genovec, uint32_t sample_ct, uintptr_t* __restrict genovec_inverted_copy) {
+ // flips 0 to 2 and vice versa.
+ // "unsafe" because trailing bits are not zeroed out.
+ const uint32_t vec_ct = NypCtToVecCt(sample_ct);
+ assert(IsVecAligned(genovec));
+ const VecW not_m1 = VCONST_W(kMaskAAAA);
+ const VecW* vin_ptr = R_CAST(const VecW*, genovec);
+ VecW* vout_ptr = R_CAST(VecW*, genovec_inverted_copy);
+ for (uint32_t vidx = 0; vidx != vec_ct; ++vidx) {
+ const VecW cur_vec = vin_ptr[vidx];
+ // flip high bit iff low bit is unset
+ vout_ptr[vidx] = cur_vec ^ vecw_and_notfirst(vecw_slli(cur_vec, 1), not_m1);
+ }
+}
+
+void PreinitSpgw(STPgenWriter* spgwp) {
+ *GetPgenOutfilep(spgwp) = nullptr;
+ *GetPgiOrFinalPgenOutfilep(spgwp) = nullptr;
+ *GetFnameBufp(spgwp) = nullptr;
+}
+
+void PreinitMpgw(MTPgenWriter* mpgwp) {
+ mpgwp->pgen_outfile = nullptr;
+ mpgwp->pgi_or_final_pgen_outfile = nullptr;
+ mpgwp->fname_buf = nullptr;
+}
+
+// OpenFail and WriteFail errors can now refer to either the .pgen, .pgen.tmp,
+// or .pgen.pgi file.
+PglErr PwcInitPhase1(const char* __restrict fname, uintptr_t* explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct_limit, uint32_t sample_ct, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uintptr_t vrec_len_byte_ct, PgenWriterCommon* pwcp, FILE** pgen_outfile_ptr, FILE** pgi_or_final_pgen_outfile_ptr, char** fname_buf_ptr) {
+ pwcp->explicit_nonref_flags = nullptr;
+ if (nonref_flags_storage == 3) {
+ if (unlikely(!explicit_nonref_flags)) {
+ return kPglRetImproperFunctionCall;
+ }
+ pwcp->explicit_nonref_flags = explicit_nonref_flags;
+ }
+ pwcp->header_exts = header_exts;
+ pwcp->footer_exts = footer_exts;
+ pwcp->variant_ct_limit = variant_ct_limit;
+ pwcp->sample_ct = sample_ct;
+ pwcp->phase_dosage_gflags = phase_dosage_gflags;
+ pwcp->nonref_flags_storage = nonref_flags_storage;
+ pwcp->vrec_len_byte_ct = vrec_len_byte_ct;
+#ifndef NDEBUG
+ pwcp->vblock_fpos = nullptr;
+ pwcp->vrec_len_buf = nullptr;
+ pwcp->vrtype_buf = nullptr;
+ pwcp->fwrite_buf = nullptr;
+ pwcp->fwrite_bufp = nullptr;
+ pwcp->genovec_hets_buf = nullptr;
+ pwcp->genovec_invert_buf = nullptr;
+ pwcp->ldbase_genovec = nullptr;
+ pwcp->ldbase_raregeno = nullptr;
+ pwcp->ldbase_difflist_sample_ids = nullptr;
+#endif
+ pwcp->vidx = 0;
+
+ *pgen_outfile_ptr = nullptr;
+ *pgi_or_final_pgen_outfile_ptr = nullptr;
+ *fname_buf_ptr = nullptr;
+ const int32_t ext_present = (header_exts != nullptr) || (footer_exts != nullptr);
+ const int32_t third_byte = ((write_mode == kPgenWriteSeparateIndex)? 0x20 : 0x10) + ext_present;
+ if (write_mode != kPgenWriteBackwardSeek) {
+ const uint32_t fname_slen = strlen(fname);
+ if (fname_slen > kPglFnamesize - 5) {
+ return kPglRetMalformedInput;
+ }
+ pwcp->vblock_fpos_offset = 3;
+ if (write_mode == kPgenWriteAndCopy) {
+ if (unlikely(pgl_malloc(fname_slen + 5, fname_buf_ptr))) {
+ return kPglRetNomem;
+ }
+ char* fname_iter = memcpya(*fname_buf_ptr, fname, fname_slen);
+ // In the kPgenWriteAndCopy case, pgen_outfile is the *temporary* .pgen
+ // output.
+ strcpy_k(fname_iter, ".tmp");
+ *pgen_outfile_ptr = fopen(*fname_buf_ptr, FOPEN_WB);
+ if (unlikely(!(*pgen_outfile_ptr))) {
+ return kPglRetOpenFail;
+ }
+ fwrite_unlocked("l\x1b", 2, 1, *pgen_outfile_ptr);
+ if (unlikely(putc_checked(third_byte + 0x10, *pgen_outfile_ptr))) {
+ return kPglRetWriteFail;
+ }
+ } else {
+ // kPgenWriteSeparateIndex
+ char fname_buf[kPglFnamesize];
+ char* fname_iter = memcpya(fname_buf, fname, fname_slen);
+ strcpy_k(fname_iter, ".pgi");
+ *pgi_or_final_pgen_outfile_ptr = fopen(fname_buf, FOPEN_WB);
+ if (unlikely(!(*pgi_or_final_pgen_outfile_ptr))) {
+ return kPglRetOpenFail;
+ }
+ fwrite_unlocked("l\x1b", 2, 1, *pgi_or_final_pgen_outfile_ptr);
+ if (unlikely(putc_checked(third_byte + 0x10, *pgi_or_final_pgen_outfile_ptr))) {
+ return kPglRetWriteFail;
+ }
+ }
+ }
+ FILE* header_ff = fopen(fname, FOPEN_WB);
+ if (unlikely(!header_ff)) {
+ return kPglRetOpenFail;
+ }
+ if (write_mode != kPgenWriteAndCopy) {
+ *pgen_outfile_ptr = header_ff;
+ } else {
+ *pgi_or_final_pgen_outfile_ptr = header_ff;
+ }
+ fwrite_unlocked("l\x1b", 2, 1, header_ff);
+ if (unlikely(putc_checked(third_byte, header_ff))) {
+ return kPglRetWriteFail;
+ }
+ if (write_mode != kPgenWriteBackwardSeek) {
+ return kPglRetSuccess;
+ }
+
+ uint64_t header_bytes_left = PglHeaderBaseEndOffset(variant_ct_limit, vrec_len_byte_ct, phase_dosage_gflags != kfPgenGlobal0, nonref_flags_storage == 3) - 3;
+ if (ext_present) {
+ // Additional header contents:
+ // 1. Flag varint describing which header extensions are present.
+ // 2. Flag varint describing which footer extensions are present.
+ // 3. If at least one footer extension present, uint64 byte offset of
+ // footer start.
+ // 4. For each present header extension, a varint indicating its byte
+ // length.
+ // 5. Bodies of header extensions, in order.
+
+ // Fixed minimum size of (1) + (2).
+ header_bytes_left += 2;
+ if (header_exts) {
+ PgenExtensionLl* header_exts_iter = header_exts;
+ int32_t prev_type_idx = -1;
+ do {
+ const int32_t cur_type_idx = header_exts_iter->type_idx;
+ if (cur_type_idx <= prev_type_idx) {
+ return kPglRetImproperFunctionCall;
+ }
+ const uint64_t cur_size = header_exts_iter->size;
+ if (unlikely(cur_size >= (1LLU << (kBitsPerWord - 1)))) {
+ return kPglRetImproperFunctionCall;
+ }
+ header_bytes_left += cur_size + VintBytect(cur_size);
+
+ prev_type_idx = cur_type_idx;
+ header_exts_iter = header_exts_iter->next;
+ } while (header_exts_iter);
+ header_bytes_left += S_CAST(uint32_t, prev_type_idx) / 7;
+ }
+ if (footer_exts) {
+ PgenExtensionLl* footer_exts_iter = footer_exts;
+ int32_t prev_type_idx = -1;
+ do {
+ const int32_t cur_type_idx = footer_exts_iter->type_idx;
+ if (cur_type_idx <= prev_type_idx) {
+ return kPglRetImproperFunctionCall;
+ }
+ prev_type_idx = cur_type_idx;
+ footer_exts_iter = footer_exts_iter->next;
+ } while (footer_exts_iter);
+ // 8 more fixed bytes for (3).
+ header_bytes_left += sizeof(int64_t) + S_CAST(uint32_t, prev_type_idx) / 7;
+ }
+ }
+
+ // this should be the position of the first variant
+ pwcp->vblock_fpos_offset = 3 + header_bytes_left;
+
+ uintptr_t zeroed_cachelines_needed = DivUp(header_bytes_left, kCacheline);
+ if (zeroed_cachelines_needed > (kPglFwriteBlockSize / kCacheline)) {
+ zeroed_cachelines_needed = kPglFwriteBlockSize / kCacheline;
+ }
+ // could wait until fwrite_buf is allocated, and make sure it's aligned?
+ unsigned char zerobuf[kPglFwriteBlockSize];
+ memset(zerobuf, 0, zeroed_cachelines_needed * kCacheline);
+ while (header_bytes_left > kPglFwriteBlockSize) {
+ fwrite_unlocked(zerobuf, kPglFwriteBlockSize, 1, header_ff);
+ header_bytes_left -= kPglFwriteBlockSize;
+ }
+ if (unlikely(fwrite_checked(zerobuf, header_bytes_left, header_ff))) {
+ return kPglRetWriteFail;
+ }
+ return kPglRetSuccess;
+}
+
+uintptr_t CountSpgwAllocCachelinesRequired(uint32_t variant_ct_limit, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uint32_t max_vrec_len) {
+ // vblock_fpos
+ const uint32_t vblock_ct_limit = DivUp(variant_ct_limit, kPglVblockSize);
+ uintptr_t cachelines_required = Int64CtToCachelineCt(vblock_ct_limit);
+
+ // vrec_len_buf
+ const uintptr_t vrec_len_byte_ct = BytesToRepresentNzU32(max_vrec_len);
+ cachelines_required += DivUp(variant_ct_limit * vrec_len_byte_ct, kCacheline);
+
+ // vrtype_buf
+ if (phase_dosage_gflags) {
+ cachelines_required += DivUp(variant_ct_limit, kCacheline);
+ } else {
+ cachelines_required += DivUp(variant_ct_limit, kCacheline * 2);
+ }
+
+ // genovec_hets_buf, genovec_invert_buf, ldbase_genovec
+ cachelines_required += 3 * NypCtToCachelineCt(sample_ct);
+
+ const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor);
+ // ldbase_raregeno
+ cachelines_required += NypCtToCachelineCt(max_difflist_len);
+
+ // ldbase_difflist_sample_ids
+ cachelines_required += 1 + (max_difflist_len / kInt32PerCacheline);
+
+ // fwrite_buf
+ // + (5 + sizeof(AlleleCode)) * kPglDifflistGroupSize to avoid buffer
+ // overflow in middle of difflist writing
+ // TODO: document which code approaches this limit. sizeof(AlleleCode)
+ // implies it's one of the multiallelic subcases?
+ cachelines_required += DivUp(max_vrec_len + kPglFwriteBlockSize + (5 + sizeof(AlleleCode)) * kPglDifflistGroupSize, kCacheline);
+ // possible todo: dosage (doesn't currently need an allocation, but that's
+ // unlikely to remain true--e.g. get_ref_nonref_genotype_counts_and_dosages
+ // tends to use workspace_vec when a function it calls doesn't use it...)
+
+ // probable todo: multiallelic (phased) dosage
+ return cachelines_required;
+}
+
+static_assert(kPglMaxAlleleCt == 255, "Need to update SpgwInitPhase1Ex().");
+PglErr SpgwInitPhase1Ex(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct_limit, uint32_t sample_ct, uint32_t allele_ct_upper_bound, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, STPgenWriter* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr) {
+ assert(variant_ct_limit);
+ assert(sample_ct);
+
+ // separate from MpgwInitPhase1's version of this computation since the
+ // latter wants a better bound on the compressed size of an entire vblock
+ // than max_vrec_len * kPglVblockSize...
+ uint64_t max_vrec_len = NypCtToByteCt(sample_ct);
+ uintptr_t max_alt_ct_p1 = 2;
+ if (allele_idx_offsets) {
+ const uint32_t variant_ct = variant_ct_limit;
+ if (allele_idx_offsets[variant_ct] != 2 * variant_ct) {
+ assert(allele_idx_offsets[0] == 0);
+ assert(allele_idx_offsets[variant_ct] > 2 * variant_ct);
+ max_alt_ct_p1 = 3;
+ uintptr_t prev_offset = 0;
+ for (uint32_t vidx = 1; vidx <= variant_ct; ++vidx) {
+ const uintptr_t cur_offset = allele_idx_offsets[vidx];
+ if (cur_offset - prev_offset > max_alt_ct_p1) {
+ max_alt_ct_p1 = cur_offset - prev_offset;
+ }
+ prev_offset = cur_offset;
+ }
+ }
+ } else if (allele_ct_upper_bound) {
+ max_alt_ct_p1 = allele_ct_upper_bound;
+ }
+ if (max_alt_ct_p1 > 2) {
+ // see comments in middle of MpgwInitPhase1()
+ // bugfix (29 Mar 2023): forgot (sample_ct + 6) / 8 term
+ max_vrec_len += 2 + sizeof(AlleleCode) + (sample_ct + 6) / 8 + GetAux1bAlleleEntryByteCt(max_alt_ct_p1, sample_ct - 1);
+ // try to permit uncompressed records to be larger than this, only error
+ // out when trying to write a larger compressed record.
+ }
+ if (phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) {
+ // phasepresent, phaseinfo
+ max_vrec_len += 2 * DivUp(sample_ct, CHAR_BIT);
+ }
+ if (phase_dosage_gflags & kfPgenGlobalDosagePresent) {
+ const uint32_t dphase_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePhasePresent) & 1;
+ // aux3, aux7
+ max_vrec_len += (1 + dphase_gflag) * DivUp(sample_ct, 8);
+ // aux4, aux8
+ max_vrec_len += (2 + 2 * dphase_gflag) * S_CAST(uint64_t, sample_ct);
+ // todo: multiallelic dosage
+ }
+#ifdef __LP64__
+ if (max_vrec_len >= kPglMaxBytesPerVariant) {
+ max_vrec_len = kPglMaxBytesPerVariant;
+ }
+#else
+ if (unlikely(max_vrec_len > kMaxBytesPerIO + 1 - kPglFwriteBlockSize)) {
+ return kPglRetNomem;
+ }
+#endif
+ *max_vrec_len_ptr = max_vrec_len;
+ const uintptr_t vrec_len_byte_ct = BytesToRepresentNzU32(max_vrec_len);
+
+ PgenWriterCommon* pwcp = GetPwcp(spgwp);
+ FILE** pgen_outfilep = GetPgenOutfilep(spgwp);
+ FILE** pgi_or_final_pgen_outfilep = GetPgiOrFinalPgenOutfilep(spgwp);
+ char** fname_bufp = GetFnameBufp(spgwp);
+ PglErr reterr = PwcInitPhase1(fname, explicit_nonref_flags, header_exts, footer_exts, variant_ct_limit, sample_ct, write_mode, phase_dosage_gflags, nonref_flags_storage, vrec_len_byte_ct, pwcp, pgen_outfilep, pgi_or_final_pgen_outfilep, fname_bufp);
+ if (likely(!reterr)) {
+ *alloc_cacheline_ct_ptr = CountSpgwAllocCachelinesRequired(variant_ct_limit, sample_ct, phase_dosage_gflags, max_vrec_len);
+ }
+ return reterr;
+}
+
+static_assert(kPglMaxAlleleCt == 255, "Need to update MpgwInitPhase1().");
+void MpgwInitPhase1(const uintptr_t* __restrict allele_idx_offsets, uint32_t variant_ct, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uintptr_t* alloc_base_cacheline_ct_ptr, uint64_t* alloc_per_thread_cacheline_ct_ptr, uint32_t* vrec_len_byte_ct_ptr, uint64_t* vblock_cacheline_ct_ptr) {
+ // variant_ct must be exact due to how MpgwFlush() works.
+ assert(variant_ct);
+ assert(sample_ct);
+ // vblock_fpos
+ const uint32_t vblock_ct = DivUp(variant_ct, kPglVblockSize);
+ uint32_t alloc_base_cacheline_ct = Int64CtToCachelineCt(vblock_ct);
+
+ // vrtype_buf
+ if (phase_dosage_gflags) {
+ alloc_base_cacheline_ct += DivUp(variant_ct, kCacheline);
+ } else {
+ alloc_base_cacheline_ct += DivUp(variant_ct, kCacheline * 2);
+ }
+
+ // pwcs
+ uint64_t alloc_per_thread_cacheline_ct = DivUp(sizeof(PgenWriterCommon), kCacheline);
+
+ // genovec_hets_buf, genovec_invert_buf, ldbase_genovec
+ // (could make genovec_hets_buf allocation conditional, but not worth the
+ // additional load this puts on import functions)
+ alloc_per_thread_cacheline_ct += 3 * NypCtToCachelineCt(sample_ct);
+
+ const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor);
+ // ldbase_raregeno
+ alloc_per_thread_cacheline_ct += NypCtToCachelineCt(max_difflist_len);
+
+ // ldbase_difflist_sample_ids
+ alloc_per_thread_cacheline_ct += 1 + (max_difflist_len / kInt32PerCacheline);
+
+ uint64_t max_vrec_len = NypCtToByteCt(sample_ct);
+ if (phase_dosage_gflags & kfPgenGlobalHardcallPhasePresent) {
+ max_vrec_len += 2 * DivUp(sample_ct, CHAR_BIT);
+ }
+ const uint32_t dosage_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePresent) & 1;
+ const uint32_t dosage_phase_gflag = (phase_dosage_gflags / kfPgenGlobalDosagePhasePresent) & 1;
+ if (dosage_gflag) {
+ max_vrec_len += ((1 + dosage_phase_gflag) * DivUp(sample_ct, CHAR_BIT)) + (2 + 2 * dosage_phase_gflag) * S_CAST(uint64_t, sample_ct);
+ }
+ const uint32_t max_vblock_size = MINV(variant_ct, kPglVblockSize);
+ // max_vrec_len is a uint64_t
+ uint64_t max_vblock_byte_ct = max_vrec_len * max_vblock_size;
+ if (allele_idx_offsets && (allele_idx_offsets[variant_ct] != 2 * variant_ct)) {
+ assert(allele_idx_offsets[0] == 0);
+ assert(allele_idx_offsets[variant_ct] > 2 * variant_ct);
+ // When multiallelic variants are present, larger write buffers are needed.
+ // we compute the largest possible size here.
+ //
+ // One way to hit the maximum size is to have exactly 1 ref/altx genotype,
+ // all other genotypes are altx/alty, and all genotypes include a rarealt.
+ // Then, we need 2 + sizeof(AlleleCode) bytes for the format byte and
+ // aux1a, (sample_ct + 6) / 8 bytes for the bitarray at the front of aux1b,
+ // and
+ // alt ct aux1_last_quarter bytes required
+ // ------ --------------------------------
+ // 2 (sample_ct - 1 + 7) / 8
+ // 3-4 (sample_ct - 1 + 1) / 2
+ // 5-16 (sample_ct - 1)
+ // 17-256 2 * (sample_ct - 1)
+ //
+ // (todo: also take multiallelic dosage into account)
+ assert(!dosage_gflag);
+ uintptr_t prev_offset = 0;
+ uint32_t vidx = 0;
+ const uint32_t extra_bytes_base = 2 + sizeof(AlleleCode) + (sample_ct + 6) / 8;
+ const uint64_t extra_bytes_max = kPglMaxBytesPerVariant - max_vrec_len;
+ uint64_t extra_byte_cts[4];
+ uint32_t extra_alt_ceil = kPglMaxAlleleCt;
+
+ // alt_ct == 2
+ uint64_t cur_extra_byte_ct = extra_bytes_base + (sample_ct + 6) / 8;
+ extra_byte_cts[0] = cur_extra_byte_ct;
+ extra_byte_cts[1] = 0; // force initialization
+ extra_byte_cts[2] = 0;
+ extra_byte_cts[3] = 0;
+ if (cur_extra_byte_ct >= extra_bytes_max) {
+ extra_alt_ceil = 2;
+ } else {
+ // alt_ct in [3, 4]
+ cur_extra_byte_ct = extra_bytes_base + (sample_ct / 2);
+ extra_byte_cts[1] = cur_extra_byte_ct;
+ if (cur_extra_byte_ct >= extra_bytes_max) {
+ extra_alt_ceil = 3;
+ } else {
+ // alt_ct in [5, 16]
+ cur_extra_byte_ct = extra_bytes_base + sample_ct - 1;
+ extra_byte_cts[2] = cur_extra_byte_ct;
+ if (cur_extra_byte_ct >= extra_bytes_max) {
+ extra_alt_ceil = 5;
+ } else {
+ // alt_ct in [17, 256]
+ cur_extra_byte_ct = extra_bytes_base + 2 * (sample_ct - 1);
+ extra_byte_cts[3] = cur_extra_byte_ct;
+ if (cur_extra_byte_ct >= extra_bytes_max) {
+ extra_alt_ceil = 16;
+ }
+ }
+ }
+ }
+ uint32_t extra_alt_ceil_ct = 0;
+ const uint64_t uncompressed_biallelic_vrec_len = max_vrec_len;
+ uint32_t altx_seen_mask = 0;
+ uint32_t max_alt_ct_p1 = 3;
+ for (uint32_t vblock_end = vidx + kPglVblockSize; ; vblock_end += kPglVblockSize) {
+ if (vblock_end > variant_ct) {
+ if (vidx == variant_ct) {
+ break;
+ }
+ vblock_end = variant_ct;
+ }
+ uint64_t cur_vblock_byte_ct = uncompressed_biallelic_vrec_len * (vblock_end - vidx);
+ uint32_t altx_seen[4];
+ ZeroU32Arr(4, altx_seen);
+ for (; vidx < vblock_end; ) {
+ const uintptr_t cur_offset = allele_idx_offsets[++vidx];
+ const uint32_t alt_ct_p1 = cur_offset - prev_offset;
+ if (alt_ct_p1 > 2) {
+ if (alt_ct_p1 >= extra_alt_ceil) {
+ ++extra_alt_ceil_ct;
+ } else {
+ // don't need to track this when we hit the ceiling
+ if (alt_ct_p1 > max_alt_ct_p1) {
+ max_alt_ct_p1 = alt_ct_p1;
+ }
+
+ if (alt_ct_p1 < 6) {
+ altx_seen[(alt_ct_p1 != 3)] += 1;
+ } else {
+ altx_seen[2 + (alt_ct_p1 >= 17)] += 1;
+ }
+ }
+ }
+ prev_offset = cur_offset;
+ }
+ cur_vblock_byte_ct += extra_alt_ceil_ct * extra_bytes_max;
+ for (uint32_t uii = 0; uii != 4; ++uii) {
+ if (altx_seen[uii]) {
+ const uint32_t cur_seen_ct = altx_seen[uii];
+ altx_seen_mask |= 1 << uii;
+ cur_vblock_byte_ct += cur_seen_ct * extra_byte_cts[uii];
+ }
+ }
+ if (cur_vblock_byte_ct > max_vblock_byte_ct) {
+ max_vblock_byte_ct = cur_vblock_byte_ct;
+ }
+ }
+ if (extra_alt_ceil_ct) {
+ max_vrec_len = kPglMaxBytesPerVariant;
+ } else {
+ max_vrec_len = uncompressed_biallelic_vrec_len + extra_byte_cts[bsru32(altx_seen_mask)];
+ }
+ }
+ if (max_vrec_len >= kPglMaxBytesPerVariant) {
+ max_vrec_len = kPglMaxBytesPerVariant;
+ max_vblock_byte_ct = kPglMaxBytesPerVariant * S_CAST(uint64_t, max_vblock_size);
+ }
+
+ // vrec_len_buf
+ // previously used overlapping uint32_t writes-to-memory, but that was
+ // incompatible with multithreaded compression
+ *vrec_len_byte_ct_ptr = BytesToRepresentNzU32(max_vrec_len);
+ *alloc_base_cacheline_ct_ptr = alloc_base_cacheline_ct + DivUp(S_CAST(uintptr_t, variant_ct) * (*vrec_len_byte_ct_ptr), kCacheline);
+
+ // main write buffer
+ *vblock_cacheline_ct_ptr = DivUpU64(max_vblock_byte_ct, kCacheline);
+ *alloc_per_thread_cacheline_ct_ptr = alloc_per_thread_cacheline_ct + (*vblock_cacheline_ct_ptr);
+}
+
+
+void PwcInitPhase2(uintptr_t fwrite_cacheline_ct, uint32_t thread_ct, PgenWriterCommon** pwcs, unsigned char* pwc_alloc) {
+ const uint32_t variant_ct_limit = pwcs[0]->variant_ct_limit;
+
+ unsigned char* alloc_iter = pwc_alloc;
+ const uint32_t vblock_ct = DivUp(variant_ct_limit, kPglVblockSize);
+ const PgenGlobalFlags phase_dosage_gflags = pwcs[0]->phase_dosage_gflags;
+ uint32_t vrtype_buf_byte_ct;
+ if (phase_dosage_gflags) {
+ vrtype_buf_byte_ct = RoundUpPow2(variant_ct_limit, kCacheline);
+ } else {
+ vrtype_buf_byte_ct = DivUp(variant_ct_limit, kCacheline * 2) * kCacheline;
+ }
+ pwcs[0]->vblock_fpos = S_CAST(uint64_t*, arena_alloc_raw_rd(vblock_ct * sizeof(int64_t), &alloc_iter));
+
+ pwcs[0]->vrec_len_buf = S_CAST(unsigned char*, arena_alloc_raw_rd(variant_ct_limit * pwcs[0]->vrec_len_byte_ct, &alloc_iter));
+
+ pwcs[0]->vrtype_buf = S_CAST(uintptr_t*, arena_alloc_raw(vrtype_buf_byte_ct, &alloc_iter));
+ // the PwcAppend... functions assume these bytes are zeroed out
+ memset(pwcs[0]->vrtype_buf, 0, vrtype_buf_byte_ct);
+
+ const uint32_t sample_ct = pwcs[0]->sample_ct;
+ const uint32_t genovec_byte_alloc = NypCtToCachelineCt(sample_ct) * kCacheline;
+ const uint32_t max_difflist_len = 2 * (sample_ct / kPglMaxDifflistLenDivisor);
+ for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) {
+ if (tidx) {
+ pwcs[tidx]->vblock_fpos = pwcs[0]->vblock_fpos;
+ pwcs[tidx]->vrec_len_buf = pwcs[0]->vrec_len_buf;
+ pwcs[tidx]->vrtype_buf = pwcs[0]->vrtype_buf;
+ }
+ pwcs[tidx]->genovec_hets_buf = S_CAST(uintptr_t*, arena_alloc_raw(genovec_byte_alloc, &alloc_iter));
+ pwcs[tidx]->genovec_invert_buf = S_CAST(uintptr_t*, arena_alloc_raw(genovec_byte_alloc, &alloc_iter));
+ pwcs[tidx]->ldbase_genovec = S_CAST(uintptr_t*, arena_alloc_raw(genovec_byte_alloc, &alloc_iter));
+
+ pwcs[tidx]->ldbase_raregeno = S_CAST(uintptr_t*, arena_alloc_raw(NypCtToCachelineCt(max_difflist_len) * kCacheline, &alloc_iter));
+ pwcs[tidx]->ldbase_difflist_sample_ids = S_CAST(uint32_t*, arena_alloc_raw_rd((max_difflist_len + 1) * sizeof(int32_t), &alloc_iter));
+
+ pwcs[tidx]->fwrite_buf = alloc_iter;
+ pwcs[tidx]->fwrite_bufp = alloc_iter;
+ alloc_iter = &(alloc_iter[fwrite_cacheline_ct * kCacheline]);
+ }
+}
+
+void SpgwInitPhase2(uint32_t max_vrec_len, STPgenWriter* spgwp, unsigned char* spgw_alloc) {
+ const uintptr_t fwrite_cacheline_ct = DivUp(max_vrec_len + kPglFwriteBlockSize + (5 + sizeof(AlleleCode)) * kPglDifflistGroupSize, kCacheline);
+ PgenWriterCommon* pwcp = GetPwcp(spgwp);
+ PwcInitPhase2(fwrite_cacheline_ct, 1, &pwcp, spgw_alloc);
+}
+
+PglErr MpgwInitPhase2Ex(const char* __restrict fname, uintptr_t* __restrict explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct, uint32_t sample_ct, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, MTPgenWriter* mpgwp) {
+ assert(thread_ct);
+ const uintptr_t pwc_byte_ct = RoundUpPow2(sizeof(PgenWriterCommon), kCacheline);
+ for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) {
+ mpgwp->pwcs[tidx] = R_CAST(PgenWriterCommon*, &(mpgw_alloc[tidx * pwc_byte_ct]));
+ }
+ PglErr reterr = PwcInitPhase1(fname, explicit_nonref_flags, header_exts, footer_exts, variant_ct, sample_ct, write_mode, phase_dosage_gflags, nonref_flags_storage, vrec_len_byte_ct, mpgwp->pwcs[0], &(mpgwp->pgen_outfile), &(mpgwp->pgi_or_final_pgen_outfile), &(mpgwp->fname_buf));
+ if (unlikely(reterr)) {
+ return reterr;
+ }
+ mpgwp->thread_ct = thread_ct;
+ for (uint32_t tidx = 1; tidx != thread_ct; ++tidx) {
+ *(mpgwp->pwcs[tidx]) = *(mpgwp->pwcs[0]);
+ mpgwp->pwcs[tidx]->vidx = tidx * kPglVblockSize;
+ }
+ PwcInitPhase2(vblock_cacheline_ct, thread_ct, mpgwp->pwcs, &(mpgw_alloc[thread_ct * pwc_byte_ct]));
+ return kPglRetSuccess;
+}
+
+
+void CountLdAndInvertedLdDiffs(const uintptr_t* __restrict ldbase_genovec, const uintptr_t* __restrict genovec, uint32_t sample_ct, uint32_t* ld_diff_ctp, uint32_t* ld_inv_diff_ctp) {
+ // Requires trailing bits to be zeroed out.
+ const uint32_t word_ct = NypCtToWordCt(sample_ct);
+ const uintptr_t* genovec_end = &(genovec[word_ct]);
+ uint32_t ld_diff_ct = 0;
+ uint32_t ld_inv_diff_ct = 0;
+ // construct the words we want to popcount_nypvec_01 on the fly
+ const VecW m1 = VCONST_W(kMask5555);
+ const VecW m2 = VCONST_W(kMask3333);
+ const VecW m4 = VCONST_W(kMask0F0F);
+ const VecW* ldbase_vvec_iter = R_CAST(const VecW*, ldbase_genovec);
+ const VecW* geno_vvec_iter = R_CAST(const VecW*, genovec);
+ VecW acc_ld = vecw_setzero();
+ VecW acc_ld_inv = vecw_setzero();
+ uintptr_t cur_incr = 60;
+ for (uintptr_t full_vecs_left = 3 * (word_ct / (3 * kWordsPerVec)); ; full_vecs_left -= cur_incr) {
+ if (full_vecs_left < 60) {
+ if (!full_vecs_left) {
+ ld_diff_ct = HsumW(acc_ld);
+ ld_inv_diff_ct = HsumW(acc_ld_inv);
+ break;
+ }
+ cur_incr = full_vecs_left;
+ }
+ VecW inner_acc_ld = vecw_setzero();
+ VecW inner_acc_ld_inv = vecw_setzero();
+ const VecW* geno_vvec_stop = &(geno_vvec_iter[cur_incr]);
+ do {
+ VecW loader_ldbase1 = *ldbase_vvec_iter++;
+ VecW loader_geno1 = *geno_vvec_iter++;
+ VecW loader_ldbase2 = *ldbase_vvec_iter++;
+ VecW loader_geno2 = *geno_vvec_iter++;
+ VecW xor1 = loader_ldbase1 ^ loader_geno1;
+ VecW xor2 = loader_ldbase2 ^ loader_geno2;
+ VecW xor_shifted1 = vecw_srli(xor1, 1);
+ VecW xor_shifted2 = vecw_srli(xor2, 1);
+ // xor(_low) xor_shifted loader_geno result
+ // 1 1
+ // 0 0 0 1
+ // 0 0 1 0
+ // 0 1 0 0
+ // 0 1 1 1
+ // gah, don't see a way to avoid throwing in an extra xor for
+ // loader_geno...
+ VecW count_ld_inv = (xor1 | (xor_shifted1 ^ loader_geno1 ^ m1)) & m1;
+ loader_ldbase1 = *ldbase_vvec_iter++;
+ VecW count_ld = (xor1 | xor_shifted1) & m1;
+ loader_geno1 = *geno_vvec_iter++;
+ count_ld_inv = count_ld_inv + ((xor2 | (xor_shifted2 ^ loader_geno2 ^ m1)) & m1);
+ xor1 = loader_ldbase1 ^ loader_geno1;
+ count_ld = count_ld + ((xor2 | xor_shifted2) & m1);
+ xor_shifted1 = vecw_srli(xor1, 1);
+ count_ld_inv = count_ld_inv + ((xor1 | (xor_shifted1 ^ loader_geno1 ^ m1)) & m1);
+ count_ld = count_ld + ((xor1 | xor_shifted1) & m1);
+ // now count_ld and count_ld_inv each have 64 2-bit values from 0-3
+
+ count_ld_inv = (count_ld_inv & m2) + (vecw_srli(count_ld_inv, 2) & m2);
+ count_ld = (count_ld & m2) + (vecw_srli(count_ld, 2) & m2);
+ // now they have 32 4-bit values from 0-6
+
+ inner_acc_ld_inv = inner_acc_ld_inv + ((count_ld_inv + vecw_srli(count_ld_inv, 4)) & m4);
+ inner_acc_ld = inner_acc_ld + ((count_ld + vecw_srli(count_ld, 4)) & m4);
+ } while (geno_vvec_iter < geno_vvec_stop);
+ const VecW m0 = vecw_setzero();
+ acc_ld_inv = acc_ld_inv + vecw_bytesum(inner_acc_ld_inv, m0);
+ acc_ld = acc_ld + vecw_bytesum(inner_acc_ld, m0);
+ }
+ const uintptr_t* ldbase_iter = DowncastKVecWToW(ldbase_vvec_iter);
+ const uintptr_t* genovec_iter = DowncastKVecWToW(geno_vvec_iter);
+ while (genovec_iter < genovec_end) {
+ uintptr_t ldbase_word = *ldbase_iter++;
+ uintptr_t geno_word = *genovec_iter++;
+ uintptr_t xor_result = ldbase_word ^ geno_word;
+ uintptr_t xor_result_shifted = xor_result >> 1;
+ ld_diff_ct += Popcount01Word((xor_result | xor_result_shifted) & kMask5555);
+ ld_inv_diff_ct += Popcount01Word((xor_result | (xor_result_shifted ^ (~geno_word))) & kMask5555);
+ }
+ *ld_diff_ctp = ld_diff_ct;
+ // trailing entries in last word are always "different"
+ *ld_inv_diff_ctp = ld_inv_diff_ct - ((-sample_ct) & (kBitsPerWordD2 - 1));
+}
+
+void CountLdAndInvertedLdDiffsList(const uintptr_t* __restrict ldbase_raregeno, const uint32_t* __restrict ldbase_difflist_sample_ids, const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t ldbase_difflist_len, uint32_t difflist_len, uint32_t* ld_diff_ctp, uint32_t* ld_inv_diff_ctp) {
+ // assumes ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct
+
+ // only the count(s) with aligned common_geno values are valid. e.g. if
+ // ldbase_common_geno and difflist_common_geno are both zero, the ld_inv_diff
+ // return value can be anything, while if they're both three, ld_diff and
+ // ld_inv_diff are both accurate.
+
+ // some similarities to ParseLdAndMergeDifflistSubset(), but much simpler.
+ // noticeably slower than CountLdAndInvertedLdDiffs() when the lists aren't
+ // tiny.
+ // possible todo: take threshold into account?
+
+ uint32_t collision_ct = 0;
+ uint32_t ld_diff_ct = 0;
+ uint32_t ld_inv_diff_ct = 0;
+ uint32_t ldbase_sample_idx = ldbase_difflist_sample_ids[0];
+ uint32_t ldbase_difflist_idx = 1;
+ // this loop is a bit slow. attempt to bail halfway through?
+ for (uint32_t difflist_idx = 0; difflist_idx != difflist_len; ++difflist_idx) {
+ const uint32_t raw_sample_idx = difflist_sample_ids[difflist_idx];
+ while (ldbase_sample_idx < raw_sample_idx) {
+ ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx++];
+ }
+ if (ldbase_sample_idx > raw_sample_idx) {
+ continue;
+ }
+ const uint32_t cur_raregeno = GetNyparrEntry(raregeno, difflist_idx);
+ const uint32_t cur_ldbase_raregeno = GetNyparrEntry(ldbase_raregeno, ldbase_difflist_idx - 1);
+ const uint32_t cur_inv_raregeno = (6 - cur_raregeno) & 3;
+ ld_diff_ct += (cur_ldbase_raregeno != cur_raregeno);
+ ldbase_sample_idx = ldbase_difflist_sample_ids[ldbase_difflist_idx++];
+ ++collision_ct;
+ ld_inv_diff_ct += (cur_ldbase_raregeno != cur_inv_raregeno);
+ }
+ // no more collisions, don't actually need to look at rest of
+ // ldbase_difflist
+ const uint32_t base_diff_ct = ldbase_difflist_len + difflist_len - 2 * collision_ct;
+ *ld_diff_ctp = base_diff_ct + ld_diff_ct;
+ *ld_inv_diff_ctp = base_diff_ct + ld_inv_diff_ct;
+}
+
+uint32_t SaveLdDifflist(const uintptr_t* __restrict genovec, const uintptr_t* __restrict ldbase_genovec, uintptr_t common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp) {
+ unsigned char* fwrite_bufp = pwcp->fwrite_bufp;
+ if (!difflist_len) {
+ *fwrite_bufp = 0;
+ pwcp->fwrite_bufp = &(fwrite_bufp[1]);
+ return 1;
+ }
+ unsigned char* fwrite_bufp_start = fwrite_bufp;
+ fwrite_bufp = Vint32Append(difflist_len, fwrite_bufp);
+ const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct);
+ const uintptr_t common_geno_word = common_geno * kMask5555;
+ const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize);
+ unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+ unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+ unsigned char* raregeno_biter = &(extra_byte_cts_iter[group_ct - 1]);
+ fwrite_bufp = &(extra_byte_cts_iter[group_ct + (difflist_len - 1) / 4]);
+ unsigned char* last_group_vint_start = fwrite_bufp;
+ uintptr_t raregeno_word = 0;
+ uint32_t last_sample_idx = 0;
+ uint32_t difflist_idx = 0;
+ for (uint32_t widx = 0; ; ++widx) {
+ const uintptr_t cur_geno_word = genovec[widx];
+ uintptr_t xor_word = ldbase_genovec? ldbase_genovec[widx] : common_geno_word;
+ xor_word ^= cur_geno_word;
+ if (xor_word) {
+ const uint32_t sample_idx_base = widx * kBitsPerWordD2;
+ do {
+ const uint32_t sample_idx_lowbits = ctzw(xor_word) / 2;
+ const uint32_t new_sample_idx = sample_idx_base + sample_idx_lowbits;
+ raregeno_word |= ((cur_geno_word >> (2 * sample_idx_lowbits)) & 3) << (2 * (difflist_idx % kBitsPerWordD2));
+ if (!(difflist_idx % kPglDifflistGroupSize)) {
+ group_first_sample_ids_iter = memcpyua(group_first_sample_ids_iter, &new_sample_idx, sample_id_byte_ct);
+ if (difflist_idx) {
+ *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1);
+ }
+ last_group_vint_start = fwrite_bufp;
+ } else {
+ assert(new_sample_idx >= last_sample_idx + 1);
+ fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp);
+ }
+ ++difflist_idx;
+ last_sample_idx = new_sample_idx;
+ if (difflist_idx == difflist_len) {
+ SubwordStore(raregeno_word, 1 + (((difflist_len - 1) / 4) % sizeof(intptr_t)), raregeno_biter);
+ pwcp->fwrite_bufp = fwrite_bufp;
+ return fwrite_bufp - fwrite_bufp_start;
+ }
+ if (!(difflist_idx % kBitsPerWordD2)) {
+ AppendW(raregeno_word, &raregeno_biter);
+ raregeno_word = 0;
+ }
+ xor_word &= (~(3 * k1LU)) << (2 * sample_idx_lowbits);
+ } while (xor_word);
+ }
+ }
+}
+
+void OnebitPreprocessBuf(const uintptr_t* __restrict genovec, uint32_t sample_ct, uint32_t common2_code, uintptr_t* __restrict genovec_buf) {
+ assert(sample_ct);
+ const uint32_t vec_ct = NypCtToVecCt(sample_ct);
+ // todo: look for better ways to perform some of these operations
+ const VecW* geno_vvec_iter = R_CAST(const VecW*, genovec);
+ const VecW* geno_vvec_end = &(geno_vvec_iter[vec_ct]);
+ VecW* write_iter = R_CAST(VecW*, genovec_buf);
+ const VecW m1 = VCONST_W(kMask5555);
+ if (common2_code < 5) {
+ if (common2_code == 1) {
+ // 11 -> 10, everything else unchanged
+ // todo: check if these loops are actually faster as simple while loops
+ // todo: check if it's better to unroll these loops to process 2 __m128is
+ // at a time
+ do {
+ const VecW cur_geno = *geno_vvec_iter++;
+ *write_iter++ = vecw_and_notfirst(m1 & vecw_srli(cur_geno, 1), cur_geno);
+ } while (geno_vvec_iter < geno_vvec_end);
+ } else if (common2_code == 3) {
+ // 00 -> 00, 01 -> 10, 10 -> 10, 11 -> 01
+ do {
+ const VecW cur_geno = *geno_vvec_iter++;
+ const VecW cur_geno_rshift = vecw_srli(cur_geno, 1);
+ const VecW cur_geno_xor_masked = (cur_geno ^ cur_geno_rshift) & m1;
+ const VecW cur_geno_or_masked = (cur_geno | cur_geno_rshift) & m1;
+ *write_iter++ = cur_geno_xor_masked + cur_geno_or_masked;
+ } while (geno_vvec_iter < geno_vvec_end);
+ } else {
+ assert(common2_code == 2);
+ // 00 -> 00, 01 -> 10, 10 -> 01, 11 -> 10
+ do {
+ const VecW cur_geno = *geno_vvec_iter++;
+ const VecW cur_geno_or_masked = (cur_geno | vecw_srli(cur_geno, 1)) & m1;
+ const VecW cur_geno_lowbits = cur_geno & m1;
+ *write_iter++ = cur_geno_lowbits + cur_geno_or_masked;
+ } while (geno_vvec_iter < geno_vvec_end);
+ }
+ } else {
+ if (common2_code == 5) {
+ // 00 -> 10, 01 -> 00, 10 -> 01, 11 -> 10
+ do {
+ const VecW cur_geno = *geno_vvec_iter++;
+ const VecW cur_geno_rshift = vecw_srli(cur_geno, 1);
+ const VecW cur_geno_not_xor_masked = vecw_and_notfirst(cur_geno ^ cur_geno_rshift, m1);
+ const VecW cur_geno_rshift_masked = cur_geno_rshift & m1;
+ *write_iter++ = cur_geno_not_xor_masked + (cur_geno_not_xor_masked | cur_geno_rshift_masked);
+ } while (geno_vvec_iter < geno_vvec_end);
+ } else if (common2_code == 9) {
+ // 00 -> 10, 01 -> 10, 10 -> 00, 11 -> 01
+ const VecW not_m1 = VCONST_W(kMaskAAAA);
+ do {
+ const VecW cur_geno = *geno_vvec_iter++;
+ *write_iter++ = (cur_geno ^ not_m1) - vecw_and_notfirst(not_m1, vecw_and_notfirst(vecw_srli(cur_geno, 1), cur_geno));
+ } while (geno_vvec_iter < geno_vvec_end);
+ } else {
+ assert(common2_code == 6);
+ // 00 -> 10, 01 -> 00, 10 -> 10, 11 -> 01
+ do {
+ const VecW cur_geno = *geno_vvec_iter++;
+ const VecW cur_geno_not_lowbits = vecw_and_notfirst(cur_geno, m1);
+ const VecW cur_geno_rshift_masked = vecw_srli(cur_geno, 1) & m1;
+ *write_iter++ = cur_geno_not_lowbits + (cur_geno_not_lowbits | cur_geno_rshift_masked);
+ } while (geno_vvec_iter < geno_vvec_end);
+ }
+ }
+}
+
+uint32_t SaveOnebit(const uintptr_t* __restrict genovec, uint32_t common2_code, uint32_t onebit_difflist_len, PgenWriterCommon* pwcp) {
+ // Uses ldbase_genovec as a temporary buffer.
+
+ // common2_code is expected to have the difference between the common
+ // genotype values in bits 0-1, and the smaller common genotype value in bits
+ // 2-3.
+ unsigned char* fwrite_bufp_start = pwcp->fwrite_bufp;
+ *fwrite_bufp_start = common2_code;
+ const uint32_t sample_ct = pwcp->sample_ct;
+ uintptr_t* __restrict genovec_buf = pwcp->ldbase_genovec;
+ // There's a 4-byte-interleaved format which is slightly more efficient for
+ // unsubsetted handling (~10 fewer compression/decompression operations per
+ // 32 genotypes), but that's only a 1-2% speedup, which probably isn't worth
+ // the more annoying subsetting.
+ //
+ // Any 10s and 11s are saved as 00 in this part.
+ // Similar logic is used to handle the other five possibilities (00/10,
+ // 00/11, 01/10, 01/11, 10/11); all of them should be expected to actually
+ // happen. (E.g. 01/11 can happen at a high MAF variant when there's lots of
+ // missing data.) To reduce branching, we preprocess genovec_buf to have
+ // even bit set iff the corresponding genotype is equal to the high common
+ // genotype value, and odd bit set iff the corresponding genotype is one of
+ // the two uncommon values. (There may be a better way to do this, analogous
+ // to the simpler decompression algorithm.)
+ OnebitPreprocessBuf(genovec, sample_ct, common2_code, genovec_buf);
+ ZeroTrailingNyps(sample_ct, genovec_buf);
+ const uint32_t word_read_ct = NypCtToWordCt(sample_ct);
+ PackWordsToHalfwordsMask(genovec_buf, word_read_ct, &(fwrite_bufp_start[1]));
+ const uint32_t onebit_block_len = (sample_ct + 15) / CHAR_BIT;
+ unsigned char* fwrite_bufp = Vint32Append(onebit_difflist_len, &(fwrite_bufp_start[onebit_block_len]));
+ // the rest is almost identical to SaveLdDifflist()
+ if (!onebit_difflist_len) {
+ pwcp->fwrite_bufp = fwrite_bufp;
+ return (onebit_block_len + 1);
+ }
+ const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct);
+ const uint32_t group_ct = DivUp(onebit_difflist_len, kPglDifflistGroupSize);
+ unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+ unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+ unsigned char* raregeno_biter = &(extra_byte_cts_iter[group_ct - 1]);
+ fwrite_bufp = &(extra_byte_cts_iter[group_ct + (onebit_difflist_len - 1) / 4]);
+ unsigned char* last_group_vint_start = fwrite_bufp;
+ uintptr_t raregeno_word = 0;
+ uint32_t last_sample_idx = 0;
+ uint32_t difflist_idx = 0;
+ for (uint32_t widx = 0; ; ++widx) {
+ uintptr_t xor_word = genovec_buf[widx] & kMaskAAAA;
+ if (xor_word) {
+ const uintptr_t cur_geno_word = genovec[widx];
+ const uint32_t sample_idx_base = widx * kBitsPerWordD2;
+
+ do {
+ const uint32_t sample_idx_lowbits = ctzw(xor_word) / 2;
+ const uint32_t new_sample_idx = sample_idx_base + sample_idx_lowbits;
+ if (!(difflist_idx % kBitsPerWordD2)) {
+ if (!(difflist_idx % kPglDifflistGroupSize)) {
+ SubU32StoreMov(new_sample_idx, sample_id_byte_ct, &group_first_sample_ids_iter);
+ if (difflist_idx) {
+ *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1);
+ AppendW(raregeno_word, &raregeno_biter);
+ raregeno_word = 0;
+ }
+ last_group_vint_start = fwrite_bufp;
+ goto SaveOnebit_skip_delta_write;
+ }
+ AppendW(raregeno_word, &raregeno_biter);
+ raregeno_word = 0;
+ }
+ assert(new_sample_idx >= last_sample_idx + 1);
+ fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp);
+ SaveOnebit_skip_delta_write:
+ raregeno_word |= ((cur_geno_word >> (2 * sample_idx_lowbits)) & 3) << (2 * (difflist_idx % kBitsPerWordD2));
+ ++difflist_idx;
+ last_sample_idx = new_sample_idx;
+ xor_word &= xor_word - 1;
+ } while (xor_word);
+ // trailing bits of genovec_buf guaranteed to be zeroed out
+ if (difflist_idx == onebit_difflist_len) {
+ SubwordStore(raregeno_word, 1 + (((onebit_difflist_len - 1) / 4) % sizeof(intptr_t)), raregeno_biter);
+ pwcp->fwrite_bufp = fwrite_bufp;
+ return fwrite_bufp - fwrite_bufp_start;
+ }
+ }
+ }
+}
+
+// returns vrec_len
+uint32_t PwcAppendBiallelicGenovecMain(const uintptr_t* __restrict genovec, uint32_t vidx, PgenWriterCommon* pwcp, uint32_t* het_ct_ptr, uint32_t* altxy_ct_ptr, unsigned char* vrtype_ptr) {
+ const uint32_t sample_ct = pwcp->sample_ct;
+ assert((!(sample_ct % kBitsPerWordD2)) || (!(genovec[sample_ct / kBitsPerWordD2] >> (2 * (sample_ct % kBitsPerWordD2)))));
+ STD_ARRAY_DECL(uint32_t, 4, genocounts);
+ GenoarrCountFreqsUnsafe(genovec, sample_ct, genocounts);
+ if (het_ct_ptr) {
+ *het_ct_ptr = genocounts[1];
+ if (altxy_ct_ptr) {
+ *altxy_ct_ptr = genocounts[2];
+ }
+ }
+ uint32_t most_common_geno = (genocounts[1] > genocounts[0]);
+ uint32_t second_most_common_geno = 1 - most_common_geno;
+ uint32_t largest_geno_ct = genocounts[most_common_geno];
+ uint32_t second_largest_geno_ct = genocounts[second_most_common_geno];
+ for (uint32_t cur_geno = 2; cur_geno != 4; ++cur_geno) {
+ const uint32_t cur_geno_ct = genocounts[cur_geno];
+ if (cur_geno_ct > second_largest_geno_ct) {
+ if (cur_geno_ct > largest_geno_ct) {
+ second_largest_geno_ct = largest_geno_ct;
+ second_most_common_geno = most_common_geno;
+ largest_geno_ct = cur_geno_ct;
+ most_common_geno = cur_geno;
+ } else {
+ second_largest_geno_ct = cur_geno_ct;
+ second_most_common_geno = cur_geno;
+ }
+ }
+ }
+ const uint32_t difflist_len = sample_ct - largest_geno_ct;
+ const uint32_t rare_2_geno_ct_sum = difflist_len - second_largest_geno_ct;
+ // average of 10-11 bits per difflist entry
+ const uint32_t sample_ctd8 = sample_ct / 8;
+ const uint32_t sample_ctd64 = sample_ct / 64;
+ uint32_t max_difflist_len = sample_ctd8 - 2 * sample_ctd64 + rare_2_geno_ct_sum;
+ if (max_difflist_len > sample_ctd8) {
+ max_difflist_len = sample_ctd8;
+ }
+ const uint32_t difflist_viable = (most_common_geno != 1) && (difflist_len <= max_difflist_len);
+
+ uintptr_t* ldbase_genovec = pwcp->ldbase_genovec;
+ STD_ARRAY_REF(uint32_t, 4) ldbase_genocounts = pwcp->ldbase_genocounts;
+ if (!(vidx % kPglVblockSize)) {
+ // beginning of a variant block. save raw fpos in header; LD compression
+ // prohibited.
+
+ // er, need to use a relative offset in the multithreaded case, absolute
+ // position isn't known
+ pwcp->vblock_fpos[vidx / kPglVblockSize] = pwcp->vblock_fpos_offset + S_CAST(uintptr_t, pwcp->fwrite_bufp - pwcp->fwrite_buf);
+ } else if (difflist_len > sample_ctd64) {
+ // do not use LD compression if there are at least this many differences.
+ // tune this threshold in the future.
+ const uint32_t ld_diff_threshold = difflist_viable? (difflist_len - sample_ctd64) : max_difflist_len;
+ // number of changes between current genovec and LD reference is bounded
+ // below by sum(genocounts[x] - ldbase_genocounts[x]) / 2
+ const int32_t count02_limit = 2 * ld_diff_threshold - abs_i32(genocounts[1] - ldbase_genocounts[1]) + abs_i32(genocounts[3] - ldbase_genocounts[3]);
+ if ((S_CAST(int32_t, abs_i32(genocounts[0] - ldbase_genocounts[0]) + abs_i32(genocounts[2] - ldbase_genocounts[2])) < count02_limit) || (S_CAST(int32_t, abs_i32(genocounts[0] - ldbase_genocounts[2]) + abs_i32(genocounts[2] - ldbase_genocounts[0])) < count02_limit)) {
+ uint32_t ld_diff_ct;
+ uint32_t ld_inv_diff_ct;
+ // okay, perform a brute-force diff
+ // (could check LD vs. inverted LD separately?)
+ if (pwcp->ldbase_common_geno < 4) {
+ // unpack to ldbase_genovec
+ PgrDifflistToGenovecUnsafe(pwcp->ldbase_raregeno, pwcp->ldbase_difflist_sample_ids, pwcp->ldbase_common_geno, sample_ct, pwcp->ldbase_difflist_len, ldbase_genovec);
+ ZeroTrailingNyps(sample_ct, ldbase_genovec);
+ pwcp->ldbase_common_geno = UINT32_MAX;
+ }
+ CountLdAndInvertedLdDiffs(ldbase_genovec, genovec, sample_ct, &ld_diff_ct, &ld_inv_diff_ct);
+ if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) {
+ const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct);
+ *vrtype_ptr = 2 + invert_before_compressing;
+ if (invert_before_compressing) {
+ GenovecInvertCopyUnsafe(genovec, sample_ct, pwcp->genovec_invert_buf);
+ ld_diff_ct = ld_inv_diff_ct;
+ }
+ return SaveLdDifflist(invert_before_compressing? pwcp->genovec_invert_buf : genovec, ldbase_genovec, 0, ld_diff_ct, pwcp);
+ }
+ }
+ }
+ const uint32_t genovec_word_ct = NypCtToWordCt(sample_ct);
+ STD_ARRAY_COPY(genocounts, 4, ldbase_genocounts);
+ pwcp->ldbase_common_geno = UINT32_MAX;
+ if ((!difflist_viable) && (rare_2_geno_ct_sum < sample_ct / (2 * kPglMaxDifflistLenDivisor))) {
+ *vrtype_ptr = 1;
+ uint32_t larger_common_geno = second_most_common_geno;
+ uint32_t smaller_common_geno = most_common_geno;
+ if (most_common_geno > second_most_common_geno) {
+ larger_common_geno = most_common_geno;
+ smaller_common_geno = second_most_common_geno;
+ }
+ const uint32_t vrec_len = SaveOnebit(genovec, larger_common_geno + (smaller_common_geno * 3), rare_2_geno_ct_sum, pwcp);
+ memcpy(ldbase_genovec, genovec, genovec_word_ct * sizeof(intptr_t));
+ return vrec_len;
+ }
+ memcpy(ldbase_genovec, genovec, genovec_word_ct * sizeof(intptr_t));
+ if (difflist_viable) {
+#ifdef FUTURE_ENCODER
+ if ((!difflist_len) && (!most_common_geno)) {
+ *vrtype_ptr = 5;
+ return 0;
+ }
+#endif
+ *vrtype_ptr = 4 + most_common_geno;
+ return SaveLdDifflist(genovec, nullptr, most_common_geno, difflist_len, pwcp);
+ }
+ *vrtype_ptr = 0;
+ const uint32_t vrec_len = NypCtToByteCt(sample_ct);
+ pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, genovec, vrec_len);
+ return vrec_len;
+}
+
+void PwcAppendBiallelicGenovec(const uintptr_t* __restrict genovec, PgenWriterCommon* pwcp) {
+ const uint32_t vidx = pwcp->vidx;
+ unsigned char vrtype;
+ const uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, nullptr, nullptr, &vrtype);
+ const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ pwcp->vidx += 1;
+ SubU32Store(vrec_len, vrec_len_byte_ct, &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]));
+ // could have a single expression which branchlessly handles both cases, but
+ // doubt that's worthwhile
+ if (!pwcp->phase_dosage_gflags) {
+ pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= S_CAST(uintptr_t, vrtype) << (4 * (vidx % kBitsPerWordD4));
+ } else {
+ DowncastToUc(pwcp->vrtype_buf)[vidx] = vrtype;
+ }
+}
+
+BoolErr SpgwFlush(STPgenWriter* spgwp) {
+ PgenWriterCommon* pwcp = GetPwcp(spgwp);
+ if (pwcp->fwrite_bufp >= &(pwcp->fwrite_buf[kPglFwriteBlockSize])) {
+ const uintptr_t cur_byte_ct = pwcp->fwrite_bufp - pwcp->fwrite_buf;
+ FILE** pgen_outfilep = GetPgenOutfilep(spgwp);
+ if (unlikely(fwrite_checked(pwcp->fwrite_buf, cur_byte_ct, *pgen_outfilep))) {
+ return 1;
+ }
+ pwcp->vblock_fpos_offset += cur_byte_ct;
+ pwcp->fwrite_bufp = pwcp->fwrite_buf;
+ }
+ return 0;
+}
+
+
+uint32_t SaveLdTwoListDelta(const uintptr_t* __restrict difflist_raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t ld_diff_ct, PgenWriterCommon* pwcp) {
+ // assumes ldbase_difflist_sample_ids[ldbase_difflist_len] == sample_ct, and
+ // difflist_sample_ids[ldbase_difflist_len] == sample_ct.
+ // assumes biallelic data.
+
+ // similar to SaveLdDifflist() and, to a lesser degree,
+ // ParseLdAndMergeDifflistSubset()
+ unsigned char* fwrite_bufp = pwcp->fwrite_bufp;
+ if (!ld_diff_ct) {
+ *fwrite_bufp = 0;
+ pwcp->fwrite_bufp = &(fwrite_bufp[1]);
+ return 1;
+ }
+ unsigned char* fwrite_bufp_start = fwrite_bufp;
+ fwrite_bufp = Vint32Append(ld_diff_ct, fwrite_bufp);
+ const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct);
+ const uint32_t group_ct = DivUp(ld_diff_ct, kPglDifflistGroupSize);
+ const uint32_t ldbase_common_geno = pwcp->ldbase_common_geno;
+ assert(ldbase_common_geno < 4);
+ const uintptr_t* __restrict ldbase_raregeno = pwcp->ldbase_raregeno;
+ const uint32_t* __restrict ldbase_sample_ids = pwcp->ldbase_difflist_sample_ids;
+ unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+ unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+ unsigned char* raregeno_write_biter = &(extra_byte_cts_iter[group_ct - 1]);
+ fwrite_bufp = &(extra_byte_cts_iter[group_ct + (ld_diff_ct - 1) / 4]);
+ unsigned char* last_group_vint_start = fwrite_bufp;
+ uintptr_t ldbase_raregeno_word = 0;
+ uintptr_t difflist_raregeno_word = 0;
+ uintptr_t raregeno_write_word = 0;
+ uint32_t last_sample_idx = 0;
+
+ uint32_t next_ldbase_sample_idx = ldbase_sample_ids[0];
+ uint32_t next_difflist_sample_idx = difflist_sample_ids[0];
+ uint32_t ldbase_idx = 0;
+ uint32_t difflist_idx = 0;
+ uint32_t diff_written_ct = 0;
+ while (diff_written_ct < ld_diff_ct) {
+ uintptr_t cur_geno;
+ uint32_t new_sample_idx;
+ if (next_ldbase_sample_idx <= next_difflist_sample_idx) {
+ ldbase_raregeno_word >>= 2;
+ if (!(ldbase_idx % kBitsPerWordD2)) {
+ ldbase_raregeno_word = ldbase_raregeno[ldbase_idx / kBitsPerWordD2];
+ }
+ ++ldbase_idx;
+ }
+ if (next_difflist_sample_idx <= next_ldbase_sample_idx) {
+ difflist_raregeno_word >>= 2;
+ if (!(difflist_idx % kBitsPerWordD2)) {
+ difflist_raregeno_word = difflist_raregeno[difflist_idx / kBitsPerWordD2];
+ }
+ new_sample_idx = next_difflist_sample_idx;
+ ++difflist_idx;
+ cur_geno = difflist_raregeno_word & 3;
+ next_difflist_sample_idx = difflist_sample_ids[difflist_idx];
+ if (next_ldbase_sample_idx == new_sample_idx) {
+ next_ldbase_sample_idx = ldbase_sample_ids[ldbase_idx];
+ if (cur_geno == (ldbase_raregeno_word & 3)) {
+ continue;
+ }
+ }
+ } else {
+ cur_geno = ldbase_common_geno;
+ new_sample_idx = next_ldbase_sample_idx;
+ next_ldbase_sample_idx = ldbase_sample_ids[ldbase_idx];
+ }
+ raregeno_write_word |= cur_geno << (2 * (diff_written_ct % kBitsPerWordD2));
+ if (!(diff_written_ct % kPglDifflistGroupSize)) {
+ SubU32StoreMov(new_sample_idx, sample_id_byte_ct, &group_first_sample_ids_iter);
+ if (diff_written_ct) {
+ *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1);
+ }
+ last_group_vint_start = fwrite_bufp;
+ } else {
+ fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp);
+ }
+ last_sample_idx = new_sample_idx;
+ ++diff_written_ct;
+ if (!(diff_written_ct % kBitsPerWordD2)) {
+ AppendW(raregeno_write_word, &raregeno_write_biter);
+ raregeno_write_word = 0;
+ }
+ }
+ if (diff_written_ct % kBitsPerWordD2) {
+ SubwordStore(raregeno_write_word, 1 + (((ld_diff_ct - 1) / 4) % kBytesPerWord), raregeno_write_biter);
+ }
+ pwcp->fwrite_bufp = fwrite_bufp;
+ return fwrite_bufp - fwrite_bufp_start;
+}
+
+uint32_t SaveLdInputList(PgenWriterCommon* pwcp) {
+ // simply "copies" ldbase_{raregeno,difflist_sample_ids,difflist_len} to the
+ // write buffer.
+ unsigned char* fwrite_bufp = pwcp->fwrite_bufp;
+ const uint32_t difflist_len = pwcp->ldbase_difflist_len;
+ if (!difflist_len) {
+ *fwrite_bufp = 0;
+ pwcp->fwrite_bufp = &(fwrite_bufp[1]);
+ return 1;
+ }
+ unsigned char* fwrite_bufp_start = fwrite_bufp;
+ fwrite_bufp = Vint32Append(difflist_len, fwrite_bufp);
+ const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct);
+ const uint32_t group_ct = DivUp(difflist_len, kPglDifflistGroupSize);
+ const uint32_t* __restrict difflist_sample_ids = pwcp->ldbase_difflist_sample_ids;
+ unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+ unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+ fwrite_bufp = memcpyua(&(extra_byte_cts_iter[group_ct - 1]), pwcp->ldbase_raregeno, NypCtToByteCt(difflist_len));
+ unsigned char* last_group_vint_start = nullptr;
+ uint32_t last_sample_idx = 0;
+ for (uint32_t difflist_idx = 0; difflist_idx != difflist_len; ++difflist_idx) {
+ const uint32_t new_sample_idx = difflist_sample_ids[difflist_idx];
+ if (!(difflist_idx % kPglDifflistGroupSize)) {
+ SubU32StoreMov(new_sample_idx, sample_id_byte_ct, &group_first_sample_ids_iter);
+ if (difflist_idx) {
+ *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1);
+ }
+ last_group_vint_start = fwrite_bufp;
+ } else {
+ // assert(new_sample_idx >= last_sample_idx + 1);
+ fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp);
+ }
+ last_sample_idx = new_sample_idx;
+ }
+ pwcp->fwrite_bufp = fwrite_bufp;
+ return fwrite_bufp - fwrite_bufp_start;
+}
+
+uint32_t PwcAppendBiallelicDifflistLimitedMain(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t vidx, uint32_t difflist_common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp, unsigned char* vrtype_ptr) {
+ const uint32_t sample_ct = pwcp->sample_ct;
+ // caller's responsibility not to exceed this limit
+ assert(difflist_len <= 2 * (sample_ct / kPglMaxDifflistLenDivisor));
+
+ // trailing bits of raregeno must be zeroed out
+
+ assert(difflist_common_geno < 4);
+ assert((!(difflist_len % kBitsPerWordD2)) || (!(raregeno[difflist_len / kBitsPerWordD2] >> (2 * (difflist_len % kBitsPerWordD2)))));
+ assert(difflist_sample_ids[difflist_len] == sample_ct);
+ STD_ARRAY_DECL(uint32_t, 4, genocounts);
+ GenoarrCountFreqsUnsafe(raregeno, difflist_len, genocounts);
+ assert(!genocounts[difflist_common_geno]);
+ genocounts[difflist_common_geno] = sample_ct - difflist_len;
+ uint32_t second_most_common_geno = difflist_common_geno? 0 : 1;
+ uint32_t second_largest_geno_ct = genocounts[second_most_common_geno];
+ for (uint32_t cur_geno = second_most_common_geno + 1; cur_geno != 4; ++cur_geno) {
+ if (cur_geno == difflist_common_geno) {
+ continue;
+ }
+ const uint32_t cur_geno_ct = genocounts[cur_geno];
+ if (cur_geno_ct > second_largest_geno_ct) {
+ second_most_common_geno = cur_geno;
+ second_largest_geno_ct = cur_geno_ct;
+ }
+ }
+ const uint32_t rare_2_geno_ct_sum = difflist_len - second_largest_geno_ct;
+ const uint32_t sample_ctd8 = sample_ct / 8;
+ const uint32_t sample_ctd64 = sample_ct / 64;
+ uint32_t max_difflist_len = sample_ctd8 - 2 * sample_ctd64 + rare_2_geno_ct_sum;
+ if (max_difflist_len > sample_ctd8) {
+ max_difflist_len = sample_ctd8;
+ }
+ const uint32_t difflist_viable = (difflist_common_geno != 1) && (difflist_len <= max_difflist_len);
+ STD_ARRAY_REF(uint32_t, 4) ldbase_genocounts = pwcp->ldbase_genocounts;
+ if (!(vidx % kPglVblockSize)) {
+ pwcp->vblock_fpos[vidx / kPglVblockSize] = pwcp->vblock_fpos_offset + S_CAST(uintptr_t, pwcp->fwrite_bufp - pwcp->fwrite_buf);
+ } else if (difflist_len > sample_ctd64) {
+ const uint32_t ld_diff_threshold = difflist_viable? (difflist_len - sample_ctd64) : max_difflist_len;
+ // number of changes between current genovec and LD reference is bounded
+ // below by sum(genocounts[x] - ldbase_genocounts[x]) / 2
+ const int32_t count02_limit = 2 * ld_diff_threshold - abs_i32(genocounts[1] - ldbase_genocounts[1]) + abs_i32(genocounts[3] - ldbase_genocounts[3]);
+ if ((S_CAST(int32_t, abs_i32(genocounts[0] - ldbase_genocounts[0]) + abs_i32(genocounts[2] - ldbase_genocounts[2])) < count02_limit) || (S_CAST(int32_t, abs_i32(genocounts[0] - ldbase_genocounts[2]) + abs_i32(genocounts[2] - ldbase_genocounts[0])) < count02_limit)) {
+ uint32_t ld_diff_ct;
+ uint32_t ld_inv_diff_ct;
+ if (pwcp->ldbase_common_geno < 4) {
+ pwcp->ldbase_difflist_sample_ids[pwcp->ldbase_difflist_len] = sample_ct;
+ CountLdAndInvertedLdDiffsList(pwcp->ldbase_raregeno, pwcp->ldbase_difflist_sample_ids, raregeno, difflist_sample_ids, pwcp->ldbase_difflist_len, difflist_len, &ld_diff_ct, &ld_inv_diff_ct);
+ const uint32_t difflist_common_geno_inv = (6 - difflist_common_geno) & 3;
+ if (pwcp->ldbase_common_geno != difflist_common_geno) {
+ ld_diff_ct = ld_diff_threshold;
+ }
+ if (pwcp->ldbase_common_geno != difflist_common_geno_inv) {
+ ld_inv_diff_ct = ld_diff_threshold;
+ }
+ if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) {
+ const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct);
+ *vrtype_ptr = 2 + invert_before_compressing;
+ if (invert_before_compressing) {
+ GenovecInvertCopyUnsafe(raregeno, difflist_len, pwcp->genovec_invert_buf);
+ // difflist_common_geno = difflist_common_geno_inv;
+ ld_diff_ct = ld_inv_diff_ct;
+ }
+ return SaveLdTwoListDelta(invert_before_compressing? pwcp->genovec_invert_buf : raregeno, difflist_sample_ids, ld_diff_ct, pwcp);
+ }
+ } else {
+ uintptr_t* __restrict genobuf = pwcp->genovec_invert_buf;
+ PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, sample_ct, difflist_len, genobuf);
+ ZeroTrailingNyps(sample_ct, genobuf);
+ CountLdAndInvertedLdDiffs(pwcp->ldbase_genovec, genobuf, sample_ct, &ld_diff_ct, &ld_inv_diff_ct);
+ if ((ld_diff_ct < ld_diff_threshold) || (ld_inv_diff_ct < ld_diff_threshold)) {
+ const uintptr_t invert_before_compressing = (ld_inv_diff_ct < ld_diff_ct);
+ *vrtype_ptr = 2 + invert_before_compressing;
+ if (invert_before_compressing) {
+ GenovecInvertUnsafe(sample_ct, genobuf);
+ ld_diff_ct = ld_inv_diff_ct;
+ }
+ return SaveLdDifflist(genobuf, pwcp->ldbase_genovec, 0, ld_diff_ct, pwcp);
+ }
+ }
+ }
+ }
+ STD_ARRAY_COPY(genocounts, 4, ldbase_genocounts);
+ if (difflist_viable) {
+ *vrtype_ptr = 4 + difflist_common_geno;
+ memcpy(pwcp->ldbase_raregeno, raregeno, NypCtToByteCt(difflist_len));
+ memcpy(pwcp->ldbase_difflist_sample_ids, difflist_sample_ids, difflist_len * sizeof(int32_t));
+ // memcpy(pwcp->ldbase_difflist_sample_ids, difflist_sample_ids, (difflist_len + 1) * sizeof(int32_t));
+ pwcp->ldbase_common_geno = difflist_common_geno;
+ pwcp->ldbase_difflist_len = difflist_len;
+ return SaveLdInputList(pwcp);
+ }
+ pwcp->ldbase_common_geno = UINT32_MAX;
+ const uint32_t use_onebit = (rare_2_geno_ct_sum < sample_ct / (2 * kPglMaxDifflistLenDivisor));
+ uintptr_t* genobuf = use_onebit? pwcp->genovec_invert_buf : pwcp->ldbase_genovec;
+ PgrDifflistToGenovecUnsafe(raregeno, difflist_sample_ids, difflist_common_geno, sample_ct, difflist_len, genobuf);
+ ZeroTrailingNyps(sample_ct, genobuf);
+ *vrtype_ptr = use_onebit;
+ if (use_onebit) {
+ uint32_t larger_common_geno = second_most_common_geno;
+ uint32_t smaller_common_geno = difflist_common_geno;
+ if (difflist_common_geno > second_most_common_geno) {
+ larger_common_geno = difflist_common_geno;
+ smaller_common_geno = second_most_common_geno;
+ }
+ const uint32_t vrec_len = SaveOnebit(genobuf, larger_common_geno + (smaller_common_geno * 3), rare_2_geno_ct_sum, pwcp);
+ memcpy(pwcp->ldbase_genovec, genobuf, NypCtToWordCt(sample_ct) * sizeof(uintptr_t));
+ return vrec_len;
+ }
+ const uint32_t vrec_len = NypCtToByteCt(sample_ct);
+ pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, genobuf, vrec_len);
+ return vrec_len;
+}
+
+void PwcAppendBiallelicDifflistLimited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp) {
+ const uint32_t vidx = pwcp->vidx;
+ unsigned char vrtype;
+ const uint32_t vrec_len = PwcAppendBiallelicDifflistLimitedMain(raregeno, difflist_sample_ids, vidx, difflist_common_geno, difflist_len, pwcp, &vrtype);
+ const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ pwcp->vidx += 1;
+ SubU32Store(vrec_len, vrec_len_byte_ct, &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]));
+ if (!pwcp->phase_dosage_gflags) {
+ pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= S_CAST(uintptr_t, vrtype) << (4 * (vidx % kBitsPerWordD4));
+ } else {
+ DowncastToUc(pwcp->vrtype_buf)[vidx] = vrtype;
+ }
+}
+
+
+static inline BoolErr CheckedVrecLenIncr(uintptr_t incr, uint32_t* vrec_len_ptr) {
+ // maybe track vrec_left instead of vrec_len...
+#ifdef __LP64__
+ const uintptr_t new_vrec_len = *vrec_len_ptr + incr;
+ if (unlikely(new_vrec_len > kPglMaxBytesPerVariant)) {
+ return 1;
+ }
+ *vrec_len_ptr = new_vrec_len;
+ return 0;
+#else
+ *vrec_len_ptr += incr;
+ return 0;
+#endif
+}
+
+// ok if delta_bitarr trailing bits set
+BoolErr PwcAppendDeltalist(const uintptr_t* delta_bitarr, uint32_t deltalist_len, PgenWriterCommon* pwcp, uint32_t* vrec_len_ptr) {
+ assert(deltalist_len);
+ unsigned char* fwrite_bufp = pwcp->fwrite_bufp;
+ unsigned char* fwrite_bufp_start = fwrite_bufp;
+ fwrite_bufp = Vint32Append(deltalist_len, fwrite_bufp);
+ const uint32_t sample_id_byte_ct = BytesToRepresentNzU32(pwcp->sample_ct);
+ const uint32_t group_ct = DivUp(deltalist_len, kPglDifflistGroupSize);
+ unsigned char* group_first_sample_ids_iter = fwrite_bufp;
+ if (unlikely(CheckedVrecLenIncr(S_CAST(uintptr_t, fwrite_bufp - fwrite_bufp_start) + group_ct * sample_id_byte_ct + (group_ct - 1), vrec_len_ptr))) {
+ return 1;
+ }
+ unsigned char* extra_byte_cts_iter = &(fwrite_bufp[group_ct * sample_id_byte_ct]);
+ fwrite_bufp_start = &(extra_byte_cts_iter[group_ct - 1]);
+ fwrite_bufp = fwrite_bufp_start;
+#ifdef __LP64__
+ const intptr_t vrec_left = kPglMaxBytesPerVariant - (*vrec_len_ptr);
+#endif
+ unsigned char* last_group_vint_start = nullptr;
+ uintptr_t last_sample_idx = 0;
+ uintptr_t new_sample_idx_base = 0;
+ uintptr_t delta_bits = delta_bitarr[0];
+ for (uint32_t deltalist_idx = 0; deltalist_idx != deltalist_len; ++deltalist_idx) {
+ const uintptr_t new_sample_idx = BitIter1(delta_bitarr, &new_sample_idx_base, &delta_bits);
+ if (!(deltalist_idx % kPglDifflistGroupSize)) {
+ SubU32StoreMov(new_sample_idx, sample_id_byte_ct, &group_first_sample_ids_iter);
+ if (deltalist_idx) {
+ *extra_byte_cts_iter++ = S_CAST(uintptr_t, fwrite_bufp - last_group_vint_start) - (kPglDifflistGroupSize - 1);
+ }
+#ifdef __LP64__
+ // Note that this inequality goes in the opposite direction from the
+ // usual PtrCheck.
+ if (unlikely(S_CAST(intptr_t, fwrite_bufp - fwrite_bufp_start) > vrec_left)) {
+ return 1;
+ }
+#endif
+ last_group_vint_start = fwrite_bufp;
+ } else {
+ assert(new_sample_idx >= last_sample_idx + 1);
+ fwrite_bufp = Vint32Append(new_sample_idx - last_sample_idx, fwrite_bufp);
+ }
+ last_sample_idx = new_sample_idx;
+ }
+ pwcp->fwrite_bufp = fwrite_bufp;
+ return CheckedVrecLenIncr(fwrite_bufp - fwrite_bufp_start, vrec_len_ptr);
+}
+
+static_assert(sizeof(AlleleCode) == 1, "PwcAppendMultiallelicMain() needs to be updated.");
+BoolErr PwcAppendMultiallelicMain(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, uint32_t vidx, PgenWriterCommon* pwcp, const uintptr_t** genovec_hets_ptr, uint32_t* het_ct_ptr, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) {
+ uint32_t genovec_het_ct;
+ uint32_t genovec_altxy_ct;
+ uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, &genovec_het_ct, &genovec_altxy_ct, vrtype_ptr);
+ if ((!patch_01_ct) && (!patch_10_ct)) {
+ if (genovec_hets_ptr) {
+ *genovec_hets_ptr = genovec;
+ *het_ct_ptr = genovec_het_ct;
+ }
+ *vrec_len_ptr = vrec_len;
+ return 0;
+ }
+ assert(allele_ct > 2);
+ unsigned char* format_byte_ptr = pwcp->fwrite_bufp;
+ ++vrec_len;
+ pwcp->fwrite_bufp += 1;
+ unsigned char format_byte = 0;
+ if (!patch_01_ct) {
+ format_byte = 15;
+ } else {
+ // We could use a different bitarray vs. deltalist threshold, since the
+ // usual threshold was chosen under the assumption that
+ // (maximum allowed deltalist index + 1) == bitarray length.
+ // However, the subsetting-speed advantage of the deltalist representation
+ // can be expected to make up for its larger size, so we may actually want
+ // to increase instead of decrease the current 1/9 fraction. For now, I'll
+ // punt on this decision.
+ const uint32_t max_deltalist_entry_ct = genovec_het_ct / kPglMaxDeltalistLenDivisor;
+ if (patch_01_ct < max_deltalist_entry_ct) {
+ // impossible for this to run out of space
+ PwcAppendDeltalist(patch_01_set, patch_01_ct, pwcp, &vrec_len);
+ format_byte = 1;
+ } else {
+ const uint32_t genovec_het_ctb = DivUp(genovec_het_ct, CHAR_BIT);
+ CopyGenomatchSubset(patch_01_set, genovec, kMask5555, 0, genovec_het_ct, pwcp->fwrite_bufp);
+ pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[genovec_het_ctb]);
+ vrec_len += genovec_het_ctb;
+ }
+ if (allele_ct > 3) {
+ if (allele_ct <= 18) {
+ const AlleleCode* patch_01_vals_iter = patch_01_vals;
+
+ // todo: revisit this logic, we may not always want to write a word at
+ // a time
+ unsigned char* write_fixed_width_start = pwcp->fwrite_bufp;
+ uint32_t written_byte_ct;
+ if (allele_ct == 4) {
+ // 1 bit per entry, clear = ref/alt2, set = ref/alt3
+#ifdef USE_SSE2
+ const uint32_t fullvec_ct = patch_01_ct / kBytesPerVec;
+ if (fullvec_ct) {
+ // parallel-equality-to-3 check, movemask
+ // (+125 or <<7 followed by movemask also works)
+ // todo: better ARM implementation
+ for (uint32_t vec_idx = 0; vec_idx != fullvec_ct; ++vec_idx) {
+ VecUc cur_vec = vecuc_loadu(patch_01_vals_iter);
+ patch_01_vals_iter = &(patch_01_vals_iter[kBytesPerVec]);
+ const Vec8thUint v8ui = vecuc_movemask(VecWToUc(vecw_slli(VecUcToW(cur_vec), 7)));
+ CopyToUnalignedOffsetV8(write_fixed_width_start, &v8ui, vec_idx);
+ }
+ }
+ const uint32_t remainder = patch_01_ct % kBytesPerVec;
+ if (remainder) {
+ const uintptr_t* cur_read_iter = R_CAST(const uintptr_t*, patch_01_vals_iter);
+ unsigned char* write_uc = &(write_fixed_width_start[fullvec_ct * sizeof(Vec8thUint)]);
+ const uint32_t word_ct_m1 = (remainder - 1) / kBytesPerWord;
+ for (uint32_t widx = 0; ; ++widx) {
+ uintptr_t cur_word;
+ if (widx >= word_ct_m1) {
+ if (widx > word_ct_m1) {
+ break;
+ }
+ cur_word = SubwordLoad(cur_read_iter, ModNz(remainder, kBytesPerWord));
+ } else {
+ cur_word = *cur_read_iter++;
+ }
+# ifdef USE_AVX2
+ cur_word = _pext_u64(cur_word, kMask0101);
+# else
+ // extract the low bit from each byte
+ cur_word = cur_word & kMask0101;
+ cur_word = (cur_word * 0x102040810204080LLU) >> 56;
+# endif
+ write_uc[widx] = cur_word;
+ }
+ }
+#else // !USE_SSE2
+ const uint32_t word_ct_m1 = (patch_01_ct - 1) / kBitsPerWord;
+ uint32_t loop_len = kBitsPerWord;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= word_ct_m1) {
+ if (widx > word_ct_m1) {
+ break;
+ }
+ loop_len = ModNz(patch_01_ct, kBitsPerWord);
+ }
+ uintptr_t cur_word = 0;
+ for (uint32_t uii = loop_len - 1; ; --uii) {
+ cur_word |= patch_01_vals_iter[uii] & 1;
+ if (!uii) {
+ break;
+ }
+ cur_word <<= 1;
+ }
+ CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx);
+ patch_01_vals_iter = &(patch_01_vals_iter[loop_len]);
+ }
+#endif // !USE_SSE2
+ written_byte_ct = DivUp(patch_01_ct, 8);
+ } else if (allele_ct <= 6) {
+ // 2 bits per entry
+ const uint32_t word_ct_m1 = (patch_01_ct - 1) / kBitsPerWordD2;
+ uint32_t loop_len = kBitsPerWordD2;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= word_ct_m1) {
+ if (widx > word_ct_m1) {
+ break;
+ }
+ loop_len = ModNz(patch_01_ct, kBitsPerWordD2);
+ }
+ uintptr_t cur_word = 0;
+ for (uint32_t uii = loop_len - 1; ; --uii) {
+ // todo:
+ // 1. parallel-subtract (now all bytes are in 0..3)
+ // 2. bitwise or with right-shift-6
+ // 3. bitwise or with right-shift-12
+ // 4. _mm{256}_shuffle_epi8() gather from bytes 0, 4, 8, ...
+ // SSE4.2:
+ // 5. store result of _mm_extract_epi32(, 0)
+ // AVX2:
+ // 5. store results of _mm256_extract_epi32(, 0) and (, 4)
+ cur_word |= patch_01_vals_iter[uii] - 2;
+ if (!uii) {
+ break;
+ }
+ cur_word <<= 2;
+ }
+ CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx);
+ patch_01_vals_iter = &(patch_01_vals_iter[loop_len]);
+ }
+ written_byte_ct = DivUp(patch_01_ct, 4);
+ } else {
+ // 4 bits per entry
+ const uint32_t word_ct_m1 = (patch_01_ct - 1) / kBitsPerWordD4;
+ uint32_t loop_len = kBitsPerWordD4;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= word_ct_m1) {
+ if (widx > word_ct_m1) {
+ break;
+ }
+ loop_len = ModNz(patch_01_ct, kBitsPerWordD4);
+ }
+ uintptr_t cur_word = 0;
+ for (uint32_t uii = loop_len - 1; ; --uii) {
+ // todo: replace this with parallel-subtract, followed by bitwise
+ // or with right-shift-4, followed by _mm{256}_shuffle_epi8()
+ // gather from even bytes, followed by _mm{256}_extract_epi64()
+ // (or _mm256_permute4x64_epi64() followed by
+ // _mm256_extracti128_si256()? test this.)
+ // Or in non-SSE4.2 case, use end of PackWordToHalfword logic.
+ cur_word |= patch_01_vals_iter[uii] - 2;
+ if (!uii) {
+ break;
+ }
+ cur_word <<= 4;
+ }
+ CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx);
+ patch_01_vals_iter = &(patch_01_vals_iter[loop_len]);
+ }
+ written_byte_ct = DivUp(patch_01_ct, 2);
+ }
+ pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[written_byte_ct]);
+ vrec_len += written_byte_ct;
+ } else {
+ // 1 byte per entry
+ // need 2-byte and 3-byte cases here for larger AlleleCode, those also
+ // need to check for vrec_len overflow
+ unsigned char* payload = pwcp->fwrite_bufp;
+ for (uint32_t patch_idx = 0; patch_idx != patch_01_ct; ++patch_idx) {
+ // todo: check if the compiler vectorizes this
+ payload[patch_idx] = patch_01_vals[patch_idx] - 2;
+ }
+ pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[patch_01_ct]);
+ vrec_len += patch_01_ct;
+ }
+ }
+ }
+ if (!patch_10_ct) {
+ format_byte |= 240;
+ if (genovec_hets_ptr) {
+ *genovec_hets_ptr = genovec;
+ *het_ct_ptr = genovec_het_ct;
+ }
+ } else {
+ const uint32_t max_deltalist_entry_ct = genovec_altxy_ct / kPglMaxDeltalistLenDivisor;
+ if (patch_10_ct < max_deltalist_entry_ct) {
+ // can't actually fail for now, but cannot ignore overflow check if
+ // sizeof(AlleleCode) > 1
+ if (unlikely(PwcAppendDeltalist(patch_10_set, patch_10_ct, pwcp, &vrec_len))) {
+ return 1;
+ }
+ format_byte |= 16;
+ } else {
+ const uintptr_t genovec_altxy_ctb = DivUp(genovec_altxy_ct, CHAR_BIT);
+ if (unlikely(CheckedVrecLenIncr(genovec_altxy_ctb, &vrec_len))) {
+ return 1;
+ }
+ CopyGenomatchSubset(patch_10_set, genovec, kMaskAAAA, 0, genovec_altxy_ct, pwcp->fwrite_bufp);
+ pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[genovec_altxy_ctb]);
+ }
+ if (allele_ct <= 17) {
+ const AlleleCode* patch_10_vals_iter = patch_10_vals;
+ unsigned char* write_fixed_width_start = pwcp->fwrite_bufp;
+ uint32_t bytes_to_write;
+ if (allele_ct == 3) {
+ // 1 bit per entry, clear = alt1/alt2, set = alt2/alt2
+ bytes_to_write = DivUp(patch_10_ct, 8);
+ if (unlikely(CheckedVrecLenIncr(bytes_to_write, &vrec_len))) {
+ return 1;
+ }
+#ifdef USE_SSE2
+ const uint32_t fullvec_ct = patch_10_ct / (kBytesPerVec / 2);
+ if (fullvec_ct) {
+# if defined(USE_SHUFFLE8) && !defined(USE_AVX2)
+ // SSE4.2: _mm_shuffle_epi8() to gather even bytes, parallel
+ // equality-to-2 check, movemask
+ // (+126 or <<6 followed by movemask also works)
+ // todo: better ARM implementation
+ const VecUc gather_even = vecuc_setr8(0, 2, 4, 6, 8, 10, 12, 14,
+ -1, -1, -1, -1, -1, -1, -1, -1);
+ for (uint32_t vec_idx = 0; vec_idx != fullvec_ct; ++vec_idx) {
+ const VecUc cur_vec = vecuc_loadu(patch_10_vals_iter);
+ patch_10_vals_iter = &(patch_10_vals_iter[kBytesPerVec]);
+ const VecUc even_vec = vecuc_shuffle8(cur_vec, gather_even);
+ write_fixed_width_start[vec_idx] = vecuc_movemask(VecWToUc(vecw_slli(VecUcToW(even_vec), 6)));
+ }
+# else
+ // SSE2/AVX2: parallel equality-to-{2, 2} check, movemask,
+ // PackWordToHalfword
+ // (uint16_t +126 followed by movemask also works, and avoids an
+ // extra masking step in SSE2 case)
+ // (<<6 doesn't work here)
+ const VecU16 all126 = vecu16_set1(126);
+ for (uint32_t vec_idx = 0; vec_idx != fullvec_ct; ++vec_idx) {
+ VecU16 cur_vec = vecu16_loadu(patch_10_vals_iter);
+ patch_10_vals_iter = &(patch_10_vals_iter[kBytesPerVec]);
+ const uint32_t cur_u32 = vecu16_movemask(cur_vec + all126);
+ const Vec16thUint v16ui = PackVec8thUintTo16th(cur_u32);
+ CopyToUnalignedOffsetV16(write_fixed_width_start, &v16ui, vec_idx);
+ }
+# endif
+ }
+ const uint32_t remainder = patch_10_ct % (kBytesPerVec / 2);
+ if (remainder) {
+ uint32_t cur_u32 = 0;
+ for (uint32_t uii = remainder - 1; ; --uii) {
+ cur_u32 += patch_10_vals_iter[2 * uii];
+ if (!uii) {
+ break;
+ }
+ cur_u32 <<= 1;
+ }
+ // this effectively subtracts 1 from each entry
+ const Vec16thUint v16ui = cur_u32 + 1 - (1U << remainder);
+ CopyToUnalignedOffsetV16(write_fixed_width_start, &v16ui, fullvec_ct);
+ patch_10_vals_iter = &(patch_10_vals_iter[remainder * 2]);
+ }
+#else // !USE_SSE2
+ const uint32_t word_ct_m1 = (patch_10_ct - 1) / kBitsPerWord;
+ uint32_t loop_len = kBitsPerWord;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= word_ct_m1) {
+ if (widx > word_ct_m1) {
+ break;
+ }
+ loop_len = ModNz(patch_10_ct, kBitsPerWord);
+ }
+ uintptr_t cur_word = 0;
+ for (uint32_t uii = loop_len - 1; ; --uii) {
+ cur_word |= patch_10_vals_iter[2 * uii] - 1;
+ if (!uii) {
+ break;
+ }
+ cur_word <<= 1;
+ }
+ CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx);
+ patch_10_vals_iter = &(patch_10_vals_iter[loop_len * 2]);
+ }
+#endif // !USE_SSE2
+ } else if (allele_ct <= 5) {
+ // 2 bits per half-entry
+ bytes_to_write = DivUp(patch_10_ct, 2);
+ if (unlikely(CheckedVrecLenIncr(bytes_to_write, &vrec_len))) {
+ return 1;
+ }
+ // this may be worse than simple loop, check this later
+ const uint32_t word_ct_m1 = (patch_10_ct - 1) / kBitsPerWordD4;
+ uint32_t loop_len = kBitsPerWordD2;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= word_ct_m1) {
+ if (widx > word_ct_m1) {
+ break;
+ }
+ loop_len = 2 * ModNz(patch_10_ct, kBitsPerWordD4);
+ }
+ uintptr_t cur_word = 0;
+ for (uint32_t uii = loop_len - 1; ; --uii) {
+ // todo: same as 2-bit patch_01 case, except starting with
+ // parallel-subtract 1 instead of 2
+ cur_word |= patch_10_vals_iter[uii] - 1;
+ if (!uii) {
+ break;
+ }
+ cur_word <<= 2;
+ }
+ CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx);
+ patch_10_vals_iter = &(patch_10_vals_iter[loop_len]);
+ }
+ } else {
+ // 4 bits per half-entry
+ bytes_to_write = patch_10_ct;
+ if (unlikely(CheckedVrecLenIncr(bytes_to_write, &vrec_len))) {
+ return 1;
+ }
+ const uint32_t word_ct_m1 = (patch_10_ct - 1) / kBytesPerWord;
+ uint32_t loop_len = kBitsPerWordD4;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= word_ct_m1) {
+ if (widx > word_ct_m1) {
+ break;
+ }
+ loop_len = 2 * ModNz(patch_10_ct, kBytesPerWord);
+ }
+ uintptr_t cur_word = 0;
+ for (uint32_t uii = loop_len - 1; ; --uii) {
+ cur_word |= patch_10_vals_iter[uii] - 1;
+ if (!uii) {
+ break;
+ }
+ cur_word <<= 4;
+ }
+ CopyToUnalignedOffsetW(write_fixed_width_start, &cur_word, widx);
+ patch_10_vals_iter = &(patch_10_vals_iter[loop_len]);
+ }
+ }
+ pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[bytes_to_write]);
+ } else {
+ // 1 byte per half-entry
+ // need 2- and 3-byte cases for larger AlleleCode
+ unsigned char* payload = pwcp->fwrite_bufp;
+ if (unlikely(CheckedVrecLenIncr(patch_10_ct * (2 * k1LU), &vrec_len))) {
+ return 1;
+ }
+ const uint32_t patch_10_ct_x2 = patch_10_ct * 2;
+ // hopefully the compiler automatically vectorizes this when
+ // sizeof(AlleleCode) == 1
+ for (uint32_t uii = 0; uii != patch_10_ct_x2; ++uii) {
+ payload[uii] = patch_10_vals[uii] - 1;
+ }
+ pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[patch_10_ct * 2]);
+ }
+ if (genovec_hets_ptr) {
+ // This is somewhat redundant with the earlier steps, but let's get
+ // everything working before doing more optimization.
+ const uint32_t sample_ct = pwcp->sample_ct;
+ const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
+ const Halfword* patch_10_set_alias = DowncastKWToHW(patch_10_set);
+ const AlleleCode* patch_10_vals_iter = patch_10_vals;
+ const AlleleCode* patch_10_vals_end = &(patch_10_vals[patch_10_ct * 2]);
+ uintptr_t* genovec_hets = nullptr;
+ uint32_t het_ct = genovec_het_ct;
+ // todo: try inverting this loop, at least in 64-bit case. perform
+ // movemask-based scan for heterozygous patch_10_vals entries, then
+ // advance widx/detect_10 as needed.
+ for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
+ uint32_t patch_10_hw = patch_10_set_alias[widx];
+ if (patch_10_hw) {
+ do {
+ const AlleleCode val1 = *patch_10_vals_iter++;
+ const AlleleCode val2 = *patch_10_vals_iter++;
+ const uintptr_t lowbit = patch_10_hw & (-patch_10_hw);
+ if (val1 != val2) {
+ if (!genovec_hets) {
+ genovec_hets = pwcp->genovec_hets_buf;
+ memcpy(genovec_hets, genovec, sample_ctl2 * sizeof(intptr_t));
+ }
+ // clear high bit, set low bit
+ genovec_hets[widx] ^= lowbit * lowbit * 3;
+ ++het_ct;
+ }
+ patch_10_hw ^= lowbit;
+ } while (patch_10_hw);
+ if (patch_10_vals_iter == patch_10_vals_end) {
+ break;
+ }
+ }
+ }
+ *het_ct_ptr = het_ct;
+ if (genovec_hets) {
+ *genovec_hets_ptr = genovec_hets;
+ } else {
+ *genovec_hets_ptr = genovec;
+ }
+ }
+ }
+ *format_byte_ptr = format_byte;
+ *vrtype_ptr |= 8;
+ *vrec_len_ptr = vrec_len;
+ return 0;
+}
+
+BoolErr PwcAppendMultiallelicSparse(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp) {
+ const uint32_t vidx = pwcp->vidx;
+ unsigned char vrtype;
+ uint32_t vrec_len;
+ if (unlikely(PwcAppendMultiallelicMain(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, allele_ct, patch_01_ct, patch_10_ct, vidx, pwcp, nullptr, nullptr, &vrtype, &vrec_len))) {
+ return 1;
+ }
+ pwcp->vidx += 1;
+ const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ SubU32Store(vrec_len, vrec_len_byte_ct, &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]));
+ if (!pwcp->phase_dosage_gflags) {
+ pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= S_CAST(uintptr_t, vrtype) << (4 * (vidx % kBitsPerWordD4));
+ } else {
+ DowncastToUc(pwcp->vrtype_buf)[vidx] = vrtype;
+ }
+ return 0;
+}
+
+void PglMultiallelicDenseToSparse(const AlleleCode* __restrict wide_codes, uint32_t sample_ct, uintptr_t* __restrict genoarr, uintptr_t* __restrict patch_01_set, AlleleCode* __restrict patch_01_vals, uintptr_t* __restrict patch_10_set, AlleleCode* __restrict patch_10_vals, uint32_t* __restrict patch_01_ct_ptr, uint32_t* __restrict patch_10_ct_ptr) {
+ const uint32_t word_ct_m1 = (sample_ct - 1) / kBitsPerWordD2;
+ const AlleleCode* wide_codes_iter = wide_codes;
+ Halfword* patch_01_set_alias = DowncastWToHW(patch_01_set);
+ Halfword* patch_10_set_alias = DowncastWToHW(patch_10_set);
+ AlleleCode* patch_01_vals_iter = patch_01_vals;
+ AlleleCode* patch_10_vals_iter = patch_10_vals;
+ uint32_t loop_len = kBitsPerWordD2;
+ for (uint32_t widx = 0; ; ++widx) {
+ if (widx >= word_ct_m1) {
+ if (widx > word_ct_m1) {
+ *patch_01_ct_ptr = patch_01_vals_iter - patch_01_vals;
+ *patch_10_ct_ptr = S_CAST(uintptr_t, patch_10_vals_iter - patch_10_vals) / 2;
+ return;
+ }
+ loop_len = ModNz(sample_ct, kBitsPerWordD2);
+ }
+ uintptr_t geno_word = 0;
+ uint32_t patch_01_hw = 0;
+ uint32_t patch_10_hw = 0;
+ // ascending loop may be worse than descending here
+ for (uint32_t sample_idx_lowbits = 0; sample_idx_lowbits != loop_len; ++sample_idx_lowbits) {
+ // todo: try making this movemask-based in 64-bit case
+ // parallel checks for equality-to-255, equality-to-0, inequality-to-1
+ // rarealts are inequality-to-1 and (not 0 or 255); can use ctzu32() and
+ // &= x - 1 to iterate
+ // hom-ref iff double-equality to 0
+ // het-ref iff low equality-to-0 bit set, high clear
+ // missing iff equality-to-255 (better be double)
+ // altx-alty otherwise
+ const AlleleCode first_code = *wide_codes_iter++;
+ const AlleleCode second_code = *wide_codes_iter++;
+ uintptr_t cur_geno;
+ if (first_code) {
+ if (first_code == kMissingAlleleCode) {
+ cur_geno = 3;
+ } else {
+ cur_geno = 2;
+ if (second_code > 1) {
+ patch_10_hw |= 1U << sample_idx_lowbits;
+ *patch_10_vals_iter++ = first_code;
+ *patch_10_vals_iter++ = second_code;
+ }
+ }
+ } else if (!second_code) {
+ continue;
+ } else {
+ cur_geno = 1;
+ if (second_code > 1) {
+ patch_01_hw |= 1U << sample_idx_lowbits;
+ *patch_01_vals_iter++ = second_code;
+ }
+ }
+ geno_word |= cur_geno << (2 * sample_idx_lowbits);
+ }
+ genoarr[widx] = geno_word;
+ patch_01_set_alias[widx] = patch_01_hw;
+ patch_10_set_alias[widx] = patch_10_hw;
+ }
+}
+
+static_assert(sizeof(AlleleCode) == 1, "PglMultiallelicSparseToDense() needs to be updated.");
+void PglMultiallelicSparseToDense(const uintptr_t* __restrict genoarr, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const AlleleCode* __restrict remap, uint32_t sample_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, uintptr_t* __restrict flipped, AlleleCode* __restrict wide_codes) {
+ if (flipped) {
+ ZeroWArr(BitCtToWordCt(sample_ct), flipped);
+ }
+ if ((!remap) || ((!remap[0]) && (remap[1] == 1))) {
+ GenoarrLookup256x2bx4(genoarr, kHcToAlleleCodes, sample_ct, wide_codes);
+ if (!remap) {
+ if (patch_01_ct) {
+ uintptr_t sample_idx_base = 0;
+ uintptr_t cur_bits = patch_01_set[0];
+ AlleleCode* wide_codes1 = &(wide_codes[1]);
+ for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
+ const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
+ wide_codes1[2 * sample_idx] = patch_01_vals[uii];
+ }
+ }
+ if (patch_10_ct) {
+ uintptr_t sample_idx_base = 0;
+ uintptr_t cur_bits = patch_10_set[0];
+ const DoubleAlleleCode* patch_10_vals_alias = R_CAST(const DoubleAlleleCode*, patch_10_vals);
+ DoubleAlleleCode* wide_codes_alias = R_CAST(DoubleAlleleCode*, wide_codes);
+ for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
+ const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
+ wide_codes_alias[sample_idx] = patch_10_vals_alias[uii];
+ }
+ }
+ return;
+ }
+ } else {
+ // 0 -> (remap[0], remap[0])
+ // 1 -> (remap[0], remap[1])
+ // 2 -> (remap[1], remap[1])
+ // 3 -> (255, 255)
+ // 256x4 lookup table usually too expensive to reinitialize for every
+ // single variant. 16x2 should be a good compromise?
+ // todo: try Expand1bitTo8 strategy in SSE4.2+ case, see if that's fast
+ // enough to remove 0/1 special case
+ const uint32_t remap0 = remap[0];
+ const uint32_t remap1 = remap[1];
+ uint32_t table4[4];
+ table4[0] = remap0 * 257;
+ table4[1] = remap0 + remap1 * 256;
+ table4[2] = remap1 * 257;
+ table4[3] = 0xffff;
+ if (remap1 < remap0) {
+ table4[1] = remap1 + remap0 * 256;
+ if (flipped) {
+ const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
+ PackWordsToHalfwordsInvmatch(genoarr, kMaskAAAA, sample_ctl2, flipped);
+ }
+ }
+ // could have an InitLookup16x2bx2 function for this
+ uint32_t table16[16];
+ for (uint32_t uii = 0; uii != 4; ++uii) {
+ const uint32_t high_bits = table4[uii] << 16;
+ uint32_t* table_row = &(table16[uii * 4]);
+ for (uint32_t ujj = 0; ujj != 4; ++ujj) {
+ table_row[ujj] = high_bits | table4[ujj];
+ }
+ }
+ const uint32_t sample_ctd4 = sample_ct / 4;
+ const unsigned char* genoarr_alias = DowncastKToUc(genoarr);
+ uint32_t* wide_codes_alias_iter = R_CAST(uint32_t*, wide_codes);
+ for (uint32_t byte_idx = 0; byte_idx != sample_ctd4; ++byte_idx) {
+ const uint32_t geno_byte = genoarr_alias[byte_idx];
+ *wide_codes_alias_iter++ = table16[geno_byte & 15];
+ *wide_codes_alias_iter++ = table16[geno_byte >> 4];
+ }
+ const uint32_t remainder = sample_ct % 4;
+ if (remainder) {
+ uint32_t geno_byte = genoarr_alias[sample_ctd4];
+ if (remainder & 2) {
+ *wide_codes_alias_iter++ = table16[geno_byte & 15];
+ geno_byte = geno_byte >> 4;
+ }
+ if (remainder & 1) {
+ uint16_t* last_code_pair_ptr = DowncastU32ToU16(wide_codes_alias_iter);
+ // guess it's easy enough to defend against dirty trailing bits for now
+ *last_code_pair_ptr = table4[geno_byte & 3];
+ }
+ }
+ }
+ if (patch_01_vals) {
+ uintptr_t sample_idx_base = 0;
+ uintptr_t cur_bits = patch_01_set[0];
+ const AlleleCode remap0 = remap[0];
+ const AlleleCode remap1 = remap[1];
+ if ((!remap0) || ((!flipped) && (remap0 == 1) && (!remap1))) {
+ // no flips possible
+ AlleleCode* wide_codes1 = &(wide_codes[1]);
+ for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
+ const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
+ wide_codes1[2 * sample_idx] = remap[patch_01_vals[uii]];
+ }
+ } else if (!flipped) {
+ for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
+ const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
+ const AlleleCode ac = remap[patch_01_vals[uii]];
+ if (ac > remap0) {
+ // bugfix (4 Jul 2024): cannot assume wide_codes[2 * sample_idx] ==
+ // remap0. It's remap1 when remap1 < remap0.
+ wide_codes[2 * sample_idx] = remap0;
+ wide_codes[2 * sample_idx + 1] = ac;
+ } else {
+ wide_codes[2 * sample_idx] = ac;
+ wide_codes[2 * sample_idx + 1] = remap0;
+ }
+ }
+ } else if (remap1 >= remap0) {
+ for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
+ const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
+ const AlleleCode ac = remap[patch_01_vals[uii]];
+ if (ac > remap0) {
+ wide_codes[2 * sample_idx + 1] = ac;
+ } else {
+ wide_codes[2 * sample_idx] = ac;
+ wide_codes[2 * sample_idx + 1] = remap0;
+ SetBit(sample_idx, flipped);
+ }
+ }
+ } else {
+ for (uint32_t uii = 0; uii != patch_01_ct; ++uii) {
+ const uintptr_t sample_idx = BitIter1(patch_01_set, &sample_idx_base, &cur_bits);
+ const AlleleCode ac = remap[patch_01_vals[uii]];
+ if (ac > remap0) {
+ wide_codes[2 * sample_idx] = remap0;
+ wide_codes[2 * sample_idx + 1] = ac;
+ ClearBit(sample_idx, flipped);
+ } else {
+ wide_codes[2 * sample_idx] = ac;
+ wide_codes[2 * sample_idx + 1] = remap0;
+ }
+ }
+ }
+ }
+ if (patch_10_ct) {
+ uintptr_t sample_idx_base = 0;
+ uintptr_t cur_bits = patch_10_set[0];
+ if (!flipped) {
+ for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
+ const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
+ const AlleleCode ac0 = remap[patch_10_vals[2 * uii]];
+ const AlleleCode ac1 = remap[patch_10_vals[2 * uii + 1]];
+ if (ac0 <= ac1) {
+ wide_codes[2 * sample_idx] = ac0;
+ wide_codes[2 * sample_idx + 1] = ac1;
+ } else {
+ wide_codes[2 * sample_idx] = ac1;
+ wide_codes[2 * sample_idx + 1] = ac0;
+ }
+ }
+ } else {
+ for (uint32_t uii = 0; uii != patch_10_ct; ++uii) {
+ const uintptr_t sample_idx = BitIter1(patch_10_set, &sample_idx_base, &cur_bits);
+ const AlleleCode ac0 = remap[patch_10_vals[2 * uii]];
+ const AlleleCode ac1 = remap[patch_10_vals[2 * uii + 1]];
+ if (ac0 <= ac1) {
+ wide_codes[2 * sample_idx] = ac0;
+ wide_codes[2 * sample_idx + 1] = ac1;
+ } else {
+ wide_codes[2 * sample_idx] = ac1;
+ wide_codes[2 * sample_idx + 1] = ac0;
+ SetBit(sample_idx, flipped);
+ }
+ }
+ }
+ }
+}
+
+// tolerates extraneous phaseinfo bits
+BoolErr AppendHphase(const uintptr_t* __restrict genoarr_hets, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t het_ct, uint32_t phasepresent_ct, PgenWriterCommon* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) {
+ assert(phasepresent_ct);
+ const uint32_t sample_ct = pwcp->sample_ct;
+ *vrtype_ptr += 16;
+ const uint32_t het_ctp1_8 = 1 + (het_ct / CHAR_BIT);
+ unsigned char* fwrite_bufp_iter = pwcp->fwrite_bufp;
+ uintptr_t phaseinfo_write_word = 0;
+ uint32_t phaseinfo_write_idx_lowbits;
+ if (het_ct == phasepresent_ct) {
+ // no need to write phasepresent; just write phaseinfo directly to output
+ // buffer
+ if (unlikely(CheckedVrecLenIncr(het_ctp1_8, vrec_len_ptr))) {
+ return 1;
+ }
+ CopyGenomatchSubset(phaseinfo, genoarr_hets, kMask5555, 1, het_ct, pwcp->fwrite_bufp);
+ fwrite_bufp_iter = &(pwcp->fwrite_bufp[het_ctp1_8]);
+ } else {
+ // this is a minor variant of ExpandThenSubsetBytearr()
+ if (unlikely(CheckedVrecLenIncr(het_ctp1_8 + DivUp(phasepresent_ct, 8), vrec_len_ptr))) {
+ return 1;
+ }
+ const uint32_t sample_ctl2 = NypCtToWordCt(sample_ct);
+ uintptr_t* phaseinfo_tmp = pwcp->genovec_invert_buf;
+ uintptr_t* phaseinfo_tmp_iter = phaseinfo_tmp;
+ uint32_t phasepresent_write_idx_lowbits = 1;
+ phaseinfo_write_idx_lowbits = 0;
+ uintptr_t phasepresent_write_word = 1; // first bit set
+ for (uint32_t widx = 0; widx != sample_ctl2; ++widx) {
+ const uintptr_t geno_word = genoarr_hets[widx];
+ uintptr_t geno_hets = Word01(geno_word);
+ if (geno_hets) {
+ const uint32_t phasepresent_halfword = DowncastKWToHW(phasepresent)[widx];
+ if (phasepresent_halfword) {
+ const uint32_t phaseinfo_halfword = DowncastKWToHW(phaseinfo)[widx];
+ do {
+ const uint32_t sample_idx_lowbits = ctzw(geno_hets) / 2;
+ if ((phasepresent_halfword >> sample_idx_lowbits) & 1) {
+ phasepresent_write_word |= k1LU << phasepresent_write_idx_lowbits;
+ phaseinfo_write_word |= S_CAST(uintptr_t, (phaseinfo_halfword >> sample_idx_lowbits) & k1LU) << phaseinfo_write_idx_lowbits;
+ if (++phaseinfo_write_idx_lowbits == kBitsPerWord) {
+ *phaseinfo_tmp_iter++ = phaseinfo_write_word;
+ phaseinfo_write_word = 0;
+ phaseinfo_write_idx_lowbits = 0;
+ }
+ }
+ if (++phasepresent_write_idx_lowbits == kBitsPerWord) {
+ AppendW(phasepresent_write_word, &fwrite_bufp_iter);
+ phasepresent_write_word = 0;
+ phasepresent_write_idx_lowbits = 0;
+ }
+ geno_hets &= geno_hets - 1;
+ } while (geno_hets);
+ } else {
+ phasepresent_write_idx_lowbits += PopcountWord(geno_hets);
+ if (phasepresent_write_idx_lowbits >= kBitsPerWord) {
+ AppendW(phasepresent_write_word, &fwrite_bufp_iter);
+ phasepresent_write_word = 0;
+ phasepresent_write_idx_lowbits -= kBitsPerWord;
+ }
+ }
+ }
+ }
+ if (phasepresent_write_idx_lowbits) {
+ const uint32_t cur_byte_ct = DivUp(phasepresent_write_idx_lowbits, CHAR_BIT);
+ // er, safe to write the entire word...
+ SubwordStoreMov(phasepresent_write_word, cur_byte_ct, &fwrite_bufp_iter);
+ }
+ fwrite_bufp_iter = memcpyua(fwrite_bufp_iter, phaseinfo_tmp, sizeof(intptr_t) * (phaseinfo_tmp_iter - phaseinfo_tmp));
+ if (phaseinfo_write_idx_lowbits) {
+ const uint32_t cur_byte_ct = DivUp(phaseinfo_write_idx_lowbits, CHAR_BIT);
+ SubwordStoreMov(phaseinfo_write_word, cur_byte_ct, &fwrite_bufp_iter);
+ }
+ assert(S_CAST(uintptr_t, fwrite_bufp_iter - pwcp->fwrite_bufp) == het_ctp1_8 + DivUp(phasepresent_ct, 8));
+ }
+ pwcp->fwrite_bufp = fwrite_bufp_iter;
+ return 0;
+}
+
+void PwcAppendBiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, PgenWriterCommon* pwcp) {
+ // assumes phase_dosage_gflags is nonzero
+ const uint32_t vidx = pwcp->vidx;
+ unsigned char* vrtype_dest = &(DowncastToUc(pwcp->vrtype_buf)[vidx]);
+ uint32_t het_ct;
+ uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, &het_ct, nullptr, vrtype_dest);
+ const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ const uint32_t sample_ct = pwcp->sample_ct;
+ const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
+ pwcp->vidx += 1;
+ unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]);
+ const uint32_t phasepresent_ct = phasepresent? PopcountWords(phasepresent, sample_ctl) : het_ct;
+ if (phasepresent_ct) {
+ AppendHphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len);
+ }
+ SubU32Store(vrec_len, vrec_len_byte_ct, vrec_len_dest);
+}
+
+BoolErr PwcAppendMultiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp) {
+ const uint32_t vidx = pwcp->vidx;
+ const uintptr_t* genovec_hets;
+ unsigned char vrtype;
+ uint32_t het_ct;
+ uint32_t vrec_len;
+ if (unlikely(PwcAppendMultiallelicMain(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, allele_ct, patch_01_ct, patch_10_ct, vidx, pwcp, &genovec_hets, &het_ct, &vrtype, &vrec_len))) {
+ return 1;
+ }
+ const uint32_t sample_ct = pwcp->sample_ct;
+ const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
+ const uint32_t phasepresent_ct = phasepresent? PopcountWords(phasepresent, sample_ctl) : het_ct;
+ unsigned char* vrtype_dest = &(DowncastToUc(pwcp->vrtype_buf)[vidx]);
+ *vrtype_dest = vrtype;
+ if (phasepresent_ct) {
+ if (unlikely(AppendHphase(genovec_hets, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len))) {
+ return 1;
+ }
+ }
+ pwcp->vidx += 1;
+ const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ SubU32Store(vrec_len, vrec_len_byte_ct, &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]));
+ return 0;
+}
+
+// ok if dosage_present trailing bits set
+BoolErr AppendDosage16(const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) {
+ const uint32_t sample_ct = pwcp->sample_ct;
+ const uint32_t max_deltalist_entry_ct = sample_ct / kPglMaxDeltalistLenDivisor;
+ if (dosage_ct < max_deltalist_entry_ct) {
+ // case 1: store dosage IDs as deltalist.
+ if (unlikely(PwcAppendDeltalist(dosage_present, dosage_ct, pwcp, vrec_len_ptr))) {
+ return 1;
+ }
+ *vrtype_ptr += 0x20;
+ } else if ((dosage_ct == sample_ct) && ((!dphase_ct) || (dphase_ct == sample_ct))) {
+ // case 2: fixed-width, no need to store dosage IDs at all.
+ // dosage_main permitted to have 65535 = missing
+ // bugfix (9 Dec 2018): since this forces fixed-width dosage-phase storage,
+ // also require dphase_ct == 0 or dphase_ct == sample_ct.
+ *vrtype_ptr += 0x40;
+ } else {
+ // case 3: save dosage_present bitarray directly.
+ const uint32_t sample_ctb = DivUp(sample_ct, CHAR_BIT);
+ if (unlikely(CheckedVrecLenIncr(sample_ctb, vrec_len_ptr))) {
+ return 1;
+ }
+ pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, dosage_present, sample_ctb);
+ *vrtype_ptr += 0x60;
+ }
+ if (unlikely(CheckedVrecLenIncr(dosage_ct * sizeof(int16_t), vrec_len_ptr))) {
+ return 1;
+ }
+ pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, dosage_main, dosage_ct * sizeof(int16_t));
+ return 0;
+}
+
+// ok if dosage_present trailing bits set
+BoolErr PwcAppendBiallelicGenovecDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp) {
+ // safe to call this even when entire file has no phase/dosage info
+ const uint32_t vidx = pwcp->vidx;
+ unsigned char vrtype;
+ uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, nullptr, nullptr, &vrtype);
+ const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ pwcp->vidx += 1;
+ unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]);
+ if (dosage_ct) {
+ if (unlikely(AppendDosage16(dosage_present, dosage_main, dosage_ct, 0, pwcp, &vrtype, &vrec_len))) {
+ return 1;
+ }
+ }
+ SubU32Store(vrec_len, vrec_len_byte_ct, vrec_len_dest);
+ if (!pwcp->phase_dosage_gflags) {
+ pwcp->vrtype_buf[vidx / kBitsPerWordD4] |= S_CAST(uintptr_t, vrtype) << (4 * (vidx % kBitsPerWordD4));
+ } else {
+ DowncastToUc(pwcp->vrtype_buf)[vidx] = vrtype;
+ }
+ return 0;
+}
+
+// NOT ok if phasepresent trailing bits set, but ok for dosage_present
+BoolErr PwcAppendBiallelicGenovecHphaseDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp) {
+ // assumes there is phase and/or dosage data in output file, otherwise
+ // vrtype_dest needs to be replaced
+
+ // this mostly overlaps with PwcAppendBiallelicGenovecHphase(); probably
+ // get rid of the latter
+ const uint32_t vidx = pwcp->vidx;
+ unsigned char* vrtype_dest = &(DowncastToUc(pwcp->vrtype_buf)[vidx]);
+ uint32_t het_ct;
+ uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, &het_ct, nullptr, vrtype_dest);
+ const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ const uint32_t sample_ct = pwcp->sample_ct;
+ const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
+ pwcp->vidx += 1;
+ unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]);
+ const uint32_t phasepresent_ct = phasepresent? PopcountWords(phasepresent, sample_ctl) : het_ct;
+ if (phasepresent_ct) {
+ AppendHphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len);
+ }
+ if (dosage_ct) {
+ if (unlikely(AppendDosage16(dosage_present, dosage_main, dosage_ct, 0, pwcp, vrtype_dest, &vrec_len))) {
+ return 1;
+ }
+ }
+ SubU32Store(vrec_len, vrec_len_byte_ct, vrec_len_dest);
+ return 0;
+}
+
+BoolErr AppendDphase16(const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp, unsigned char* vrtype_ptr, uint32_t* vrec_len_ptr) {
+ assert(dphase_ct);
+ *vrtype_ptr += 0x80;
+ if (dphase_ct != pwcp->sample_ct) {
+ // bugfix (9 Dec 2018): forgot to separate fixed-width phased-dosage case
+ // bugfix (12 Feb 2019): double-added dphase_ct * sizeof(int16_t)
+ const uint32_t dphase_present_byte_ct = DivUp(dosage_ct, CHAR_BIT);
+ if (unlikely(CheckedVrecLenIncr(dphase_present_byte_ct, vrec_len_ptr))) {
+ return 1;
+ }
+ CopyBitarrSubsetToUnaligned(dphase_present, dosage_present, dosage_ct, pwcp->fwrite_bufp);
+ pwcp->fwrite_bufp = &(pwcp->fwrite_bufp[dphase_present_byte_ct]);
+ }
+ // bugfix (2 Jan 2019): forgot to update vrec_len here
+ if (unlikely(CheckedVrecLenIncr(dphase_ct * sizeof(int16_t), vrec_len_ptr))) {
+ return 1;
+ }
+ pwcp->fwrite_bufp = memcpyua(pwcp->fwrite_bufp, dphase_delta, dphase_ct * sizeof(int16_t));
+ return 0;
+}
+
+BoolErr PwcAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp) {
+ // assumes there is phase and/or dosage data in output file, otherwise
+ // vrtype_dest needs to be replaced
+ const uint32_t vidx = pwcp->vidx;
+ unsigned char* vrtype_dest = &(DowncastToUc(pwcp->vrtype_buf)[vidx]);
+ uint32_t het_ct;
+ uint32_t vrec_len = PwcAppendBiallelicGenovecMain(genovec, vidx, pwcp, &het_ct, nullptr, vrtype_dest);
+ const uintptr_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ const uint32_t sample_ct = pwcp->sample_ct;
+ const uint32_t sample_ctl = BitCtToWordCt(sample_ct);
+ pwcp->vidx += 1;
+ unsigned char* vrec_len_dest = &(pwcp->vrec_len_buf[vidx * vrec_len_byte_ct]);
+ const uint32_t phasepresent_ct = phasepresent? PopcountWords(phasepresent, sample_ctl) : het_ct;
+ if (phasepresent_ct) {
+ AppendHphase(genovec, phasepresent, phaseinfo, het_ct, phasepresent_ct, pwcp, vrtype_dest, &vrec_len);
+ }
+ if (dosage_ct) {
+ if (unlikely(AppendDosage16(dosage_present, dosage_main, dosage_ct, dphase_ct, pwcp, vrtype_dest, &vrec_len))) {
+ return 1;
+ }
+ if (dphase_ct) {
+ if (unlikely(AppendDphase16(dosage_present, dphase_present, dphase_delta, dosage_ct, dphase_ct, pwcp, vrtype_dest, &vrec_len))) {
+ return 1;
+ }
+ }
+ }
+ SubU32Store(vrec_len, vrec_len_byte_ct, vrec_len_dest);
+ return 0;
+}
+
+uint32_t PhaseOrDosagePresent(const uintptr_t* vrtype_buf, uint32_t variant_ct) {
+ // Assumes trailing bits of vrtype_buf are zeroed out. (This happens during
+ // vrtype_buf initialization.)
+ const uint32_t word_ct = DivUp(variant_ct, kBytesPerWord);
+ for (uint32_t widx = 0; widx != word_ct; ++widx) {
+ if (vrtype_buf[widx] & (kMask0101 * 0xf0)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+// ext_iter == nullptr ok
+IntErr AppendExtVarint(const PgenExtensionLl* ext_iter, FILE* pgen) {
+ int32_t cur_outbyte = 0;
+ uint32_t flushed_byte_ct = 0;
+ while (ext_iter) {
+ const uint32_t cur_type_idx = ext_iter->type_idx;
+ const uint32_t cur_type_idx_div7 = cur_type_idx / 7;
+ const uint32_t bytes_to_flush = cur_type_idx_div7 - flushed_byte_ct;
+ if (bytes_to_flush) {
+ putc_unlocked(cur_outbyte | 128, pgen);
+ for (uint32_t uii = 1; uii != bytes_to_flush; ++uii) {
+ putc_unlocked(128, pgen);
+ }
+ cur_outbyte = 0;
+ flushed_byte_ct = cur_type_idx_div7;
+ }
+ cur_outbyte |= 1 << (cur_type_idx % 7);
+
+ ext_iter = ext_iter->next;
+ }
+ return putc_checked(cur_outbyte, pgen);
+}
+
+PglErr AppendExtSizesAndContents(const PgenExtensionLl* exts, FILE* pgen) {
+ const PgenExtensionLl* exts_iter = exts;
+ do {
+ const uint64_t size = exts_iter->size;
+ if (unlikely(size >= (1LLU << (kBitsPerWord - 1)))) {
+ return kPglRetImproperFunctionCall;
+ }
+ FPutVint64(size, pgen);
+ exts_iter = exts_iter->next;
+ } while (exts_iter);
+ exts_iter = exts;
+ do {
+ if (unlikely(fwrite_checked(exts_iter->contents, exts_iter->size, pgen))) {
+ return kPglRetWriteFail;
+ }
+ exts_iter = exts_iter->next;
+ } while (exts_iter);
+ return kPglRetSuccess;
+}
+
+PglErr PwcFinish(PgenWriterCommon* pwcp, FILE** pgen_outfile_ptr, FILE** pgi_or_final_pgen_outfile_ptr, char** fname_buf_ptr) {
+ uint64_t footer_fpos = 0;
+ if (pwcp->footer_exts) {
+ footer_fpos = ftello(*pgen_outfile_ptr);
+ if (!(*fname_buf_ptr)) {
+ PglErr reterr = AppendExtSizesAndContents(pwcp->footer_exts, *pgen_outfile_ptr);
+ if (unlikely(reterr)) {
+ return reterr;
+ }
+ }
+ }
+ const uint32_t variant_ct = pwcp->vidx;
+ FILE** header_ff_ptr;
+ if (*pgi_or_final_pgen_outfile_ptr) {
+ // kPgenWriteSeparateIndex, kPgenWriteAndCopy
+ assert(variant_ct);
+ assert(pwcp->variant_ct_limit >= variant_ct);
+ if (pwcp->phase_dosage_gflags) {
+ // Check if there is at least one variant with a phase or dosage data
+ // track.
+ if (!PhaseOrDosagePresent(pwcp->vrtype_buf, variant_ct)) {
+ // If not, shrink the index.
+ Reduce8to4bitInplaceUnsafe(variant_ct, pwcp->vrtype_buf);
+ pwcp->phase_dosage_gflags = kfPgenGlobal0;
+ }
+ }
+ if (unlikely(fclose_null(pgen_outfile_ptr))) {
+ return kPglRetWriteFail;
+ }
+ header_ff_ptr = pgi_or_final_pgen_outfile_ptr;
+ if (*fname_buf_ptr) {
+ *pgen_outfile_ptr = fopen(*fname_buf_ptr, FOPEN_RB);
+ if (unlikely(!(*pgen_outfile_ptr))) {
+ return kPglRetOpenFail;
+ }
+ }
+ } else {
+ // kPgenWriteBackwardSeek
+ assert(pwcp->variant_ct_limit == variant_ct);
+ header_ff_ptr = pgen_outfile_ptr;
+ if (unlikely(fseeko(*header_ff_ptr, 3, SEEK_SET))) {
+ return kPglRetWriteFail;
+ }
+ }
+ FILE* header_ff = *header_ff_ptr;
+ const PgenGlobalFlags phase_dosage_gflags = pwcp->phase_dosage_gflags;
+ fwrite_unlocked(&variant_ct, sizeof(int32_t), 1, header_ff);
+ fwrite_unlocked(&(pwcp->sample_ct), sizeof(int32_t), 1, header_ff);
+
+ const uint32_t vrec_len_byte_ct = pwcp->vrec_len_byte_ct;
+ const uint32_t control_byte = (vrec_len_byte_ct - 1) + (4 * (phase_dosage_gflags != 0)) + (pwcp->nonref_flags_storage << 6);
+ putc_unlocked(control_byte, header_ff);
+
+ const uint32_t vblock_ct = DivUp(variant_ct, kPglVblockSize);
+ if (*fname_buf_ptr) {
+ // Current vblock_fpos values are with no index. Determine the size of the
+ // index and add it to all array entries.
+
+ // variant_ct, sample_ct, control_byte, vblock_fpos itself
+ uintptr_t index_size = 9 + vblock_ct * sizeof(int64_t);
+ // vrtypes
+ if (phase_dosage_gflags != 0) {
+ index_size += variant_ct;
+ } else {
+ index_size += DivUp(variant_ct, 2);
+ }
+ // vrec_lens
+ index_size += vrec_len_byte_ct * S_CAST(uintptr_t, variant_ct);
+ // nonref_flags
+ if (pwcp->explicit_nonref_flags) {
+ index_size += DivUp(variant_ct, CHAR_BIT);
+ }
+ for (uint32_t vblock_idx = 0; vblock_idx != vblock_ct; ++vblock_idx) {
+ pwcp->vblock_fpos[vblock_idx] += index_size;
+ }
+ }
+ fwrite_unlocked(pwcp->vblock_fpos, vblock_ct * sizeof(int64_t), 1, header_ff);
+ const unsigned char* vrtype_buf_iter = DowncastToUc(pwcp->vrtype_buf);
+ const unsigned char* vrec_len_buf_iter = pwcp->vrec_len_buf;
+ uint32_t vrec_iter_incr = kPglVblockSize * vrec_len_byte_ct;
+ uint32_t vrtype_buf_iter_incr = phase_dosage_gflags? kPglVblockSize : (kPglVblockSize / 2);
+ uint32_t nonref_flags_write_byte_ct = kPglVblockSize / CHAR_BIT;
+ const unsigned char* vrec_len_buf_last = &(vrec_len_buf_iter[S_CAST(uintptr_t, vblock_ct - 1) * vrec_iter_incr]);
+ uintptr_t* explicit_nonref_flags = pwcp->explicit_nonref_flags;
+ uintptr_t* explicit_nonref_flags_iter = explicit_nonref_flags;
+ for (; ; vrec_len_buf_iter = &(vrec_len_buf_iter[vrec_iter_incr])) {
+ if (vrec_len_buf_iter >= vrec_len_buf_last) {
+ if (vrec_len_buf_iter > vrec_len_buf_last) {
+ break;
+ }
+ const uint32_t vblock_size = ModNz(variant_ct, kPglVblockSize);
+ vrtype_buf_iter_incr = phase_dosage_gflags? vblock_size : DivUp(vblock_size, 2);
+ vrec_iter_incr = vblock_size * vrec_len_byte_ct;
+ nonref_flags_write_byte_ct = DivUp(vblock_size, CHAR_BIT);
+ }
+ // 4b(i): array of 4-bit or 1-byte vrtypes
+ fwrite_unlocked(vrtype_buf_iter, vrtype_buf_iter_incr, 1, header_ff);
+ vrtype_buf_iter = &(vrtype_buf_iter[vrtype_buf_iter_incr]);
+
+ // 4b(ii): array of variant record lengths
+ if (unlikely(fwrite_checked(vrec_len_buf_iter, vrec_iter_incr, header_ff))) {
+ return kPglRetWriteFail;
+ }
+
+ // If we were writing allele_idx_offsets, that would happen here.
+
+ // 4b(iii): explicit nonref flags
+ if (explicit_nonref_flags) {
+ if (unlikely(fwrite_checked(explicit_nonref_flags_iter, nonref_flags_write_byte_ct, header_ff))) {
+ return kPglRetWriteFail;
+ }
+ explicit_nonref_flags_iter = &(explicit_nonref_flags_iter[kPglVblockSize / kBitsPerWord]);
+ }
+ }
+ if (pwcp->header_exts || pwcp->footer_exts) {
+ if (unlikely(AppendExtVarint(pwcp->header_exts, header_ff) ||
+ AppendExtVarint(pwcp->footer_exts, header_ff))) {
+ return kPglRetWriteFail;
+ }
+ if (pwcp->footer_exts) {
+ if (*fname_buf_ptr) {
+ // In kPgenWriteAndCopy mode, initial footer_fpos value is relative to
+ // the start of the .pgen body. We now need to add the header size.
+ footer_fpos += ftello(header_ff) + sizeof(int64_t);
+ const PgenExtensionLl* header_exts_iter = pwcp->header_exts;
+ while (header_exts_iter) {
+ const uintptr_t cur_size = header_exts_iter->size;
+ footer_fpos += cur_size + VintBytect(cur_size);
+ header_exts_iter = header_exts_iter->next;
+ }
+ }
+ if (unlikely(!fwrite_unlocked(&footer_fpos, sizeof(int64_t), 1, header_ff))) {
+ return kPglRetWriteFail;
+ }
+ }
+ if (pwcp->header_exts) {
+ PglErr reterr = AppendExtSizesAndContents(pwcp->header_exts, header_ff);
+ if (unlikely(reterr)) {
+ return reterr;
+ }
+ }
+ }
+ if (!(*fname_buf_ptr)) {
+ return fclose_null(header_ff_ptr)? kPglRetWriteFail : kPglRetSuccess;
+ }
+ // Append temporary-.pgen body (excluding the 3 bytes containing the leading
+ // magic number) to final output file.
+ unsigned char copybuf[kPglFwriteBlockSize + 3];
+ uintptr_t nbyte = fread(copybuf, 1, kPglFwriteBlockSize + 3, *pgen_outfile_ptr);
+ if (unlikely((nbyte <= 3) || (!memequal_k(copybuf, "l\x1b", 2)))) {
+ return kPglRetRewindFail;
+ }
+ nbyte -= 3;
+ if (unlikely(!fwrite_unlocked(&(copybuf[3]), nbyte, 1, header_ff))) {
+ return kPglRetWriteFail;
+ }
+ while (1) {
+ nbyte = fread(copybuf, 1, kPglFwriteBlockSize, *pgen_outfile_ptr);
+ if (!nbyte) {
+ break;
+ }
+ if (unlikely(!fwrite_unlocked(copybuf, nbyte, 1, header_ff))) {
+ return kPglRetWriteFail;
+ }
+ }
+ if (pwcp->footer_exts) {
+ PglErr reterr = AppendExtSizesAndContents(pwcp->footer_exts, header_ff);
+ if (unlikely(reterr)) {
+ return kPglRetWriteFail;
+ }
+ }
+ if (unlikely(fclose_null(pgen_outfile_ptr) ||
+ unlink(*fname_buf_ptr) ||
+ fclose_null(header_ff_ptr))) {
+ return kPglRetWriteFail;
+ }
+ free(*fname_buf_ptr);
+ *fname_buf_ptr = nullptr;
+ return kPglRetSuccess;
+}
+
+PglErr SpgwFinish(STPgenWriter* spgwp) {
+ PgenWriterCommon* pwcp = GetPwcp(spgwp);
+ FILE** pgen_outfilep = GetPgenOutfilep(spgwp);
+ if (unlikely(fwrite_checked(pwcp->fwrite_buf, pwcp->fwrite_bufp - pwcp->fwrite_buf, *pgen_outfilep))) {
+ return kPglRetWriteFail;
+ }
+ FILE** pgi_or_final_pgen_outfilep = GetPgiOrFinalPgenOutfilep(spgwp);
+ char** fname_bufp = GetFnameBufp(spgwp);
+ return PwcFinish(pwcp, pgen_outfilep, pgi_or_final_pgen_outfilep, fname_bufp);
+}
+
+PglErr MpgwFlush(MTPgenWriter* mpgwp) {
+ PgenWriterCommon* pwcp = mpgwp->pwcs[0];
+ uint32_t vidx = RoundDownPow2(pwcp->vidx - 1, kPglVblockSize);
+ uint32_t thread_ct = mpgwp->thread_ct;
+ const uint32_t variant_ct = pwcp->variant_ct_limit;
+ const uint32_t is_last_flush = ((vidx + thread_ct * kPglVblockSize) >= variant_ct);
+ if (is_last_flush) {
+ thread_ct = DivUp(variant_ct - vidx, kPglVblockSize);
+ }
+ uint64_t* vblock_fpos = pwcp->vblock_fpos;
+ FILE* pgen_outfile = mpgwp->pgen_outfile;
+ const uint32_t vidx_incr = (thread_ct - 1) * kPglVblockSize;
+ uint64_t cur_vblock_fpos = ftello(pgen_outfile);
+ for (uint32_t tidx = 0; tidx != thread_ct; ++tidx) {
+ vblock_fpos[(vidx / kPglVblockSize) + tidx] = cur_vblock_fpos;
+ PgenWriterCommon* cur_pwcp = mpgwp->pwcs[tidx];
+ uintptr_t cur_vblock_byte_ct = cur_pwcp->fwrite_bufp - cur_pwcp->fwrite_buf;
+ if (unlikely(fwrite_checked(cur_pwcp->fwrite_buf, cur_vblock_byte_ct, pgen_outfile))) {
+ return kPglRetWriteFail;
+ }
+ cur_pwcp->vidx += vidx_incr;
+ cur_pwcp->fwrite_bufp = cur_pwcp->fwrite_buf;
+ cur_vblock_fpos += cur_vblock_byte_ct;
+ }
+ if (!is_last_flush) {
+ return kPglRetSuccess;
+ }
+ pwcp->vidx = variant_ct;
+ return PwcFinish(pwcp, &(mpgwp->pgen_outfile), &(mpgwp->pgi_or_final_pgen_outfile), &(mpgwp->fname_buf));
+}
+
+BoolErr CleanupSpgw(STPgenWriter* spgwp, PglErr* reterrp) {
+ // assume file is open if spgw.pgen_outfile is not null
+ // memory is the responsibility of the caller for now
+ FILE** pgen_outfilep = GetPgenOutfilep(spgwp);
+ FILE** pgi_or_final_pgen_outfilep = GetPgiOrFinalPgenOutfilep(spgwp);
+ char** fname_bufp = GetFnameBufp(spgwp);
+ BoolErr fclose_err = 0;
+ if (*pgi_or_final_pgen_outfilep) {
+ fclose_err = fclose_null(pgi_or_final_pgen_outfilep);
+ }
+ if (*pgen_outfilep) {
+ if (fclose_null(pgen_outfilep)) {
+ fclose_err = 1;
+ }
+ }
+ if (*fname_bufp) {
+ free(*fname_bufp);
+ *fname_bufp = nullptr;
+ }
+ if (unlikely(fclose_err && (*reterrp == kPglRetSuccess))) {
+ *reterrp = kPglRetWriteFail;
+ }
+ return fclose_err;
+}
+
+BoolErr CleanupMpgw(MTPgenWriter* mpgwp, PglErr* reterrp) {
+ if (!mpgwp) {
+ return 0;
+ }
+ FILE** pgen_outfilep = &(mpgwp->pgen_outfile);
+ FILE** pgi_or_final_pgen_outfilep = &(mpgwp->pgi_or_final_pgen_outfile);
+ char** fname_bufp = &(mpgwp->fname_buf);
+ BoolErr fclose_err = 0;
+ if (*pgi_or_final_pgen_outfilep) {
+ fclose_err = fclose_null(pgi_or_final_pgen_outfilep);
+ }
+ if (*pgen_outfilep) {
+ if (fclose_null(pgen_outfilep)) {
+ fclose_err = 1;
+ }
+ }
+ if (*fname_bufp) {
+ free(*fname_bufp);
+ *fname_bufp = nullptr;
+ }
+ if (unlikely(fclose_err && (*reterrp == kPglRetSuccess))) {
+ *reterrp = kPglRetWriteFail;
+ }
+ return fclose_err;
+}
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
diff --git a/external_libs/pgenlib/include/pgenlib_write.h b/external_libs/pgenlib/include/pgenlib_write.h
new file mode 100644
index 00000000..801b85b5
--- /dev/null
+++ b/external_libs/pgenlib/include/pgenlib_write.h
@@ -0,0 +1,393 @@
+#ifndef __PGENLIB_WRITE_H__
+#define __PGENLIB_WRITE_H__
+
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
+// Christopher Chang.
+//
+// This library is free software: you can redistribute it and/or modify it
+// under the terms of the GNU Lesser General Public License as published by the
+// Free Software Foundation; either version 3 of the License, or (at your
+// option) any later version.
+//
+// This library is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License
+// for more details.
+//
+// You should have received a copy of the GNU Lesser General Public License
+// along with this library. If not, see .
+
+
+// pgenlib_write contains writer-specific code.
+
+#include "pgenlib_misc.h"
+#include "plink2_base.h"
+
+#ifdef __cplusplus
+namespace plink2 {
+#endif
+
+typedef struct PgenWriterCommonStruct {
+ // was marked noncopyable, but, well, gcc 9 caught me cheating (memcpying the
+ // whole struct) in the multithreaded writer implementation. So, copyable
+ // now.
+
+ // This must be the true variant count when writing a regular .pgen; but if
+ // the index is being written to a separate .pgen.pgi file, it is okay for
+ // this to just be an upper bound.
+ uint32_t variant_ct_limit;
+
+ uint32_t sample_ct;
+ PgenGlobalFlags phase_dosage_gflags; // subset of gflags
+ unsigned char nonref_flags_storage;
+
+ // I'll cache this for now
+ uintptr_t vrec_len_byte_ct;
+
+ // There should be a single copy of these arrays shared by all threads.
+ uint64_t* vblock_fpos;
+ unsigned char* vrec_len_buf;
+ uintptr_t* vrtype_buf;
+ uintptr_t* explicit_nonref_flags; // usually nullptr
+
+ // needed for multiallelic-phased case
+ uintptr_t* genovec_hets_buf;
+
+ PgenExtensionLl* header_exts;
+ PgenExtensionLl* footer_exts;
+
+ // should match ftello() return value in singlethreaded case, but be set to
+ // zero in multithreaded case
+ uint64_t vblock_fpos_offset;
+
+
+ // The remainder of this data structure is unshared.
+ STD_ARRAY_DECL(uint32_t, 4, ldbase_genocounts);
+
+ // these must hold sample_ct entries
+ // genovec_invert_buf also used as phaseinfo and dphase_present temporary
+ // storage
+ uintptr_t* genovec_invert_buf;
+ uintptr_t* ldbase_genovec;
+
+ // these must hold 2 * (sample_ct / kPglMaxDifflistLenDivisor) entries
+ uintptr_t* ldbase_raregeno;
+ uint32_t* ldbase_difflist_sample_ids; // 1 extra entry, == sample_ct
+
+ // this must fit 64k variants in multithreaded case
+ unsigned char* fwrite_buf;
+ unsigned char* fwrite_bufp;
+
+ uint32_t ldbase_common_geno; // UINT32_MAX if ldbase_genovec present
+ uint32_t ldbase_difflist_len;
+
+ uint32_t vidx;
+} PgenWriterCommon;
+
+// Given packed arrays of unphased biallelic genotypes in uncompressed plink2
+// binary format (00 = hom ref, 01 = het ref/alt1, 10 = hom alt1, 11 =
+// missing), {S,M}TPgenWriter performs difflist (sparse variant), one bit
+// (mostly-two-value), and LD compression before writing to disk, and flushes
+// the header at the end.
+//
+// The flush requires a backward seek if write_mode is set to
+// kPgenWriteBackwardSeek during initialization. Otherwise, the (possibly
+// temporary) .pgen is written sequentially; in the kPgenWriteSeparateIndex
+// mode, the index is then saved to a separate .pgen.pgi file, while in the
+// kPgenWriteAndCopy mode, the real .pgen is only created at the end, by
+// writing the index and then appending the body of the first .pgen (which is
+// then deleted).
+//
+// Slightly over 128 KiB (kPglFwriteBlockSize) of stack space is currently
+// required. We use plain fwrite() instead of write() to reduce the amount of
+// necessary platform-specific logic; a 128 KiB write-block size keeps the
+// associated performance penalty to a minimum.
+//
+// MTPgenWriter has additional restrictions:
+// * You must process large blocks of variants at a time (64k per thread).
+// * You must know the true variant_ct (and, in the multiallelic case, the true
+// allele_idx_offsets) during initialization.
+// Thus, STPgenWriter is still worth using in many cases.
+
+ENUM_U31_DEF_START()
+ kPgenWriteBackwardSeek,
+ kPgenWriteSeparateIndex,
+ kPgenWriteAndCopy
+ENUM_U31_DEF_END(PgenWriteMode);
+
+typedef struct STPgenWriterStruct {
+ MOVABLE_BUT_NONCOPYABLE(STPgenWriterStruct);
+#ifdef __cplusplus
+ PgenWriterCommon& GET_PRIVATE_pwc() { return pwc; }
+ PgenWriterCommon const& GET_PRIVATE_pwc() const { return pwc; }
+ FILE*& GET_PRIVATE_pgen_outfile() { return pgen_outfile; }
+ FILE* const& GET_PRIVATE_pgen_outfile() const { return pgen_outfile; }
+ FILE*& GET_PRIVATE_pgi_or_final_pgen_outfile() { return pgi_or_final_pgen_outfile; }
+ FILE* const& GET_PRIVATE_pgi_or_final_pgen_outfile() const { return pgi_or_final_pgen_outfile; }
+ char*& GET_PRIVATE_fname_buf() { return fname_buf; }
+ char* const& GET_PRIVATE_fname_buf() const { return fname_buf; }
+ private:
+#endif
+ PgenWriterCommon pwc;
+ FILE* pgen_outfile;
+ FILE* pgi_or_final_pgen_outfile;
+ // Initialized in kPgenWriteAndCopy mode. It is necessary on at least macOS
+ // to close and reopen the initially-written .pgen; freopen() with
+ // filename=nullptr can't be used to change from write-mode to read-mode.
+ char* fname_buf;
+} STPgenWriter;
+
+typedef struct MTPgenWriterStruct {
+ MOVABLE_BUT_NONCOPYABLE(MTPgenWriterStruct);
+ FILE* pgen_outfile;
+ FILE* pgi_or_final_pgen_outfile;
+ char* fname_buf;
+ uint32_t thread_ct;
+ PgenWriterCommon* pwcs[];
+} MTPgenWriter;
+
+HEADER_INLINE uint32_t SpgwGetVariantCt(STPgenWriter* spgwp) {
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ return pwcp->variant_ct_limit;
+}
+
+HEADER_INLINE uint32_t SpgwGetSampleCt(STPgenWriter* spgwp) {
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ return pwcp->sample_ct;
+}
+
+HEADER_INLINE uint32_t SpgwGetVidx(STPgenWriter* spgwp) {
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ return pwcp->vidx;
+}
+
+void PreinitSpgw(STPgenWriter* spgwp);
+
+void PreinitMpgw(MTPgenWriter* mpgwp);
+
+// phase_dosage_gflags zero vs. nonzero is most important: this determines size
+// of header. Otherwise, setting more flags than necessary just increases
+// memory requirements.
+//
+// nonref_flags_storage values:
+// 0 = no info stored
+// 1 = always trusted
+// 2 = always untrusted
+// 3 = use explicit_nonref_flags
+//
+// Caller is responsible for printing open-fail error message.
+//
+// In the multiallelic case, allele_idx_offsets can be nullptr if
+// allele_ct_upper_bound is provided.
+//
+// If write_mode is kPgenWriteBackwardSeek, variant_ct_limit must be the actual
+// variant count. Otherwise, variant_ct_limit just needs to be an upper bound,
+// unless you're writing multiallelic variants and you don't specify
+// allele_ct_upper_bound. If you're using a plink2-style memory workspace,
+// variant_ct_limit := min(kPglMaxVariantCt, bigstack_left() / 8) is a
+// reasonable default: this tends to consume less than half of your remaining
+// workspace, without falling below the number of variants you can
+// realistically load on the system.
+//
+// The body of explicit_nonref_flags doesn't need to be filled until final
+// flush.
+// For header extensions, type_idx and size must be correct when this function
+// is called, but contents doesn't need to be filled until final flush.
+// For footer extensions, type_idx must be correct when this function is
+// called, but size and contents don't need to be filled until final flush.
+// Extension linked-lists must be ordered with lower type_idx first.
+PglErr SpgwInitPhase1Ex(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct_limit, uint32_t sample_ct, uint32_t allele_ct_upper_bound, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, STPgenWriter* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr);
+
+HEADER_INLINE PglErr SpgwInitPhase1(const char* __restrict fname, const uintptr_t* __restrict allele_idx_offsets, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct_limit, uint32_t sample_ct, uint32_t allele_ct_upper_bound, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, STPgenWriter* spgwp, uintptr_t* alloc_cacheline_ct_ptr, uint32_t* max_vrec_len_ptr) {
+ return SpgwInitPhase1Ex(fname, allele_idx_offsets, explicit_nonref_flags, nullptr, nullptr, variant_ct_limit, sample_ct, allele_ct_upper_bound, write_mode, phase_dosage_gflags, nonref_flags_storage, spgwp, alloc_cacheline_ct_ptr, max_vrec_len_ptr);
+}
+
+void SpgwInitPhase2(uint32_t max_vrec_len, STPgenWriter* spgwp, unsigned char* spgw_alloc);
+
+// moderately likely that there isn't enough memory to use the maximum number
+// of threads, so this returns per-thread memory requirements before forcing
+// the caller to specify thread count
+// (eventually should write code which falls back on STPgenWriter
+// when there isn't enough memory for even a single 64k variant block, at least
+// for the most commonly used plink 2.0 functions)
+void MpgwInitPhase1(const uintptr_t* __restrict allele_idx_offsets, uint32_t variant_ct, uint32_t sample_ct, PgenGlobalFlags phase_dosage_gflags, uintptr_t* alloc_base_cacheline_ct_ptr, uint64_t* alloc_per_thread_cacheline_ct_ptr, uint32_t* vrec_len_byte_ct_ptr, uint64_t* vblock_cacheline_ct_ptr);
+
+// Caller is responsible for printing open-fail error message.
+PglErr MpgwInitPhase2Ex(const char* __restrict fname, uintptr_t* __restrict explicit_nonref_flags, PgenExtensionLl* header_exts, PgenExtensionLl* footer_exts, uint32_t variant_ct, uint32_t sample_ct, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, MTPgenWriter* mpgwp);
+
+HEADER_INLINE PglErr MpgwInitPhase2(const char* __restrict fname, uintptr_t* __restrict explicit_nonref_flags, uint32_t variant_ct, uint32_t sample_ct, PgenWriteMode write_mode, PgenGlobalFlags phase_dosage_gflags, uint32_t nonref_flags_storage, uint32_t vrec_len_byte_ct, uintptr_t vblock_cacheline_ct, uint32_t thread_ct, unsigned char* mpgw_alloc, MTPgenWriter* mpgwp) {
+ return MpgwInitPhase2Ex(fname, explicit_nonref_flags, nullptr, nullptr, variant_ct, sample_ct, write_mode, phase_dosage_gflags, nonref_flags_storage, vrec_len_byte_ct, vblock_cacheline_ct, thread_ct, mpgw_alloc, mpgwp);
+}
+
+
+// trailing bits of genovec must be zeroed out
+void PwcAppendBiallelicGenovec(const uintptr_t* __restrict genovec, PgenWriterCommon* pwcp);
+
+BoolErr SpgwFlush(STPgenWriter* spgwp);
+
+HEADER_INLINE PglErr SpgwAppendBiallelicGenovec(const uintptr_t* __restrict genovec, STPgenWriter* spgwp) {
+ if (unlikely(SpgwFlush(spgwp))) {
+ return kPglRetWriteFail;
+ }
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ PwcAppendBiallelicGenovec(genovec, pwcp);
+ return kPglRetSuccess;
+}
+
+// trailing bits of raregeno must be zeroed out
+// all raregeno entries assumed to be unequal to difflist_common_geno; the
+// difflist should be compacted first if this isn't true
+// difflist_len must be <= 2 * (sample_ct / kPglMaxDifflistLenDivisor);
+// there's an assert checking this
+void PwcAppendBiallelicDifflistLimited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, PgenWriterCommon* pwcp);
+
+HEADER_INLINE PglErr SpgwAppendBiallelicDifflistLimited(const uintptr_t* __restrict raregeno, const uint32_t* __restrict difflist_sample_ids, uint32_t difflist_common_geno, uint32_t difflist_len, STPgenWriter* spgwp) {
+ if (unlikely(SpgwFlush(spgwp))) {
+ return kPglRetWriteFail;
+ }
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ PwcAppendBiallelicDifflistLimited(raregeno, difflist_sample_ids, difflist_common_geno, difflist_len, pwcp);
+ return kPglRetSuccess;
+}
+
+// Two interfaces for appending multiallelic hardcalls:
+// 1. sparse: genovec, bitarray+values describing ref/altx hardcalls which
+// aren't ref/alt1, bitarray+values describing altx/alty hardcalls which
+// aren't alt1/alt1.
+// patch_01_vals[] contains one entry per relevant sample (only need altx
+// index), patch_10_vals[] contains two.
+// Ok if patch_01_ct == patch_10_ct == 0; in this case no aux1 track is
+// saved and bit 3 of vrtype is not set. (Note that multiallelic dosage may
+// still be present when vrtype bit 3 is unset.)
+// 2. generic dense: takes a length-2n array of AlleleCode allele codes.
+// Assumes [2k] <= [2k+1] for each k. Instead of providing direct API
+// functions for this, we just provide a dense -> sparse helper function.
+//
+// All arrays should be vector-aligned.
+BoolErr PwcAppendMultiallelicSparse(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp);
+
+HEADER_INLINE PglErr SpgwAppendMultiallelicSparse(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, STPgenWriter* spgwp) {
+ if (unlikely(SpgwFlush(spgwp))) {
+ return kPglRetWriteFail;
+ }
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ if (unlikely(PwcAppendMultiallelicSparse(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, allele_ct, patch_01_ct, patch_10_ct, pwcp))) {
+ return kPglRetVarRecordTooLarge;
+ }
+ return kPglRetSuccess;
+}
+
+// This may not zero out trailing halfword of patch_{01,10}_set.
+void PglMultiallelicDenseToSparse(const AlleleCode* __restrict wide_codes, uint32_t sample_ct, uintptr_t* __restrict genoarr, uintptr_t* __restrict patch_01_set, AlleleCode* __restrict patch_01_vals, uintptr_t* __restrict patch_10_set, AlleleCode* __restrict patch_10_vals, uint32_t* __restrict patch_01_ct_ptr, uint32_t* __restrict patch_10_ct_ptr);
+
+// If remap is not nullptr, this simultaneously performs a rotation operation:
+// wide_codes[2n] and [2n+1] are set to remap[geno[n]] rather than geno[n], and
+// bit n of flipped (if flipped non-null) is set iff phase orientation is
+// flipped (i.e. wide_codes[2n] was larger than wide_codes[2n+1] before the
+// final reordering pass; caller needs to know this to properly update
+// phaseinfo, dphase_delta, multidphase_delta).
+// It currently assumes no present alleles are being mapped to 'missing'.
+void PglMultiallelicSparseToDense(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const AlleleCode* __restrict remap, uint32_t sample_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, uintptr_t* __restrict flipped, AlleleCode* __restrict wide_codes);
+
+// phasepresent == nullptr ok, that indicates that ALL heterozygous calls are
+// phased. Caller should use e.g. PwcAppendBiallelicGenovec() if it's known
+// in advance that no calls are phased.
+// Ok for phaseinfo to have bits set at non-het calls, NOT currently okay for
+// phasepresent
+void PwcAppendBiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, PgenWriterCommon* pwcp);
+
+// phasepresent == nullptr ok
+// ok for trailing bits of phaseinfo to not be zeroed out, NOT currently ok for
+// phasepresent
+HEADER_INLINE PglErr SpgwAppendBiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, STPgenWriter* spgwp) {
+ if (unlikely(SpgwFlush(spgwp))) {
+ return kPglRetWriteFail;
+ }
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ PwcAppendBiallelicGenovecHphase(genovec, phasepresent, phaseinfo, pwcp);
+ return kPglRetSuccess;
+}
+
+BoolErr PwcAppendMultiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, PgenWriterCommon* pwcp);
+
+HEADER_INLINE PglErr SpgwAppendMultiallelicGenovecHphase(const uintptr_t* __restrict genovec, const uintptr_t* __restrict patch_01_set, const AlleleCode* __restrict patch_01_vals, const uintptr_t* __restrict patch_10_set, const AlleleCode* __restrict patch_10_vals, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, uint32_t allele_ct, uint32_t patch_01_ct, uint32_t patch_10_ct, STPgenWriter* spgwp) {
+ if (unlikely(SpgwFlush(spgwp))) {
+ return kPglRetWriteFail;
+ }
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ if (unlikely(PwcAppendMultiallelicGenovecHphase(genovec, patch_01_set, patch_01_vals, patch_10_set, patch_10_vals, phasepresent, phaseinfo, allele_ct, patch_01_ct, patch_10_ct, pwcp))) {
+ return kPglRetVarRecordTooLarge;
+ }
+ return kPglRetSuccess;
+}
+
+
+// dosage_main[] has length dosage_ct, not sample_ct
+// ok for trailing bits of dosage_present to not be zeroed out
+BoolErr PwcAppendBiallelicGenovecDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp);
+
+HEADER_INLINE PglErr SpgwAppendBiallelicGenovecDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, STPgenWriter* spgwp) {
+ if (unlikely(SpgwFlush(spgwp))) {
+ return kPglRetWriteFail;
+ }
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ if (unlikely(PwcAppendBiallelicGenovecDosage16(genovec, dosage_present, dosage_main, dosage_ct, pwcp))) {
+ return kPglRetVarRecordTooLarge;
+ }
+ return kPglRetSuccess;
+}
+
+BoolErr PwcAppendBiallelicGenovecHphaseDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, PgenWriterCommon* pwcp);
+
+HEADER_INLINE PglErr SpgwAppendBiallelicGenovecHphaseDosage16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uint16_t* dosage_main, uint32_t dosage_ct, STPgenWriter* spgwp) {
+ if (unlikely(SpgwFlush(spgwp))) {
+ return kPglRetWriteFail;
+ }
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ if (unlikely(PwcAppendBiallelicGenovecHphaseDosage16(genovec, phasepresent, phaseinfo, dosage_present, dosage_main, dosage_ct, pwcp))) {
+ return kPglRetVarRecordTooLarge;
+ }
+ return kPglRetSuccess;
+}
+
+// dosage_present cannot be null for nonzero dosage_ct
+// trailing bits of dosage_present MUST be zeroed out
+// could make dosage_main[] has length dosage_ct + dphase_ct instead of having
+// separate dphase_delta[]?
+BoolErr PwcAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* __restrict dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, PgenWriterCommon* pwcp);
+
+HEADER_INLINE PglErr SpgwAppendBiallelicGenovecDphase16(const uintptr_t* __restrict genovec, const uintptr_t* __restrict phasepresent, const uintptr_t* __restrict phaseinfo, const uintptr_t* __restrict dosage_present, const uintptr_t* dphase_present, const uint16_t* dosage_main, const int16_t* dphase_delta, uint32_t dosage_ct, uint32_t dphase_ct, STPgenWriter* spgwp) {
+ if (unlikely(SpgwFlush(spgwp))) {
+ return kPglRetWriteFail;
+ }
+ PgenWriterCommon* pwcp = &GET_PRIVATE(*spgwp, pwc);
+ if (unlikely(PwcAppendBiallelicGenovecDphase16(genovec, phasepresent, phaseinfo, dosage_present, dphase_present, dosage_main, dphase_delta, dosage_ct, dphase_ct, pwcp))) {
+ return kPglRetVarRecordTooLarge;
+ }
+ return kPglRetSuccess;
+}
+
+// Writes footer if present, backfills header, then closes the file.
+PglErr SpgwFinish(STPgenWriter* spgwp);
+
+// Last flush automatically writes footer if present, backfills header, and
+// closes the file.
+// (caller should set mpgwp = nullptr after that)
+PglErr MpgwFlush(MTPgenWriter* mpgwp);
+
+
+// these close the file if open, but do not free any memory
+// MpgwCleanup() handles mpgwp == nullptr, since it shouldn't be allocated on
+// the stack
+// error-return iff reterr was success and was changed to kPglRetWriteFail
+// (i.e. an error message should be printed), though this is not relevant for
+// plink2
+BoolErr CleanupSpgw(STPgenWriter* spgwp, PglErr* reterrp);
+
+BoolErr CleanupMpgw(MTPgenWriter* mpgwp, PglErr* reterrp);
+
+#ifdef __cplusplus
+} // namespace plink2
+#endif
+
+#endif // __PGENLIB_WRITE_H__
diff --git a/external_libs/pgenlib/include/plink2_base.cc b/external_libs/pgenlib/include/plink2_base.cc
index 58f7f4e1..69339e0b 100644
--- a/external_libs/pgenlib/include/plink2_base.cc
+++ b/external_libs/pgenlib/include/plink2_base.cc
@@ -1,4 +1,4 @@
-// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell,
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
@@ -21,6 +21,13 @@
namespace plink2 {
#endif
+const char kErrprintfFopen[] = "Error: Failed to open %s : %s.\n";
+const char kErrprintfFread[] = "Error: %s read failure: %s.\n";
+const char kErrprintfRewind[] = "Error: %s could not be scanned twice. (Process-substitution/named-pipe input is not permitted in this use case.)\n";
+const char kErrstrNomem[] = "Error: Out of memory. The --memory flag may be helpful.\n";
+const char kErrstrWrite[] = "Error: File write failure: %s.\n";
+const char kErrprintfDecompress[] = "Error: %s decompression failure: %s.\n";
+
uint64_t g_failed_alloc_attempt_size = 0;
#if (((__GNUC__ == 4) && (__GNUC_MINOR__ < 7)) || (__GNUC__ >= 11)) && !defined(__APPLE__)
@@ -359,7 +366,7 @@ int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) {
// tried unrolling this, doesn't make a difference
const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
- if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
+ if (!vecucs_are_equal(v1, v2)) {
return 0;
}
}
@@ -369,7 +376,7 @@ int32_t memequal(const void* m1, const void* m2, uintptr_t byte_ct) {
const uintptr_t final_offset = byte_ct - kBytesPerVec;
const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
- if (vecuc_movemask(v1 == v2) != kVec8thUintMax) {
+ if (!vecucs_are_equal(v1, v2)) {
return 0;
}
}
@@ -453,22 +460,39 @@ int32_t Memcmp(const void* m1, const void* m2, uintptr_t byte_ct) {
for (uintptr_t vidx = 0; vidx != fullvec_ct; ++vidx) {
const VecUc v1 = vecuc_loadu(&(m1_alias[vidx]));
const VecUc v2 = vecuc_loadu(&(m2_alias[vidx]));
+# ifndef SIMDE_ARM_NEON_A32V8_NATIVE
// is this even worthwhile now in non-AVX2 case?
const uint32_t movemask_result = vecuc_movemask(v1 == v2);
if (movemask_result != kVec8thUintMax) {
+ // todo: check if this is faster to do outside of vector-space
const uintptr_t diff_pos = vidx * kBytesPerVec + ctzu32(~movemask_result);
return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
}
+# else
+ const uint64_t eq_nybbles = arm_shrn4_uc(v1 == v2);
+ if (eq_nybbles != UINT64_MAX) {
+ const uintptr_t diff_pos = vidx * kBytesPerVec + ctzw(~eq_nybbles) / 4;
+ return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
+ }
+# endif
}
if (byte_ct % kBytesPerVec) {
const uintptr_t final_offset = byte_ct - kBytesPerVec;
const VecUc v1 = vecuc_loadu(&(m1_uc[final_offset]));
const VecUc v2 = vecuc_loadu(&(m2_uc[final_offset]));
+# ifndef SIMDE_ARM_NEON_A32V8_NATIVE
const uint32_t movemask_result = vecuc_movemask(v1 == v2);
if (movemask_result != kVec8thUintMax) {
const uintptr_t diff_pos = final_offset + ctzu32(~movemask_result);
return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
}
+# else
+ const uint64_t eq_nybbles = arm_shrn4_uc(v1 == v2);
+ if (eq_nybbles != UINT64_MAX) {
+ const uintptr_t diff_pos = final_offset + ctzw(~eq_nybbles) / 4;
+ return (m1_uc[diff_pos] < m2_uc[diff_pos])? -1 : 1;
+ }
+# endif
}
return 0;
}
@@ -633,21 +657,37 @@ uintptr_t FirstUnequal4(const void* arr1, const void* arr2, uintptr_t nbytes) {
for (uintptr_t vidx = 0; vidx != vec_ct; ++vidx) {
const VecUc v1 = vecuc_loadu(&(arr1_alias[vidx]));
const VecUc v2 = vecuc_loadu(&(arr2_alias[vidx]));
+# ifndef SIMDE_ARM_NEON_A32V8_NATIVE
const uint32_t eq_result = vecw_movemask(v1 == v2);
if (eq_result != kVec8thUintMax) {
return vidx * kBytesPerVec + ctzu32(~eq_result);
}
+# else
+ const uint64_t eq_nybbles = arm_shrn4_uc(v1 == v2);
+ if (eq_nybbles != UINT64_MAX) {
+ return vidx * kBytesPerVec + ctzw(~eq_nybbles) / 4;
+ }
+# endif
}
if (nbytes % kBytesPerVec) {
const uintptr_t final_offset = nbytes - kBytesPerVec;
const char* s1 = S_CAST(const char*, arr1);
const char* s2 = S_CAST(const char*, arr2);
- const VecW v1 = vecw_loadu(&(s1[final_offset]));
- const VecW v2 = vecw_loadu(&(s2[final_offset]));
- const uint32_t eq_result = vecw_movemask(v1 == v2);
+ // bugfix (5 Jul 2025): this must be VecUc on ARM, not VecW
+ // (disturbing that arm_shrn4_uc() call compiled at all...)
+ const VecUc v1 = vecuc_loadu(&(s1[final_offset]));
+ const VecUc v2 = vecuc_loadu(&(s2[final_offset]));
+# ifndef SIMDE_ARM_NEON_A32V8_NATIVE
+ const uint32_t eq_result = vecuc_movemask(v1 == v2);
if (eq_result != kVec8thUintMax) {
return final_offset + ctzu32(~eq_result);
}
+# else
+ const uint64_t eq_nybbles = arm_shrn4_uc(v1 == v2);
+ if (eq_nybbles != UINT64_MAX) {
+ return final_offset + ctzw(~eq_nybbles) / 4;
+ }
+# endif
}
return nbytes;
}
@@ -688,6 +728,7 @@ uintptr_t CountVintsNonempty(const unsigned char* buf, const unsigned char* buf_
const uintptr_t ending_addr = R_CAST(uintptr_t, buf_end);
const VecUc* buf_vlast = R_CAST(const VecUc*, RoundDownPow2(ending_addr - 1, kBytesPerVec));
const uint32_t leading_byte_ct = starting_addr - R_CAST(uintptr_t, buf_viter);
+ // todo: better ARM implementation
Vec8thUint vint_ends = (UINT32_MAX << leading_byte_ct) & (~vecuc_movemask(*buf_viter));
uintptr_t total = 0;
while (buf_viter != buf_vlast) {
diff --git a/external_libs/pgenlib/include/plink2_base.h b/external_libs/pgenlib/include/plink2_base.h
index b9d9a29d..e29b9d0d 100644
--- a/external_libs/pgenlib/include/plink2_base.h
+++ b/external_libs/pgenlib/include/plink2_base.h
@@ -1,7 +1,7 @@
#ifndef __PLINK2_BASE_H__
#define __PLINK2_BASE_H__
-// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell,
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
@@ -77,10 +77,11 @@
// valid input types, NOT counting VecW*.
-// gcc 8.3.0 has been miscompiling the ParseOnebitUnsafe() function in
-// pgenlib_read.cc for the last several years. gcc 8.4 does not have this
-// problem, and neither does any other gcc major version I've tested to date.
-#ifndef __clang__
+// gcc 8.3.0 has been miscompiling the vector code in the ParseOnebitUnsafe()
+// function in pgenlib_read.cc for the last several years. gcc 8.4 does not
+// have this problem, and neither does any other gcc major version I've tested
+// to date.
+#if !defined(__clang__) && defined(__LP64__)
# if (__GNUC__ == 8) && (__GNUC_MINOR__ < 4)
# error "gcc 8.3 is known to have a miscompilation bug that was fixed in 8.4."
# endif
@@ -106,24 +107,33 @@
// 10000 * major + 100 * minor + patch
// Exception to CONSTI32, since we want the preprocessor to have access
// to this value. Named with all caps as a consequence.
-#define PLINK2_BASE_VERNUM 815
+#define PLINK2_BASE_VERNUM 822
+// We now try to adhere to include-what-you-use in simple cases. However,
+// we don't want to repeat either platform-specific ifdefs, or stuff like
+// "#define _FILE_OFFSET_BITS 64" before "#include ", so we use
+// "IWYU pragma: export" for those cases.
+// TODO: unfortunately, there is no straightforward way to tell IWYU to stop
+// looking behind the likes of or "../simde/x86/sse2.h". I'm
+// manually ignoring the associated false-positives for now, but this should be
+// systematized.
#define _FILE_OFFSET_BITS 64
+#ifdef __USE_MINGW_ANSI_STDIO
+# undef __USE_MINGW_ANSI_STDIO
+#endif
+#define __USE_MINGW_ANSI_STDIO 1
-#include
-#include
-#include
-#include // offsetof()
-#include
+#include
#ifndef __STDC_FORMAT_MACROS
# define __STDC_FORMAT_MACROS 1
#endif
-#include
+#include // IWYU pragma: export
#include // CHAR_BIT, PATH_MAX
-
-// #define NDEBUG
-#include
+#include // offsetof()
+#include // IWYU pragma: export
+#include
+#include
#ifdef _WIN32
// needed for EnterCriticalSection, etc.
@@ -135,19 +145,21 @@
# ifndef WIN32_LEAN_AND_MEAN
# define WIN32_LEAN_AND_MEAN
# endif
-# include
+# include // IWYU pragma: export
#endif
#if __cplusplus >= 201103L
-# include
+# include // IWYU pragma: export
#endif
#ifdef __LP64__
-// TODO: working no-SSE2 fallback on 64-bit little-endian platforms unsupported
-// by simde. Can perform early test by compiling on M1/M2 without simde.
+// Possible todo: working no-SSE2 fallback on 64-bit little-endian platforms
+// unsupported by simde. Can perform early test by compiling on arm64 without
+// simde.
+// (But not yet aware of a use case that matters.)
# define USE_SSE2
# ifdef __x86_64__
-# include
+# include // IWYU pragma: export
# else
# define SIMDE_ENABLE_NATIVE_ALIASES
// Since e.g. an old zstd system header breaks the build, and plink2 is
@@ -156,9 +168,9 @@
// are manually updated as necessary.
// To use system headers, define IGNORE_BUNDLED_{ZSTD,LIBDEFLATE.SIMDE}.
# ifdef IGNORE_BUNDLED_SIMDE
-# include
+# include // IWYU pragma: export
# else
-# include "../simde/x86/sse2.h"
+# include "../simde/x86/sse2.h" // IWYU pragma: export
# endif
# ifdef SIMDE_ARM_NEON_A32V8_NATIVE
// For Apple M1, we effectively use SSE2 + constrained _mm_shuffle_epi8().
@@ -218,6 +230,8 @@ namespace plink2 {
// RoundUpPow2()...), or (ii) it allows a useful static_assert to be inserted
// for a hardcoded constant.
# if __cplusplus >= 201103L
+# define HAS_CONSTEXPR
+# define PREFER_CONSTEXPR constexpr
# define HEADER_CINLINE constexpr
# define CSINLINE static constexpr
# if __cplusplus > 201103L
@@ -228,6 +242,7 @@ namespace plink2 {
# define CSINLINE2 static inline
# endif
# else
+# define PREFER_CONSTEXPR const
# define HEADER_CINLINE inline
# define HEADER_CINLINE2 inline
# define CSINLINE static inline
@@ -241,6 +256,7 @@ namespace plink2 {
# endif
# endif
#else
+# define PREFER_CONSTEXPR const
# define HEADER_INLINE static inline
# define HEADER_CINLINE static inline
# define HEADER_CINLINE2 static inline
@@ -352,7 +368,13 @@ HEADER_INLINE unsigned char* CToUc(char* pp) {
// * IntErr allows implicit conversion from int, but conversion back to
// int32_t requires an explicit cast. It mainly serves as a holding pen for
// C standard library error return values, which can be negative.
-#if __cplusplus >= 201103L
+#if __cplusplus >= 201103L && !defined(NO_CPP11_TYPE_ENFORCEMENT)
+// Cython doesn't support these, and the previous workaround of forcing C++98
+// is breaking down. So allow this C++11 feature to be selectively disabled.
+# define CPP11_TYPE_ENFORCEMENT
+#endif
+
+#ifdef CPP11_TYPE_ENFORCEMENT
struct PglErr {
enum class ec
#else
@@ -397,7 +419,7 @@ typedef enum
kPglRetHelp = 125,
kPglRetLongLine = 126,
kPglRetEof = 127}
-#if __cplusplus >= 201103L
+#ifdef CPP11_TYPE_ENFORCEMENT
;
PglErr() {}
@@ -466,7 +488,7 @@ const PglErr kPglRetEof = PglErr::ec::kPglRetEof;
PglErr;
#endif
-#if __cplusplus >= 201103L
+#ifdef CPP11_TYPE_ENFORCEMENT
// allow efficient arithmetic on these, but force them to require explicit
// int32_t/uint32_t casts; only permit implicit assignment from
// int32_t/uint32_t by default.
@@ -512,6 +534,13 @@ typedef int32_t IntErr;
typedef uint32_t BoolErr;
#endif
+extern const char kErrprintfFopen[];
+extern const char kErrprintfFread[];
+extern const char kErrprintfRewind[];
+extern const char kErrstrNomem[];
+extern const char kErrstrWrite[];
+extern const char kErrprintfDecompress[];
+
// make this work on 32-bit as well as 64-bit systems, across
// Windows/OS X/Linux
// (todo: clean this up a bit. it's inherently a baling-wire-and-duct-tape
@@ -557,16 +586,9 @@ typedef uint32_t BoolErr;
# endif
#endif
-#ifdef _WIN32
-# undef PRId64
-# undef PRIu64
-# define PRId64 "I64d"
-# define PRIu64 "I64u"
-#else
-# ifdef __cplusplus
-# ifndef PRId64
-# define PRId64 "lld"
-# endif
+#ifdef __cplusplus
+# ifndef PRId64
+# define PRId64 "lld"
# endif
#endif
@@ -633,26 +655,13 @@ HEADER_INLINE uint32_t bsrw(unsigned long ulii) {
#endif
#ifdef __LP64__
-# ifdef _WIN32 // i.e. Win64
-
-# undef PRIuPTR
-# undef PRIdPTR
-# define PRIuPTR PRIu64
-# define PRIdPTR PRId64
-# define PRIxPTR2 "016I64x"
-
-# else // not _WIN32
-
-# ifndef PRIuPTR
-# define PRIuPTR "lu"
-# endif
-# ifndef PRIdPTR
-# define PRIdPTR "ld"
-# endif
-# define PRIxPTR2 "016lx"
-
-# endif // Win64
-
+# ifndef PRIuPTR
+# define PRIuPTR "lu"
+# endif
+# ifndef PRIdPTR
+# define PRIdPTR "ld"
+# endif
+# define PRIxPTR2 "016lx"
#else // not __LP64__
// without this, we get ridiculous warning spew...
@@ -665,6 +674,15 @@ HEADER_INLINE uint32_t bsrw(unsigned long ulii) {
# define PRIdPTR "ld"
# endif
+# ifdef _WIN32
+# ifndef PRIuPTR
+# define PRIuPTR "u"
+# endif
+# ifndef PRIdPTR
+# define PRIdPTR "d"
+# endif
+# endif
+
# define PRIxPTR2 "08lx"
#endif
@@ -943,8 +961,6 @@ HEADER_INLINE VecI8 veci8_set1(char cc) {
return VecToI8( _mm256_set1_epi8(cc));
}
-// TODO: on ARM, replace most movemask uses:
-// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
HEADER_INLINE uint32_t vecw_movemask(VecW vv) {
return _mm256_movemask_epi8(WToVec(vv));
}
@@ -1160,10 +1176,18 @@ HEADER_INLINE VecW vecw_sad(VecW v1, VecW v2) {
return VecToW(_mm256_sad_epu8(WToVec(v1), WToVec(v2)));
}
+HEADER_INLINE VecUc vecuc_add(VecUc v1, VecUc v2) {
+ return VecToUc(_mm256_add_epi8(UcToVec(v1), UcToVec(v2)));
+}
+
HEADER_INLINE VecUc vecuc_adds(VecUc v1, VecUc v2) {
return VecToUc(_mm256_adds_epu8(UcToVec(v1), UcToVec(v2)));
}
+HEADER_INLINE VecUc vecuc_signed_cmpgt(VecUc v1, VecUc v2) {
+ return VecToUc(_mm256_cmpgt_epi8(UcToVec(v1), UcToVec(v2)));
+}
+
HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) {
return VecToU16(_mm256_min_epu8(U16ToVec(v1), U16ToVec(v2)));
}
@@ -1554,15 +1578,16 @@ HEADER_INLINE VecUc vecuc_gather_odd(VecUc src_lo, VecUc src_hi) {
# ifdef USE_SHUFFLE8
# ifdef SIMDE_ARM_NEON_A64V8_NATIVE
// See simde_mm_shuffle_epi8().
-// In the future, this may need to be written more carefully in the
-// IGNORE_BUNDLED_SIMDE case. But this is compatible with simde v0.7.x and
-// v0.8.x.
-SIMDE_FUNCTION_ATTRIBUTES simde__m128i _mm_shuffle_epi8(simde__m128i a, simde__m128i b) {
- simde__m128i_private a_ = simde__m128i_to_private(a);
- simde__m128i_private b_ = simde__m128i_to_private(b);
- simde__m128i_private r_;
- r_.neon_i8 = vqtbl1q_s8(a_.neon_i8, b_.neon_u8);
- return simde__m128i_from_private(r_);
+HEADER_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) {
+ SIMDE_ALIGN_TO_16 int8x16_t a_;
+ SIMDE_ALIGN_TO_16 uint8x16_t b_;
+ SIMDE_ALIGN_TO_16 int8x16_t r_;
+ memcpy(&a_, &a, sizeof(a_));
+ memcpy(&b_, &b, sizeof(b_));
+ r_ = vqtbl1q_s8(a_, b_);
+ __m128i r;
+ memcpy(&r, &r_, sizeof(r));
+ return r;
}
# endif
HEADER_INLINE VecW vecw_shuffle8(VecW table, VecW indexes) {
@@ -1643,10 +1668,18 @@ HEADER_INLINE VecW vecw_sad(VecW v1, VecW v2) {
return VecToW(_mm_sad_epu8(WToVec(v1), WToVec(v2)));
}
+HEADER_INLINE VecUc vecuc_add(VecUc v1, VecUc v2) {
+ return VecToUc(_mm_add_epi8(UcToVec(v1), UcToVec(v2)));
+}
+
HEADER_INLINE VecUc vecuc_adds(VecUc v1, VecUc v2) {
return VecToUc(_mm_adds_epu8(UcToVec(v1), UcToVec(v2)));
}
+HEADER_INLINE VecUc vecuc_signed_cmpgt(VecUc v1, VecUc v2) {
+ return VecToUc(_mm_cmpgt_epi8(UcToVec(v1), UcToVec(v2)));
+}
+
HEADER_INLINE VecU16 vecu16_min8(VecU16 v1, VecU16 v2) {
return VecToU16(_mm_min_epu8(U16ToVec(v1), U16ToVec(v2)));
}
@@ -1906,7 +1939,6 @@ CONSTI32(kFloatPerFVec, kBytesPerFVec / 4);
CONSTI32(kDoublePerDVec, kBytesPerDVec / 8);
#if defined(__APPLE__) && defined(__LP64__) && !defined(__x86_64__)
-// TODO: make this 128 once that stops breaking code
# define CACHELINE128
CONSTI32(kCacheline, 128);
#else
@@ -1987,7 +2019,7 @@ HEADER_INLINE void PrintVecD(const VecD* vv_ptr, const char* preprint) {
fputs("\n", stdout);
}
-#if __cplusplus >= 201103L
+#ifdef CPP11_TYPE_ENFORCEMENT
// Main application of std::array in this codebase is enforcing length when
// passing references between functions. Conversely, if the array type has
// different lengths in different functions (e.g. col_skips[]/col_types[]), we
@@ -2806,6 +2838,115 @@ HEADER_INLINE uint32_t PopcountVec8thUint(uint32_t val) {
# endif
#endif
+#ifdef USE_SSE2
+// On ARM, emulated movemask isn't great. But there are alternative
+// instructions that efficiently perform what you usually want to do with
+// movemasks:
+// https://community.arm.com/arm-community-blogs/b/infrastructure-solutions-blog/posts/porting-x86-vector-bitmask-optimizations-to-arm-neon
+
+# ifndef SIMDE_ARM_NEON_A32V8_NATIVE
+// vec0255 refers to a VecUc where all bytes are equal to 0 or 255.
+// (possible todo: define this as its own type, with automatic downcast to
+// VecUc but not the other way around.)
+//
+// Return value in nonzero case is architecture-dependent (though always
+// nonzero). So, best to only use this in other architecture-specific code;
+// hence the leading underscore in the function name.
+HEADER_INLINE uint64_t _vec0255_is_nonzero(VecUc vv) {
+ return vecuc_movemask(vv);
+}
+
+HEADER_INLINE uint32_t vec0255_set_ct(VecUc vv) {
+ return PopcountVec8thUint(vecuc_movemask(vv));
+}
+
+HEADER_INLINE uint32_t vec0255_is_all_set(VecUc vv) {
+ return (vecuc_movemask(vv) == kVec8thUintMax);
+}
+
+HEADER_INLINE uint32_t vec0255u16_is_all_set(VecU16 vv) {
+ return (vecu16_movemask(vv) == kVec8thUintMax);
+}
+
+HEADER_INLINE uint32_t m128is_are_equal(__m128i v1, __m128i v2) {
+ return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535);
+}
+# else
+HEADER_INLINE uint64_t arm_shrn4_uc(VecUc vv) {
+ uint16x8_t vv_;
+ memcpy(&vv_, &vv, sizeof(vv_));
+ return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vv_, 4)), 0);
+}
+
+HEADER_INLINE uint64_t arm_shrn4_i8(VecI8 vv) {
+ uint16x8_t vv_;
+ memcpy(&vv_, &vv, sizeof(vv_));
+ return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vv_, 4)), 0);
+}
+
+HEADER_INLINE uint64_t arm_shrn4_u16(VecU16 vv) {
+ uint16x8_t vv_;
+ memcpy(&vv_, &vv, sizeof(vv_));
+ return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vv_, 4)), 0);
+}
+
+HEADER_INLINE uint64_t arm_shrn4_m128i(__m128i vv) {
+ uint16x8_t vv_;
+ memcpy(&vv_, &vv, sizeof(vv_));
+ return vget_lane_u64(vreinterpret_u64_u8(vshrn_n_u16(vv_, 4)), 0);
+}
+
+HEADER_INLINE uint64_t _vec0255_is_nonzero(VecUc vv) {
+ return arm_shrn4_uc(vv);
+}
+
+// set_bits4 must only have bits in 0, 4, ..., 60 set.
+// move this out of ARM-only ifdef if we ever want this elsewhere.
+HEADER_INLINE uint32_t popcount_bits4(uint64_t set_bits4) {
+ // Branchlessly count the number of set bits, taking advantage of the limited
+ // set of positions they can be in.
+ // Multiplication by the magic constant kMask1111 usually puts the sum of
+ // all 16 bits of interest in the high nybble of the result... except that
+ // the nybble overflows when all 16 bits are set. We work around this by
+ // (i) multiplying by (kMask1111 >> 4) instead, which excludes the lowest bit
+ // from the high-nybble sum, and
+ // (ii) then adding the lowest bit afterward.
+ const uint32_t set_ct_excluding_lowest = (set_bits4 * (kMask1111 >> 4)) >> 60;
+ return set_ct_excluding_lowest + (set_bits4 & 1);
+}
+
+// xx must have all nybbles equal to 0 or 15.
+HEADER_INLINE uint32_t count_set_nybbles(uint64_t xx) {
+ return popcount_bits4(xx & kMask1111);
+}
+
+HEADER_INLINE uint32_t vec0255_set_ct(VecUc vv) {
+ return count_set_nybbles(arm_shrn4_uc(vv));
+}
+
+HEADER_INLINE uint32_t vec0255_is_all_set(VecUc vv) {
+ return (arm_shrn4_uc(vv) == UINT64_MAX);
+}
+
+HEADER_INLINE uint32_t vec0255u16_is_all_set(VecU16 vv) {
+ return (arm_shrn4_u16(vv) == UINT64_MAX);
+}
+
+HEADER_INLINE uint32_t m128is_are_equal(__m128i v1, __m128i v2) {
+ const uint64_t set_nybbles = arm_shrn4_m128i(_mm_cmpeq_epi8(v1, v2));
+ return (set_nybbles == UINT64_MAX);
+}
+# endif
+
+HEADER_INLINE uint32_t vecucs_are_equal(VecUc v1, VecUc v2) {
+ return vec0255_is_all_set(v1 == v2);
+}
+
+HEADER_INLINE uint32_t vecu16s_are_equal(VecU16 v1, VecU16 v2) {
+ return vec0255u16_is_all_set(v1 == v2);
+}
+#endif
+
// Downcasts don't risk alignment issues.
HEADER_INLINE unsigned char* DowncastToUc(void* pp) {
return S_CAST(unsigned char*, pp);
@@ -3167,7 +3308,12 @@ HEADER_INLINE void aligned_free_cond(void* aligned_ptr) {
}
}
-// C spec is slightly broken here
+// Common pattern is to initialize a data structure, then use it without
+// mutating it, then free it.
+// Yes, in principle you should have a non-const pointer to the data structure
+// to free it. But my experience is that that makes too many beneficial uses
+// of const less ergonomic, without offering enough in return. So we cheat a
+// bit, in a way that can be systematically cleaned up later if necessary.
HEADER_INLINE void free_const(const void* memptr) {
free(K_CAST(void*, memptr));
}
@@ -3314,7 +3460,7 @@ template <> struct MemequalKImpl<16> {
static int32_t MemequalK(const void* m1, const void* m2) {
const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
const __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
- return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535);
+ return m128is_are_equal(v1, v2);
}
};
@@ -3325,7 +3471,7 @@ template struct MemequalKImpl > {
const __m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
const __m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
return
- (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535) &&
+ m128is_are_equal(v1, v2) &&
((*R_CAST(const uint64_t*, &(m1_uc[N - 8]))) == (*R_CAST(const uint64_t*, &(m2_uc[N - 8]))));
}
};
@@ -3334,14 +3480,14 @@ template struct MemequalKImpl > {
static int32_t MemequalK(const void* m1, const void* m2) {
__m128i v1 = _mm_loadu_si128(S_CAST(const __m128i*, m1));
__m128i v2 = _mm_loadu_si128(S_CAST(const __m128i*, m2));
- if (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) != 65535) {
+ if (!m128is_are_equal(v1, v2)) {
return 0;
}
const unsigned char* m1_uc = S_CAST(const unsigned char*, m1);
const unsigned char* m2_uc = S_CAST(const unsigned char*, m2);
v1 = _mm_loadu_si128(R_CAST(const __m128i*, &(m1_uc[N - 16])));
v2 = _mm_loadu_si128(R_CAST(const __m128i*, &(m2_uc[N - 16])));
- return (_mm_movemask_epi8(_mm_cmpeq_epi8(v1, v2)) == 65535);
+ return m128is_are_equal(v1, v2);
}
};
@@ -3578,7 +3724,7 @@ HEADER_INLINE char* strcpya(char* __restrict dst, const void* __restrict src) {
return memcpya(dst, src, slen);
}
-#if defined(__LP64__) && (__cplusplus >= 201103L)
+#if __cplusplus >= 201103L
constexpr uint32_t CompileTimeSlen(const char* k_str) {
return k_str[0]? (1 + CompileTimeSlen(&(k_str[1]))) : 0;
}
@@ -3900,6 +4046,18 @@ HEADER_INLINE void ZeroHwArr(uintptr_t entry_ct, Halfword* hwarr) {
memset(hwarr, 0, entry_ct * sizeof(Halfword));
}
+HEADER_INLINE void ZeroFArr(uintptr_t entry_ct, float* farr) {
+ for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
+ *farr++ = 0.0;
+ }
+}
+
+HEADER_INLINE void ZeroDArr(uintptr_t entry_ct, double* darr) {
+ for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
+ *darr++ = 0.0;
+ }
+}
+
HEADER_INLINE void SetAllWArr(uintptr_t entry_ct, uintptr_t* warr) {
// todo: test this against vecset()
for (uintptr_t idx = 0; idx != entry_ct; ++idx) {
@@ -3907,6 +4065,12 @@ HEADER_INLINE void SetAllWArr(uintptr_t entry_ct, uintptr_t* warr) {
}
}
+HEADER_INLINE void SetAllU32Arr(uintptr_t entry_ct, uint32_t* u32arr) {
+ for (uintptr_t ulii = 0; ulii != entry_ct; ulii++) {
+ *u32arr++ = ~0U;
+ }
+}
+
// tried _bzhi_u64() in AVX2 case, it was actually worse on my Mac (more
// opaque to compiler?)
@@ -4157,7 +4321,7 @@ HEADER_INLINE uint32_t VintBytect(uintptr_t ulii) {
// enum base type to make it safe for the enum to serve as the flagset type.
// * Implicit conversion to int is not prevented for now, since I'm trying to
// keep PglErr-style code duplication to a minimum.
-#if __cplusplus >= 201103L
+#ifdef CPP11_TYPE_ENFORCEMENT
// could avoid the typedef here, but that leads to a bit more verbosity.
# define FLAGSET_DEF_START() typedef enum : uint32_t {
@@ -4269,7 +4433,7 @@ private: \
# define ENUM_U31_DEF_START() typedef enum : uint32_t {
# define ENUM_U31_DEF_END(tname) } tname
-#else // !__cplusplus >= 201103L
+#else // !CPP11_TYPE_ENFORCEMENT
# define FLAGSET_DEF_START() enum {
# define FLAGSET_DEF_END(tname) } ; \
diff --git a/external_libs/pgenlib/include/plink2_bits.cc b/external_libs/pgenlib/include/plink2_bits.cc
index 08ef9de5..0b2a5048 100644
--- a/external_libs/pgenlib/include/plink2_bits.cc
+++ b/external_libs/pgenlib/include/plink2_bits.cc
@@ -14,7 +14,6 @@
// You should have received a copy of the GNU Lesser General Public License
// along with this library. If not, see .
-
#include "plink2_bits.h"
#ifdef __cplusplus
@@ -387,7 +386,7 @@ uintptr_t NextNonmissingUnsafe(const uintptr_t* genoarr, uintptr_t loc) {
*/
uint32_t AdvBoundedTo1Bit(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil) {
- // safe version.
+ // Can overread a single word if loc == ceil.
const uintptr_t* bitarr_iter = &(bitarr[loc / kBitsPerWord]);
uintptr_t ulii = (*bitarr_iter) >> (loc % kBitsPerWord);
if (ulii) {
@@ -406,7 +405,7 @@ uint32_t AdvBoundedTo1Bit(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil)
}
uintptr_t AdvBoundedTo0Bit(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil) {
- assert(ceil >= 1);
+ // Can overread a single word if loc == ceil.
const uintptr_t* bitarr_ptr = &(bitarr[loc / kBitsPerWord]);
uintptr_t ulii = (~(*bitarr_ptr)) >> (loc % kBitsPerWord);
if (ulii) {
@@ -1832,6 +1831,27 @@ uintptr_t PopcountBytesMasked(const void* bitarr, const uintptr_t* mask_arr, uin
#endif
}
+uintptr_t PopcountBitRange(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
+ uintptr_t start_idxl = start_idx / kBitsPerWord;
+ const uintptr_t start_idxlr = start_idx & (kBitsPerWord - 1);
+ const uintptr_t end_idxl = end_idx / kBitsPerWord;
+ const uintptr_t end_idxlr = end_idx & (kBitsPerWord - 1);
+ uintptr_t ct = 0;
+ if (start_idxl == end_idxl) {
+ return PopcountWord(bitvec[start_idxl] & ((k1LU << end_idxlr) - (k1LU << start_idxlr)));
+ }
+ if (start_idxlr) {
+ ct = PopcountWord(bitvec[start_idxl++] >> start_idxlr);
+ }
+ if (end_idxl > start_idxl) {
+ ct += PopcountWordsNzbase(bitvec, start_idxl, end_idxl);
+ }
+ if (end_idxlr) {
+ ct += PopcountWord(bzhi(bitvec[end_idxl], end_idxlr));
+ }
+ return ct;
+}
+
void FillCumulativePopcounts(const uintptr_t* subset_mask, uint32_t word_ct, uint32_t* cumulative_popcounts) {
assert(word_ct);
const uint32_t word_ct_m1 = word_ct - 1;
@@ -2066,6 +2086,7 @@ void TransposeBitblock64(const uintptr_t* read_iter, uintptr_t read_ul_stride, u
// buf0 and buf1 must both be 32KiB vector-aligned buffers when
// kCacheline==64, and 128KiB when kCacheline==128.
+ // todo: better ARM implementation
const uint32_t buf0_row_ct = DivUp(write_row_ct, 64);
{
uintptr_t* buf0_ul = DowncastVecWToW(buf0);
@@ -2916,7 +2937,8 @@ void Reduce8to4bitInplaceUnsafe(uintptr_t entry_ct, uintptr_t* mainvec) {
vmainvec[write_vidx] = vecw_gather_even(v0, v1, m8);
}
uintptr_t write_idx = fullvec_ct * kWordsPerVec;
- if (write_idx == entry_ct * 2) {
+ // bugfix (9 Jun 2025): mixed up units in this comparison
+ if (write_idx * kBitsPerWordD4 == entry_ct) {
return;
}
#else
@@ -2940,6 +2962,100 @@ void Reduce8to4bitInplaceUnsafe(uintptr_t entry_ct, uintptr_t* mainvec) {
mainvec[write_idx] = bzhi_max(write_word, remaining_entry_ct * 4);
}
+// advances forward_ct set bits; forward_ct must be positive. (stays put if
+// forward_ct == 1 and current bit is set. may want to tweak this interface,
+// easy to introduce off-by-one bugs...)
+// In usual 64-bit case, also assumes bitvec is 16-byte aligned and the end of
+// the trailing 16-byte block can be safely read from.
+uintptr_t FindNth1BitFrom(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct) {
+ assert(forward_ct);
+ uintptr_t widx = cur_pos / kBitsPerWord;
+ uintptr_t ulii = cur_pos % kBitsPerWord;
+ const uintptr_t* bptr = &(bitvec[widx]);
+ uintptr_t uljj;
+ uintptr_t ulkk;
+#ifdef __LP64__
+ const VecW* vptr;
+ assert(IsVecAligned(bitvec));
+#endif
+ if (ulii) {
+ uljj = (*bptr) >> ulii;
+ ulkk = PopcountWord(uljj);
+ if (ulkk >= forward_ct) {
+ FindNth1BitFrom_finish:
+ return widx * kBitsPerWord + ulii + WordBitIdxToUidx(uljj, forward_ct - 1);
+ }
+ forward_ct -= ulkk;
+ ++widx;
+ ++bptr;
+ }
+ ulii = 0;
+#ifdef __LP64__
+ while (widx & (kWordsPerVec - k1LU)) {
+ uljj = *bptr;
+ ulkk = PopcountWord(uljj);
+ if (ulkk >= forward_ct) {
+ goto FindNth1BitFrom_finish;
+ }
+ forward_ct -= ulkk;
+ ++widx;
+ ++bptr;
+ }
+ vptr = R_CAST(const VecW*, bptr);
+#ifdef USE_AVX2
+ while (forward_ct > kBitsPerWord * (16 * kWordsPerVec)) {
+ uljj = (forward_ct - 1) / (kBitsPerWord * kWordsPerVec);
+ ulkk = PopcountVecsAvx2(vptr, uljj);
+ vptr = &(vptr[uljj]);
+ forward_ct -= ulkk;
+ }
+#else
+ while (forward_ct > kBitsPerWord * (3 * kWordsPerVec)) {
+ uljj = ((forward_ct - 1) / (kBitsPerWord * (3 * kWordsPerVec))) * 3;
+ // yeah, yeah, this is suboptimal if we have SSE4.2
+ ulkk = PopcountVecsNoAvx2(vptr, uljj);
+ vptr = &(vptr[uljj]);
+ forward_ct -= ulkk;
+ }
+#endif
+ bptr = R_CAST(const uintptr_t*, vptr);
+ while (forward_ct > kBitsPerWord) {
+ forward_ct -= PopcountWord(*bptr++);
+ }
+#else
+ while (forward_ct > kBitsPerWord) {
+ uljj = (forward_ct - 1) / kBitsPerWord;
+ ulkk = PopcountWords(bptr, uljj);
+ bptr = &(bptr[uljj]);
+ forward_ct -= ulkk;
+ }
+#endif
+ for (; ; ++bptr) {
+ uljj = *bptr;
+ ulkk = PopcountWord(uljj);
+ if (ulkk >= forward_ct) {
+ widx = bptr - bitvec;
+ goto FindNth1BitFrom_finish;
+ }
+ forward_ct -= ulkk;
+ }
+}
+
+void FillU32SubsetStarts(const uintptr_t* subset, uint32_t thread_ct, uint32_t start, uint64_t bit_ct, uint32_t* starts) {
+ assert(bit_ct);
+ uint32_t uidx = AdvTo1Bit(subset, start);
+ uint32_t prev_bit_idx = 0;
+ starts[0] = uidx;
+ for (uint32_t tidx = 1; tidx != thread_ct; ++tidx) {
+ const uint32_t bit_idx = (tidx * bit_ct) / thread_ct;
+ if (bit_idx != prev_bit_idx) {
+ uidx = FindNth1BitFrom(subset, uidx + 1, bit_idx - prev_bit_idx);
+ prev_bit_idx = bit_idx;
+ }
+ starts[tidx] = uidx;
+ }
+}
+
#ifdef __cplusplus
} // namespace plink2
#endif
diff --git a/external_libs/pgenlib/include/plink2_bits.h b/external_libs/pgenlib/include/plink2_bits.h
index c86d2ee6..3026bfc7 100644
--- a/external_libs/pgenlib/include/plink2_bits.h
+++ b/external_libs/pgenlib/include/plink2_bits.h
@@ -1,7 +1,7 @@
#ifndef __PLINK2_BITS_H__
#define __PLINK2_BITS_H__
-// This library is part of PLINK 2.0, copyright (C) 2005-2024 Shaun Purcell,
+// This library is part of PLINK 2.0, copyright (C) 2005-2025 Shaun Purcell,
// Christopher Chang.
//
// This library is free software: you can redistribute it and/or modify it
@@ -20,6 +20,10 @@
// Bitarray support. (Inline single-word operations are in plink2_base.h.)
+#include
+#include
+#include
+
#include "plink2_base.h"
#ifdef __cplusplus
@@ -106,8 +110,10 @@ uintptr_t AdvTo0Bit(const uintptr_t* bitarr, uintptr_t loc);
// uintptr_t NextNonmissingUnsafe(const uintptr_t* genoarr, uintptr_t loc);
+// Can overread a single word if loc == ceil.
uint32_t AdvBoundedTo1Bit(const uintptr_t* bitarr, uint32_t loc, uint32_t ceil);
+// Can overread a single word if loc == ceil.
uintptr_t AdvBoundedTo0Bit(const uintptr_t* bitarr, uintptr_t loc, uintptr_t ceil);
uintptr_t FindLast1BitBefore(const uintptr_t* bitarr, uintptr_t loc);
@@ -284,6 +290,35 @@ uintptr_t PopcountWordsXor(const uintptr_t* __restrict bitvec1_iter, const uintp
// uintptr_t PopcountWordsIntersect3(const uintptr_t* __restrict bitvec1_iter, const uintptr_t* __restrict bitvec2_iter, const uintptr_t* __restrict bitvec3_iter, uintptr_t word_ct);
+#ifdef __LP64__
+HEADER_INLINE uintptr_t PopcountWordsNzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
+ uintptr_t prefix_ct = 0;
+# ifdef USE_AVX2
+ while (start_idx & 3) {
+ if (end_idx == start_idx) {
+ return prefix_ct;
+ }
+ prefix_ct += PopcountWord(bitvec[start_idx++]);
+ }
+# else
+ if (start_idx & 1) {
+ if (end_idx == start_idx) {
+ return 0;
+ }
+ prefix_ct = PopcountWord(bitvec[start_idx++]);
+ }
+# endif // USE_AVX2
+ return prefix_ct + PopcountWords(&(bitvec[start_idx]), end_idx - start_idx);
+}
+#else
+HEADER_INLINE uintptr_t PopcountWordsNzbase(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx) {
+ return PopcountWords(&(bitvec[start_idx]), end_idx - start_idx);
+}
+#endif
+
+// start_idx == end_idx ok
+uintptr_t PopcountBitRange(const uintptr_t* bitvec, uintptr_t start_idx, uintptr_t end_idx);
+
// requires positive word_ct
// stay agnostic a bit longer re: word_ct := DIV_UP(entry_ct, kBitsPerWord)
// vs. word_ct := 1 + (entry_ct / kBitsPerWord)
@@ -636,6 +671,15 @@ HEADER_INLINE void AssignNybblearrEntry(uint32_t idx, uintptr_t newval, uintptr_
// positive.
void Reduce8to4bitInplaceUnsafe(uintptr_t entry_ct, uintptr_t* mainvec);
+// forward_ct must be positive. Stays put if forward_ct == 1 and current bit
+// is set.
+// In usual 64-bit case, also assumes bitvec is vector aligned.
+uintptr_t FindNth1BitFrom(const uintptr_t* bitvec, uintptr_t cur_pos, uintptr_t forward_ct);
+
+// This function assumes (bit_ct * (thread_ct - 1)) < 2^64.
+// bit_ct must be positive, but can be smaller than thread_ct
+void FillU32SubsetStarts(const uintptr_t* subset, uint32_t thread_ct, uint32_t start, uint64_t bit_ct, uint32_t* starts);
+
#ifdef __cplusplus
} // namespace plink2
#endif
diff --git a/src/Data.cpp b/src/Data.cpp
index d5e49d2f..25a69238 100755
--- a/src/Data.cpp
+++ b/src/Data.cpp
@@ -2040,11 +2040,16 @@ void Data::print_test_info(){
if(params.write_masks) {
- bm.write_info(¶ms, &in_filters, sout);
- sout << " * user specified to write masks (in PLINK bed format)\n";
- if(params.dosage_mode)
- sout << " +dosages will be converted to hardcalls\n";
- if(params.write_setlist)
+ if(params.mask_format_pgen) {
+ bm.write_info_pgen(¶ms, &in_filters, sout);
+ sout << " * user specified to write masks (in PLINK2 pgen format with dosage support)\n";
+ } else {
+ bm.write_info(¶ms, &in_filters, sout);
+ sout << " * user specified to write masks (in PLINK bed format)\n";
+ if(params.dosage_mode)
+ sout << " +dosages will be converted to hardcalls\n";
+ }
+ if(params.write_setlist)
bm.prep_setlists(files.new_sets, files.out_file, sout);
}
diff --git a/src/Masks.cpp b/src/Masks.cpp
index 632f8140..2a4988bb 100644
--- a/src/Masks.cpp
+++ b/src/Masks.cpp
@@ -58,6 +58,7 @@ void GenoMask::prep_run(struct param& params, struct in_files const& files){
vc_collapse_MAC = params.skat_collapse_MAC;
w_vc_cust_weights = params.vc_with_weights;
write_masks = params.write_masks;
+ mask_format_pgen = params.mask_format_pgen;
write_snplist = params.write_mask_snplist;
force_singleton = params.aaf_file_wSingletons;
verbose = params.verbose || params.debug;
@@ -701,8 +702,14 @@ void GenoMask::computeMasks(struct param* params, struct filter* filters, const
tmpsnp.allele2 = buffer.str();
if(write_masks && in_bed(index_start)) {
- write_genovec(index_start);
- write_genobim(tmpsnp);
+ if(mask_format_pgen) {
+ write_pgen_variant(index_start);
+ write_pgen_pvar(tmpsnp);
+ pgen_variant_ct++;
+ } else {
+ write_genovec(index_start);
+ write_genobim(tmpsnp);
+ }
if(write_setlist) append_setlist(index_start, tmpsnp.ID);
}
@@ -959,7 +966,7 @@ void GenoMask::buildMask(int const& isnp, int const& chrom, uint32_t const& phys
if(take_comphet) maskvec = maskvec.min(2);
// if dosages were given and writing to PLINK bed, convert dosages to hardcalls
- if(params->dosage_mode && write_masks) maskvec = maskvec.round();
+ if(params->dosage_mode && write_masks && !mask_format_pgen) maskvec = maskvec.round();
// get counts
for (int i = 0, index = 0; i < filters->ind_ignore.size(); i++) {
@@ -1015,7 +1022,12 @@ void GenoMask::buildMask(int const& isnp, int const& chrom, uint32_t const& phys
maskvec(index++) = ds;
}
//cerr << maskvec.matrix().transpose().array().head(5) << endl << endl << maskvec.mean()<nChrom) mac = total; // use MAC assuming diploid coding
@@ -1167,6 +1179,72 @@ void GenoMask::write_famfile(struct param* params, struct filter const* filters,
params->FIDvec.clear();
}
+void GenoMask::write_info_pgen(struct param* params, struct filter const* filters, mstream& sout){
+ using namespace plink2;
+
+ uint32_t sample_ct = filters->ind_in_analysis.count();
+
+ // Write .psam file (PLINK2 sample file with header)
+ string psam_fname = gfile_prefix + ".psam";
+ Files psam_out;
+ psam_out.openForWrite(psam_fname, sout);
+ psam_out << "#FID\tIID\tSEX\n";
+ for (int i = 0; i < filters->ind_in_analysis.size(); i++) {
+ if( filters->ind_in_analysis(i) ){
+ psam_out <<
+ params->FIDvec[i][0] << "\t" <<
+ params->FIDvec[i][1] << "\t" <<
+ params->sex(i) << "\n";
+ }
+ }
+ psam_out.closeFile();
+
+ // Initialize pgen writer
+ PreinitSpgw(&spgw);
+ uintptr_t alloc_cacheline_ct;
+ uint32_t max_vrec_len;
+
+ PglErr reterr = SpgwInitPhase1(
+ (gfile_prefix + ".pgen").c_str(),
+ nullptr, // allele_idx_offsets (biallelic)
+ nullptr, // explicit_nonref_flags
+ nmasks_total, // variant_ct_limit
+ sample_ct,
+ 2, // allele_ct_upper_bound (biallelic)
+ kPgenWriteBackwardSeek, // write mode
+ kfPgenGlobalDosagePresent, // phase_dosage_gflags
+ 0, // nonref_flags_storage
+ &spgw,
+ &alloc_cacheline_ct,
+ &max_vrec_len
+ );
+
+ if(reterr != kPglRetSuccess) {
+ throw "failed to initialize pgen writer";
+ }
+
+ // Allocate memory
+ spgw_alloc = (unsigned char*) malloc(alloc_cacheline_ct * kCacheline);
+ SpgwInitPhase2(max_vrec_len, &spgw, spgw_alloc);
+
+ // Initialize dosage storage
+ dosage_vals.resize(nmasks_total);
+ dosage_present.resize(nmasks_total);
+ dosage_counts.resize(nmasks_total);
+
+ size_t dosage_present_size = (sample_ct + kBitsPerWord - 1) / kBitsPerWord;
+ for(int i = 0; i < nmasks_total; i++) {
+ dosage_vals[i].resize(sample_ct);
+ dosage_present[i].resize(dosage_present_size);
+ std::fill(dosage_present[i].begin(), dosage_present[i].end(), 0);
+ }
+
+ // Open .pvar file
+ string fname = gfile_prefix + ".pvar";
+ openStream(&outfile_bim, fname, std::ios::out, sout);
+ outfile_bim << "#CHROM\tPOS\tID\tREF\tALT\n";
+}
+
void GenoMask::reset_gvec(){
gvec.resize(nmasks_total);
for(int i = 0; i < nmasks_total; i++)
@@ -1184,7 +1262,7 @@ void GenoMask::make_genovec(int const& isnp, Ref mask, struct fil
if( !filters->ind_in_analysis(i) ) continue;
// round to nearest int
- hc = (int) (mask(i) + 0.5);
+ hc = (int) (mask(i) + 0.5);
// using 'ref-last':
// 00 -> hom. alt
@@ -1226,6 +1304,84 @@ void GenoMask::write_genovec(int const& isnp){
}
+void GenoMask::make_pgen_dosage(int const& isnp, Ref mask, struct filter const* filters){
+ using namespace plink2;
+
+ uint32_t dosage_ct = 0;
+
+ // Build densely packed dosage array (only non-missing samples)
+ for(int i = 0, index = 0; i < mask.size(); i++){
+ if( !filters->ind_in_analysis(i) ) continue;
+
+ double dosage_dbl = mask(i);
+
+ // Convert double to uint16_t dosage (0-32768 range, 16384 = 1.0, 32768 = 2.0)
+ uint16_t dosage_val = (uint16_t)(dosage_dbl * 16384.0 + 0.5);
+
+ // Set dosage_present bit for all samples (including 0 dosages)
+ size_t word_idx = index / kBitsPerWord;
+ size_t bit_idx = index % kBitsPerWord;
+ dosage_present[isnp][word_idx] |= (1ULL << bit_idx);
+
+ // Store in packed array at position dosage_ct
+ dosage_vals[isnp][dosage_ct] = dosage_val;
+ dosage_ct++;
+
+ index++;
+ }
+
+ dosage_counts[isnp] = dosage_ct;
+}
+
+void GenoMask::write_pgen_variant(int const& isnp){
+ using namespace plink2;
+
+ // Create hardcall genovec (all missing for now - can optimize later)
+ uint32_t sample_ct = SpgwGetSampleCt(&spgw);
+ size_t genovec_size = NypCtToVecCt(sample_ct) * kBytesPerVec;
+ uintptr_t* genovec = (uintptr_t*) calloc(genovec_size, 1); // Initialize to zero
+
+ // Set all sample bits to missing (0b11), but leave trailing bits zero
+ // Each sample is 2 bits (nybble), so sample_ct samples = sample_ct nybbles
+ uint32_t full_word_ct = sample_ct / kBitsPerWordD2;
+ uint32_t remainder_nyp = sample_ct % kBitsPerWordD2;
+
+ // Set full words to all missing (all bits = 1, so 0b11 for each sample)
+ for(uint32_t i = 0; i < full_word_ct; i++) {
+ genovec[i] = ~k0LU; // All bits set
+ }
+
+ // Set remainder samples to missing, but clear trailing bits beyond sample_ct
+ if(remainder_nyp > 0) {
+ // Create mask: lower (remainder_nyp * 2) bits set to 1 (0b11 for each sample), rest zero
+ // This ensures bits beyond sample_ct are zero (required by pgenlib assertion)
+ uintptr_t mask = (~k0LU) >> (kBitsPerWord - (2 * remainder_nyp));
+ genovec[full_word_ct] = mask;
+ }
+
+ PglErr reterr = SpgwAppendBiallelicGenovecDosage16(
+ genovec,
+ dosage_present[isnp].data(),
+ dosage_vals[isnp].data(),
+ dosage_counts[isnp],
+ &spgw
+ );
+
+ free(genovec);
+
+ if(reterr != kPglRetSuccess) {
+ throw "failed to write pgen variant";
+ }
+}
+
+void GenoMask::write_pgen_pvar(struct snp const& tsnp){
+ // Write variant info to pvar file
+ // Format: #CHROM POS ID REF ALT
+ outfile_bim << tsnp.chrom << "\t" << tsnp.physpos << "\t"
+ << tsnp.ID << "\t" << tsnp.allele1 << "\t"
+ << tsnp.allele2 << endl;
+}
+
// get list of indices for each mask (across all AAF bins)
void GenoMask::build_map(map>& mask_map){
@@ -1367,8 +1523,23 @@ void GenoMask::make_setlist(string const& sname, int const& chr, uint32_t const&
}
void GenoMask::closeFiles(){
+ using namespace plink2;
+
outfile_bim.close();
- outfile_bed.close();
+ if(mask_format_pgen) {
+ // Update variant count to actual number written (fixes assertion in SpgwFinish)
+ PgenWriterCommon* pwcp = &GET_PRIVATE(spgw, pwc);
+ pwcp->variant_ct_limit = pgen_variant_ct;
+ PglErr reterr = kPglRetSuccess;
+ SpgwFinish(&spgw);
+ CleanupSpgw(&spgw, &reterr);
+ if(spgw_alloc) {
+ free(spgw_alloc);
+ spgw_alloc = nullptr;
+ }
+ } else {
+ outfile_bed.close();
+ }
if(write_setlist){
for(size_t i = 0; i < setfiles.size(); i++)
setfiles[i]->closeFile();
diff --git a/src/Masks.hpp b/src/Masks.hpp
index a11bc59a..1430b44d 100644
--- a/src/Masks.hpp
+++ b/src/Masks.hpp
@@ -27,6 +27,7 @@
#ifndef MASK_H
#define MASK_H
+#include "../external_libs/pgenlib/include/pgenlib_write.h"
class GenoMask {
@@ -46,6 +47,14 @@ class GenoMask {
Files snplist_out;
std::vector > list_snps;//contains snplist
+ // For pgen output
+ plink2::STPgenWriter spgw;
+ unsigned char* spgw_alloc;
+ bool mask_format_pgen = false;
+ std::vector> dosage_vals; // dosages in pgen format
+ std::vector> dosage_present; // bitarray for dosages
+ std::vector dosage_counts; // count per mask
+ uint32_t pgen_variant_ct = 0; // actual number of variants written to pgen
double tol = 1e-6;
double minAAF = 1e-7, default_aaf = .01;
int n_aaf_bins, max_aaf_bins = 12, nmasks_total;
@@ -93,6 +102,10 @@ class GenoMask {
void write_genobim(struct snp const&);
void setAllBitsZero(const int&);
void setAllBitsOne(const int&);
+ void write_info_pgen(struct param*, struct filter const*, mstream&);
+ void make_pgen_dosage(const int&, Eigen::Ref, struct filter const*);
+ void write_pgen_variant(const int&);
+ void write_pgen_pvar(struct snp const&);
std::string build_header();
void build_map(std::map>&);
void prep_snplist(const std::string&,mstream& sout);
diff --git a/src/Regenie.cpp b/src/Regenie.cpp
index ff2c0477..877ec346 100755
--- a/src/Regenie.cpp
+++ b/src/Regenie.cpp
@@ -243,7 +243,8 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct
("weights-col", "column index (1-based) for user-defined weights in annotation file", cxxopts::value(params->vc_weight_col))
("joint", "comma spearated list of joint tests to perform", cxxopts::value(params->burden),"STRING")
("singleton-carrier", "define singletons as variants with a single carrier in the sample")
- ("write-mask", "write masks in PLINK bed/bim/fam format")
+ ("write-mask", "write masks to file")
+ ("mask-format", "format for writing masks: 'bed' (PLINK bed/bim/fam, default) or 'pgen' (PLINK2 pgen/pvar/psam with dosage support)", cxxopts::value(), "STRING")
("mask-lovo", "apply Leave-One-Variant-Out (LOVO) scheme when building masks (,,)", cxxopts::value(),"STRING")
("mask-lodo", "apply Leave-One-Domain-Out (LODO) scheme when building masks (,,)", cxxopts::value(),"STRING")
("skip-test", "skip computing association tests after building masks")
@@ -500,6 +501,16 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct
if( vm.count("mask-lovo") ) params->mask_loo = true;
if( vm.count("mask-lodo") ) params->mask_lodo = true;
if( vm.count("write-mask") ) params->write_masks = true;
+ if( vm.count("mask-format") ) {
+ std::string format = vm["mask-format"].as();
+ if(format == "pgen") {
+ params->mask_format_pgen = true;
+ } else if(format == "bed") {
+ params->mask_format_pgen = false;
+ } else {
+ throw "invalid value for --mask-format: must be 'bed' or 'pgen'";
+ }
+ }
if( vm.count("write-setlist") ) params->write_setlist = true;
if( vm.count("write-mask-snplist") ) params->write_mask_snplist = true;
if( vm.count("skip-test") ) params->skip_test = true;
@@ -971,7 +982,10 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct
}
if(params->write_masks){
sout << "WARNING: cannot use --write-mask with --mask-lovo.\n";
- params->write_masks = false; valid_args[ "write-mask" ] = false;
+ params->write_masks = false;
+ params->mask_format_pgen = false;
+ valid_args[ "write-mask" ] = false;
+ valid_args[ "mask-format" ] = false;
}
if(params->joint_test)
throw "cannot use --joint with --mask-lovo";
@@ -996,7 +1010,10 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct
}
if(params->write_masks){
sout << "WARNING: cannot use --write-mask with --mask-lodo.\n";
- params->write_masks = false; valid_args[ "write-mask" ] = false;
+ params->write_masks = false;
+ params->mask_format_pgen = false;
+ valid_args[ "write-mask" ] = false;
+ valid_args[ "mask-format" ] = false;
}
valid_args[ "vc-maxAAF" ] = valid_args[ "aaf-bins" ] = false;
}
@@ -1014,6 +1031,10 @@ void read_params_and_check(int& argc, char *argv[], struct param* params, struct
if(params->build_mask) throw "option --force-mac-filter cannot be used when building masks";
} else valid_args[ "force-mac-filter" ] = false;
+ if(vm.count("mask-format") && !params->write_masks){
+ sout << "ERROR: --mask-format can only be used with --write-mask.\n";
+ exit(EXIT_FAILURE);
+ }
if(!params->build_mask && params->write_masks) {params->write_masks = false; valid_args[ "write-mask" ] = false;}
if(!params->build_mask && params->check_mask_files) {params->check_mask_files = false; valid_args[ "check-burden-files" ] = false;}
if(!params->build_mask && params->strict_check_burden) {params->strict_check_burden = false; valid_args[ "strict-check-burden" ] = false;}
diff --git a/src/Regenie.hpp b/src/Regenie.hpp
index b0d0c393..c78f6ec2 100644
--- a/src/Regenie.hpp
+++ b/src/Regenie.hpp
@@ -383,7 +383,7 @@ struct param {
bool aaf_file_wSingletons = false;//for choosing snps in singleton masks
bool singleton_carriers = false; // carrier count used to define singletons
uint64 max_bsize = 0; // number of SNPs per variant set
- bool write_masks = false, write_setlist = false, write_mask_snplist = false; //write masks to bed file
+ bool write_masks = false, mask_format_pgen = false, write_setlist = false, write_mask_snplist = false; //write masks to bed/pgen file
bool mask_loo = false, mask_lodo = false;
bool use_max_bsize = false; // set bsize to max set size
bool p_joint_only = false;
diff --git a/src/Step2_Models.cpp b/src/Step2_Models.cpp
index f64026ce..f8fc4047 100644
--- a/src/Step2_Models.cpp
+++ b/src/Step2_Models.cpp
@@ -2683,8 +2683,12 @@ std::string print_summary(Files* ofile, string const& out, std::vector