From 0ab98c166770aaf974cbeb4df598a5479f45f6f7 Mon Sep 17 00:00:00 2001 From: Nae-Chyun Chen Date: Thu, 11 Sep 2025 21:48:02 -0700 Subject: [PATCH 1/2] Add index size logging with per-contig stats --- src/chain.cpp | 32 ++++++++++++++++++++++++++++ src/chain.hpp | 1 + src/leviosam.cpp | 17 +++++++++++++-- src/leviosam.hpp | 54 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 102 insertions(+), 2 deletions(-) diff --git a/src/chain.cpp b/src/chain.cpp index d1d9450..9c0bb8b 100644 --- a/src/chain.cpp +++ b/src/chain.cpp @@ -210,6 +210,38 @@ bool ChainMap::interval_map_sanity_check() { return true; } +void ChainMap::log_index_size(size_t bytes) const { + size_t total_intervals = 0; + size_t total_bases = 0; + std::vector>> stats; + stats.reserve(interval_map.size()); + for (const auto &kv : interval_map) { + size_t n = kv.second.size(); + size_t bases = 0; + for (const auto &intvl : kv.second) { + bases += static_cast(intvl.source_end) - + static_cast(intvl.source_start); + } + total_intervals += n; + total_bases += bases; + stats.push_back({kv.first, {n, bases}}); + } + size_t avg_len = total_intervals ? total_bases / total_intervals : 0; + std::cerr << "[I::log_index_size] contigs=" << interval_map.size() + << " intervals=" << total_intervals << " bases=" << total_bases + << " avg_len=" << avg_len; + if (bytes) + std::cerr << " bytes=" << bytes; + std::cerr << "\n"; + for (const auto &s : stats) { + size_t avg = s.second.first ? s.second.second / s.second.first : 0; + std::cerr << "[I::log_index_size] " << s.first + << " intervals=" << s.second.first + << " bases=" << s.second.second + << " avg_len=" << avg << "\n"; + } +} + /* Get the rank in the start bitvector at contig[pos] * If pos > len(contig): return rank(contig[-1]) * diff --git a/src/chain.hpp b/src/chain.hpp index cc23e0a..f338463 100644 --- a/src/chain.hpp +++ b/src/chain.hpp @@ -63,6 +63,7 @@ class ChainMap { void debug_print_interval_map(); void debug_print_intervals(std::string contig, const int n); bool interval_map_sanity_check(); + void log_index_size(size_t bytes = 0) const; int get_start_rank(std::string contig, int pos); int get_end_rank(std::string contig, int pos); bool update_interval_indexes(const std::string contig, const int32_t pos, diff --git a/src/leviosam.cpp b/src/leviosam.cpp index abe74ac..b2c1f7a 100644 --- a/src/leviosam.cpp +++ b/src/leviosam.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include "aln.hpp" #include "collate.hpp" @@ -112,7 +113,8 @@ void serialize_run(lift_opts args) { args.haplotype, args.name_map, args.length_map)); std::ofstream o(fn_index, std::ios::binary); - l.serialize(o); + size_t bytes = l.serialize(o); + l.log_index_size(bytes); std::cerr << "[I::serialize_run] levioSAM VcfMap saved to " << fn_index << "\n"; // ChainMap @@ -121,7 +123,8 @@ void serialize_run(lift_opts args) { chain::ChainMap cfp(args.chain_fname, args.verbose, args.allowed_cigar_changes, args.length_map); std::ofstream o(fn_index, std::ios::binary); - cfp.serialize(o); + size_t bytes = cfp.serialize(o); + cfp.log_index_size(bytes); std::cerr << "[I::serialize_run] levioSAM ChainMap saved to " << fn_index << "\n"; } else { @@ -299,10 +302,13 @@ std::map load_fasta(std::string ref_name) { } void lift_run(lift_opts args) { + size_t chain_bytes = 0; chain::ChainMap chain_map = [&] { if (args.chainmap_fname != "") { std::cerr << "[I::lift_run] Loading levioSAM index..."; std::ifstream in(args.chainmap_fname, std::ios::binary); + std::ifstream fs(args.chainmap_fname, std::ios::binary | std::ios::ate); + chain_bytes = fs.tellg(); return chain::ChainMap(in, args.verbose, args.allowed_cigar_changes); } else if (args.chain_fname != "") { @@ -319,10 +325,13 @@ void lift_run(lift_opts args) { return chain::ChainMap(); } }(); + size_t lift_bytes = 0; lift::LiftMap lift_map = [&] { if (args.lift_fname != "") { std::cerr << "[I::lift_run] Loading levioSAM index..."; std::ifstream in(args.lift_fname, std::ios::binary); + std::ifstream fs(args.lift_fname, std::ios::binary | std::ios::ate); + lift_bytes = fs.tellg(); return lift::LiftMap(in); // if "-l" not specified, then create a levioSAM } else if (args.vcf_fname != "") { @@ -349,6 +358,10 @@ void lift_run(lift_opts args) { exit(1); } std::cerr << "done\n"; + if (args.chainmap_fname != "" || args.chain_fname != "") + chain_map.log_index_size(chain_bytes); + if (args.lift_fname != "" || args.vcf_fname != "") + lift_map.log_index_size(lift_bytes); samFile *sam_fp = (args.sam_fname == "") ? sam_open("-", "r") diff --git a/src/leviosam.hpp b/src/leviosam.hpp index 7a198e6..9bf2f51 100644 --- a/src/leviosam.hpp +++ b/src/leviosam.hpp @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -395,6 +396,21 @@ class Lift { : ins_rs0(ins.size() - 1) + 1; } + size_t count_ins() const { + sdsl::sd_vector<>::rank_1_type r(&ins); + return r(ins.size()); + } + + size_t count_del() const { + sdsl::sd_vector<>::rank_1_type r(&del); + return r(del.size()); + } + + size_t count_snp() const { + sdsl::sd_vector<>::rank_1_type r(&snp); + return r(snp.size()); + } + // returns size of s2 sequence // size_t s2_len() { // return del[del.size() - 1] ? del_rs0(del.size() - 1) : @@ -769,6 +785,44 @@ class LiftMap { length_map = LevioSamUtils::load_lengthmap(in); } + void log_index_size(size_t bytes = 0) const { + struct CStats { + size_t ins; + size_t del; + size_t snp; + }; + std::vector> stats; + stats.reserve(s2_map.size()); + size_t total_ins = 0, total_del = 0, total_snp = 0; + for (const auto &kv : s2_map) { + const auto &l = lmap[kv.second]; + size_t ins = l.count_ins(); + size_t del = l.count_del(); + size_t snp = l.count_snp(); + total_ins += ins; + total_del += del; + total_snp += snp; + stats.push_back({kv.first, {ins, del, snp}}); + } + size_t total_var = total_ins + total_del + total_snp; + std::cerr << "[I::log_index_size] contigs=" << s2_map.size() + << " variants=" << total_var + << " ins=" << total_ins + << " del=" << total_del + << " snp=" << total_snp; + if (bytes) + std::cerr << " bytes=" << bytes; + std::cerr << "\n"; + for (const auto &s : stats) { + size_t var = s.second.ins + s.second.del + s.second.snp; + std::cerr << "[I::log_index_size] " << s.first + << " variants=" << var + << " ins=" << s.second.ins + << " del=" << s.second.del + << " snp=" << s.second.snp << "\n"; + } + } + // get names and lengths of s1 sequences std::pair, std::vector> get_s1_lens() { std::vector lengths; From 2eda174fe625ac2d65659b71ebb3f533f13780cd Mon Sep 17 00:00:00 2001 From: milkschen Date: Thu, 9 Oct 2025 14:47:41 -0700 Subject: [PATCH 2/2] remove liftmap changes. minor formatting changes --- src/chain.cpp | 24 ++++++++++++++------- src/leviosam.cpp | 12 +++-------- src/leviosam.hpp | 54 ------------------------------------------------ 3 files changed, 19 insertions(+), 71 deletions(-) diff --git a/src/chain.cpp b/src/chain.cpp index 9c0bb8b..a65e27c 100644 --- a/src/chain.cpp +++ b/src/chain.cpp @@ -210,6 +210,17 @@ bool ChainMap::interval_map_sanity_check() { return true; } +/** + * @brief Logs statistics about the chain index. + * + * This function reports the number of contigs, intervals, bases, and average + * interval length present in the chain interval map. If a nonzero byte count is + * provided, it also logs the memory footprint in bytes. Per-contig statistics, + * such as the number of intervals and bases, as well as the average interval + * length, are reported for each contig. + * + * @param bytes (optional) The size in bytes of the loaded index to report. + */ void ChainMap::log_index_size(size_t bytes) const { size_t total_intervals = 0; size_t total_bases = 0; @@ -227,18 +238,15 @@ void ChainMap::log_index_size(size_t bytes) const { stats.push_back({kv.first, {n, bases}}); } size_t avg_len = total_intervals ? total_bases / total_intervals : 0; - std::cerr << "[I::log_index_size] contigs=" << interval_map.size() + std::cerr << "[I::chain::log_index_size] contigs=" << interval_map.size() << " intervals=" << total_intervals << " bases=" << total_bases << " avg_len=" << avg_len; - if (bytes) - std::cerr << " bytes=" << bytes; - std::cerr << "\n"; + if (bytes) std::cerr << " bytes=" << bytes << "\n"; for (const auto &s : stats) { size_t avg = s.second.first ? s.second.second / s.second.first : 0; - std::cerr << "[I::log_index_size] " << s.first - << " intervals=" << s.second.first - << " bases=" << s.second.second - << " avg_len=" << avg << "\n"; + std::cerr << "[I::chain::log_index_size] " << s.first + << ": intervals=" << s.second.first + << " bases=" << s.second.second << " avg_len=" << avg << "\n"; } } diff --git a/src/leviosam.cpp b/src/leviosam.cpp index b2c1f7a..48c9002 100644 --- a/src/leviosam.cpp +++ b/src/leviosam.cpp @@ -113,8 +113,7 @@ void serialize_run(lift_opts args) { args.haplotype, args.name_map, args.length_map)); std::ofstream o(fn_index, std::ios::binary); - size_t bytes = l.serialize(o); - l.log_index_size(bytes); + l.serialize(o); std::cerr << "[I::serialize_run] levioSAM VcfMap saved to " << fn_index << "\n"; // ChainMap @@ -305,14 +304,14 @@ void lift_run(lift_opts args) { size_t chain_bytes = 0; chain::ChainMap chain_map = [&] { if (args.chainmap_fname != "") { - std::cerr << "[I::lift_run] Loading levioSAM index..."; + std::cerr << "[I::lift_run] Loading levioSAM 2 index..."; std::ifstream in(args.chainmap_fname, std::ios::binary); std::ifstream fs(args.chainmap_fname, std::ios::binary | std::ios::ate); chain_bytes = fs.tellg(); return chain::ChainMap(in, args.verbose, args.allowed_cigar_changes); } else if (args.chain_fname != "") { - std::cerr << "[I::lift_run] Building levioSAM index..."; + std::cerr << "[I::lift_run] Building levioSAM 2 index..."; if (args.length_map.size() == 0) { std::cerr << "[E::lift_run] No length map is found. Please " "set -F properly.\n"; @@ -325,13 +324,10 @@ void lift_run(lift_opts args) { return chain::ChainMap(); } }(); - size_t lift_bytes = 0; lift::LiftMap lift_map = [&] { if (args.lift_fname != "") { std::cerr << "[I::lift_run] Loading levioSAM index..."; std::ifstream in(args.lift_fname, std::ios::binary); - std::ifstream fs(args.lift_fname, std::ios::binary | std::ios::ate); - lift_bytes = fs.tellg(); return lift::LiftMap(in); // if "-l" not specified, then create a levioSAM } else if (args.vcf_fname != "") { @@ -360,8 +356,6 @@ void lift_run(lift_opts args) { std::cerr << "done\n"; if (args.chainmap_fname != "" || args.chain_fname != "") chain_map.log_index_size(chain_bytes); - if (args.lift_fname != "" || args.vcf_fname != "") - lift_map.log_index_size(lift_bytes); samFile *sam_fp = (args.sam_fname == "") ? sam_open("-", "r") diff --git a/src/leviosam.hpp b/src/leviosam.hpp index 9bf2f51..7a198e6 100644 --- a/src/leviosam.hpp +++ b/src/leviosam.hpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -396,21 +395,6 @@ class Lift { : ins_rs0(ins.size() - 1) + 1; } - size_t count_ins() const { - sdsl::sd_vector<>::rank_1_type r(&ins); - return r(ins.size()); - } - - size_t count_del() const { - sdsl::sd_vector<>::rank_1_type r(&del); - return r(del.size()); - } - - size_t count_snp() const { - sdsl::sd_vector<>::rank_1_type r(&snp); - return r(snp.size()); - } - // returns size of s2 sequence // size_t s2_len() { // return del[del.size() - 1] ? del_rs0(del.size() - 1) : @@ -785,44 +769,6 @@ class LiftMap { length_map = LevioSamUtils::load_lengthmap(in); } - void log_index_size(size_t bytes = 0) const { - struct CStats { - size_t ins; - size_t del; - size_t snp; - }; - std::vector> stats; - stats.reserve(s2_map.size()); - size_t total_ins = 0, total_del = 0, total_snp = 0; - for (const auto &kv : s2_map) { - const auto &l = lmap[kv.second]; - size_t ins = l.count_ins(); - size_t del = l.count_del(); - size_t snp = l.count_snp(); - total_ins += ins; - total_del += del; - total_snp += snp; - stats.push_back({kv.first, {ins, del, snp}}); - } - size_t total_var = total_ins + total_del + total_snp; - std::cerr << "[I::log_index_size] contigs=" << s2_map.size() - << " variants=" << total_var - << " ins=" << total_ins - << " del=" << total_del - << " snp=" << total_snp; - if (bytes) - std::cerr << " bytes=" << bytes; - std::cerr << "\n"; - for (const auto &s : stats) { - size_t var = s.second.ins + s.second.del + s.second.snp; - std::cerr << "[I::log_index_size] " << s.first - << " variants=" << var - << " ins=" << s.second.ins - << " del=" << s.second.del - << " snp=" << s.second.snp << "\n"; - } - } - // get names and lengths of s1 sequences std::pair, std::vector> get_s1_lens() { std::vector lengths;