diff --git a/src/chain.cpp b/src/chain.cpp index d1d9450..a65e27c 100644 --- a/src/chain.cpp +++ b/src/chain.cpp @@ -210,6 +210,46 @@ bool ChainMap::interval_map_sanity_check() { return true; } +/** + * @brief Logs statistics about the chain index. + * + * This function reports the number of contigs, intervals, bases, and average + * interval length present in the chain interval map. If a nonzero byte count is + * provided, it also logs the memory footprint in bytes. Per-contig statistics, + * such as the number of intervals and bases, as well as the average interval + * length, are reported for each contig. + * + * @param bytes (optional) The size in bytes of the loaded index to report. + */ +void ChainMap::log_index_size(size_t bytes) const { + size_t total_intervals = 0; + size_t total_bases = 0; + std::vector>> stats; + stats.reserve(interval_map.size()); + for (const auto &kv : interval_map) { + size_t n = kv.second.size(); + size_t bases = 0; + for (const auto &intvl : kv.second) { + bases += static_cast(intvl.source_end) - + static_cast(intvl.source_start); + } + total_intervals += n; + total_bases += bases; + stats.push_back({kv.first, {n, bases}}); + } + size_t avg_len = total_intervals ? total_bases / total_intervals : 0; + std::cerr << "[I::chain::log_index_size] contigs=" << interval_map.size() + << " intervals=" << total_intervals << " bases=" << total_bases + << " avg_len=" << avg_len; + if (bytes) std::cerr << " bytes=" << bytes << "\n"; + for (const auto &s : stats) { + size_t avg = s.second.first ? s.second.second / s.second.first : 0; + std::cerr << "[I::chain::log_index_size] " << s.first + << ": intervals=" << s.second.first + << " bases=" << s.second.second << " avg_len=" << avg << "\n"; + } +} + /* Get the rank in the start bitvector at contig[pos] * If pos > len(contig): return rank(contig[-1]) * diff --git a/src/chain.hpp b/src/chain.hpp index cc23e0a..f338463 100644 --- a/src/chain.hpp +++ b/src/chain.hpp @@ -63,6 +63,7 @@ class ChainMap { void debug_print_interval_map(); void debug_print_intervals(std::string contig, const int n); bool interval_map_sanity_check(); + void log_index_size(size_t bytes = 0) const; int get_start_rank(std::string contig, int pos); int get_end_rank(std::string contig, int pos); bool update_interval_indexes(const std::string contig, const int32_t pos, diff --git a/src/leviosam.cpp b/src/leviosam.cpp index abe74ac..48c9002 100644 --- a/src/leviosam.cpp +++ b/src/leviosam.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include "aln.hpp" #include "collate.hpp" @@ -121,7 +122,8 @@ void serialize_run(lift_opts args) { chain::ChainMap cfp(args.chain_fname, args.verbose, args.allowed_cigar_changes, args.length_map); std::ofstream o(fn_index, std::ios::binary); - cfp.serialize(o); + size_t bytes = cfp.serialize(o); + cfp.log_index_size(bytes); std::cerr << "[I::serialize_run] levioSAM ChainMap saved to " << fn_index << "\n"; } else { @@ -299,14 +301,17 @@ std::map load_fasta(std::string ref_name) { } void lift_run(lift_opts args) { + size_t chain_bytes = 0; chain::ChainMap chain_map = [&] { if (args.chainmap_fname != "") { - std::cerr << "[I::lift_run] Loading levioSAM index..."; + std::cerr << "[I::lift_run] Loading levioSAM 2 index..."; std::ifstream in(args.chainmap_fname, std::ios::binary); + std::ifstream fs(args.chainmap_fname, std::ios::binary | std::ios::ate); + chain_bytes = fs.tellg(); return chain::ChainMap(in, args.verbose, args.allowed_cigar_changes); } else if (args.chain_fname != "") { - std::cerr << "[I::lift_run] Building levioSAM index..."; + std::cerr << "[I::lift_run] Building levioSAM 2 index..."; if (args.length_map.size() == 0) { std::cerr << "[E::lift_run] No length map is found. Please " "set -F properly.\n"; @@ -349,6 +354,8 @@ void lift_run(lift_opts args) { exit(1); } std::cerr << "done\n"; + if (args.chainmap_fname != "" || args.chain_fname != "") + chain_map.log_index_size(chain_bytes); samFile *sam_fp = (args.sam_fname == "") ? sam_open("-", "r")