diff --git a/src/chain.cpp b/src/chain.cpp index a65e27c..2542c67 100644 --- a/src/chain.cpp +++ b/src/chain.cpp @@ -19,6 +19,14 @@ #include "leviosam_utils.hpp" namespace chain { +/** + * @brief Default constructor for Interval. + * + * Creates an empty interval with default values: + * - Empty target string + * - Zero offset and coordinates + * - Positive strand (true) + */ Interval::Interval() { target = ""; offset = 0; @@ -27,6 +35,18 @@ Interval::Interval() { strand = true; } +/** + * @brief Parameterized constructor for Interval. + * + * Creates an interval with specified alignment coordinates and properties. + * An interval represents a gapless alignment segment between source and target references. + * + * @param t Target reference/contig name + * @param so Source start position (0-based) + * @param se Source end position (0-based, exclusive) + * @param o Offset to convert source positions to target positions + * @param ss Strand orientation (true = forward/+, false = reverse/-) + */ Interval::Interval(std::string t, int32_t so, int32_t se, int32_t o, bool ss) { target = t; offset = o; @@ -35,6 +55,14 @@ Interval::Interval(std::string t, int32_t so, int32_t se, int32_t o, bool ss) { strand = ss; } +/** + * @brief Constructs an Interval by loading data from a binary input stream. + * + * This constructor loads a previously serialized Interval from a binary stream. + * The data must have been written using the serialize() method. + * + * @param in Input stream containing the serialized Interval data + */ Interval::Interval(std::ifstream &in) { load(in); } void Interval::debug_print_interval() { @@ -84,11 +112,39 @@ void Interval::load(std::istream &in) { strand = load_strand; } +/** + * @brief Constructs a ChainMap by loading a pre-built index from an input stream. + * + * This constructor loads a previously serialized ChainMap index from a binary stream. + * The index must have been created using the serialize() method. + * + * @param in Input stream containing the serialized ChainMap data + * @param verbose Verbosity level (0=quiet, higher=more verbose) + * @param allowed_intvl_gaps Maximum allowed gaps between intervals for lift-over + */ ChainMap::ChainMap(std::ifstream &in, int verbose, int allowed_intvl_gaps) : verbose(verbose), allowed_intvl_gaps(allowed_intvl_gaps) { load(in); } +/** + * @brief Constructs a ChainMap by parsing a chain file and building the index. + * + * This constructor reads a chain file (typically from UCSC's liftOver tool) and + * builds an efficient index for performing lift-over operations. The chain file + * contains pairwise alignments between source and target reference sequences. + * + * The constructor: + * - Parses chain file format and extracts alignment intervals + * - Builds bit vectors for efficient interval queries + * - Sorts intervals and validates for overlaps + * - Exits with error if interval sanity check fails + * + * @param fname Path to the chain file to parse + * @param verbose Verbosity level (0=quiet, higher=more verbose) + * @param allowed_intvl_gaps Maximum allowed gaps between intervals for lift-over + * @param lm Length map containing chromosome/contig lengths for validation + */ ChainMap::ChainMap(std::string fname, int verbose, int allowed_intvl_gaps, LengthMap &lm) : verbose(verbose), allowed_intvl_gaps(allowed_intvl_gaps), length_map(lm) { @@ -132,8 +188,10 @@ ChainMap::ChainMap(std::string fname, int verbose, int allowed_intvl_gaps, if (verbose > 2) { debug_print_interval_map(); } - // TEMP - interval_map_sanity_check(); + if (!validate_intervals()) { + std::cerr << "[E::chain::build] Interval validation failed\n"; + exit(1); + } } /* Create start and end bitvectors when see a new `source`. @@ -187,17 +245,33 @@ void ChainMap::debug_print_intervals(std::string contig, const int n) { std::cerr << "\n"; } -/* Check if the interval map contains any overlaps in the source reference - * Logic: for each interval, its ending position <= next starting position +/** + * @brief Adds an interval to the specified contig for testing purposes. + * + * This method allows adding intervals to the interval map, primarily intended + * for unit testing scenarios where controlled interval data is needed. + * + * @param contig The contig/chromosome name to add the interval to + * @param interval The interval to add + */ +void ChainMap::add_interval(const std::string& contig, const Interval& interval) { + interval_map[contig].push_back(interval); +} + +/** + * @brief Validates that intervals in the interval map do not overlap in the source reference. + * + * For each interval in each contig, this function ensures that the end position + * of the current interval is less than or equal to the start position of the next interval. * - * Return true if pass; false otherwise. + * @return true if validation passes (no overlaps found), false otherwise. */ -bool ChainMap::interval_map_sanity_check() { +bool ChainMap::validate_intervals() { for (auto &itr : interval_map) { std::vector v = interval_map[itr.first]; for (int i = 0; i < v.size() - 1; i++) { if (v[i].source_end > v[i + 1].source_start) { - std::cerr << "[E::chain::interval_map_sanity_check] " + std::cerr << "[E::chain::validate_intervals] " << itr.first << "\n"; v[i].debug_print_interval(); v[i + 1].debug_print_interval(); @@ -205,8 +279,7 @@ bool ChainMap::interval_map_sanity_check() { } } } - std::cerr << "[I::chain::interval_map_sanity_check] Interval_map sanity " - "check: passed (no overlaps)\n"; + std::cerr << "[I::chain::validate_intervals] Interval validation: passed (no overlaps)\n"; return true; } diff --git a/src/chain.hpp b/src/chain.hpp index f338463..c46e9f0 100644 --- a/src/chain.hpp +++ b/src/chain.hpp @@ -62,7 +62,7 @@ class ChainMap { void sort_intervals(std::string contig); void debug_print_interval_map(); void debug_print_intervals(std::string contig, const int n); - bool interval_map_sanity_check(); + bool validate_intervals(); void log_index_size(size_t bytes = 0) const; int get_start_rank(std::string contig, int pos); int get_end_rank(std::string contig, int pos); @@ -115,6 +115,8 @@ class ChainMap { size_t serialize(std::ofstream &out); void load(std::ifstream &in); std::vector> length_map; + + void add_interval(const std::string& contig, const Interval& interval); private: void init_rs(); diff --git a/src/chain_test.cpp b/src/chain_test.cpp index 3652eb9..525bcea 100644 --- a/src/chain_test.cpp +++ b/src/chain_test.cpp @@ -539,6 +539,109 @@ TEST(ChainTest, UpdateIntervalIndexes) { EXPECT_EQ(eidx, 784); // TODO: why not 785 } +TEST(ChainTest, IntervalMapSanityCheckEmpty) { + chain::ChainMap cmap; + // Empty interval map should pass sanity check + EXPECT_EQ(cmap.validate_intervals(), true); +} + +TEST(ChainTest, IntervalMapSanityCheckSingleInterval) { + chain::ChainMap cmap; + // Single interval should pass sanity check + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true)); + EXPECT_EQ(cmap.validate_intervals(), true); +} + +TEST(ChainTest, IntervalMapSanityCheckNoOverlaps) { + chain::ChainMap cmap; + // Add non-overlapping intervals + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 400, 500, 50, true)); + + EXPECT_EQ(cmap.validate_intervals(), true); +} + +TEST(ChainTest, IntervalMapSanityCheckWithOverlaps) { + chain::ChainMap cmap; + // Add overlapping intervals - first interval ends at 200, second starts at 150 + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 150, 250, 50, true)); + + EXPECT_EQ(cmap.validate_intervals(), false); +} + +TEST(ChainTest, IntervalMapSanityCheckAdjacentIntervals) { + chain::ChainMap cmap; + // Add adjacent intervals (no gap, no overlap) - should pass + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 200, 300, 50, true)); + + EXPECT_EQ(cmap.validate_intervals(), true); +} + +TEST(ChainTest, IntervalMapSanityCheckMultipleContigs) { + chain::ChainMap cmap; + // Add intervals to multiple contigs - some with overlaps, some without + // chr1: no overlaps (should pass) + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true)); + + // chr2: has overlaps (should fail) + cmap.add_interval("chr2", chain::Interval("chr2_dest", 100, 200, 50, true)); + cmap.add_interval("chr2", chain::Interval("chr2_dest", 150, 250, 50, true)); + + // Should fail because chr2 has overlaps + EXPECT_EQ(cmap.validate_intervals(), false); +} + +TEST(ChainTest, IntervalMapSanityCheckAllContigsValid) { + chain::ChainMap cmap; + // Add intervals to multiple contigs - all valid (no overlaps) + // chr1: no overlaps + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true)); + + // chr2: no overlaps + cmap.add_interval("chr2", chain::Interval("chr2_dest", 100, 200, 50, true)); + cmap.add_interval("chr2", chain::Interval("chr2_dest", 300, 400, 50, true)); + + // chr3: single interval + cmap.add_interval("chr3", chain::Interval("chr3_dest", 500, 600, 50, true)); + + // Should pass because all contigs have valid intervals + EXPECT_EQ(cmap.validate_intervals(), true); +} + +TEST(ChainTest, IntervalMapSanityCheckExactOverlap) { + chain::ChainMap cmap; + // Add intervals with exact overlap (same start/end positions) + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true)); + + EXPECT_EQ(cmap.validate_intervals(), false); +} + +TEST(ChainTest, IntervalMapSanityCheckPartialOverlap) { + chain::ChainMap cmap; + // Add intervals with partial overlap + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 300, 50, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 400, 50, true)); + + EXPECT_EQ(cmap.validate_intervals(), false); +} + +TEST(ChainTest, IntervalMapSanityCheckDestOverlap) { + chain::ChainMap cmap; + // The dest intervals overlap, but the source intervals do not + // Dest intervals: chr1_dest: [100, 300), chr1_dest: [100, 200) + // Source intervals: chr1: [100, 300), chr1: [300, 400) + cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 300, 0, true)); + cmap.add_interval("chr1", chain::Interval("chr1_dest", 300, 400, -200, true)); + + EXPECT_EQ(cmap.validate_intervals(), true); +} + int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS();