Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 82 additions & 9 deletions src/chain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,14 @@
#include "leviosam_utils.hpp"

namespace chain {
/**
* @brief Default constructor for Interval.
*
* Creates an empty interval with default values:
* - Empty target string
* - Zero offset and coordinates
* - Positive strand (true)
*/
Interval::Interval() {
target = "";
offset = 0;
Expand All @@ -27,6 +35,18 @@ Interval::Interval() {
strand = true;
}

/**
* @brief Parameterized constructor for Interval.
*
* Creates an interval with specified alignment coordinates and properties.
* An interval represents a gapless alignment segment between source and target references.
*
* @param t Target reference/contig name
* @param so Source start position (0-based)
* @param se Source end position (0-based, exclusive)
* @param o Offset to convert source positions to target positions
* @param ss Strand orientation (true = forward/+, false = reverse/-)
*/
Interval::Interval(std::string t, int32_t so, int32_t se, int32_t o, bool ss) {
target = t;
offset = o;
Expand All @@ -35,6 +55,14 @@ Interval::Interval(std::string t, int32_t so, int32_t se, int32_t o, bool ss) {
strand = ss;
}

/**
* @brief Constructs an Interval by loading data from a binary input stream.
*
* This constructor loads a previously serialized Interval from a binary stream.
* The data must have been written using the serialize() method.
*
* @param in Input stream containing the serialized Interval data
*/
Interval::Interval(std::ifstream &in) { load(in); }

void Interval::debug_print_interval() {
Expand Down Expand Up @@ -84,11 +112,39 @@ void Interval::load(std::istream &in) {
strand = load_strand;
}

/**
* @brief Constructs a ChainMap by loading a pre-built index from an input stream.
*
* This constructor loads a previously serialized ChainMap index from a binary stream.
* The index must have been created using the serialize() method.
*
* @param in Input stream containing the serialized ChainMap data
* @param verbose Verbosity level (0=quiet, higher=more verbose)
* @param allowed_intvl_gaps Maximum allowed gaps between intervals for lift-over
*/
ChainMap::ChainMap(std::ifstream &in, int verbose, int allowed_intvl_gaps)
: verbose(verbose), allowed_intvl_gaps(allowed_intvl_gaps) {
load(in);
}

/**
* @brief Constructs a ChainMap by parsing a chain file and building the index.
*
* This constructor reads a chain file (typically from UCSC's liftOver tool) and
* builds an efficient index for performing lift-over operations. The chain file
* contains pairwise alignments between source and target reference sequences.
*
* The constructor:
* - Parses chain file format and extracts alignment intervals
* - Builds bit vectors for efficient interval queries
* - Sorts intervals and validates for overlaps
* - Exits with error if interval sanity check fails
*
* @param fname Path to the chain file to parse
* @param verbose Verbosity level (0=quiet, higher=more verbose)
* @param allowed_intvl_gaps Maximum allowed gaps between intervals for lift-over
* @param lm Length map containing chromosome/contig lengths for validation
*/
ChainMap::ChainMap(std::string fname, int verbose, int allowed_intvl_gaps,
LengthMap &lm)
: verbose(verbose), allowed_intvl_gaps(allowed_intvl_gaps), length_map(lm) {
Expand Down Expand Up @@ -132,8 +188,10 @@ ChainMap::ChainMap(std::string fname, int verbose, int allowed_intvl_gaps,
if (verbose > 2) {
debug_print_interval_map();
}
// TEMP
interval_map_sanity_check();
if (!validate_intervals()) {
std::cerr << "[E::chain::build] Interval validation failed\n";
exit(1);
}
}

/* Create start and end bitvectors when see a new `source`.
Expand Down Expand Up @@ -187,26 +245,41 @@ void ChainMap::debug_print_intervals(std::string contig, const int n) {
std::cerr << "\n";
}

/* Check if the interval map contains any overlaps in the source reference
* Logic: for each interval, its ending position <= next starting position
/**
* @brief Adds an interval to the specified contig for testing purposes.
*
* This method allows adding intervals to the interval map, primarily intended
* for unit testing scenarios where controlled interval data is needed.
*
* @param contig The contig/chromosome name to add the interval to
* @param interval The interval to add
*/
void ChainMap::add_interval(const std::string& contig, const Interval& interval) {
interval_map[contig].push_back(interval);
}

/**
* @brief Validates that intervals in the interval map do not overlap in the source reference.
*
* For each interval in each contig, this function ensures that the end position
* of the current interval is less than or equal to the start position of the next interval.
*
* Return true if pass; false otherwise.
* @return true if validation passes (no overlaps found), false otherwise.
*/
bool ChainMap::interval_map_sanity_check() {
bool ChainMap::validate_intervals() {
for (auto &itr : interval_map) {
std::vector<chain::Interval> v = interval_map[itr.first];
for (int i = 0; i < v.size() - 1; i++) {
if (v[i].source_end > v[i + 1].source_start) {
std::cerr << "[E::chain::interval_map_sanity_check] "
std::cerr << "[E::chain::validate_intervals] "
<< itr.first << "\n";
v[i].debug_print_interval();
v[i + 1].debug_print_interval();
return false;
}
}
}
std::cerr << "[I::chain::interval_map_sanity_check] Interval_map sanity "
"check: passed (no overlaps)\n";
std::cerr << "[I::chain::validate_intervals] Interval validation: passed (no overlaps)\n";
return true;
}

Expand Down
4 changes: 3 additions & 1 deletion src/chain.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ class ChainMap {
void sort_intervals(std::string contig);
void debug_print_interval_map();
void debug_print_intervals(std::string contig, const int n);
bool interval_map_sanity_check();
bool validate_intervals();
void log_index_size(size_t bytes = 0) const;
int get_start_rank(std::string contig, int pos);
int get_end_rank(std::string contig, int pos);
Expand Down Expand Up @@ -115,6 +115,8 @@ class ChainMap {
size_t serialize(std::ofstream &out);
void load(std::ifstream &in);
std::vector<std::pair<std::string, int32_t>> length_map;

void add_interval(const std::string& contig, const Interval& interval);

private:
void init_rs();
Expand Down
103 changes: 103 additions & 0 deletions src/chain_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,109 @@ TEST(ChainTest, UpdateIntervalIndexes) {
EXPECT_EQ(eidx, 784); // TODO: why not 785
}

TEST(ChainTest, IntervalMapSanityCheckEmpty) {
chain::ChainMap cmap;
// Empty interval map should pass sanity check
EXPECT_EQ(cmap.validate_intervals(), true);
}

TEST(ChainTest, IntervalMapSanityCheckSingleInterval) {
chain::ChainMap cmap;
// Single interval should pass sanity check
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
EXPECT_EQ(cmap.validate_intervals(), true);
}

TEST(ChainTest, IntervalMapSanityCheckNoOverlaps) {
chain::ChainMap cmap;
// Add non-overlapping intervals
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 400, 500, 50, true));

EXPECT_EQ(cmap.validate_intervals(), true);
}

TEST(ChainTest, IntervalMapSanityCheckWithOverlaps) {
chain::ChainMap cmap;
// Add overlapping intervals - first interval ends at 200, second starts at 150
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 150, 250, 50, true));

EXPECT_EQ(cmap.validate_intervals(), false);
}

TEST(ChainTest, IntervalMapSanityCheckAdjacentIntervals) {
chain::ChainMap cmap;
// Add adjacent intervals (no gap, no overlap) - should pass
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 200, 300, 50, true));

EXPECT_EQ(cmap.validate_intervals(), true);
}

TEST(ChainTest, IntervalMapSanityCheckMultipleContigs) {
chain::ChainMap cmap;
// Add intervals to multiple contigs - some with overlaps, some without
// chr1: no overlaps (should pass)
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true));

// chr2: has overlaps (should fail)
cmap.add_interval("chr2", chain::Interval("chr2_dest", 100, 200, 50, true));
cmap.add_interval("chr2", chain::Interval("chr2_dest", 150, 250, 50, true));

// Should fail because chr2 has overlaps
EXPECT_EQ(cmap.validate_intervals(), false);
}

TEST(ChainTest, IntervalMapSanityCheckAllContigsValid) {
chain::ChainMap cmap;
// Add intervals to multiple contigs - all valid (no overlaps)
// chr1: no overlaps
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 350, 50, true));

// chr2: no overlaps
cmap.add_interval("chr2", chain::Interval("chr2_dest", 100, 200, 50, true));
cmap.add_interval("chr2", chain::Interval("chr2_dest", 300, 400, 50, true));

// chr3: single interval
cmap.add_interval("chr3", chain::Interval("chr3_dest", 500, 600, 50, true));

// Should pass because all contigs have valid intervals
EXPECT_EQ(cmap.validate_intervals(), true);
}

TEST(ChainTest, IntervalMapSanityCheckExactOverlap) {
chain::ChainMap cmap;
// Add intervals with exact overlap (same start/end positions)
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 200, 50, true));

EXPECT_EQ(cmap.validate_intervals(), false);
}

TEST(ChainTest, IntervalMapSanityCheckPartialOverlap) {
chain::ChainMap cmap;
// Add intervals with partial overlap
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 300, 50, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 250, 400, 50, true));

EXPECT_EQ(cmap.validate_intervals(), false);
}

TEST(ChainTest, IntervalMapSanityCheckDestOverlap) {
chain::ChainMap cmap;
// The dest intervals overlap, but the source intervals do not
// Dest intervals: chr1_dest: [100, 300), chr1_dest: [100, 200)
// Source intervals: chr1: [100, 300), chr1: [300, 400)
cmap.add_interval("chr1", chain::Interval("chr1_dest", 100, 300, 0, true));
cmap.add_interval("chr1", chain::Interval("chr1_dest", 300, 400, -200, true));

EXPECT_EQ(cmap.validate_intervals(), true);
}

int main(int argc, char **argv) {
testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
Expand Down
Loading