diff --git a/CMakeLists.txt b/CMakeLists.txt index 723b3c5..e5968b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -174,55 +174,58 @@ target_link_libraries(leviosam ${SDSL_LIB}) target_link_libraries(leviosam pthread) target_link_libraries(leviosam z) +option(BUILD_TESTS "Set OFF to not compile the tests" ON) + +if(${BUILD_TESTS}) + # Download and unpack googletest at configure time + configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) + execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) + if(result) + message(FATAL_ERROR "CMake step for googletest failed: ${result}") + endif() + execute_process(COMMAND ${CMAKE_COMMAND} --build . + RESULT_VARIABLE result + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) + if(result) + message(FATAL_ERROR "Build step for googletest failed: ${result}") + endif() -# Download and unpack googletest at configure time -configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt) -execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) -if(result) - message(FATAL_ERROR "CMake step for googletest failed: ${result}") -endif() -execute_process(COMMAND ${CMAKE_COMMAND} --build . - RESULT_VARIABLE result - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download ) -if(result) - message(FATAL_ERROR "Build step for googletest failed: ${result}") -endif() - -# Prevent overriding the parent project's compiler/linker -# settings on Windows -set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) + # Prevent overriding the parent project's compiler/linker + # settings on Windows + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) -enable_testing() + enable_testing() -# Add googletest directly to our build. This defines -# the gtest and gtest_main targets. -add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src - ${CMAKE_CURRENT_BINARY_DIR}/googletest-build - EXCLUDE_FROM_ALL) + # Add googletest directly to our build. This defines + # the gtest and gtest_main targets. + add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src + ${CMAKE_CURRENT_BINARY_DIR}/googletest-build + EXCLUDE_FROM_ALL) -add_executable(leviosam_test src/leviosam_test.cpp) -target_link_libraries(leviosam_test lvsam) -target_link_libraries(leviosam_test ${HTS_LIB}) -target_link_libraries(leviosam_test ${SDSL_LIB}) -target_link_libraries(leviosam_test gtest gtest_main) + add_executable(leviosam_test src/leviosam_test.cpp) + target_link_libraries(leviosam_test lvsam) + target_link_libraries(leviosam_test ${HTS_LIB}) + target_link_libraries(leviosam_test ${SDSL_LIB}) + target_link_libraries(leviosam_test gtest gtest_main) -add_test(NAME leviosam_test COMMAND leviosam_test - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata -) + add_test(NAME leviosam_test COMMAND leviosam_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata + ) -add_executable(chain_test src/chain_test.cpp) -target_link_libraries(chain_test lvsam) -target_link_libraries(chain_test ${HTS_LIB}) -target_link_libraries(chain_test ${SDSL_LIB}) -target_link_libraries(chain_test gtest gtest_main) + add_executable(chain_test src/chain_test.cpp) + target_link_libraries(chain_test lvsam) + target_link_libraries(chain_test ${HTS_LIB}) + target_link_libraries(chain_test ${SDSL_LIB}) + target_link_libraries(chain_test gtest gtest_main) -add_test(NAME chain_test COMMAND chain_test - WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata -) + add_test(NAME chain_test COMMAND chain_test + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/testdata + ) +endif() INSTALL(TARGETS leviosam DESTINATION bin) INSTALL(FILES src/leviosam.hpp DESTINATION include) diff --git a/src/leviosam.hpp b/src/leviosam.hpp index 077e6b3..f1d3bdb 100644 --- a/src/leviosam.hpp +++ b/src/leviosam.hpp @@ -103,7 +103,7 @@ namespace lift { // Serialization class Name2IdxMap: public std::unordered_map { public: - size_t serialize(std::ofstream& out) { + size_t serialize(std::ostream& out) { size_t size = 0; size_t map_size = this->size(); out.write(reinterpret_cast(&map_size), sizeof(map_size)); @@ -117,7 +117,7 @@ class Name2IdxMap: public std::unordered_map { return size; } - void load(std::ifstream& in) { + void load(std::istream& in) { size_t map_size; in.read(reinterpret_cast(&map_size), sizeof(map_size)); for (auto i = 0; i < map_size; ++i) { @@ -137,7 +137,7 @@ class Name2IdxMap: public std::unordered_map { class Name2NameMap : public std::unordered_map { public: - size_t serialize(std::ofstream& out) { + size_t serialize(std::ostream& out) { size_t size = 0; size_t map_size = this->size(); out.write(reinterpret_cast(&map_size), sizeof(map_size)); @@ -154,7 +154,7 @@ class Name2NameMap : public std::unordered_map { return size; } - void load(std::ifstream& in) { + void load(std::istream& in) { size_t map_size; in.read(reinterpret_cast(&map_size), sizeof(map_size)); for (auto i = 0; i < map_size; ++i) { @@ -173,6 +173,85 @@ class Name2NameMap : public std::unordered_map { } }; +// Builder class for lift construction +class Lift_builder +{ + friend class Lift; + +protected: + sdsl::bit_vector m_ibv; + sdsl::bit_vector m_dbv; + sdsl::bit_vector m_sbv; + size_t m_size = 0; // Size of the contig + size_t m_x = 0; + int m_ppos = 0; + int m_prev_is_ins = 0; // 1 if last variant is an insertion +public: + Lift_builder() + { + + } + + Lift_builder(const size_t l) + { + init(l); + } + + inline void init(const size_t l) + { + m_ibv = sdsl::bit_vector(l*2); + m_dbv = sdsl::bit_vector(l*2); + m_sbv = sdsl::bit_vector(l*2); + m_size = l; + m_x = 0; + m_ppos = 0; + m_prev_is_ins = 0; + } + + inline int prev_is_ins() const {return m_prev_is_ins;} + inline int contig_size() const {return m_size;} + + inline void set( + const size_t rpos, // Position in the reference + const int var_type, // Type of the variant + const int rlen, // Length of the reference allele variant + const int alen // Length of the alternate allele variant + ) + { + m_x += (rpos - m_ppos); // x should now be pointing to *start* of current variant wrt alignment + m_ppos = rpos; // m_ppos now pointing to *start* of current variant + if (var_type == VCF_INDEL) { + ++m_ppos; + ++m_x; + m_prev_is_ins = 0; + if (rlen < alen) { // ins + for (size_t i = 0; i < alen - rlen; ++i) { + m_ibv[m_x++] = 1; + } + m_prev_is_ins = 1; + } else if (rlen > alen) { // del + for (size_t i = 0; i < rlen - alen; ++i) { + m_dbv[m_x++] = 1; + ++m_ppos; // s1 advances, so m_ppos advances + } + } + --m_x; + --m_ppos; + } else if (var_type == VCF_SNP) { + m_sbv[m_x] = 1; // no need to increment x here? + } + } + + inline void finalize() + { + if (m_ppos > m_size) { + die( "something went wrong, we went past bounds of s1 sequence. exiting...\n"); + } + m_ibv.resize(m_x + (m_size - m_ppos)); + m_dbv.resize(m_x + (m_size - m_ppos)); + m_sbv.resize(m_x + (m_size - m_ppos)); + } +}; // liftover data structure for a single sequence class Lift { @@ -183,7 +262,7 @@ class Lift { init_rs_sls(); } - Lift(std::ifstream& in) { + Lift(std::istream& in) { this->load(in); } @@ -196,6 +275,16 @@ class Lift { init_rs_sls(); } + // construct liftover from builder + Lift(Lift_builder& builder) : + ins(builder.m_ibv), + del(builder.m_dbv), + snp(builder.m_sbv) + { + // create rank and select data-structures for everything + init_rs_sls(); + } + Lift(const Lift& rhs) : ins(rhs.ins), del(rhs.del), snp(rhs.snp) { init_rs_sls(); } @@ -375,7 +464,7 @@ class Lift { // } // Save to stream - size_t serialize(std::ofstream& out) const { + size_t serialize(std::ostream& out) const { size_t size = 0; size += ins.serialize(out); size += del.serialize(out); @@ -420,7 +509,7 @@ class LiftMap { ~LiftMap() {} - LiftMap(std::ifstream& in) { + LiftMap(std::istream& in) { this->load(in); } @@ -467,15 +556,13 @@ class LiftMap { int lmap_idx = 0; // current index of lmap - sdsl::bit_vector ibv, dbv, sbv; // ins, del, snp + Lift_builder builder; bool no_sample = (sample_name == ""); if (no_sample) fprintf(stderr, "no sample given, assuming GT=1 for all variants\n"); if (!no_sample && bcf_hdr_set_samples(hdr, sample_name.c_str(), 0)) die("error: sample does not exist!\n"); bcf1_t* rec = bcf_init(); - size_t x = 0; int rid = -1; - int ppos = 0; int tppos = 0; size_t l = 0; int hap = stoi(haplotype); @@ -484,13 +571,8 @@ class LiftMap { if (rec->rid != rid) { if (rid != -1) { // save the bitvector // condense the bit vectors and add to map - if (ppos > l) { - die( "something went wrong, we went past bounds of s1 sequence. exiting...\n"); - } - ibv.resize(x + (l - ppos)); - dbv.resize(x + (l - ppos)); - sbv.resize(x + (l - ppos)); - lmap.push_back(Lift(ibv,dbv,sbv)); + builder.finalize(); + lmap.push_back(Lift(builder)); } rid = rec->rid; if (get_names) { @@ -512,11 +594,7 @@ class LiftMap { } // initialize bit vectors for this contig // TODO: maybe set vec size to something less than 2l - ibv = sdsl::bit_vector(l*2); - dbv = sdsl::bit_vector(l*2); - sbv = sdsl::bit_vector(l*2); - x = 0; - ppos = 0; + builder.init(l); tppos = 0; // end of last variant processed } int32_t* gt_arr = NULL; @@ -555,40 +633,15 @@ class LiftMap { } // at this point, ppos and x should point to *end* of last variant in s1 & s2 alignment, resp. - x += (rec->pos - ppos); // x should now be pointing to *start* of current variant wrt alignment - ppos = rec->pos; // ppos now pointing to *start* of current variant - if (var_type == VCF_INDEL) { - ++ppos; - ++x; - prev_is_ins = 0; - if (rlen < alen) { // ins - for (size_t i = 0; i < alen - rlen; ++i) { - ibv[x++] = 1; - } - prev_is_ins = 1; - } else if (rlen > alen) { // del - for (size_t i = 0; i < rlen - alen; ++i) { - dbv[x++] = 1; - ++ppos; // s1 advances, so ppos advances - } - } - --x; - --ppos; - } else if (var_type == VCF_SNP) { - sbv[x] = 1; // no need to increment x here? - } + builder.set(rec->pos,var_type,rlen,alen); + prev_is_ins = builder.prev_is_ins(); tppos = rec->pos + rec->rlen - 1; // tppos points to end of this variant } else { continue; } } - if (ppos > l) { - die("something went wrong, we went past bounds of s1 sequence. exiting...\n"); - } - ibv.resize(x + (l - ppos)); - dbv.resize(x + (l - ppos)); - sbv.resize(x + (l - ppos)); - lmap.push_back(Lift(ibv,dbv,sbv)); + builder.finalize(); + lmap.push_back(Lift(builder)); length_map = ls; } @@ -695,7 +748,7 @@ class LiftMap { } // saves to stream - size_t serialize(std::ofstream& out) { + size_t serialize(std::ostream& out) { size_t size = 0; size_t nelems = lmap.size(); out.write(reinterpret_cast(&nelems), sizeof(nelems)); @@ -711,7 +764,7 @@ class LiftMap { } // loads from stream - void load(std::ifstream& in) { + void load(std::istream& in) { size_t nelems; in.read(reinterpret_cast(&nelems), sizeof(nelems)); for (auto i = 0; i < nelems; ++i) { diff --git a/src/leviosam_utils.cpp b/src/leviosam_utils.cpp index 9e188b5..3079088 100644 --- a/src/leviosam_utils.cpp +++ b/src/leviosam_utils.cpp @@ -219,13 +219,6 @@ void update_cigar( memmove(aln->data + cigar_st + n_cigar4, aln->data + cigar_st + fake_bytes, orig_len - (cigar_st + fake_bytes)); - // If new n_cigar is greater, copy the real CIGAR to the right place. - // Skipped this if new n_cigar is smaller than the original value. - if (n_cigar4 > fake_bytes){ - memcpy(aln->data + cigar_st, - aln->data + (n_cigar4 - fake_bytes) + 8, - n_cigar4); - } aln->core.n_cigar = new_cigar.size(); } for (int i = 0; i < aln->core.n_cigar; i++){ @@ -500,7 +493,7 @@ sam_hdr_t* fai_to_hdr(std::string fai_fn, const sam_hdr_t* const hdr_orig) { // Serialize a `vector>` object size_t serialize_lengthmap( - std::ofstream& out, std::vector> length_map + std::ostream& out, std::vector> length_map ) { size_t size = 0; size_t nelems = length_map.size(); @@ -518,7 +511,7 @@ size_t serialize_lengthmap( // Load a serialized `vector>` object -std::vector> load_lengthmap(std::ifstream& in) { +std::vector> load_lengthmap(std::istream& in) { size_t map_size; in.read(reinterpret_cast(&map_size), sizeof(map_size)); std::vector> length_map; diff --git a/src/leviosam_utils.hpp b/src/leviosam_utils.hpp index 555a2ef..4de497e 100644 --- a/src/leviosam_utils.hpp +++ b/src/leviosam_utils.hpp @@ -129,9 +129,9 @@ sam_hdr_t* lengthmap_to_hdr( std::vector> lm, const sam_hdr_t* const hdr_orig); -std::vector> load_lengthmap(std::ifstream& in); +std::vector> load_lengthmap(std::istream& in); size_t serialize_lengthmap( - std::ofstream& out, + std::ostream& out, std::vector> length_map); std::string make_cmd(int argc, char** argv);