diff --git a/.clang-tidy b/.clang-tidy index e0fb1a1..5c5547b 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -11,8 +11,8 @@ ImplementationFileExtensions: - cc - cpp - cxx -HeaderFilterRegex: '' -ExcludeHeaderFilterRegex: '' +HeaderFilterRegex: '.' +ExcludeHeaderFilterRegex: 'json.hpp|bamxx.hpp|popcnt.hpp|OptionParser.hpp|config.h|.*smithlab_cpp.*' FormatStyle: none CheckOptions: cert-dcl16-c.NewSuffixes: 'L;LL;LU;LLU' diff --git a/src/AbismalAlign.hpp b/src/AbismalAlign.hpp index 213c290..c870cdc 100644 --- a/src/AbismalAlign.hpp +++ b/src/AbismalAlign.hpp @@ -4,112 +4,106 @@ * * This file is part of ABISMAL. * - * ABISMAL is free software: you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. + * ABISMAL is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * ABISMAL is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * ABISMAL is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. */ -#ifndef ABISMAL_ALIGN_HPP -#define ABISMAL_ALIGN_HPP +#ifndef SRC_ABISMAL_ALIGN_HPP_ +#define SRC_ABISMAL_ALIGN_HPP_ #include #include #include #include +#include #include #include #include "abismal_cigar_utils.hpp" -#include "dna_four_bit.hpp" +#include "dna_four_bit_bisulfite.hpp" -// AbismalAlign class has a templated function for the comparison -// operation between letters in a sequence. This function currently -// returns a boolean value, so only computes longest common -// subsequence. -typedef int16_t score_t; +// AbismalAlign class has a templated function for the comparison operation +// between letters in a sequence. This function currently returns a boolean +// value, so only computes longest common subsequence. +typedef std::int16_t score_t; typedef genome_four_bit_itr genome_iterator; typedef std::vector bam_cigar_t; -static inline score_t -count_deletions(const bam_cigar_t &cigar) { - score_t ans = 0; - // cppcheck-suppress-begin useStlAlgorithm - for (const auto &x : cigar) - if (abismal_bam_cigar_op(x) == ABISMAL_BAM_CDEL) - ans += abismal_bam_cigar_oplen(x); - // cppcheck-suppress-end useStlAlgorithm - return ans; +template +[[nodiscard]] static inline score_t +count_total_ops(const bam_cigar_t &cigar) { + static constexpr auto acc = [&](const score_t t, const auto x) { + return t + (abismal_bam_cigar_op(x) == op ? abismal_bam_cigar_oplen(x) : 0); + }; + return std::accumulate(std::cbegin(cigar), std::cend(cigar), 0, acc); } -static inline score_t -count_insertions(const bam_cigar_t &cigar) { - score_t ans = 0; - // cppcheck-suppress-begin useStlAlgorithm - for (const auto &x : cigar) - if (abismal_bam_cigar_op(x) == ABISMAL_BAM_CINS) - ans += abismal_bam_cigar_oplen(x); - // cppcheck-suppress-end useStlAlgorithm - return ans; -} - -// A specific namespace for simple match/mismatch scoring system and a -// 1 -1 -1 scoring scheme for edit distance. +// A specific namespace for simple match/mismatch scoring system and a 2 -3 -4 +// scoring scheme for edit distance. namespace simple_aln { -static const score_t match = 2; -static const score_t mismatch = -3; -static const score_t indel = -4; -static const std::array score_lookup = {match, mismatch}; +static constexpr score_t match = 2; +static constexpr score_t mismatch = -3; +static constexpr score_t indel = -4; +static constexpr auto score_lookup = std::array{ + match, + mismatch, +}; -inline score_t +[[nodiscard]] constexpr score_t default_score(const std::uint32_t len, const score_t diffs) { - return match * (len - diffs) + mismatch * diffs; + return static_cast(match * static_cast(len - diffs) + + mismatch * static_cast(diffs)); } -inline score_t +[[nodiscard]] inline score_t mismatch_score(const std::uint8_t q_base, const std::uint8_t t_base) { + // NOLINTNEXTLINE(*-constant-array-index) return score_lookup[(q_base & t_base) == 0]; } // edit distance as a function of aln_score and len -inline score_t +[[nodiscard]] inline score_t edit_distance(const score_t scr, const std::uint32_t len, const bam_cigar_t &cigar) { if (scr == 0) - return len; - const score_t ins = count_insertions(cigar); - const score_t del = count_deletions(cigar); + return static_cast(len); + const score_t ins = count_total_ops(cigar); + const score_t del = count_total_ops(cigar); - // A = S - (indel_pen) = match*M + mismatch*m + // A = S - (indel_penalty) = match*M + mismatch*m // B = len - ins = M + m // m = (match*(len - ins) - A)/(match - mismatch) - const score_t A = scr - indel * (ins + del); - const score_t mism = (match * (len - ins) - A) / (match - mismatch); + const score_t A = static_cast(scr - indel * (ins + del)); + const score_t mism = + static_cast((match * (len - ins) - A) / (match - mismatch)); - return mism + ins + del; + return static_cast(mism + ins + del); } -inline score_t +[[nodiscard]] inline score_t best_single_score(const std::uint32_t readlen) { - return match * readlen; + return static_cast(match * readlen); } -inline score_t +[[nodiscard]] inline score_t best_pair_score(const std::uint32_t readlen1, const std::uint32_t readlen2) { - return best_single_score(readlen1) + best_single_score(readlen2); + return static_cast(best_single_score(readlen1) + + best_single_score(readlen2)); } } // namespace simple_aln -template +template struct AbismalAlign { explicit AbismalAlign(const genome_iterator &target_start) : - target(target_start), bw(2 * max_off_diag + 1) {} + target{target_start}, bw{2 * max_off_diag + 1} {} template score_t @@ -125,27 +119,26 @@ struct AbismalAlign { reset(const std::uint32_t max_read_length); std::vector table; - std::vector traceback; - const genome_iterator target; - const std::size_t bw{}; + std::vector traceback; + const genome_iterator target; // NOLINT(*-avoid-const-or-ref-data-members) + const std::size_t bw; // NOLINT(*-avoid-const-or-ref-data-members) - // these are kept because they are needed in both - // align and build_cigar + // these are kept because they are needed in both align and build_cigar std::uint16_t q_sz_max{}; std::uint16_t q_sz{}; - static const std::size_t max_off_diag = 30; + static constexpr std::size_t max_off_diag{30}; }; -template +template void -AbismalAlign::reset( +AbismalAlign::reset( const std::uint32_t max_read_length) { // uses cigar q_sz_max = max_read_length; - // size of alignment matrix and traceback matrix is maximum query - // length times the width of the band around the diagonal + // size of alignment matrix and traceback matrix is maximum query length + // times the width of the band around the diagonal const std::size_t n_cells = (q_sz_max + bw) * bw; table.resize(n_cells); traceback.resize(n_cells, -1); // ADS: -1 no meaning for traceback @@ -157,22 +150,22 @@ static const std::uint8_t above_symbol = ABISMAL_BAM_CDEL; // D static const std::uint8_t diag_symbol = ABISMAL_BAM_CMATCH; // M static const std::uint8_t soft_clip_symbol = ABISMAL_BAM_CSOFT_CLIP; // S -static inline bool // consumes reference +[[nodiscard]] static inline bool // consumes reference is_deletion(const std::uint8_t c) { return c == above_symbol; } -static inline bool // does not consume reference +[[nodiscard]] static inline bool // does not consume reference is_insertion(const std::uint8_t c) { return c == left_symbol; } static inline void get_traceback(const std::size_t n_col, const std::vector &table, - const std::vector &traceback, + const std::vector &traceback, std::vector &cigar, std::size_t &the_row, std::size_t &the_col) { - int8_t prev_arrow = traceback[the_row * n_col + the_col]; + std::int8_t prev_arrow = traceback[the_row * n_col + the_col]; bool is_del = is_deletion(prev_arrow); bool is_ins = is_insertion(prev_arrow); the_row -= !is_ins; @@ -180,7 +173,7 @@ get_traceback(const std::size_t n_col, const std::vector &table, the_col += is_del; // ADS: straight up IS diagonal! std::uint32_t n = 1; while (table[the_row * n_col + the_col] > 0) { - const int8_t arrow = traceback[the_row * n_col + the_col]; + const std::int8_t arrow = traceback[the_row * n_col + the_col]; is_del = is_deletion(arrow); is_ins = is_insertion(arrow); the_row -= !is_ins; @@ -208,65 +201,74 @@ min16(const T a, const T b) { return ((a < b) ? a : b); } -static score_t +[[nodiscard]] static inline score_t get_best_score(const std::vector &table, const std::size_t n_cells, const std::size_t n_col, std::size_t &best_i, std::size_t &best_j) { - auto best_cell_itr = + const auto best_cell_itr = + // NOLINTNEXTLINE(*-narrowing-conversions) std::max_element(std::begin(table), std::begin(table) + n_cells); - const std::size_t best_cell = std::distance(std::begin(table), best_cell_itr); + const std::size_t best_cell = + std::distance(std::cbegin(table), best_cell_itr); best_i = best_cell / n_col; best_j = best_cell % n_col; return *best_cell_itr; } -// ADS: it seems like with some versions of GCC, the optimization from -// `-O3`, which seems to include loop vectorizations, breaks the -// function below and the other overload named `from_diag` -// below. Using the __attribute__ helps with GCC, and it should be -// ignored if not supported. +[[nodiscard]] static inline score_t +get_best_score(const std::vector &table, const std::size_t n_cells) { + // NOLINTNEXTLINE(*-narrowing-conversions) + return *std::max_element(std::cbegin(table), std::cbegin(table) + n_cells); +} -template +// ADS: it seems like with some versions of GCC, the optimization from `-O3`, +// which seems to include loop vectorizations, breaks the function below and +// the other overload named `from_diag` below. Using the __attribute__ helps +// with GCC, and it should be ignored if not supported. + +template +// NOLINTNEXTLINE(*-unknown-attributes) __attribute__((optimize("no-tree-loop-vectorize"))) void from_diag(T next_row, const T next_row_end, T cur_row, QueryConstItr query_seq, std::uint8_t ref_base) { while (next_row != next_row_end) { - const score_t score = scr_fun(*query_seq++, ref_base) + *cur_row++; + const score_t score = score_function(*query_seq++, ref_base) + *cur_row++; max16(*next_row++, score); } } -template +template void from_above(T above_itr, const T above_end, T target) { while (above_itr != above_end) { - const score_t score = *above_itr++ + indel_pen; + const score_t score = *above_itr++ + indel_penalty; max16(*target++, score); } } -// ADS: from_left is the same function as from_above, but uses -// different order on arguments, so rewritten to be more intuitive. -template +// ADS: from_left is the same function as from_above, but uses different order +// on arguments, so rewritten to be more intuitive. +template void from_left(T left_itr, T target, const T target_end) { while (target != target_end) { - const score_t score = *left_itr++ + indel_pen; + const score_t score = *left_itr++ + indel_penalty; max16(*target++, score); } } /********* SAME FUNCTIONS AS ABOVE BUT WITH TRACEBACK ********/ -template +template +// NOLINTNEXTLINE(*-unknown-attributes) __attribute__((optimize("no-tree-loop-vectorize"))) void from_diag(T next_row, const T next_row_end, T cur_row, QueryConstItr query_seq, std::uint8_t ref_base, U traceback) { while (next_row != next_row_end) { - const score_t score = scr_fun(*query_seq, ref_base) + *cur_row; + const score_t score = score_function(*query_seq, ref_base) + *cur_row; max16(*next_row, score); - *traceback = (*next_row == score) ? (diag_symbol) : (*traceback); + *traceback = (*next_row == score) ? diag_symbol : *traceback; ++traceback; ++next_row; ++query_seq; @@ -274,26 +276,26 @@ from_diag(T next_row, const T next_row_end, T cur_row, QueryConstItr query_seq, } } -template +template void from_above(T above_itr, const T above_end, T target, U traceback) { while (above_itr != above_end) { - const score_t score = *above_itr + indel_pen; + const score_t score = *above_itr + indel_penalty; max16(*target, score); - *traceback = (*target == score) ? (above_symbol) : (*traceback); + *traceback = (*target == score) ? above_symbol : *traceback; ++traceback; ++above_itr; ++target; } } -template +template void from_left(T left_itr, T target, const T target_end, U traceback) { while (target != target_end) { - const score_t score = *left_itr + indel_pen; + const score_t score = *left_itr + indel_penalty; max16(*target, score); - *traceback = (*target == score) ? (left_symbol) : (*traceback); + *traceback = (*target == score) ? left_symbol : *traceback; ++traceback; ++left_itr; ++target; @@ -311,14 +313,13 @@ make_default_cigar(const std::uint32_t len, bam_cigar_t &cigar) { cigar = {(len << ABISMAL_BAM_CIGAR_SHIFT)}; } -template +template template score_t -AbismalAlign::align(const score_t diffs, - const score_t max_diffs, - const std::vector &qseq, - const std::uint32_t t_pos) { +AbismalAlign::align( + const score_t diffs, const score_t max_diffs, + const std::vector &qseq, const std::uint32_t t_pos) { q_sz = qseq.size(); // edge case: diffs = 0 so alignment is "trivial" if (diffs == 0) @@ -329,12 +330,12 @@ AbismalAlign::align(const score_t diffs, min16(bw, static_cast(2 * min16(diffs, max_diffs) + 1)); const std::size_t n_cells = (q_sz + bandwidth) * bandwidth; - std::fill(std::begin(table), std::begin(table) + n_cells, 0); + std::fill_n(std::begin(table), n_cells, 0); if (do_traceback) - std::fill(std::begin(traceback), std::begin(traceback) + n_cells, -1); + std::fill_n(std::begin(traceback), n_cells, -1); - // GS: non-negative because of padding. The mapper - // must ensure t_pos is large enough when calling the function + // GS: non-negative because of padding. The mapper must ensure t_pos is + // large enough when calling the function const std::size_t t_beg = t_pos - ((bandwidth - 1) / 2); const std::size_t t_shift = q_sz + bandwidth; @@ -347,6 +348,10 @@ AbismalAlign::align(const score_t diffs, auto prev(std::begin(table)); auto cur(prev); + const auto sub_noneg = [](const auto i, const auto bw) { + return i > bw ? i - bw : 0; + }; + for (std::size_t i = 1; i < t_shift; ++i) { const std::size_t left = (i < bandwidth ? bandwidth - i : 0); const std::size_t right = min16(bandwidth, t_shift - i); @@ -354,33 +359,33 @@ AbismalAlign::align(const score_t diffs, cur += bandwidth; // next row in aln matrix if (do_traceback) { tb_cur += bandwidth; // next row in traceback - from_diag(cur + left, cur + right, prev + left, - q_itr + (i > bandwidth ? i - bandwidth : 0), *t_itr, - tb_cur + left); - from_above(prev + left + 1, prev + right, cur + left, - tb_cur + left); - from_left(cur + left, cur + left + 1, cur + right, - tb_cur + left + 1); + from_diag(cur + left, cur + right, prev + left, + q_itr + sub_noneg(i, bandwidth), *t_itr, + tb_cur + left); + from_above(prev + left + 1, prev + right, cur + left, + tb_cur + left); + from_left(cur + left, cur + left + 1, cur + right, + tb_cur + left + 1); } else { - from_diag(cur + left, cur + right, prev + left, - q_itr + (i > bandwidth ? i - bandwidth : 0), *t_itr); - from_above(prev + left + 1, prev + right, cur + left); - from_left(cur + left, cur + left + 1, cur + right); + from_diag(cur + left, cur + right, prev + left, + q_itr + sub_noneg(i, bandwidth), *t_itr); + from_above(prev + left + 1, prev + right, cur + left); + from_left(cur + left, cur + left + 1, cur + right); } ++t_itr; prev += bandwidth; // update previous row } // locate the end of the alignment as max score - std::size_t the_row = 0, the_col = 0; - return get_best_score(table, n_cells, bandwidth, the_row, the_col); + return get_best_score(table, n_cells); } -template +template void -AbismalAlign::build_cigar_len_and_pos( // uses cigar +AbismalAlign::build_cigar_len_and_pos( // uses cigar const score_t diffs, const score_t max_diffs, bam_cigar_t &cigar, std::uint32_t &len, std::uint32_t &t_pos) { // locate the end of the alignment as max score @@ -390,8 +395,8 @@ AbismalAlign::build_cigar_len_and_pos( // uses cigar std::size_t the_row = 0, the_col = 0; const score_t r = get_best_score(table, n_cells, bandwidth, the_row, the_col); - // GS: unlikely, but possible, case where the score = 0, which - // degenerates CIGAR string below + // GS: unlikely, but possible, case where the score = 0, which degenerates + // CIGAR string below if (r == 0 || diffs == 0) { make_default_cigar(q_sz, cigar); len = q_sz; @@ -430,4 +435,4 @@ AbismalAlign::build_cigar_len_and_pos( // uses cigar t_pos = t_beg + the_row; } -#endif +#endif // SRC_ABISMAL_ALIGN_HPP_ diff --git a/src/AbismalIndex.cpp b/src/AbismalIndex.cpp index b8f2a4a..e4b6b04 100644 --- a/src/AbismalIndex.cpp +++ b/src/AbismalIndex.cpp @@ -16,7 +16,7 @@ */ #include "AbismalIndex.hpp" -#include "dna_four_bit.hpp" +#include "dna_four_bit_bisulfite.hpp" #include "bamxx.hpp" @@ -34,18 +34,15 @@ #include #include -// NOLINTBEGIN(*-unix.Stream,*cert-err33-c,*-avoid-const-or-ref-data-members,*-avoid-magic-numbers,*-narrowing-conversions,*-owning-memory,*-constant-array-index,*-pro-type-reinterpret-cast) +// NOLINTBEGIN(*-unix.Stream,*-avoid-magic-numbers,*-narrowing-conversions) using abismal_clock = std::chrono::steady_clock; using std::chrono::time_point; - -template using num_lim = std::numeric_limits; +using genome_iterator = genome_four_bit_itr; bool AbismalIndex::VERBOSE = false; std::size_t AbismalIndex::n_threads_global = 1; -using genome_iterator = genome_four_bit_itr; - static std::string delta_seconds(const time_point &a) { const auto b = abismal_clock::now(); @@ -90,7 +87,8 @@ load_target_regions(const std::string &target_filename) std::vector targets{}; std::string line; std::string chrom; - std::size_t start_pos = 0, end_pos = 0; + std::size_t start_pos{}; + std::size_t end_pos{}; while (getline(in, line)) { std::istringstream iss(line); if (!(iss >> chrom)) @@ -120,7 +118,7 @@ mask_non_target( } } -auto +[[nodiscard]] static inline auto contiguous_n(const std::vector &genome) -> std::vector> { std::vector> r{}; @@ -143,18 +141,18 @@ contiguous_n(const std::vector &genome) } template -static inline auto +[[nodiscard]] static inline auto get_exclude_itrs( const G &genome, const std::vector> &exclude) -> std::vector> { - std::vector> exclude_itrs; + typedef std::pair gi_pair; const auto g_beg = genome_iterator(std::cbegin(genome)); - // cppcheck-suppress-begin useStlAlgorithm + std::vector exclude_itrs; exclude_itrs.reserve(std::size(exclude)); - for (const auto &i : exclude) - exclude_itrs.emplace_back(g_beg + i.first, g_beg + i.second); - // cppcheck-suppress-end useStlAlgorithm + std::transform( + std::cbegin(exclude), std::cend(exclude), std::back_inserter(exclude_itrs), + [&](const auto &e) { return gi_pair{g_beg + e.first, g_beg + e.second}; }); return exclude_itrs; } @@ -171,30 +169,28 @@ replace_included_n( } } -static inline std::size_t +[[nodiscard]] static inline std::size_t get_compressed_size(const std::size_t original_size) { const std::size_t nt_per_word = 2 * sizeof(original_size); return original_size / nt_per_word + (original_size % nt_per_word != 0); } -static auto +[[nodiscard]] static auto sort_by_chrom(const std::vector &names, const std::vector &u) -> std::vector { - // This function will exclude any target regions that are not - // corresponding to chromosomes in the given reference genome. + // This function will exclude any target regions that are not corresponding + // to chromosomes in the given reference genome. std::vector t(std::size(u)); auto t_itr = std::begin(t); for (std::size_t i = 0; i < std::size(names); ++i) { const auto u_itr = - find_if(std::cbegin(u), std::cend(u), - [&](const g_interval &x) { return x.chrom == names[i]; }); + std::find_if(std::cbegin(u), std::cend(u), + [&](const g_interval &x) { return x.chrom == names[i]; }); const auto v_itr = std::find_if(u_itr, std::cend(u), [&](const g_interval &x) { return x.chrom != names[i]; }); - if (!std::is_sorted(u_itr, v_itr)) throw std::runtime_error("target regions not sorted"); - std::copy(u_itr, v_itr, t_itr); t_itr += std::distance(u_itr, v_itr); } @@ -344,13 +340,13 @@ AbismalIndex::get_bucket_sizes_two() { while (gi != gi_lim) shift_hash_key(*gi++, hash_key); - auto itl_itr = std::cbegin(is_two_let); + auto itl_itr = std::cbegin(is_two_letter); auto keep_itr = std::cbegin(keep); auto nidx_itr = std::cbegin(exclude); // general loop to count positions for corresponding buckets const auto lim = cl.get_genome_size() - seed::key_weight + 1; - // const auto itl_lim = std::cbegin(is_two_let) + lim; + // const auto itl_lim = std::cbegin(is_two_letter) + lim; for (std::size_t i = 0; i < lim; ++i) { shift_hash_key(*gi++, hash_key); assert(nidx_itr != std::cend(exclude)); @@ -378,15 +374,15 @@ AbismalIndex::get_bucket_sizes_three() { else counter_a.resize(counter_size_three + 1, 0); - std::uint32_t hash_key = 0; auto gi = genome_iterator(std::cbegin(genome)); + std::uint32_t hash_key{}; // builds progressively // start building up the hash key const auto gi_lim = gi + (seed::key_weight_three - 1); while (gi != gi_lim) shift_three_key(*gi++, hash_key); - auto itl_itr = std::cbegin(is_two_let); + auto itl_itr = std::cbegin(is_two_letter); auto keep_itr = std::cbegin(keep); auto nidx_itr = std::cbegin(exclude); @@ -408,12 +404,12 @@ AbismalIndex::get_bucket_sizes_three() { } } -static inline std::uint32_t +[[nodiscard]] static inline std::uint32_t two_letter_cost(const std::uint32_t count) { return count; } -static inline std::uint64_t +[[nodiscard]] static inline std::uint64_t three_letter_cost(const std::uint32_t count_t, const std::uint32_t count_a) { return (count_t + count_a) >> 1; } @@ -475,8 +471,8 @@ AbismalIndex::select_two_letter_positions() { std::clog << "[selecting two-letter positions]"; // choose which have lower count under two-letters - is_two_let.resize(cl.get_genome_size(), false); - const auto itl_beg = std::begin(is_two_let); + is_two_letter.resize(cl.get_genome_size(), false); + const auto itl_beg = std::begin(is_two_letter); const auto g_beg = genome_iterator(std::cbegin(genome)); // ADS: this works below because seed::key_weight is always at least @@ -584,7 +580,7 @@ AbismalIndex::hash_genome() { for (std::size_t i = 0; i < lim; ++i) { shift_hash_key(*gi_two++, hash_two); assert(nidx_itr != std::cend(exclude)); - if (i < nidx_itr->first && keep[i] && is_two_let[i]) { + if (i < nidx_itr->first && keep[i] && is_two_letter[i]) { assert(counter[hash_two] > 0); index[--counter[hash_two]] = i; } @@ -603,7 +599,7 @@ AbismalIndex::hash_genome() { for (std::size_t i = 0; i < lim; ++i) { shift_three_key(*gi_three++, hash_t); assert(nidx_itr != std::cend(exclude)); - if (i < nidx_itr->first && keep[i] && !is_two_let[i]) { + if (i < nidx_itr->first && keep[i] && !is_two_letter[i]) { assert(counter_t[hash_t] > 0); index_t[--counter_t[hash_t]] = i; } @@ -622,7 +618,7 @@ AbismalIndex::hash_genome() { for (std::size_t i = 0; i < lim; ++i) { shift_three_key(*gi_three++, hash_a); assert(nidx_itr != std::cend(exclude)); - if (i < nidx_itr->first && keep[i] && !is_two_let[i]) { + if (i < nidx_itr->first && keep[i] && !is_two_letter[i]) { assert(counter_a[hash_a] > 0); index_a[--counter_a[hash_a]] = i; } @@ -640,7 +636,8 @@ AbismalIndex::hash_genome() { } struct dp_sol { - static const std::uint64_t sentinel = num_lim::max(); + static const std::uint64_t sentinel = + std::numeric_limits::max(); std::uint64_t cost{sentinel}; std::uint64_t prev{sentinel}; }; @@ -651,10 +648,8 @@ struct dp_sol { // or full. We need to ensure that the size of this deque ring buffer // is at least (window_size + 2). template struct fixed_ring_buffer { - constexpr static std::size_t qsz = - 32ul; // must be >= (seed::window_size + 2) - constexpr static std::size_t qsz_msk = - 31ul; // mask to keep positions in range + constexpr static auto qsz{32ul}; // must be >= (seed::window_size + 2) + constexpr static auto qsz_msk{31ul}; // mask to keep positions in range std::array h; @@ -670,12 +665,12 @@ template struct fixed_ring_buffer { auto back() -> T & { - return h[(b - 1) & qsz_msk]; + return h[(b - 1) & qsz_msk]; // NOLINT(*-constant-array-index) } auto front() -> T & { - return h[f]; + return h[f]; // NOLINT(*-constant-array-index) } auto @@ -692,7 +687,7 @@ template struct fixed_ring_buffer { auto push_back(T &&x) -> void { - h[b] = std::move(x); + h[b] = std::move(x); // NOLINT(*-constant-array-index) ++b; b &= qsz_msk; } @@ -717,10 +712,10 @@ add_sol(deque &helper, const std::uint64_t prev, const std::uint64_t cost) { } static inline std::uint64_t -hybrid_cost(const bool is_two_let, const std::uint32_t count_two, +hybrid_cost(const bool is_two_letter, const std::uint32_t count_two, const std::uint32_t count_t, const std::uint32_t count_a) { - return is_two_let ? two_letter_cost(count_two) - : three_letter_cost(count_t, count_a); + return is_two_letter ? two_letter_cost(count_two) + : three_letter_cost(count_t, count_a); } void @@ -781,7 +776,7 @@ AbismalIndex::compress_dp() { auto opt_itr = std::begin(opt); const auto opt_lim = std::cbegin(opt) + current_block_size; - auto itl = std::cbegin(is_two_let) + block_start; + auto itl = std::cbegin(is_two_letter) + block_start; // do the base cases for the dynamic programming: the solutions // for the first "window size" positions @@ -792,9 +787,9 @@ AbismalIndex::compress_dp() { shift_three_key(gi3_val, hash_t); shift_three_key(gi3_val, hash_a); - assert(hash_two < counter.size()); - assert(hash_t < counter_t.size()); - assert(hash_a < counter_a.size()); + assert(hash_two < std::size(counter)); + assert(hash_t < std::size(counter_t)); + assert(hash_a < std::size(counter_a)); const auto c = hybrid_cost(*itl++, counter[hash_two], counter_t[hash_t], counter_a[hash_a]); @@ -810,13 +805,13 @@ AbismalIndex::compress_dp() { shift_three_key(gi3_val, hash_t); shift_three_key(gi3_val, hash_a); - assert(hash_two < counter.size()); - assert(hash_t < counter_t.size()); - assert(hash_a < counter_a.size()); + assert(hash_two < std::size(counter)); + assert(hash_t < std::size(counter_t)); + assert(hash_a < std::size(counter_a)); const auto c = hybrid_cost(*itl++, counter[hash_two], counter_t[hash_t], counter_a[hash_a]); - assert(helper.size() > 0); + assert(std::size(helper) > 0); assert(opt_itr != std::end(opt)); *opt_itr = helper.front(); opt_itr->cost += c; @@ -824,8 +819,8 @@ AbismalIndex::compress_dp() { } // get solution for start of traceback - std::uint64_t opt_ans = num_lim::max(); - std::uint64_t last = num_lim::max(); + std::uint64_t opt_ans = std::numeric_limits::max(); + std::uint64_t last = std::numeric_limits::max(); for (i = current_block_size - 1; i >= current_block_size - seed::window_size; --i) { const auto cand_cost = opt[i].cost; @@ -871,15 +866,15 @@ struct BucketLess { return false; } - const genome_iterator g_start; + const genome_iterator g_start; // NOLINT(*-avoid-const-or-ref-data-members) }; template static inline three_letter_t -three_letter_num_srt(const std::uint8_t nt) { +three_letter_num_srt(const std::uint8_t x) { // C=T=0, A=1, G=4 // A=G=0, C=2, T=8 - return the_conv == c_to_t ? nt & 5 : nt & 10; + return the_conv == c_to_t ? x & 5 : x & 10; // NOLINT(*-avoid-magic-numbers) } template struct BucketLessThree { @@ -899,7 +894,7 @@ template struct BucketLessThree { return false; } - const genome_iterator g_start; + const genome_iterator g_start; // NOLINT(*-avoid-const-or-ref-data-members) }; void @@ -1037,7 +1032,8 @@ AbismalIndex::write(const std::string &index_file) const { const auto fwrite_var = [&](FILE *out, const auto &x) { return std::fwrite(&x, sizeof(x), 1, out) != 1; }; - FILE *out = fopen(index_file.data(), "wb"); + + FILE *out = std::fopen(index_file.data(), "wb"); // NOLINT(*-owning-memory) if (!out) throw std::runtime_error("cannot open output file " + index_file); @@ -1064,7 +1060,7 @@ AbismalIndex::write(const std::string &index_file) const { out) != index_size_three) throw std::runtime_error("failed writing index"); - if (fclose(out) != 0) + if (std::fclose(out) != 0) // NOLINT(*-owning-memory) throw std::runtime_error("problem closing file: " + index_file); } @@ -1083,7 +1079,7 @@ AbismalIndex::read(const std::string &index_file) { }; static const std::string error_msg("failed loading index file"); - FILE *in = fopen(index_file.data(), "rb"); + FILE *in = std::fopen(index_file.data(), "rb"); // NOLINT(*-owning-memory) if (!in) throw std::runtime_error("cannot open input file " + index_file); @@ -1138,13 +1134,14 @@ AbismalIndex::read(const std::string &index_file) { index_size_three) throw std::runtime_error(error_msg); - if (fclose(in) != 0) + if (std::fclose(in) != 0) // NOLINT(*-owning-memory) throw std::runtime_error("problem closing file: " + index_file); } std::ostream & ChromLookup::write(std::ostream &out) const { const auto write_var = [&](const auto &x) { + // NOLINTNEXTLINE(*-reinterpret-cast) out.write(reinterpret_cast(&x), sizeof(x)); }; const std::uint32_t n_chroms = std::size(names); @@ -1154,6 +1151,7 @@ ChromLookup::write(std::ostream &out) const { write_var(name_size); out.write(names[i].data(), name_size); } + // NOLINTNEXTLINE(*-reinterpret-cast) out.write(reinterpret_cast(starts.data()), sizeof(std::uint32_t) * (n_chroms + 1)); return out; @@ -1169,9 +1167,13 @@ ChromLookup::write(FILE *out) const { for (std::size_t i = 0; i < n_chroms; ++i) { const std::uint32_t name_size = std::size(names[i]); fwrite_var(out, name_size); - std::fwrite(names[i].data(), 1, name_size, out); + if (std::fwrite(names[i].data(), 1, name_size, out) != name_size) + throw std::runtime_error("failure writing index file"); } - std::fwrite(starts.data(), sizeof(std::uint32_t), n_chroms + 1, out); + const auto n_starts = n_chroms + 1; + if (std::fwrite(starts.data(), sizeof(std::uint32_t), n_starts, out) != + n_starts) + throw std::runtime_error("failure writing index file"); return out; } @@ -1186,30 +1188,29 @@ ChromLookup::write(const std::string &outfile) const { std::istream & ChromLookup::read(std::istream &in) { const auto read_var = [&](auto &x) { - in.read(reinterpret_cast(&x), sizeof(x)); + const auto n = sizeof(x); + in.read(reinterpret_cast(&x), n); // NOLINT(*-reinterpret-cast) + }; + const auto read_chunk = [&](auto x, const auto n) { + in.read(reinterpret_cast(x), n); // NOLINT(*-reinterpret-cast) }; - // read the number of chroms std::uint32_t n_chroms{}; - read_var(n_chroms); + read_var(n_chroms); // read the number of chroms - // allocate the number of chroms - names.resize(n_chroms); + names.resize(n_chroms); // allocate the number of chroms // get each chrom name for (std::size_t i = 0; i < n_chroms; ++i) { std::uint32_t name_size{}; - // get the size of the chrom name - read_var(name_size); - // allocate the chrom name - names[i].resize(name_size); - // read the chrom name - in.read(reinterpret_cast(names[i].data()), name_size); + read_var(name_size); // get the size of the chrom name + names[i].resize(name_size); // allocate the chrom name + read_chunk(names[i].data(), name_size); // read the chrom name } // allocate then read the starts vector - starts = std::vector(n_chroms + 1); - in.read(reinterpret_cast(starts.data()), - sizeof(std::uint32_t) * (n_chroms + 1)); + const auto n_sizes = n_chroms + 1; // one more size than chroms + starts.resize(n_sizes); + read_chunk(starts.data(), sizeof(std::uint32_t) * n_sizes); return in; } @@ -1231,20 +1232,19 @@ ChromLookup::read(FILE *in) { // get each chrom name for (std::size_t i = 0; i < n_chroms; ++i) { std::uint32_t name_size = 0; - // get size of chrom name - if (fread_var(in, name_size)) + if (fread_var(in, name_size)) // get size of chrom name throw std::runtime_error(error_msg); - // allocate the chrom name - names[i].resize(name_size); + names[i].resize(name_size); // allocate the chrom name // read the chrom name if (std::fread(names[i].data(), 1, name_size, in) != name_size) throw std::runtime_error(error_msg); } // allocate then read the starts vector - starts = std::vector(n_chroms + 1); - if (std::fread(starts.data(), sizeof(std::uint32_t), n_chroms + 1, in) != - n_chroms + 1) + const auto n_starts = n_chroms + 1; + starts = std::vector(n_starts); + if (std::fread(starts.data(), sizeof(std::uint32_t), n_starts, in) != + n_starts) throw std::runtime_error(error_msg); return in; @@ -1266,7 +1266,7 @@ operator<<(std::ostream &out, const ChromLookup &cl) { std::string ChromLookup::tostring() const { std::ostringstream iss; - for (auto i = 0u; i < names.size(); ++i) + for (auto i = 0u; i < std::size(names); ++i) iss << i << '\t' << names[i] << '\t' << starts[i] << '\t' << starts[i + 1] << '\n'; return iss.str(); @@ -1277,7 +1277,6 @@ ChromLookup::get_chrom_idx_and_offset(const std::uint32_t pos, std::int32_t &chrom_idx, std::uint32_t &offset) const { auto idx = upper_bound(std::cbegin(starts), std::cend(starts), pos); - assert(idx != std::cbegin(starts)); --idx; @@ -1290,9 +1289,9 @@ ChromLookup::get_chrom_idx_and_offset(const std::uint32_t pos, std::uint32_t ChromLookup::get_pos(const std::string &chrom, const std::uint32_t offset) const { - const auto itr = find(std::cbegin(names), std::cend(names), chrom); + const auto itr = std::find(std::cbegin(names), std::cend(names), chrom); return itr == std::cend(names) - ? num_lim::max() + ? std::numeric_limits::max() : starts[std::distance(std::cbegin(names), itr)] + offset; } @@ -1302,7 +1301,6 @@ ChromLookup::get_chrom_idx_and_offset(const std::uint32_t pos, std::int32_t &chrom_idx, std::uint32_t &offset) const { auto idx = upper_bound(std::cbegin(starts), std::cend(starts), pos); - if (idx == std::cbegin(starts)) return false; // read is before any chrom @@ -1339,19 +1337,19 @@ load_genome_impl(const std::string &genome_file, G &genome, ChromLookup &cl) { std::copy(std::cbegin(line), std::cend(line), g_ins); else { cl.names.push_back(line.substr(1, line.find_first_of(" \t") - 1)); - cl.starts.push_back(genome.size()); + cl.starts.push_back(std::size(genome)); } - if (cl.names.size() < 2) + if (std::size(cl.names) < 2) throw std::runtime_error("no names found in genome file"); // now pad the end of the concatenated sequence cl.names.push_back("pad_end"); - cl.starts.push_back(genome.size()); + cl.starts.push_back(std::size(genome)); std::fill_n(g_ins, seed::padding_size, 'N'); // this one additional "start" is the end of all chroms - cl.starts.push_back(genome.size()); + cl.starts.push_back(std::size(genome)); } void @@ -1366,4 +1364,4 @@ load_genome(const std::string &genome_file, std::vector &genome, load_genome_impl(genome_file, genome, cl); } -// NOLINTEND(*-unix.Stream,*cert-err33-c,*-avoid-const-or-ref-data-members,*-avoid-magic-numbers,*-narrowing-conversions,*-owning-memory,*-constant-array-index,*-pro-type-reinterpret-cast) +// NOLINTEND(*-unix.Stream,*-avoid-magic-numbers,*-narrowing-conversions) diff --git a/src/AbismalIndex.hpp b/src/AbismalIndex.hpp index 0e00d98..5efc7c4 100644 --- a/src/AbismalIndex.hpp +++ b/src/AbismalIndex.hpp @@ -15,13 +15,12 @@ * details. */ -#ifndef ABISMAL_INDEX_HPP -#define ABISMAL_INDEX_HPP +#ifndef SRC_ABISMAL_INDEX_HPP_ +#define SRC_ABISMAL_INDEX_HPP_ -#include "dna_four_bit.hpp" +#include "dna_four_bit_bisulfite.hpp" #include -#include #include #include #include @@ -35,60 +34,63 @@ using element_t = std::size_t; using Genome = std::vector; using two_letter_t = bool; -using three_letter_t = uint8_t; +using three_letter_t = std::uint8_t; struct random_base_generator { - // ADS: other RNG behaves differently across systems (e.g. macos vs - // ubuntu) and caused problems for comparing results when testing + // ADS: other RNG behaves differently across systems (e.g. macos vs ubuntu) + // and caused problems for comparing results when testing + static random_base_generator & - init() { + init() noexcept { static random_base_generator r; return r; } - random_base_generator(const random_base_generator &) = delete; - random_base_generator(random_base_generator &&) = delete; - char + constexpr char operator()() { constexpr auto m = 0x7fffffffu; // 2^31 constexpr auto a = 1103515245u; constexpr auto c = 12345u; x = (a * x + c) & m; - // low order bits empicially confirmed to suck - return "ACGT"[x & 3]; // do this: (x >> 15); + // low order bits empicially confirmed to suck (do this: (x >> 15);) + return "ACGT"[x & 3]; // NOLINT(*-constant-array-index) } private: - random_base_generator() {} + random_base_generator() noexcept {} std::uint32_t x{1}; }; +// NOLINTNEXTLINE(*-avoid-non-const-global-variables) static random_base_generator &random_base = random_base_generator::init(); namespace seed { // number of positions in the hashed portion of the seed -static const std::uint32_t key_weight = 25u; -static const std::uint32_t key_weight_three = 16u; +static constexpr std::uint32_t key_weight = 25u; +static constexpr std::uint32_t key_weight_three = 16u; -// window in which we select the best k-mer. The longer it is, -// the longer the minimum read length that guarantees an exact -// match will be mapped +// window in which we select the best k-mer. The longer it is, the longer the +// minimum read length that guarantees an exact match will be mapped #ifdef ENABLE_SHORT -static const std::uint32_t window_size = 12u; +static constexpr std::uint32_t window_size = 12u; #else -static const std::uint32_t window_size = 20u; +static constexpr std::uint32_t window_size = 20u; #endif // number of positions to sort within buckets -static const std::uint32_t n_sorting_positions = 256u; +static constexpr std::uint32_t n_sorting_positions = 256u; + +static constexpr std::size_t hash_mask = (1ull << seed::key_weight) - 1; -static const std::size_t hash_mask = (1ull << seed::key_weight) - 1; -static const std::size_t hash_mask_three = pow(3, key_weight_three); +[[nodiscard]] static constexpr std::size_t +ipow(const std::size_t b, const std::size_t e) { + return e == 0 ? 1 : (e & 1 ? b : 1) * ipow(b * b, e >> 1); +} +static constexpr std::size_t hash_mask_three = ipow(3, key_weight_three); -// the purpose of padding the left and right ends of the -// concatenated genome is so that later we can avoid having to check -// the (unlikely) case that a read maps partly off either end of the -// genome. -static const std::size_t padding_size = std::numeric_limits::max(); +// the purpose of padding the left and right ends of the concatenated genome +// is so that later we can avoid having to check the (unlikely) case that a +// read maps partly off either end of the genome. +static constexpr std::size_t padding_size = std::numeric_limits::max(); void read(FILE *in); @@ -103,6 +105,7 @@ struct ChromLookup { void get_chrom_idx_and_offset(const std::uint32_t pos, std::int32_t &chrom_idx, std::uint32_t &offset) const; + bool get_chrom_idx_and_offset(const std::uint32_t pos, const std::uint32_t readlen, std::int32_t &chrom_idx, @@ -110,6 +113,7 @@ struct ChromLookup { std::uint32_t get_pos(const std::string &chrom, const std::uint32_t offset) const; + std::uint32_t get_genome_size() const { return starts.back(); @@ -117,15 +121,19 @@ struct ChromLookup { FILE * read(FILE *in); + std::istream & read(std::istream &in); + void read(const std::string &infile); FILE * write(FILE *out) const; + std::ostream & write(std::ostream &out) const; + void write(const std::string &outfile) const; @@ -144,21 +152,21 @@ load_genome(const std::string &genome_file, std::string &genome, std::ostream & operator<<(std::ostream &out, const ChromLookup &cl); -enum three_conv_type { c_to_t, g_to_a }; +enum three_conv_type : std::uint8_t { + c_to_t, + g_to_a, +}; struct AbismalIndex { - static bool VERBOSE; - - static const std::uint32_t max_n_count = 256ul; static std::size_t n_threads_global; - std::uint32_t max_candidates{100u}; + std::uint32_t max_candidates{default_max_candidates}; std::size_t counter_size{}; // number of kmers indexed - std::size_t counter_size_three{}; // number of kmers indexed + std::size_t counter_size_three{}; // (3 letters) std::size_t index_size{}; // number of genome positions indexed - std::size_t index_size_three{}; // number of genome positions indexed + std::size_t index_size_three{}; // (3 letters) std::vector index; // genome positions for each k-mer std::vector index_t; // genome positions for each k-mer @@ -170,7 +178,7 @@ struct AbismalIndex { // a vector indicating whether each position goes into two- // or three-letter encoding - std::vector is_two_let; + std::vector is_two_letter; std::vector keep; std::vector> exclude{}; std::vector> @@ -187,6 +195,7 @@ struct AbismalIndex { void create_index(const std::string &genome_file); + void create_index(const std::string &target_filename, const std::string &genome_file); @@ -195,9 +204,11 @@ struct AbismalIndex { template void initialize_bucket_sizes(); + template void get_bucket_sizes_two(); + template void get_bucket_sizes_three(); @@ -233,31 +244,36 @@ struct AbismalIndex { static constexpr auto internal_identifier = "AbismalIndex"; static constexpr auto internal_identifier_size = 12; + static constexpr auto default_max_candidates{100u}; + static constexpr auto max_n_count = 256ul; + AbismalIndex() {} }; // A/T nucleotide to 1-bit value (0100 | 0001 = 5) is for A or G. -inline two_letter_t +constexpr two_letter_t get_bit(const std::uint8_t nt) { - return (nt & 5) == 0; + return (nt & 5) == 0; // NOLINT(*-avoid-magic-numbers) } template -inline three_letter_t +constexpr three_letter_t get_three_letter_num(const std::uint8_t nt) { // the_conv = c_to_t: C=T=0, A=1, G=2 // the_conv = g_to_a: A=G=0, C=1, T=2 + // NOLINTBEGIN(*-avoid-magic-numbers) return the_conv == c_to_t ? (((nt & 4) != 0) << 1) | ((nt & 1) != 0) : (((nt & 8) != 0) << 1) | ((nt & 2) != 0); + // NOLINTEND(*-avoid-magic-numbers) } -inline void +constexpr void shift_hash_key(const std::uint8_t c, std::uint32_t &hash_key) { hash_key = ((hash_key << 1) | get_bit(c)) & seed::hash_mask; } template -inline void +constexpr void shift_three_key(const std::uint8_t c, std::uint32_t &hash_key) { hash_key = (hash_key * 3 + get_three_letter_num(c)) % seed::hash_mask_three; @@ -287,4 +303,4 @@ get_base_3_hash(T r, std::uint32_t &k) { } } -#endif // ABISMAL_INDEX_HPP +#endif // SRC_ABISMAL_INDEX_HPP_ diff --git a/src/abismal.cpp b/src/abismal.cpp index 2f21e4e..98025f2 100644 --- a/src/abismal.cpp +++ b/src/abismal.cpp @@ -19,7 +19,6 @@ #include "AbismalAlign.hpp" #include "AbismalIndex.hpp" #include "common.hpp" -#include "dna_four_bit.hpp" #include "dna_four_bit_bisulfite.hpp" #include "popcnt.hpp" @@ -91,7 +90,7 @@ using AbismalAlignSimple = typedef std::uint16_t flags_t; // every bit is a flag typedef std::int16_t score_t; // aln score, edit distance, hamming distance -typedef std::vector Read; // 4-bit encoding of reads +typedef std::vector Read; // 4-bit encoding of reads typedef std::vector PackedRead; // 4-bit encoding of reads namespace abismal_concurrency { @@ -129,8 +128,8 @@ flip_conv(const conversion_type conv) { static constexpr flags_t get_strand_code(const char strand, const conversion_type conv) { - return (((strand == '-') ? samflags::read_rc : 0) | - ((conv == a_rich) ? bsflags::read_is_a_rich : 0)); + return (strand == '-' ? samflags::read_rc : 0) | + (conv == a_rich ? bsflags::read_is_a_rich : 0); } struct read_holder { @@ -204,12 +203,10 @@ struct ReadLoader { std::string filename; bamxx::bgzf_file in; - static const std::size_t batch_size; + static constexpr std::size_t batch_size{1000}; static const std::uint32_t min_read_length; }; -const std::size_t ReadLoader::batch_size = 1000; - // GS: minimum length which an exact match can be guaranteed to map const std::uint32_t ReadLoader::min_read_length = seed::key_weight + seed::window_size - 1; @@ -223,15 +220,21 @@ get_max_read_length(const std::vector &r) { return std::accumulate(std::cbegin(r), std::cend(r), 0ul, acc); } -struct se_element { // size = 8 +struct se_element { // size = 8 + static constexpr auto valid_frac_default = 0.1; + static double valid_frac; + // a liberal number of mismatches accepted to align a read downstream + static constexpr auto invalid_hit_frac{0.4}; + static constexpr score_t MAX_DIFFS{std::numeric_limits::max()}; + score_t diffs{}; // 2 bytes flags_t flags{}; // 2 bytes std::uint32_t pos{}; // 4 bytes - se_element() : diffs(MAX_DIFFS) {} + se_element() : diffs{MAX_DIFFS} {} - se_element(const score_t d, const flags_t f, const std::uint32_t p) : - diffs(d), flags(f), pos(p) {} + se_element(const score_t diffs, const flags_t flags, + const std::uint32_t pos) : diffs{diffs}, flags{flags}, pos{pos} {} bool operator==(const se_element &rhs) const { @@ -244,42 +247,42 @@ struct se_element { // size = 8 } // this is used to keep PE candidates sorted in the max heap - bool + [[nodiscard]] bool operator<(const se_element &rhs) const { return diffs < rhs.diffs; } - inline bool + [[nodiscard]] bool rc() const { return samflags::check(flags, samflags::read_rc); } - inline bool + [[nodiscard]] bool elem_is_a_rich() const { return samflags::check(flags, bsflags::read_is_a_rich); } - inline bool + [[nodiscard]] bool ambig() const { return samflags::check(flags, samflags::secondary_aln); } - inline void + void set_ambig() { samflags::set(flags, samflags::secondary_aln); } - inline bool + [[nodiscard]] bool empty() const { return pos == 0; } - inline bool + [[nodiscard]] bool sure_ambig() const { return ambig() && diffs == 0; } - inline void + void reset() { pos = 0; diffs = MAX_DIFFS; @@ -290,19 +293,9 @@ struct se_element { // size = 8 reset(); diffs = static_cast(invalid_hit_frac * readlen); } - - static double valid_frac; - static const double invalid_hit_frac; - static const score_t MAX_DIFFS; }; -static constexpr auto valid_frac_default = 0.1; -double se_element::valid_frac = valid_frac_default; -const score_t se_element::MAX_DIFFS = std::numeric_limits::max() - 1; - -// a liberal number of mismatches accepted to align a read downstream -static constexpr auto invalid_hit_frac_default = 0.4; -const double se_element::invalid_hit_frac = invalid_hit_frac_default; +double se_element::valid_frac = se_element::valid_frac_default; static inline score_t valid_diffs_cutoff(const std::uint32_t readlen, const double cutoff) { @@ -337,7 +330,7 @@ max16(const T x, const T y) { } struct se_candidates { - se_candidates() : sz{1}, best{}, v{std::vector(max_size)} {} + se_candidates() : sz{1}, v{std::vector(max_size)} {} inline bool full() const { @@ -434,10 +427,10 @@ struct se_candidates { // in SE reads, we sort to exclude duplicates void prepare_for_alignments() { - std::sort(std::begin(v), - std::begin(v) + sz, // no sort_heap here as heapify used "diffs" + // no sort_heap here as heapify used "diffs" + std::sort(std::begin(v), std::begin(v) + sz, [](const se_element &a, const se_element &b) { - return (a.pos < b.pos) || (a.pos == b.pos && a.flags < b.flags); + return a.pos < b.pos || (a.pos == b.pos && a.flags < b.flags); }); sz = std::distance(std::begin(v), std::unique(std::begin(v), std::begin(v) + sz)); @@ -448,13 +441,11 @@ struct se_candidates { score_t cutoff{}; std::uint32_t sz{}; se_element best{}; - std::vector v; + std::vector v{}; - static const std::uint32_t max_size; + static constexpr std::uint32_t max_size{50u}; }; -const std::uint32_t se_candidates::max_size = 50u; - [[nodiscard]] static inline bool cigar_eats_ref(const std::uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; @@ -462,11 +453,10 @@ cigar_eats_ref(const std::uint32_t c) { [[nodiscard]] static inline std::uint32_t cigar_rseq_ops(const bam_cigar_t &cig) { - return std::accumulate(std::cbegin(cig), std::cend(cig), 0u, - [](const std::uint32_t total, const std::uint32_t x) { - return total + - (cigar_eats_ref(x) ? bam_cigar_oplen(x) : 0); - }); + const auto acc = [](const std::uint32_t t, const std::uint32_t x) { + return t + (cigar_eats_ref(x) ? bam_cigar_oplen(x) : 0); + }; + return std::accumulate(std::cbegin(cig), std::cend(cig), 0u, acc); } [[nodiscard]] static inline bool @@ -524,7 +514,7 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, flag, // uint16_t flag, chrom_idx - 1, // int32_t tid (-1 for padding) ref_s, // hts_pos_t pos, - mapq_max_val, // uint8_t mapq, + mapq_max_val, // std::uint8_t mapq, std::size(r.cig), // size_t n_cigar, r.cig.data(), // const uint32_t *cigar, -1, // int32_t mtid, @@ -542,10 +532,10 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, if (ret < 0) throw std::runtime_error("bam_aux_update_int"); - // cppcheck-suppress-begin cstyleCast - ret = bam_aux_append(r.rec.b, "CV", 'A', 1, - (std::uint8_t *)(res.elem_is_a_rich() ? "A" : "T")); - // cppcheck-suppress-end cstyleCast + ret = bam_aux_append( + r.rec.b, "CV", 'A', 1, + // NOLINTNEXTLINE(*-reinterpret-cast) + reinterpret_cast(res.elem_is_a_rich() ? "A" : "T")); if (ret < 0) throw std::runtime_error("bam_aux_append"); @@ -594,24 +584,24 @@ struct pe_element { } // GS: used to decide whether ends should be mapped as SE independently - inline bool + [[nodiscard]] inline bool should_report(const bool allow_ambig) const { return !empty() && (allow_ambig || !ambig()); } - inline bool + [[nodiscard]] inline bool ambig() const { return r1.ambig(); } - inline bool + [[nodiscard]] inline bool empty() const { return r1.empty(); } - inline bool + [[nodiscard]] inline bool sure_ambig() const { - return ambig() && (aln_score == max_aln_score); + return ambig() && aln_score == max_aln_score; } score_t aln_score{}; @@ -619,14 +609,14 @@ struct pe_element { se_element r1; se_element r2; + static constexpr std::uint32_t min_dist_default{32}; + static constexpr std::uint32_t max_dist_default{3000}; static std::uint32_t min_dist; static std::uint32_t max_dist; }; -// NOLINTBEGIN(*-avoid-magic-numbers) -std::uint32_t pe_element::min_dist = 32; -std::uint32_t pe_element::max_dist = 3000; -// NOLINTEND(*-avoid-magic-numbers) +std::uint32_t pe_element::min_dist = min_dist_default; +std::uint32_t pe_element::max_dist = max_dist_default; static inline bool valid_pair(const pe_element &best, const std::uint32_t readlen1, @@ -659,7 +649,7 @@ format_pe( read_holder &r1, read_holder &r2) { static constexpr auto mapq_max_val = 255u; static constexpr auto aux_len = 16u; - static const std::array cv = { + static const std::array cv = { 'T', 'A', }; @@ -722,7 +712,7 @@ format_pe( flag1, // uint16_t flag, chr1 - 1, // (-1 for padding) int32_t tid r_s1, // hts_pos_t pos, - mapq_max_val, // uint8_t mapq, + mapq_max_val, // std::uint8_t mapq, std::size(r1.cig), // size_t n_cigar, r1.cig.data(), // const uint32_t *cigar, chr2 - 1, // (-1 for padding) int32_t mtid, @@ -753,7 +743,7 @@ format_pe( flag2, // uint16_t flag, chr2 - 1, // (-1 for padding) int32_t tid r_s2, // hts_pos_t pos, - mapq_max_val, // uint8_t mapq, + mapq_max_val, // std::uint8_t mapq, std::size(r2.cig), // size_t n_cigar, r2.cig.data(), // const uint32_t *cigar, chr1 - 1, // (-1 for padding) int32_t mtid, @@ -1201,7 +1191,7 @@ find_candidates(const std::uint32_t max_candidates, template static inline three_letter_t -get_three_letter_num_fast(const uint8_t nt) { +get_three_letter_num_fast(const std::uint8_t nt) { // NOLINTBEGIN(*-avoid-magic-numbers) return (the_conv == c_to_t) ? nt & 5 : // C=T=0, A=1, G=4 nt & 10; // A=G=0, C=2, T=8 @@ -1391,7 +1381,7 @@ prep_read(const std::string &r, Read &pread) { // NOLINTEND(*-constant-array-index) } -/* GS: this function simply converts the vector pread to a +/* GS: this function simply converts the vector pread to a * vector by putting 16 bases in each element of the packed read. If * the read length does not divide 16, we add 1111s to the remaining positions * so it divides 16. The remaining bases match all bases in the reference @@ -1995,15 +1985,13 @@ map_paired_ended(const bool show_progress, const bool allow_ambig, bests[i].reset(); if (!bests[i].should_report(allow_ambig)) { - // NOLINTBEGIN(*-avoid-magic-numbers) align_se_candidates(pread1, pread1_rc, pread1, pread1_rc, - se_element::valid_frac / 2.0, res_se1, bests_se1[i], + se_element::valid_frac / 2, res_se1, bests_se1[i], r1[i].cig, aln); align_se_candidates(pread2, pread2_rc, pread2, pread2_rc, - se_element::valid_frac / 2.0, res_se2, bests_se2[i], + se_element::valid_frac / 2, res_se2, bests_se2[i], r2[i].cig, aln); - // NOLINTEND(*-avoid-magic-numbers) } select_output(allow_ambig, abismal_index.cl, r1[i], r2[i], bests[i], bests_se1[i], bests_se2[i]); @@ -2208,9 +2196,8 @@ struct runner { template void - run_paired_ended(const std::string &reads_file1, - const std::string &reads_file2, const bamxx::bam_header &hdr, - bamxx::bam_out &out) { + paired_ended(const std::string &reads_file1, const std::string &reads_file2, + const bamxx::bam_header &hdr, bamxx::bam_out &out) { ReadLoader rl1(reads_file1); ReadLoader rl2(reads_file2); progress_bar progress(std::filesystem::file_size(reads_file1), @@ -2241,8 +2228,8 @@ struct runner { template void - run_single_ended(const std::string &reads_file, const bamxx::bam_header &hdr, - bamxx::bam_out &out) { + single_ended(const std::string &reads_file, const bamxx::bam_header &hdr, + bamxx::bam_out &out) { ReadLoader rl(reads_file); progress_bar progress(std::filesystem::file_size(reads_file), "mapping reads"); @@ -2403,7 +2390,7 @@ abismal(int argc, char *argv[]) { // NOLINT(*-c-arrays) } /****************** END COMMAND LINE OPTIONS *****************/ - /****************** BEGIN THREAD VALIDATION *****************/ + // thread validation const auto n_cores = std::thread::hardware_concurrency(); abismal_concurrency::n_threads = std::min(abismal_concurrency::n_threads, n_cores); @@ -2415,9 +2402,8 @@ abismal(int argc, char *argv[]) { // NOLINT(*-c-arrays) "this device"); if (verbose) - log_msg("using " + std::to_string(abismal_concurrency::n_threads) + - " threads to map reads"); - /****************** END THREAD VALIDATION *****************/ + log_msg("threads to map reads: " + + std::to_string(abismal_concurrency::n_threads)); const bool show_progress = verbose && isatty(fileno(stderr)); @@ -2473,34 +2459,34 @@ abismal(int argc, char *argv[]) { // NOLINT(*-c-arrays) if (!out.write(hdr)) throw std::runtime_error("error writing header"); - runner r{show_progress, allow_ambig, random_pbat, abismal_index}; + runner run{show_progress, allow_ambig, random_pbat, abismal_index}; if (reads_file2.empty()) { if (g_to_a_conversion || pbat_mode) - r.run_single_ended(reads_file, hdr, out); + run.single_ended(reads_file, hdr, out); else if (random_pbat) - r.run_single_ended(reads_file, hdr, out); + run.single_ended(reads_file, hdr, out); else - r.run_single_ended(reads_file, hdr, out); + run.single_ended(reads_file, hdr, out); } else { if (pbat_mode) - r.run_paired_ended(reads_file, reads_file2, hdr, out); + run.paired_ended(reads_file, reads_file2, hdr, out); else if (random_pbat) - r.run_paired_ended(reads_file, reads_file2, hdr, out); + run.paired_ended(reads_file, reads_file2, hdr, out); else - r.run_paired_ended(reads_file, reads_file2, hdr, out); + run.paired_ended(reads_file, reads_file2, hdr, out); } if (!stats_outfile.empty()) { - std::ofstream stats_of(stats_outfile); - if (stats_of) { + std::ofstream statout(stats_outfile); + if (statout) { if (stats_as_json) - stats_of << (reads_file2.empty() ? nlohmann::json(r.se_stats) - : nlohmann::json(r.pe_stats)); + statout << (reads_file2.empty() ? nlohmann::json(run.se_stats) + : nlohmann::json(run.pe_stats)); else - stats_of << (reads_file2.empty() ? r.se_stats.tostring("read1") - : r.pe_stats.tostring(allow_ambig)); + statout << (reads_file2.empty() ? run.se_stats.tostring("read1") + : run.pe_stats.tostring(allow_ambig)); } else std::cerr << "failed to open stats out file: " << stats_outfile << '\n'; diff --git a/src/abismal.hpp b/src/abismal.hpp index 8a6b171..ad20ad5 100644 --- a/src/abismal.hpp +++ b/src/abismal.hpp @@ -19,6 +19,6 @@ #define SRC_ABISMAL_HPP_ int -abismal(int argc, char *argv[]); +abismal(int argc, char *argv[]); // NOLINT(*-avoid-c-arrays) #endif // SRC_ABISMAL_HPP_ diff --git a/src/abismalidx.hpp b/src/abismalidx.hpp index 17bdae6..c3bfc17 100644 --- a/src/abismalidx.hpp +++ b/src/abismalidx.hpp @@ -19,6 +19,6 @@ #define SRC_ABISMALIDX_HPP_ int -abismalidx(int argc, char *argv[]); +abismalidx(int argc, char *argv[]); // NOLINT(*-avoid-c-arrays) #endif // SRC_ABISMALIDX_HPP_ diff --git a/src/common.hpp b/src/common.hpp index d4eaa7d..1c5a73b 100644 --- a/src/common.hpp +++ b/src/common.hpp @@ -11,7 +11,11 @@ * more details. */ +#ifndef SRC_COMMON_HPP_ +#define SRC_COMMON_HPP_ + #include +#include #include #include #include @@ -41,24 +45,33 @@ revcomp(T &s) -> T { struct progress_bar { progress_bar(const std::size_t total, - const std::string message = "completion") : + const std::string &message = "completion") : total{total}, mid_tag{message} { - // the 3 below is for the default left_tag and right_tag printed width and - // the 5 is for the width of the percent (up to 100) plus two pipes ('|') - bar_width = max_bar_width - std::size(message) - 3 - 5; + // NOLINTNEXTLINE(*-prefer-member-initializer) + bar_width = max_bar_width - std::size(message) - tag_size - pcnt_and_pipes; bar = std::string(bar_width, ' '); } bool time_to_report(const std::size_t i) const { - return std::round((100.0 * std::min(i, total)) / total) > prev; + static constexpr auto one_hundred = 100.0; + // NOLINTBEGIN(*-narrowing-conversions) + return std::round((one_hundred * std::min(i, total)) / + static_cast(total)) > prev; + // NOLINTEND(*-narrowing-conversions) } void report(std::ostream &out, const std::size_t i) { - prev = std::round((100.0 * std::min(i, total)) / total); + static constexpr auto one_hundred = 100.0; + // NOLINTBEGIN(*-narrowing-conversions) + prev = std::round((one_hundred * std::min(i, total)) / + static_cast(total)); const std::size_t x = - std::min(static_cast(bar_width * (prev / 100.0)), bar_width); + std::min(static_cast( + bar_width * (static_cast(prev) / one_hundred)), + bar_width); + // NOLINTEND(*-narrowing-conversions) std::fill_n(std::begin(bar), x, '='); out << left_tag << mid_tag << "|" << bar << "|" << std::setw(3) << prev << right_tag; @@ -66,15 +79,18 @@ struct progress_bar { out << '\n'; } + static constexpr auto pcnt_and_pipes = 5; + static constexpr auto tag_size = 3; + static constexpr auto left_tag = "\r["; + static constexpr auto right_tag = "%]"; + std::size_t total{}; std::size_t prev{}; std::size_t bar_width{}; - std::string left_tag = "\r["; std::string mid_tag; std::string bar; - std::string right_tag = "%]"; - static const std::size_t max_bar_width = 72; + static constexpr std::size_t max_bar_width{72}; }; // from 30 April 2020 SAM documentation @@ -93,28 +109,34 @@ struct progress_bar { namespace samflags { // ADS: names of flags adjusted to how we typically interpret -static const std::uint16_t read_paired = 0x1; -static const std::uint16_t read_pair_mapped = 0x2; -static const std::uint16_t read_unmapped = 0x4; -static const std::uint16_t mate_unmapped = 0x8; -static const std::uint16_t read_rc = 0x10; -static const std::uint16_t mate_rc = 0x20; -static const std::uint16_t template_first = 0x40; -static const std::uint16_t template_last = 0x80; -static const std::uint16_t secondary_aln = 0x100; -static const std::uint16_t below_quality = 0x200; -static const std::uint16_t pcr_duplicate = 0x400; -static const std::uint16_t supplementary_aln = 0x800; +static constexpr std::uint16_t read_paired = 0x1; +static constexpr std::uint16_t read_pair_mapped = 0x2; +static constexpr std::uint16_t read_unmapped = 0x4; +static constexpr std::uint16_t mate_unmapped = 0x8; +static constexpr std::uint16_t read_rc = 0x10; +static constexpr std::uint16_t mate_rc = 0x20; +static constexpr std::uint16_t template_first = 0x40; +static constexpr std::uint16_t template_last = 0x80; +static constexpr std::uint16_t secondary_aln = 0x100; +static constexpr std::uint16_t below_quality = 0x200; +static constexpr std::uint16_t pcr_duplicate = 0x400; +static constexpr std::uint16_t supplementary_aln = 0x800; + constexpr auto check(const std::uint16_t to_check, const std::uint16_t &f) -> bool { return to_check & f; } + constexpr auto set(std::uint16_t &to_set, const std::uint16_t f) { to_set |= f; } + constexpr auto unset(std::uint16_t &to_unset, const std::uint16_t f) { to_unset &= ~f; } + } // namespace samflags + +#endif // SRC_COMMON_HPP_ diff --git a/src/dna_four_bit_bisulfite.hpp b/src/dna_four_bit_bisulfite.hpp index da0c099..009c940 100644 --- a/src/dna_four_bit_bisulfite.hpp +++ b/src/dna_four_bit_bisulfite.hpp @@ -1,50 +1,43 @@ /* Copyright (C) 2018-2025 Andrew D. Smith - * - * Authors: Andrew D. Smith * * This file is part of ABISMAL. * - * ABISMAL is free software: you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. + * ABISMAL is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * ABISMAL is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * ABISMAL is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. */ #ifndef DNA_FOUR_BIT_BISULFITE_HPP #define DNA_FOUR_BIT_BISULFITE_HPP #include +#include #include +#include // IWYU pragma: keep +#include +#include -// clang-format off /* encoding of ASCII characters into T-rich bases, used * in encoding reads. * A: 0001 = 1 * C: 0010 = 2 * G: 0100 = 4 * T: 1010 = 10 */ -constexpr std::array encode_base_t_rich = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //0 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //17 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //33 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //49 - 0, 1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, //@,A-O - 0, 0, 0, 0, 10,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //P-Z - 0, 1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, //`,a-o - 0, 0, 0, 0, 10,0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //p-z - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +constexpr auto encode_base_t_rich = std::array{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 17 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 33 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 49 + 0, 1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, //@,A-O + 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // P-Z + 0, 1, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, //`,a-o + 0, 0, 0, 0, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // p-z }; /* encoding of ASCII characters into A-rich bases @@ -52,25 +45,221 @@ constexpr std::array encode_base_t_rich = { * C: 0010 = 2 * G: 0100 = 4 * T: 1000 = 8 */ -constexpr std::array encode_base_a_rich = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //0 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //17 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //33 - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //49 - 0, 5, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, //@,A-O - 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //P-Z - 0, 5, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, //`,a-o - 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, //p-z - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +constexpr auto encode_base_a_rich = std::array{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 0 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 17 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 33 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 49 + 0, 5, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, //@,A-O + 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // P-Z + 0, 5, 0, 2, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0, //`,a-o + 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // p-z +}; + +// clang-format off +constexpr auto dna_four_bit_decoding = std::array{ + 'Z', // = 0000 = 0 = {} = Zero bases + 'A', // = 0001 = 1 = {A} = Adenine + 'C', // = 0010 = 2 = {C} = Cytosine + 'M', // = 0011 = 3 = {C,A} = aMino + 'G', // = 0100 = 4 = {G} = Guanine + 'R', // = 0101 = 5 = {G,A} = puRine + 'S', // = 0110 = 6 = {G,C} = Strong + 'V', // = 0111 = 7 = {G,C,A} = not T + 'T', // = 1000 = 8 = {T} = Thymine + 'W', // = 1001 = 9 = {T,A} = Weak + 'Y', // = 1010 = 10 = {T,C} = pYramidine + 'H', // = 1011 = 11 = {T,C,A} = not G + 'K', // = 1100 = 12 = {T,G} = Keto + 'D', // = 1101 = 13 = {T,G,A} = not C + 'B', // = 1110 = 14 = {T,G,C} = not A + 'N' // = 1111 = 15 = {T,G,C,A} = aNything +}; + +enum base_in_byte : std::uint8_t { + left, + right, }; +static constexpr auto nibble_mask = 15; +static constexpr auto nibble_per_word = 16; + +template +constexpr auto +get_nibble(const uint_type x, const std::size_t offset) -> uint_type { + return (x >> (4 * offset)) & nibble_mask; +} + +template +constexpr auto +decode_dna_four_bit(const uint_type x, const std::size_t offset) -> char { + return dna_four_bit_decoding[get_nibble(x, offset)]; +} + +template +auto +decode_dna_four_bit(InputItr first, InputItr last, OutputIt d_first) -> OutputIt { + // ADS: assume destination has enough space + for (; first != last; ++first) + for (std::size_t offset = 0; offset < nibble_per_word; ++offset) + *d_first++ = decode_dna_four_bit(*first, offset); + // if original sequence length is odd and encoding not padded at the front, + // then the final element in dest will be 'Z' + return d_first; +} + +template +void +decode_dna_four_bit(const InCtr &source, OutCtr &dest) { + // expand out the bytes as pairs (do this backwards in case source == dest) + const std::size_t source_size = std::size(source); + dest.resize(nibble_per_word * source_size); + std::size_t i = source_size; + std::size_t j = std::size(dest); + while (i > 0) { + dest[--j] = source[--i]; + dest[--j] = source[i]; + } + for (i = 0; i < std::size(dest); i += nibble_per_word) + for (std::size_t offset = 0; offset < nibble_per_word; ++offset) + dest[i + offset] = decode_dna_four_bit(dest[i], offset); +} + +/* Sorted by letter + A = 0001 = 1 = {A} = Adenine + B = 1110 = 14 = {T,G,C} = not A + C = 0010 = 2 = {C} = Cytosine + D = 1101 = 13 = {T,G,A} = not C + E = 15 = + F = 15 = + G = 0100 = 4 = {G} = Guanine + H = 1011 = 11 = {T,C,A} = not G + I = 15 = + J = 15 = + K = 1100 = 12 = {T,G} = Keto + L = 15 = + M = 0011 = 3 = {C,A} = aMino + N = 1111 = 15 = {T,G,C,A} = aNything + O = 15 = + P = 15 = + Q = 15 = + R = 0101 = 5 = {G,A} = puRine + S = 0110 = 6 = {G,C} = Strong + T = 1000 = 8 = {T} = Thymine + U = 15 = + V = 0111 = 7 = {G,C,A} = not T + W = 1001 = 9 = {T,A} = Weak + X = 15 = + Y = 1010 = 10 = {T,C} = pYramidine + Z = 0000 = 0 = {} = Zero +*/ +constexpr auto dna_four_bit_encoding = std::array{ + /* 0*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 15*/ + /* 16*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 31*/ + /* 32*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 47*/ + /* 48*/ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 63*/ + /* 64*/ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3, 0, 0, /* 79*/ + /* 80*/ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, /* 95*/ + /* 96*/ 0, 1,14, 2,13, 0, 0, 4,11, 0, 0,12, 0, 3, 0, 0, /*111*/ + /*112*/ 0, 0, 5, 6, 8, 0, 7, 9, 0,10, 0, 0, 0, 0, 0, 0, /*127*/ +}; +// . A B C D . . G H . . K . M N . +// . . R S T . V W . Y Z // clang-format on +template +constexpr auto +encode_dna_four_bit(const uint_type x, + const std::size_t offset) -> std::size_t { + // NOLINTBEGIN(*-constant-array-index) + return (static_cast( + dna_four_bit_encoding[static_cast(x)])) + << (4 * offset); + // NOLINTEND(*-constant-array-index) +} + +template +auto +encode_dna_four_bit(InputItr first, InputItr last, + OutputIt d_first) -> OutputIt { + for (; first != last; ++d_first) { + *d_first = 0; + for (std::size_t i = 0; i < nibble_per_word && first != last; ++i) + *d_first |= encode_dna_four_bit(std::move(*first++), i); + } + return d_first; +} + +// GS: intended to be used as pointer to 4-bit encoding of DNA within a vector +// of std::size_t values +struct genome_four_bit_itr { + explicit genome_four_bit_itr( + const std::vector::const_iterator itr, const int offset = 0) : + itr{itr}, offset{offset} {} + + auto + operator*() const -> std::size_t { + return (*itr >> (offset << 2)) & nibble_mask; + } + + auto + operator++() -> genome_four_bit_itr & { + offset = (offset + 1) & nibble_mask; + itr += (offset == 0); + return *this; + } + + auto + operator++(int) -> genome_four_bit_itr { + genome_four_bit_itr tmp(*this); + offset = (offset + 1) & nibble_mask; + itr += (offset == 0); + return tmp; + } + + auto + operator--() -> genome_four_bit_itr & { + itr -= (offset == 0); + offset = (offset - 1) & nibble_mask; + return *this; + } + + auto + operator--(int) -> genome_four_bit_itr { + genome_four_bit_itr tmp(*this); + itr -= (offset == 0); + offset = (offset - 1) & nibble_mask; + return tmp; + } + + auto + operator+(const std::size_t step) const -> genome_four_bit_itr { + // check if the sum of offsets is >= 16 + const bool high_nibble = + ((offset + (static_cast(step) & nibble_mask)) & nibble_per_word) >> + 4; + const int new_offset = (offset + static_cast(step)) & nibble_mask; + return genome_four_bit_itr( + itr + static_cast(step) / nibble_per_word + high_nibble, new_offset); + } + + auto + operator!=(const genome_four_bit_itr &rhs) const -> bool { + return itr != rhs.itr || offset != rhs.offset; + } + + auto + operator<(const genome_four_bit_itr &rhs) const -> bool { + return itr < rhs.itr || (itr == rhs.itr && offset < rhs.offset); + } + + auto + operator<=(const genome_four_bit_itr &rhs) const -> bool { + return itr < rhs.itr || (itr == rhs.itr && offset <= rhs.offset); + } + + std::vector::const_iterator itr; + int offset{}; +}; + #endif // DNA_FOUR_BIT_BISULFITE_HPP diff --git a/src/simreads.hpp b/src/simreads.hpp index 08879f1..d9967e8 100644 --- a/src/simreads.hpp +++ b/src/simreads.hpp @@ -19,6 +19,6 @@ #define SRC_SIMREADS_HPP_ int -simreads(int argc, char *argv[]); +simreads(int argc, char *argv[]); // NOLINT(*-avoid-c-arrays) #endif // SRC_SIMREADS_HPP_