From 019823801a76f8b12faa63735290da4ab4d3d79a Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:01:49 -0800 Subject: [PATCH 01/10] src/analysis/nanopore.cpp: truncated size 256 lookup array to 96 for ACGT and change c array to std::array --- src/analysis/nanopore.cpp | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index e15529bb..7b6eeaeb 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -54,23 +54,13 @@ // NOLINTBEGIN(*-narrowing-conversions) // clang-format off -static constexpr std::array encoding = { +static constexpr std::array encoding = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 48 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 64 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 80 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 96 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 112 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 128 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 144 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 160 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 176 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 192 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 208 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 224 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 240 - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 // 256 }; // clang-format on @@ -150,8 +140,8 @@ get_basecall_model(const bamxx::bam_header &hdr) { return {}; } -// ADS: here the std::uint16_t allows for up to 256 reads, each contributing up -// to 256 "counts" in the probability encoding. +// ADS: here the std::uint16_t allows for up to 256 reads, each contributing +// up to 256 "counts" in the probability encoding. typedef std::uint16_t count_type; [[nodiscard]] static inline bool @@ -268,15 +258,13 @@ enum class missing_code : std::uint8_t { unknown = 1, }; -// NOLINTBEGIN(*-avoid-c-arrays) -static const char *const tag_values[] = { +static constexpr auto tag_values = std::array{ "CpG", // 0 "CHH", // 1 "CXG", // 2 "CCG", // 3 "N", // 4 }; -// NOLINTEND(*-avoid-c-arrays) struct mod_prob_buffer { static constexpr auto init_capacity{128 * 1024}; From 352f03372a471fc0090b49e92627007fc1aed6e5 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:04:02 -0800 Subject: [PATCH 02/10] src/analysis/nanopore.cpp: adding 'about' --- src/analysis/nanopore.cpp | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 7b6eeaeb..36e05cc8 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -1,7 +1,4 @@ -/* nanocount: methylation counts for nanopore data; see docs for 'counts' - * command for details. - * - * Copyright (C) 2025 Andrew D. Smith +/* Copyright (C) 2025 Andrew D. Smith * * Author: Andrew D. Smith * @@ -18,12 +15,17 @@ #ifdef BUILD_NANOPORE +[[maybe_unused]] static constexpr auto about = R"( +nanocount: methylation counts for nanopore data; see docs for 'counts' command +for details. +)"; + +#include "bam_record_utils.hpp" #include "counts_header.hpp" #include "OptionParser.hpp" -#include "bam_record_utils.hpp" -#include "bamxx.hpp" +#include #include "nlohmann/json.hpp" From 7e6d8946cfa3acdb207f08b3814d261fdba42bf6 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:04:32 -0800 Subject: [PATCH 03/10] src/analysis/nanopore.cpp: source formatting --- src/analysis/nanopore.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 36e05cc8..101a7c84 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -156,12 +156,11 @@ eats_query(const std::uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; } -/* The three functions below here should probably be moved into - bsutils.hpp. I am not sure if the DDG function is needed, but it - seems like if one considers strand, and the CHH is not symmetric, - then one needs this. Also, Qiang should be consulted on this - because he spent much time thinking about it in the context of - plants. */ +/* The three functions below here should probably be moved into bsutils.hpp. I + am not sure if the DDG function is needed, but it seems like if one + considers strand, and the CHH is not symmetric, then one needs this. Also, + Qiang should be consulted on this because he spent much time thinking about + it in the context of plants. */ [[nodiscard]] static inline bool is_chh(const std::string &s, const std::size_t i) { return i + 2 < std::size(s) && is_cytosine(s[i]) && !is_guanine(s[i + 1]) && @@ -180,11 +179,10 @@ is_c_at_g(const std::string &s, const std::size_t i) { !is_guanine(s[i + 1]) && is_guanine(s[i + 2]); } -/* Right now the CountSet objects below are much larger than they need - to be, for the things we are computing. However, it's not clear - that the minimum information would really put the memory - requirement of the program into a more reasonable range, so keeping - all the information seems reasonable. */ +/* Right now the CountSet objects below are much larger than they need to be, + for the things we are computing. However, it's not clear that the minimum + information would really put the memory requirement of the program into a + more reasonable range, so keeping all the information seems reasonable. */ struct CountSet { static constexpr auto max_prob_repr = 255.0; // ADS: accepting int16_t because of using -1 for unknown prob vs. 0 prob. From 10946559f997d0310a314b75cf77036bfd0bf2ce Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:05:00 -0800 Subject: [PATCH 04/10] src/analysis/metagene.cpp: source formatting and modernization --- src/analysis/metagene.cpp | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/src/analysis/metagene.cpp b/src/analysis/metagene.cpp index f4e928f2..b2c2a0c5 100644 --- a/src/analysis/metagene.cpp +++ b/src/analysis/metagene.cpp @@ -18,6 +18,18 @@ metagene (tsscpgplot): get data to plot methylation level around a TSS )"; +[[maybe_unused]] constexpr auto description = R"( +Compute the information needed for metagene plots of DNA methylation +levels. The columns in the output correspond to the fields calculated +globally by the `levels` and per-region by the `roi` command. Input +for features is in BED format, and when present the 6th column is used +to indicate strand. For features of non-zero width (where the 2nd and +3rd columns are not identical) the negative strand will indicate that +3rd column should be used. This means, for example, if the features are +genes, and the promoters are of interest, the strand will be used +correctly. +)"; + #include "Interval6.hpp" #include "LevelsCounter.hpp" #include "MSite.hpp" @@ -39,8 +51,8 @@ metagene (tsscpgplot): get data to plot methylation level around a TSS #include #include -static MSite -tss_from_gene(const Interval6 &r) { +static auto +tss_from_gene(const Interval6 &r) -> MSite { MSite s; s.chrom = r.chrom; s.pos = r.strand == '+' ? r.start : r.stop; @@ -54,7 +66,9 @@ process_chrom(const std::uint32_t region_size, const std::pair &bounds, const std::vector &sites, std::vector &levels) { - const auto cmp = [](const MSite &a, const MSite &b) { return a.pos < b.pos; }; + constexpr auto cmp = [](const MSite &a, const MSite &b) { + return a.pos < b.pos; + }; const std::uint32_t twice_rs = 2 * region_size; for (auto i = bounds.first; i < bounds.second; ++i) { @@ -88,20 +102,8 @@ collapse_bins(const std::uint32_t bin_size, std::vector &v) { v.swap(vv); } -int -metagene(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) - constexpr auto description = - R"( -Compute the information needed for metagene plots of DNA methylation -levels. The columns in the output correspond to the fields calculated -globally by the `levels` and per-region by the `roi` command. Input -for features is in BED format, and when present the 6th column is used -to indicate strand. For features of non-zero width (where the 2nd and -3rd columns are not identical) the negative strand will indicate that -3rd column should be used. This means, for example, if the features are -genes, and the promoters are of interest, the strand will be used -correctly. -)"; +auto +metagene(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays) try { std::string outfile; std::uint32_t region_size = 5000; // NOLINT(*-avoid-magic-numbers) @@ -185,10 +187,9 @@ correctly. const auto bounds = lookup.find(chrom_name); if (bounds != std::cend(lookup)) process_chrom(region_size, features, bounds->second, sites, levels); - const auto n_features = pair_diff(bounds); if (show_progress) std::cerr << "[sites=" << std::size(sites) - << " features=" << n_features << "]" << '\n'; + << " features=" << pair_diff(bounds) << "]" << '\n'; sites.clear(); } if (show_progress) @@ -202,10 +203,9 @@ correctly. const auto bounds = lookup.find(chrom_name); if (bounds != std::cend(lookup)) process_chrom(region_size, features, bounds->second, sites, levels); - const auto n_features = pair_diff(bounds); if (show_progress) - std::cerr << "[sites=" << std::size(sites) << " features=" << n_features - << "]\n"; + std::cerr << "[sites=" << std::size(sites) + << " features=" << pair_diff(bounds) << "]\n"; } collapse_bins(bin_size, levels); From a987ec7864d0a6f48a8f7c5b6a7d726432dc8edf Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:08:04 -0800 Subject: [PATCH 05/10] src/utils/guessprotocol.cpp: making outout file required, reformatting an array of nucleotide lookups, modernizing and linting --- src/utils/guessprotocol.cpp | 221 +++++++++++++++--------------------- 1 file changed, 93 insertions(+), 128 deletions(-) diff --git a/src/utils/guessprotocol.cpp b/src/utils/guessprotocol.cpp index 85a3e5ff..4281eb80 100644 --- a/src/utils/guessprotocol.cpp +++ b/src/utils/guessprotocol.cpp @@ -1,24 +1,28 @@ -/* guessprotocol: a program for guessing whether a wgbs protocol is - * wgbs, pbat or random pbat +/* Copyright (C) 2019-2023 Andrew D. Smith * - * Copyright (C) 2019-2023 Andrew D. Smith + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * Authors: Andrew D. Smith + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. * - * This program is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . */ -#include "OptionParser.hpp" +[[maybe_unused]] static constexpr auto about = R"( +guessprotocol: a tool for guessing whether the protocol for generating +whole-genome bisulfite sequencing was wgbs, pbat or random pbat. +)"; + #include "numerical_utils.hpp" +#include "OptionParser.hpp" + #include #include @@ -28,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -38,28 +41,14 @@ #include #include -// NOLINTBEGIN(*-pointer-arithmetic,*-constant-array-index,*-narrowing-conversions,cert-err09-cpp,cert-err61-cpp) - -// clang-format off -constexpr std::array nuc_to_idx = { - /* 0*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 16*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 32*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 48*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 64*/ 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - /* 80*/ 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /* 96*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /*112*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - /*128*/ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, +static constexpr std::array nuc_to_idx = { + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 48 + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 64 + 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 80 + 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 96 }; -// clang-format on struct nucleotide_model { std::vector pr{}; @@ -76,27 +65,28 @@ struct nucleotide_model { pr[nuc_from] *= (1.0 - bisulfite_conversion_rate); assert(std::reduce(std::cbegin(pr), std::cend(pr), 0.0) == 1.0); lpr.resize(std::size(pr)); - transform(std::cbegin(pr), std::cend(pr), std::begin(lpr), - [](const double x) { return log(x); }); + std::transform(std::cbegin(pr), std::cend(pr), std::begin(lpr), + [](const double x) { return std::log(x); }); } - double - operator()(const std::string &s) const { - return std::accumulate(std::cbegin(s), std::cend(s), 0.0, - [&](const double x, const char c) { - const auto i = nuc_to_idx[static_cast(c)]; - return i == 4 ? x : x + lpr[i]; - }); + auto + operator()(const std::string &s) const -> double { + static const auto acc = [&](const double x, const char c) { + // NOLINTNEXTLINE(*-constant-array-index) + const auto i = nuc_to_idx[static_cast(c)]; + return i == 4 ? x : x + lpr[i]; + }; + return std::accumulate(std::cbegin(s), std::cend(s), 0.0, acc); }; - std::string - tostring() const { + [[nodiscard]] auto + tostring() const -> std::string { std::ostringstream oss; oss << "pr:\n"; - for (auto i : pr) + for (const auto i : pr) oss << i << '\n'; oss << "log pr:\n"; - for (auto i : lpr) + for (const auto i : lpr) oss << i << '\n'; oss << bisulfite_conversion_rate << '\n' << is_t_rich; return oss.str(); @@ -114,23 +104,29 @@ struct guessprotocol_summary { // protocol is the guessed protocol (wgbs, pbat or rpbat) based on the // content of the reads. std::string protocol; + // confidence indicates the level of confidence in the guess for the // protocol. std::string confidence; + // layout indicates whether the reads are paired or single-ended. std::string layout; + // n_reads_wgbs is the average number of reads (for single-ended reads) or // read pairs (for paired reads) where read1 is T-rich. double n_reads_wgbs{}; + // n_reads is the number of evaluated reads or read pairs. - uint64_t n_reads{}; + std::uint64_t n_reads{}; + // wgbs_fraction is the probability that a read (for single-ended reads) or // the read1 of a read pair (for paired reads) is T-rich. double wgbs_fraction{}; void evaluate() { - const auto frac = n_reads_wgbs / n_reads; + const auto frac = + static_cast(n_reads_wgbs) / static_cast(n_reads); // assigning wgbs (near one) if (frac > wgbs_cutoff_confident) { @@ -164,8 +160,8 @@ struct guessprotocol_summary { wgbs_fraction = frac; } - std::string - tostring() const { + [[nodiscard]] auto + tostring() const -> std::string { std::ostringstream oss; oss << "protocol: " << protocol << '\n' << "confidence: " << confidence << '\n' @@ -177,73 +173,52 @@ struct guessprotocol_summary { }; // store each read from one end -struct FASTQRecord { +struct fastq_record { std::string name; std::string seq; }; // see if two reads from two ends match to each other (they should // have the same name) -static bool -mates(const size_t to_ignore_at_end, // in case names have #0/1 name ends - const FASTQRecord &a, const FASTQRecord &b) { +static auto +mates(const std::size_t to_ignore_at_end, // in case names have #0/1 name ends + const fastq_record &a, const fastq_record &b) -> bool { assert(to_ignore_at_end < std::size(a.name)); - return equal(std::cbegin(a.name), std::cend(a.name) - to_ignore_at_end, - std::cbegin(b.name)); + const auto name_end = + std::cend(a.name) - to_ignore_at_end; // NOLINT(*-narrowing-conversions) + return std::equal(std::cbegin(a.name), name_end, std::cbegin(b.name)); } -// Read 4 lines one time from fastq and fill in the FASTQRecord structure -static bamxx::bgzf_file & -operator>>(bamxx::bgzf_file &s, FASTQRecord &r) { - static constexpr auto n_error_codes = 5u; - - enum err_code : std::uint8_t { - none, - bad_name, - bad_seq, - bad_plus, - bad_qual, - }; - - static const std::array error_msg = { - std::runtime_error(""), - std::runtime_error("failed to parse fastq name line"), - std::runtime_error("failed to parse fastq sequence line"), - std::runtime_error("failed to parse fastq plus line"), - std::runtime_error("failed to parse fastq qual line"), - }; - - err_code ec = err_code::none; - +// Read 4 lines one time from fastq and fill in the fastq_record structure +static auto +operator>>(bamxx::bgzf_file &s, fastq_record &r) -> bamxx::bgzf_file & { if (!getline(s, r.name)) return s; if (r.name.empty() || r.name[0] != '@') - ec = err_code::bad_name; + throw std::runtime_error("failed to parse fastq name line"); const auto nm_end = r.name.find_first_of(" \t"); - const auto nm_sz = (nm_end == std::string::npos ? r.name.size() : nm_end) - 1; + const auto nm_sz = + (nm_end == std::string::npos ? std::size(r.name) : nm_end) - 1; r.name.erase(std::copy_n(std::cbegin(r.name) + 1, nm_sz, std::begin(r.name)), std::cend(r.name)); if (!getline(s, r.seq)) - ec = err_code::bad_seq; + throw std::runtime_error("failed to parse fastq sequence line"); std::string tmp; if (!getline(s, tmp)) - ec = err_code::bad_plus; + throw std::runtime_error("failed to parse fastq plus line"); if (!getline(s, tmp)) - ec = err_code::bad_qual; - - if (ec != err_code::none) - throw error_msg[ec]; // NOLINT(runtime/arrays) + throw std::runtime_error("failed to parse fastq qual line"); return s; } -int -main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) +auto +main_guessprotocol(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays) try { static const std::vector human_base_comp = { 0.295, @@ -257,32 +232,28 @@ main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) 0.25, 0.25, }; + static constexpr auto description = + "guess bisulfite protocol for a library"; - constexpr auto description = "guess bisulfite protocol for a library"; - - bool verbose = false; - bool use_human = false; + bool verbose{}; + bool use_human{}; std::string outfile; - size_t reads_to_check = 1000000; // NOLINT(*-avoid-magic-numbers) - size_t name_suffix_len = 0; - double bisulfite_conversion_rate = 0.98; // NOLINT(*-avoid-magic-numbers) - - namespace fs = std::filesystem; - const std::string cmd_name = std::filesystem::path(argv[0]).filename(); + std::size_t reads_to_check{1000000}; // NOLINT(*-avoid-magic-numbers) + std::size_t name_suffix_len{}; + double bisulfite_conversion_rate{0.98}; // NOLINT(*-avoid-magic-numbers) /****************** COMMAND LINE OPTIONS ********************/ - OptionParser opt_parse(cmd_name, description, - " []"); + OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) + description, " []"); opt_parse.add_opt("nreads", 'n', "number of reads in initial check", false, reads_to_check); opt_parse.add_opt("ignore", 'i', - "length of read name suffix " - "to ignore when matching", + "length of read name suffix to ignore when matching", false, name_suffix_len); opt_parse.add_opt("bisulfite", 'b', "bisulfite conversion rate", false, bisulfite_conversion_rate); opt_parse.add_opt("human", 'H', "assume human genome", false, use_human); - opt_parse.add_opt("output", 'o', "output file name", false, outfile); + opt_parse.add_opt("output", 'o', "output file name", true, outfile); opt_parse.add_opt("verbose", 'v', "report available information during the run", false, verbose); @@ -297,25 +268,25 @@ main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) std::cerr << opt_parse.option_missing_message() << '\n'; return EXIT_SUCCESS; } - if (opt_parse.about_requested() || leftover_args.size() > 2) { + if (opt_parse.about_requested() || std::size(leftover_args) > 2) { std::cerr << opt_parse.about_message() << '\n'; return EXIT_SUCCESS; } const std::vector reads_files(leftover_args); /****************** END COMMAND LINE OPTIONS *****************/ - auto base_comp = flat_base_comp; - if (use_human) - base_comp = human_base_comp; + const auto base_comp = use_human ? human_base_comp : flat_base_comp; + // if (use_human) + // base_comp = human_base_comp; nucleotide_model t_rich_model(base_comp, bisulfite_conversion_rate, true); nucleotide_model a_rich_model(base_comp, bisulfite_conversion_rate, false); guessprotocol_summary summary; - summary.layout = reads_files.size() == 2 ? "paired" : "single"; + summary.layout = std::size(reads_files) == 2 ? "paired" : "single"; if (verbose) { - if (reads_files.size() == 2) + if (std::size(reads_files) == 2) std::cerr << "data layout: " << "paired" << '\n' << "read1 file: " << reads_files.front() << '\n' @@ -330,7 +301,7 @@ main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) << '\n'; } - if (reads_files.size() == 2) { + if (std::size(reads_files) == 2) { // input: paired-end reads with end1 and end2 bamxx::bgzf_file in1(reads_files.front(), "r"); if (!in1) @@ -340,7 +311,7 @@ main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) if (!in2) throw std::runtime_error("cannot open file: " + reads_files.back()); - FASTQRecord r1, r2; + fastq_record r1, r2; while (in1 >> r1 && in2 >> r2 && summary.n_reads < reads_to_check) { summary.n_reads++; @@ -351,7 +322,7 @@ main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) const double ta = t_rich_model(r1.seq) + a_rich_model(r2.seq); const double at = a_rich_model(r1.seq) + t_rich_model(r2.seq); - const auto prob_read1_t_rich = exp(ta - log_sum_log(ta, at)); + const auto prob_read1_t_rich = std::exp(ta - log_sum_log(ta, at)); summary.n_reads_wgbs += prob_read1_t_rich; } } @@ -361,28 +332,24 @@ main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) if (!in) throw std::runtime_error("cannot open file: " + reads_files.front()); - FASTQRecord r; + fastq_record r; while (in >> r && summary.n_reads < reads_to_check) { summary.n_reads++; const double t = t_rich_model(r.seq); const double a = a_rich_model(r.seq); - const auto prob_t_rich = exp(t - log_sum_log(t, a)); + const auto prob_t_rich = std::exp(t - log_sum_log(t, a)); summary.n_reads_wgbs += prob_t_rich; } } summary.evaluate(); - if (!outfile.empty()) { - std::ofstream out(outfile); - if (!out) - throw std::runtime_error("failed to open: " + outfile); - out << summary.tostring() << '\n'; - } - else - std::cout << summary.tostring() << '\n'; + std::ofstream out(outfile); + if (!out) + throw std::runtime_error("failed to open: " + outfile); + out << summary.tostring() << '\n'; } catch (const std::exception &e) { std::cerr << e.what() << '\n'; @@ -390,5 +357,3 @@ main_guessprotocol(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) } return EXIT_SUCCESS; } - -// NOLINTEND(*-pointer-arithmetic,*-constant-array-index,*-narrowing-conversions,cert-err09-cpp,cert-err61-cpp) From e54724af90edcf5a048f7e134bcb3e078ba08323 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:08:19 -0800 Subject: [PATCH 06/10] src/analysis/nanopore.cpp: modernizing --- src/analysis/nanopore.cpp | 123 +++++++++++++++++++------------------- 1 file changed, 60 insertions(+), 63 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index 101a7c84..ddbaff3a 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -55,7 +55,6 @@ for details. // NOLINTBEGIN(*-narrowing-conversions) -// clang-format off static constexpr std::array encoding = { 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32 @@ -64,22 +63,21 @@ static constexpr std::array encoding = { 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 80 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 96 }; -// clang-format on static constexpr auto n_nucs = 4u; -[[nodiscard]] inline bool -is_cytosine(const char c) { +[[nodiscard]] inline auto +is_cytosine(const char c) -> bool { return c == 'c' || c == 'C'; } -[[nodiscard]] inline bool -is_guanine(const char c) { +[[nodiscard]] inline auto +is_guanine(const char c) -> bool { return c == 'g' || c == 'G'; } -[[nodiscard]] inline bool -is_cpg(const std::string &s, const std::size_t i) { +[[nodiscard]] inline auto +is_cpg(const std::string &s, const std::size_t i) -> bool { return i + 1 < std::size(s) && is_cytosine(s[i]) && is_guanine(s[i + 1]); } @@ -108,8 +106,8 @@ read_fasta_file(const std::string &filename, std::vector &names, } } -[[nodiscard]] static std::string -get_basecall_model(const bamxx::bam_header &hdr) { +[[nodiscard]] static auto +get_basecall_model(const bamxx::bam_header &hdr) -> std::string { kstring_t ks{}; ks = {0, 0, nullptr}; @@ -144,15 +142,15 @@ get_basecall_model(const bamxx::bam_header &hdr) { // ADS: here the std::uint16_t allows for up to 256 reads, each contributing // up to 256 "counts" in the probability encoding. -typedef std::uint16_t count_type; +using count_type = std::uint16_t; -[[nodiscard]] static inline bool -eats_ref(const std::uint32_t c) { +[[nodiscard]] static inline auto +eats_ref(const std::uint32_t c) -> bool { return bam_cigar_type(bam_cigar_op(c)) & 2; } -[[nodiscard]] static inline bool -eats_query(const std::uint32_t c) { +[[nodiscard]] static inline auto +eats_query(const std::uint32_t c) -> bool { return bam_cigar_type(bam_cigar_op(c)) & 1; } @@ -161,20 +159,20 @@ eats_query(const std::uint32_t c) { considers strand, and the CHH is not symmetric, then one needs this. Also, Qiang should be consulted on this because he spent much time thinking about it in the context of plants. */ -[[nodiscard]] static inline bool -is_chh(const std::string &s, const std::size_t i) { +[[nodiscard]] static inline auto +is_chh(const std::string &s, const std::size_t i) -> bool { return i + 2 < std::size(s) && is_cytosine(s[i]) && !is_guanine(s[i + 1]) && !is_guanine(s[i + 2]); } -[[nodiscard]] static inline bool -is_ddg(const std::string &s, const std::size_t i) { +[[nodiscard]] static inline auto +is_ddg(const std::string &s, const std::size_t i) -> bool { return i + 2 < std::size(s) && !is_cytosine(s[i]) && !is_cytosine(s[i + 1]) && is_guanine(s[i + 2]); } -[[nodiscard]] static inline bool -is_c_at_g(const std::string &s, const std::size_t i) { +[[nodiscard]] static inline auto +is_c_at_g(const std::string &s, const std::size_t i) -> bool { return i + 2 < std::size(s) && is_cytosine(s[i]) && !is_cytosine(s[i + 1]) && !is_guanine(s[i + 1]) && is_guanine(s[i + 2]); } @@ -198,20 +196,20 @@ struct CountSet { methyl_rev += static_cast(m); ++n_reads_rev; } - [[nodiscard]] double - get_hydroxy(const bool is_c) const { + [[nodiscard]] auto + get_hydroxy(const bool is_c) const -> double { return (is_c ? hydroxy_fwd : hydroxy_rev) / max_prob_repr; } - [[nodiscard]] double - get_methyl(const bool is_c) const { + [[nodiscard]] auto + get_methyl(const bool is_c) const -> double { return (is_c ? methyl_fwd : methyl_rev) / max_prob_repr; } - [[nodiscard]] double - get_mods(const bool is_c) const { + [[nodiscard]] auto + get_mods(const bool is_c) const -> double { return get_hydroxy(is_c) + get_methyl(is_c); } - [[nodiscard]] double - get_n_reads(const bool is_c) const { + [[nodiscard]] auto + get_n_reads(const bool is_c) const -> double { return is_c ? n_reads_fwd : n_reads_rev; } count_type hydroxy_fwd{0}; @@ -222,14 +220,14 @@ struct CountSet { count_type n_reads_rev{0}; }; -/* The "tag" returned by this function should be exclusive, so that - * the order of checking conditions doesn't matter. There is also a - * bit of a hack in that the unsigned "pos" could wrap, but this still - * works as long as the chromosome size is not the maximum size of a - * std::size_t. +/* The "tag" returned by this function should be exclusive, so that the order + * of checking conditions doesn't matter. There is also a bit of a hack in + * that the unsigned "pos" could wrap, but this still works as long as the + * chromosome size is not the maximum size of a std::size_t. */ -[[nodiscard]] static std::uint32_t -get_tag_from_genome(const std::string &s, const std::size_t pos) { +[[nodiscard]] static auto +get_tag_from_genome(const std::string &s, + const std::size_t pos) -> std::uint32_t { if (is_cytosine(s[pos])) { if (is_cpg(s, pos)) return 0; @@ -281,8 +279,8 @@ struct mod_prob_buffer { hydroxy_probs.reserve(init_capacity); } - [[nodiscard]] bool - set_probs(const bamxx::bam_rec &aln) { + [[nodiscard]] auto + set_probs(const bamxx::bam_rec &aln) -> bool { static constexpr auto h_idx = 0; static constexpr auto m_idx = 1; const auto qlen = get_l_qseq(aln); @@ -323,8 +321,8 @@ struct mod_prob_buffer { static void count_states_fwd(const bamxx::bam_rec &aln, std::vector &counts, mod_prob_buffer &mod_buf, const std::string &chrom) { - /* Move through cigar, reference and read positions without - inflating cigar or read sequence */ + // Move through cigar, reference and read positions without inflating cigar + // or read sequence const auto beg_cig = bam_get_cigar(aln); const auto end_cig = beg_cig + get_n_cigar(aln); // NOLINT(*-pointer-arithmetic) @@ -363,10 +361,10 @@ count_states_fwd(const bamxx::bam_rec &aln, std::vector &counts, ref_itr += n; } } - // ADS: somehow previous code included a correction for rpos going - // past the end of the chromosome; this should result at least in a - // soft-clip by any mapper. I'm not checking it here as even if it - // happens I don't want to terminate. + // ADS: somehow previous code included a correction for rpos going past the + // end of the chromosome; this should result at least in a soft-clip by any + // mapper. I'm not checking it here as even if it happens I don't want to + // terminate. assert(qpos == get_l_qseq(aln)); } @@ -421,16 +419,15 @@ count_states_rev(const bamxx::bam_rec &aln, std::vector &counts, assert(qpos == 0); } -[[nodiscard]] static std::tuple, - std::set> -get_tid_to_idx( - const bamxx::bam_header &hdr, - const std::unordered_map &name_to_idx) { +[[nodiscard]] static auto +get_tid_to_idx(const bamxx::bam_header &hdr, + const std::unordered_map &name_to_idx) + -> std::tuple, std::set> { std::set missing_tids; std::map tid_to_idx; for (std::int32_t i = 0; i < get_n_targets(hdr); ++i) { - // "curr_name" gives a "tid_to_name" mapping allowing to jump - // through "name_to_idx" and get "tid_to_idx" + // "curr_name" gives a "tid_to_name" mapping allowing to jump through + // "name_to_idx" and get "tid_to_idx" // NOLINTNEXTLINE(*-pointer-arithmetic) const std::string curr_name(hdr.h->target_name[i]); const auto name_itr(name_to_idx.find(curr_name)); @@ -443,11 +440,11 @@ get_tid_to_idx( std::set>{tid_to_idx, missing_tids}; } -[[nodiscard]] static bool +[[nodiscard]] static auto consistent_targets(const bamxx::bam_header &hdr, const std::map &tid_to_idx, const std::vector &names, - const std::vector &sizes) { + const std::vector &sizes) -> bool { const std::size_t n_targets = get_n_targets(hdr); if (n_targets != std::size(names)) return false; @@ -465,12 +462,12 @@ consistent_targets(const bamxx::bam_header &hdr, return true; } -[[nodiscard]] static bool +[[nodiscard]] static auto consistent_existing_targets( const bamxx::bam_header &hdr, const std::map &tid_to_idx, const std::vector &names, - const std::vector &sizes) { + const std::vector &sizes) -> bool { const std::size_t n_targets = get_n_targets(hdr); for (std::size_t tid = 0; tid < n_targets; ++tid) { const auto idx_itr = tid_to_idx.find(tid); @@ -561,8 +558,8 @@ struct read_processor { int strand{}; std::string expected_basecall_model{}; - [[nodiscard]] std::string - tostring() const { + [[nodiscard]] auto + tostring() const -> std::string { const auto strand_str = strand == 0 ? "both" : strand == 1 ? "fwd" : "rev"; std::ostringstream oss; oss << std::boolalpha << "[verbose: " << verbose << "]\n" @@ -582,8 +579,8 @@ struct read_processor { read_processor() : expected_basecall_model{default_expected_basecall_model} {} - [[nodiscard]] std::string - expected_basecall_model_str() const { + [[nodiscard]] auto + expected_basecall_model_str() const -> std::string { return expected_basecall_model.empty() ? "NA" : expected_basecall_model; } @@ -752,9 +749,9 @@ struct read_processor { write_output_all(hdr, out, tid, chrom, counts); } - [[nodiscard]] mod_prob_stats + [[nodiscard]] auto operator()(const std::string &infile, const std::string &outfile, - const std::string &chroms_file) const { + const std::string &chroms_file) const -> mod_prob_stats { // first get the chromosome names and sequences from the FASTA file std::vector chroms, names; read_fasta_file(chroms_file, names, chroms); @@ -1003,8 +1000,8 @@ check_modification_sites(const std::string &infile, return only_cpgs_counter == reads_processed; } -int -main_nanocount(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) +auto +main_nanocount(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays) static constexpr auto n_reads_to_check = 1000; try { read_processor rp; From 51b7fa53f61dfead0030a076542a115d494bc1c1 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:09:03 -0800 Subject: [PATCH 07/10] src/analysis/nanopore.cpp: removing a commented out line --- src/analysis/nanopore.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index ddbaff3a..cf07fa09 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -1087,7 +1087,6 @@ main_nanocount(int argc, char *argv[]) -> int { // NOLINT(*-avoid-c-arrays) << "[mods only at CpGs: " << mods_at_cpgs << "]\n" << rp.tostring(); - // const auto [mc, mps] = rp(mapped_reads_file, outfile, chroms_file); const auto mps = rp(mapped_reads_file, outfile, chroms_file); if (!stats_file.empty()) { std::ofstream stats_out(stats_file); From 9923e242c8c198324cf83165ee4735baf4cfbd82 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:09:16 -0800 Subject: [PATCH 08/10] src/analysis/nanopore.cpp: tweak --- src/analysis/nanopore.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index cf07fa09..b56f8891 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -1,6 +1,4 @@ /* Copyright (C) 2025 Andrew D. Smith - * - * Author: Andrew D. Smith * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free From 37e68785e9b37c65933c5903cbbf1ac9de5499a2 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 17:10:09 -0800 Subject: [PATCH 09/10] src/analysis/nanopore.cpp: fixing copyright header --- src/analysis/nanopore.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/analysis/nanopore.cpp b/src/analysis/nanopore.cpp index b56f8891..44753b43 100644 --- a/src/analysis/nanopore.cpp +++ b/src/analysis/nanopore.cpp @@ -9,6 +9,9 @@ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. + * + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . */ #ifdef BUILD_NANOPORE From 7e63d5739eba1da1255976637119f95ab6b1e528 Mon Sep 17 00:00:00 2001 From: Andrew D Smith Date: Sat, 10 Jan 2026 18:02:25 -0800 Subject: [PATCH 10/10] src/common/bam_record_utils.hcpp: linting --- src/common/bam_record_utils.cpp | 442 +++++++++++++++----------------- src/common/bam_record_utils.hpp | 83 +++--- 2 files changed, 248 insertions(+), 277 deletions(-) diff --git a/src/common/bam_record_utils.cpp b/src/common/bam_record_utils.cpp index 4cdf3d32..e88db70f 100644 --- a/src/common/bam_record_utils.cpp +++ b/src/common/bam_record_utils.cpp @@ -1,16 +1,17 @@ -/* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith +/* Copyright (C) 2020-2026 Andrew D. Smith and Masaru Nakajima * - * Authors: Masaru Nakajima and Andrew D. Smith + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * This program is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * You should have received a copy of the GNU General Public License along + * with this program. If not, see . */ #include "bam_record_utils.hpp" @@ -21,6 +22,7 @@ #include #include +#include #include #include #include @@ -30,40 +32,32 @@ #include #include -using std::runtime_error; -using std::string; -using std::stringstream; -using std::to_string; -using std::vector; - -using bamxx::bam_header; -using bamxx::bam_rec; - -// NOLINTBEGIN(*-pointer-arithmetic,*-avoid-magic-numbers,*-type-reinterpret-cast,*-owning-memory,*-no-malloc,*-narrowing-conversions,*-avoid-c-arrays,*-constant-array-index) +// NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) /// functions in place of undefd macro -static inline bool +[[nodiscard]] static inline bool bam_is_rev(const bam1_t *b) { return (b->core.flag & BAM_FREVERSE) != 0; } -static inline char * +[[nodiscard]] static inline char * bam_get_qname(const bam1_t *b) { - return reinterpret_cast(b->data); + return reinterpret_cast(b->data); // NOLINT(*-type-reinterpret-cast) } -static inline uint32_t * +[[nodiscard]] static inline std::uint32_t * bam_get_cigar(const bam1_t *b) { - return reinterpret_cast(b->data + b->core.l_qname); + // NOLINTNEXTLINE(*-type-reinterpret-cast) + return reinterpret_cast(b->data + b->core.l_qname); } -static inline uint8_t * +[[nodiscard]] static inline std::uint8_t * bam_get_seq(const bam1_t *b) { // start of data + bytes for query/read name+ bytes for cigar return b->data + b->core.l_qname + (b->core.n_cigar << 2); } -static inline uint8_t * +[[nodiscard]] static inline std::uint8_t * bam_get_qual(const bam1_t *b) { return b->data + // start of data b->core.l_qname + // bytes for query name @@ -71,28 +65,17 @@ bam_get_qual(const bam1_t *b) { ((b->core.l_qseq + 1) >> 1); // bytes for packed query/read } -static inline uint8_t * +[[nodiscard]] static inline std::uint8_t * bam_get_aux(const bam1_t *b) { return b->data + b->core.l_qname + (b->core.n_cigar << 2) + ((b->core.l_qseq + 1) >> 1) + b->core.l_qseq; } -// static inline int -// bam_get_l_aux(const bam1_t *b) { -// return b->l_data - (b->core.l_qname + (b->core.n_cigar << 2) + -// ((b->core.l_qseq + 1) >> 1) + b->core.l_qseq); -// } - -// static inline bool -// bam_same_orientation(const bam1_t *a, const bam1_t *b) { -// return ((a->core.flag ^ b->core.flag) & BAM_FREVERSE) != 0; -// } - static void -roundup_to_power_of_2(uint32_t &x) { - bool k_high_bit_set = (x >> (sizeof(uint32_t) * 8 - 1)) & 1; +roundup_to_power_of_2(std::uint32_t &x) { + bool k_high_bit_set = (x >> (sizeof(std::uint32_t) * 8 - 1)) & 1; if (x > 0) { - uint8_t size = sizeof(uint32_t); + std::uint8_t size = sizeof(std::uint32_t); --x; x |= x >> (size / 4); x |= x >> (size / 2); @@ -106,29 +89,32 @@ roundup_to_power_of_2(uint32_t &x) { } static int -sam_realloc_bam_data(bam1_t *b, size_t desired) { - uint32_t new_m_data = desired; +sam_realloc_bam_data(bam1_t *b, std::size_t desired) { + std::uint32_t new_m_data = desired; roundup_to_power_of_2(new_m_data); if (new_m_data < desired) { errno = ENOMEM; // (from sam.c) Not strictly true but we can't // store the size return -1; } - uint8_t *new_data = nullptr; + // NOLINTBEGIN(*-owning-memory,*-no-malloc) + std::uint8_t *new_data = nullptr; if ((bam_get_mempolicy(b) & BAM_USER_OWNS_DATA) == 0) { - new_data = static_cast(realloc(b->data, new_m_data)); + new_data = static_cast(realloc(b->data, new_m_data)); } else { - new_data = static_cast(malloc(new_m_data)); + new_data = static_cast(malloc(new_m_data)); if (new_data != nullptr) { if (b->l_data > 0) std::copy_n(b->data, - (static_cast(b->l_data) < b->m_data) ? b->l_data - : b->m_data, + (static_cast(b->l_data) < b->m_data) + ? b->l_data + : b->m_data, new_data); bam_set_mempolicy(b, bam_get_mempolicy(b) & (~BAM_USER_OWNS_DATA)); } } + // NOLINTEND(*-owning-memory,*-no-malloc) if (!new_data) return -1; b->data = new_data; @@ -137,13 +123,13 @@ sam_realloc_bam_data(bam1_t *b, size_t desired) { } // static int -// sam_realloc_bam_data(bam1_t *b, size_t desired) { +// sam_realloc_bam_data(bam1_t *b, std::size_t desired) { // /* returns flag: either 0 for success or -1 for error (unable to // allocate desired memory) */ -// uint32_t new_m_data = desired; +// std::uint32_t new_m_data = desired; // roundup_to_power_of_2(new_m_data); // if (new_m_data < desired) return -1; -// uint8_t *new_data = (uint8_t *)realloc(b->data, new_m_data); +// std::uint8_t *new_data = (std::uint8_t *)realloc(b->data, new_m_data); // if (!new_data) return -1; // // ADS: what would be the state of members below if -1 was returned? // b->data = new_data; @@ -170,17 +156,17 @@ sam_realloc_bam_data(bam1_t *b, size_t desired) { // } static inline void -bam_set1_core(bam1_core_t &core, const size_t l_qname, const uint16_t flag, - const int32_t tid, const hts_pos_t pos, const uint8_t mapq, - const size_t n_cigar, const int32_t mtid, const hts_pos_t mpos, - const hts_pos_t isize, const size_t l_seq, - const size_t qname_nuls) { +bam_set1_core(bam1_core_t &core, const std::size_t l_qname, + const std::uint16_t flag, const int32_t tid, const hts_pos_t pos, + const std::uint8_t mapq, const std::size_t n_cigar, + const int32_t mtid, const hts_pos_t mpos, const hts_pos_t isize, + const std::size_t l_seq, const std::size_t qname_nuls) { /* ADS: These are used in `hts_reg2bin` from `htslib/hts.h` and likely mean "region to bin" for indexing */ /* MN: hts_reg2bin categorizes the size of the reference region. Here, we use the numbers used in htslib/cram/cram_samtools.h */ - static const int min_shift = 14; - static const int n_lvls = 5; + static constexpr auto min_shift = 14; + static constexpr auto n_lvls = 5; core.pos = pos; core.tid = tid; @@ -197,12 +183,13 @@ bam_set1_core(bam1_core_t &core, const size_t l_qname, const uint16_t flag, } static inline int -bam_set1_wrapper(bam1_t *bam, const size_t l_qname, const char *qname, - const uint16_t flag, const int32_t tid, const hts_pos_t pos, - const uint8_t mapq, const size_t n_cigar, - const uint32_t *cigar, const int32_t mtid, - const hts_pos_t mpos, const hts_pos_t isize, - const size_t l_seq, const size_t l_aux) { +bam_set1_wrapper(bam1_t *bam, const std::size_t l_qname, const char *qname, + const std::uint16_t flag, const int32_t tid, + const hts_pos_t pos, const std::uint8_t mapq, + const std::size_t n_cigar, const std::uint32_t *cigar, + const int32_t mtid, const hts_pos_t mpos, + const hts_pos_t isize, const std::size_t l_seq, + const std::size_t l_aux) { /* This is based on how assignment is done in the `bam_set1` function defined in `sam.c` from htslib */ @@ -231,12 +218,13 @@ bam_set1_wrapper(bam1_t *bam, const size_t l_qname, const char *qname, // `qname_nuls` below is the number of '\0' to use to pad the qname // so that the cigar has 4-byte alignment. - const size_t qname_nuls = 4 - l_qname % 4; + const std::size_t qname_nuls = 4 - l_qname % 4; bam_set1_core(bam->core, l_qname, flag, tid, pos, mapq, n_cigar, mtid, mpos, isize, l_seq, qname_nuls); - const size_t data_len = (l_qname + qname_nuls + n_cigar * sizeof(uint32_t) + - (l_seq + 1) / 2 + l_seq); + const std::size_t data_len = l_qname + qname_nuls + + n_cigar * sizeof(std::uint32_t) + + (l_seq + 1) / 2 + l_seq; bam->l_data = data_len; if (data_len + l_aux > bam->m_data) { @@ -251,9 +239,10 @@ bam_set1_wrapper(bam1_t *bam, const size_t l_qname, const char *qname, data_iter += l_qname + qname_nuls; // ADS: reinterpret here because we know the cigar is originally an - // array of uint32_t and has been aligned for efficiency - std::copy_n(cigar, n_cigar, reinterpret_cast(data_iter)); - data_iter += n_cigar * sizeof(uint32_t); + // array of std::uint32_t and has been aligned for efficiency + // NOLINTNEXTLINE(*-type-reinterpret-cast) + std::copy_n(cigar, n_cigar, reinterpret_cast(data_iter)); + data_iter += n_cigar * sizeof(std::uint32_t); // skipping sequece assignment data_iter += (l_seq + 1) / 2; @@ -263,18 +252,13 @@ bam_set1_wrapper(bam1_t *bam, const size_t l_qname, const char *qname, return static_cast(data_len); } -// static inline size_t -// bam_get_n_cigar(const bam1_t *b) { -// return b->core.n_cigar; -// } - -static inline uint32_t -to_insertion(const uint32_t x) { +static inline std::uint32_t +to_insertion(const std::uint32_t x) { return (x & ~BAM_CIGAR_MASK) | BAM_CINS; } static inline void -fix_internal_softclip(const size_t n_cigar, uint32_t *cigar) { +fix_internal_softclip(const std::size_t n_cigar, std::uint32_t *cigar) { if (n_cigar < 3) return; // find first non-softclip @@ -296,13 +280,13 @@ fix_internal_softclip(const size_t n_cigar, uint32_t *cigar) { *c_itr = to_insertion(*c_itr); } -static inline uint32_t -to_softclip(const uint32_t x) { +static inline std::uint32_t +to_softclip(const std::uint32_t x) { return (x & ~BAM_CIGAR_MASK) | BAM_CSOFT_CLIP; } static inline void -fix_external_insertion(const size_t n_cigar, uint32_t *cigar) { +fix_external_insertion(const std::size_t n_cigar, std::uint32_t *cigar) { if (n_cigar < 2) return; @@ -320,8 +304,8 @@ fix_external_insertion(const size_t n_cigar, uint32_t *cigar) { *c_itr = to_softclip(*c_itr); } -static inline size_t -merge_cigar_ops(const size_t n_cigar, uint32_t *cigar) { +static inline std::size_t +merge_cigar_ops(const std::size_t n_cigar, std::uint32_t *cigar) { if (n_cigar < 2) return n_cigar; auto c_itr1 = cigar; @@ -345,7 +329,7 @@ merge_cigar_ops(const size_t n_cigar, uint32_t *cigar) { return std::distance(cigar, c_itr1); } -static inline size_t +static inline std::size_t correct_cigar(bam1_t *b) { /* This function will change external insertions into soft clip operations. Not sure why those would be present. It will also @@ -355,26 +339,26 @@ correct_cigar(bam1_t *b) { identical operations. None of this impacts the seq/qual/aux which get moved as a block */ - uint32_t *cigar = bam_get_cigar(b); - size_t n_cigar = b->core.n_cigar; + std::uint32_t *cigar = bam_get_cigar(b); + std::size_t n_cigar = b->core.n_cigar; fix_external_insertion(n_cigar, cigar); fix_internal_softclip(n_cigar, cigar); // merge identical adjacent cigar ops and get new number of ops n_cigar = merge_cigar_ops(n_cigar, cigar); // difference in bytes to shift the internal data - const size_t delta = (b->core.n_cigar - n_cigar) * sizeof(uint32_t); + const std::size_t delta = (b->core.n_cigar - n_cigar) * sizeof(std::uint32_t); if (delta > 0) { // if there is a difference; do the shift - const auto data_end = - b->data + b->l_data; // bam_get_aux(b) + bam_get_l_aux(b); + // bam_get_aux(b) + bam_get_l_aux(b); + const auto data_end = b->data + b->l_data; std::copy(bam_get_seq(b), data_end, bam_get_seq(b) - delta); b->core.n_cigar = n_cigar; // and update number of cigar ops } return delta; } -size_t -correct_cigar(bam_rec &b) { +std::size_t +correct_cigar(bamxx::bam_rec &b) { return (b.b) ? correct_cigar(b.b) : 0ul; } @@ -383,61 +367,41 @@ get_rlen(const bam1_t *b) { // less tedious return bam_cigar2rlen(b->core.n_cigar, bam_get_cigar(b)); } -static inline size_t +static inline std::size_t get_l_qseq(const bam1_t *b) { return b->core.l_qseq; } -// static inline void -// complement_seq(char *first, char *last) { -// for (; first != last; ++first) { -// assert(valid_base(*first)); -// *first = complement(*first); -// } -// } - -// static inline void -// reverse(char *a, char *b) { -// char *p1, *p2; -// for (p1 = a, p2 = b - 1; p2 > p1; ++p1, --p2) { -// *p1 ^= *p2; -// *p2 ^= *p1; -// *p1 ^= *p2; -// assert(valid_base(*p1) && valid_base(*p2)); -// } -// } - -// return value is the number of cigar ops that are fully consumed in -// order to read n_ref, while "partial_oplen" is the number of bases -// that could be taken from the next operation, which might be merged -// with the other read. -static inline uint32_t -get_full_and_partial_ops(const uint32_t *cig_in, const uint32_t in_ops, - const uint32_t n_ref_full, uint32_t *partial_oplen) { +// return value is the number of cigar ops that are fully consumed in order to +// read n_ref, while "partial_oplen" is the number of bases that could be +// taken from the next operation, which might be merged with the other read. +static inline std::uint32_t +get_full_and_partial_ops(const std::uint32_t *cig_in, + const std::uint32_t in_ops, + const std::uint32_t n_ref_full, + std::uint32_t *partial_oplen) { // assume: n_ops <= size(cig_in) <= size(cig_out) - size_t rlen = 0; - uint32_t i = 0; - for (i = 0; i < in_ops; ++i) { + std::size_t rlen = 0; + std::uint32_t i = 0; + for (i = 0; i < in_ops; ++i) if (cigar_eats_ref(cig_in[i])) { if (rlen + bam_cigar_oplen(cig_in[i]) > n_ref_full) break; rlen += bam_cigar_oplen(cig_in[i]); } - } *partial_oplen = n_ref_full - rlen; return i; } -/* This table converts 2 bases packed in a byte to their reverse - * complement. The input is therefore a unit8_t representing 2 bases. - * It is assumed that the input uint8_t value is of form "xx" or "x-", - * where 'x' a 4-bit number representing either A, C, G, T, or N and - * '-' is 0000. For example, the ouptut for "AG" is "CT". The format - * "x-" is often used at the end of an odd-length sequence. The - * output of "A-" is "-T", and the output of "C-" is "-G", and so - * forth. The user must handle this case separately. +/* This table converts 2 bases packed in a byte to their reverse complement. + * The input is therefore a unit8_t representing 2 bases. It is assumed that + * the input std::uint8_t value is of form "xx" or "x-", where 'x' a 4-bit + * number representing either A, C, G, T, or N and '-' is 0000. For example, + * the ouptut for "AG" is "CT". The format "x-" is often used at the end of an + * odd-length sequence. The output of "A-" is "-T", and the output of "C-" is + * "-G", and so forth. The user must handle this case separately. */ -const uint8_t byte_revcomp_table[] = { +static constexpr auto byte_revcomp_table = std::array{ // clang-format off 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 136, 72, 0, 40, 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 248, 4, 132, 68, 0, @@ -453,12 +417,13 @@ const uint8_t byte_revcomp_table[] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 15, 143, 79, 0, 47, 0, 0, 0, 31, 0, 0, 0, - 0, 0, 0, 255 + 0, 0, 0, 255, // clang-format on }; static inline void revcomp_byte_then_reverse(unsigned char *const a, unsigned char *const b) { + // NOLINTBEGIN(*-constant-array-index) unsigned char *p1{a}, *p2{b}; for (p2 -= 1; p2 > p1; ++p1, --p2) { *p1 = byte_revcomp_table[*p1]; @@ -469,19 +434,20 @@ revcomp_byte_then_reverse(unsigned char *const a, unsigned char *const b) { } if (p1 == p2) *p1 = byte_revcomp_table[*p1]; + // NOLINTEND(*-constant-array-index) } -/// Take an alignment in bam1_t format and reverse complement the sequence -/// directly by manipulating the bytes in the binary encoding. +// Take an alignment in bam1_t format and reverse complement the sequence +// directly by manipulating the bytes in the binary encoding. static inline void revcomp_seq_by_byte_impl(bam1_t *const aln) { - const size_t l_qseq = get_l_qseq(aln); + const std::size_t l_qseq = get_l_qseq(aln); auto seq = bam_get_seq(aln); - const size_t num_bytes = (l_qseq + 1) / 2; // integer ceil / 2 + const std::size_t num_bytes = (l_qseq + 1) / 2; // integer ceil / 2 auto seq_end = seq + num_bytes; revcomp_byte_then_reverse(seq, seq_end); if (l_qseq % 2 == 1) { // for odd-length sequences - for (size_t i = 0; i < num_bytes - 1; i++) { + for (std::size_t i = 0; i < num_bytes - 1; i++) { // swap 4-bit chunks within consecutive bytes like this: // (----aaaa bbbbcccc dddd....) => (aaaabbbb ccccdddd ....) seq[i] = (seq[i] << 4) | (seq[i + 1] >> 4); @@ -491,37 +457,38 @@ revcomp_seq_by_byte_impl(bam1_t *const aln) { } void -revcomp_qseq(bam_rec &aln) { +revcomp_qseq(bamxx::bam_rec &aln) { revcomp_seq_by_byte_impl(aln.b); } -// places seq of b at the end of seq of c -// assumes 0 < c_seq_len - b_seq_len <= a_seq_len -// also assumes that c_seq_len has been figured out -// Also assumes the number of bytes allocated to sequence potion of c->data -// has been set to ceil((a_used_len + b_seq_len) / 2.0) where -// a_used_len = c_seq_len - b_seq_len +// Places seq of b at the end of seq of c. Assumes 0 < c_seq_len - b_seq_len +// <= a_seq_len. Also assumes that c_seq_len has been figured out. Also +// assumes the number of bytes allocated to sequence potion of c->data has +// been set to ceil((a_used_len + b_seq_len) / 2.0) where a_used_len = +// c_seq_len - b_seq_len static inline void merge_by_byte(bam1_t const *const a, bam1_t const *const b, bam1_t *const c) { // ADS: (todo) need some functions for int_ceil and is_odd - const size_t b_seq_len = get_l_qseq(b); - const size_t c_seq_len = get_l_qseq(c); - const size_t a_used_len = c_seq_len - b_seq_len; + const std::size_t b_seq_len = get_l_qseq(b); + const std::size_t c_seq_len = get_l_qseq(c); + const std::size_t a_used_len = c_seq_len - b_seq_len; const bool is_a_odd = a_used_len % 2 == 1; const bool is_b_odd = b_seq_len % 2 == 1; const bool is_c_odd = c_seq_len % 2 == 1; - const size_t a_num_bytes = (a_used_len + 1) / 2; // integer ceil / 2 - const size_t b_num_bytes = (b_seq_len + 1) / 2; // integer ceil / 2 + const std::size_t a_num_bytes = (a_used_len + 1) / 2; // integer ceil / 2 + const std::size_t b_num_bytes = (b_seq_len + 1) / 2; // integer ceil / 2 - const size_t b_offset = is_a_odd && is_b_odd; + const std::size_t b_offset = is_a_odd && is_b_odd; const auto a_seq = bam_get_seq(a); const auto b_seq = bam_get_seq(b); auto c_seq = bam_get_seq(c); std::copy_n(a_seq, a_num_bytes, c_seq); + + // NOLINTBEGIN(*-constant-array-index) if (is_a_odd) { // c_seq looks like either [ aa aa aa aa ] // or [ aa aa aa a- ] @@ -533,7 +500,7 @@ merge_by_byte(bam1_t const *const a, bam1_t const *const b, bam1_t *const c) { if (is_c_odd) { // c_seq looks like either [ aa aa aa aa ] // or [ aa aa aa ab ] - for (size_t i = 0; i < b_num_bytes - 1; i++) { + for (std::size_t i = 0; i < b_num_bytes - 1; i++) { c_seq[a_num_bytes + i] = (byte_revcomp_table[b_seq[b_num_bytes - i - 1]] << 4) | (byte_revcomp_table[b_seq[b_num_bytes - i - 2]] >> 4); @@ -543,13 +510,14 @@ merge_by_byte(bam1_t const *const a, bam1_t const *const b, bam1_t *const c) { // or [ aa aa aa ab bb bb bb b- ] (a odd; b odd) } else { - for (size_t i = 0; i < b_num_bytes - b_offset; i++) { + for (std::size_t i = 0; i < b_num_bytes - b_offset; i++) { c_seq[a_num_bytes + i] = byte_revcomp_table[b_seq[b_num_bytes - i - 1 - b_offset]]; } // Here, c_seq is either [ aa aa aa aa bb bb bb bb ] (a even and b even) // or [ aa aa aa ab bb bb bb ] (a odd and b odd) } + // NOLINTEND(*-constant-array-index) } static inline void @@ -566,7 +534,7 @@ flip_conversion(bam1_t *aln) { } void -flip_conversion(bam_rec &aln) { +flip_conversion(bamxx::bam_rec &aln) { flip_conversion(aln.b); } @@ -581,20 +549,20 @@ are_mates(const bam1_t *one, const bam1_t *two) { } static inline int -truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { - const uint32_t *a_cig = bam_get_cigar(a); - const uint32_t a_ops = a->core.n_cigar; +truncate_overlap(const bam1_t *a, const std::uint32_t overlap, bam1_t *c) { + const std::uint32_t *a_cig = bam_get_cigar(a); + const std::uint32_t a_ops = a->core.n_cigar; - uint32_t part_op = 0; - const uint32_t c_cur = + std::uint32_t part_op = 0; + const std::uint32_t c_cur = get_full_and_partial_ops(a_cig, a_ops, overlap, &part_op); // ADS: hack here because the get_full_and_partial_ops doesn't do // exactly what is needed for this. const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0); - const uint32_t c_ops = c_cur + use_partial; - vector c_cig(c_ops, 0); + const std::uint32_t c_ops = c_cur + use_partial; + std::vector c_cig(c_ops, 0); // ADS: replace this with a std::copy auto c_cig_itr = std::copy(a_cig, a_cig + c_cur, begin(c_cig)); @@ -602,15 +570,16 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { // used below would make no sense. if (use_partial) *c_cig_itr = bam_cigar_gen(part_op, bam_cigar_op(a_cig[c_cur])); - /* after this point the cigar is set and should decide everything */ + // after this point the cigar is set and should decide everything - const uint32_t c_seq_len = bam_cigar2qlen(c_ops, c_cig.data()); + const std::uint32_t c_seq_len = bam_cigar2qlen(c_ops, c_cig.data()); const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data()); // flag only needs to worry about strand and single-end stuff - const uint16_t flag = a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); + const std::uint16_t flag = + a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); - const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); + const std::size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); int ret = bam_set1_wrapper(c, a_qname_len, bam_get_qname(a), flag, // flags (SR and revcomp info) a->core.tid, a->core.pos, a->core.qual, @@ -624,17 +593,17 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { if (ret < 0) throw dnmt_error(ret, "bam_set1_wrapper"); - const size_t n_bytes_to_copy = (c_seq_len + 1) / 2; // compression + const std::size_t n_bytes_to_copy = (c_seq_len + 1) / 2; // compression std::copy_n(bam_get_seq(a), n_bytes_to_copy, bam_get_seq(c)); - /* add the tags */ - const int64_t nm = bam_aux2i(bam_aux_get(a, "NM")); // ADS: do better here! + // add the tags + const std::int64_t nm = bam_aux2i(bam_aux_get(a, "NM")); // ADS: do better // "_udpate" for "int" because it determines the right size ret = bam_aux_update_int(c, "NM", nm); if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int"); - const uint8_t conversion = bam_aux2A(bam_aux_get(a, "CV")); + const std::uint8_t conversion = bam_aux2A(bam_aux_get(a, "CV")); // "_append" for "char" because there is no corresponding update ret = bam_aux_append(c, "CV", 'A', 1, &conversion); if (ret < 0) @@ -644,25 +613,26 @@ truncate_overlap(const bam1_t *a, const uint32_t overlap, bam1_t *c) { } int -truncate_overlap(const bam_rec &a, const uint32_t overlap, bam_rec &c) { +truncate_overlap(const bamxx::bam_rec &a, const std::uint32_t overlap, + bamxx::bam_rec &c) { if (c.b == nullptr) c.b = bam_init1(); return truncate_overlap(a.b, overlap, c.b); } int -merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, +merge_overlap(const bam1_t *a, const bam1_t *b, const std::uint32_t head, bam1_t *c) { assert(head > 0); - const uint32_t *a_cig = bam_get_cigar(a); - const uint32_t a_ops = a->core.n_cigar; + const std::uint32_t *a_cig = bam_get_cigar(a); + const std::uint32_t a_ops = a->core.n_cigar; - const uint32_t *b_cig = bam_get_cigar(b); - const uint32_t b_ops = b->core.n_cigar; + const std::uint32_t *b_cig = bam_get_cigar(b); + const std::uint32_t b_ops = b->core.n_cigar; - uint32_t part_op = 0; - uint32_t c_cur = get_full_and_partial_ops(a_cig, a_ops, head, &part_op); + std::uint32_t part_op = 0; + std::uint32_t c_cur = get_full_and_partial_ops(a_cig, a_ops, head, &part_op); // ADS: hack here because the get_full_and_partial_ops doesn't do // exactly what is needed for this. const bool use_partial = (c_cur < a->core.n_cigar && part_op > 0); @@ -676,8 +646,8 @@ merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, // c_ops: include the prefix of a_cig we need; then add for the // partial op; subtract for the identical op in the middle; finally // add the rest of b_cig. - const uint32_t c_ops = c_cur + use_partial - merge_mid + b_ops; - vector c_cig(c_ops, 0); + const std::uint32_t c_ops = c_cur + use_partial - merge_mid + b_ops; + std::vector c_cig(c_ops, 0); std::copy(a_cig, a_cig + c_cur, begin(c_cig)); if (use_partial) { @@ -688,8 +658,8 @@ merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, // sequence before the possibility of merging the last entry with // the first entry in b's cigar. This is done with the cigar, so // everything depends on the "use_partial" - const size_t a_seq_len = bam_cigar2qlen(c_cur, c_cig.data()); - /* ADS: above the return type of bam_cigar2qlen is uint64_t, but + const std::size_t a_seq_len = bam_cigar2qlen(c_cur, c_cig.data()); + /* ADS: above the return type of bam_cigar2qlen is std::uint64_t, but according to the source as of 05/2023 it cannot become negative; no possible error code returned */ @@ -701,15 +671,16 @@ merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, std::copy(b_cig + merge_mid, b_cig + b_ops, begin(c_cig) + c_cur); /* done with cigar here */ - const uint32_t c_seq_len = a_seq_len + b->core.l_qseq; + const std::uint32_t c_seq_len = a_seq_len + b->core.l_qseq; // get the template length const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data()); // flag only needs to worry about strand and single-end stuff - const uint16_t flag = a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); + const std::uint16_t flag = + a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); - const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); + const std::size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); int ret = bam_set1_wrapper(c, a_qname_len, bam_get_qname(a), flag, // (no PE; revcomp info) a->core.tid, a->core.pos, @@ -727,14 +698,14 @@ merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, merge_by_byte(a, b, c); // add the tag for mismatches - const int64_t nm = + const std::int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) + bam_aux2i(bam_aux_get(b, "NM"))); ret = bam_aux_update_int(c, "NM", nm); if (ret < 0) throw dnmt_error(ret, "bam_aux_update_int in merge_overlap"); // add the tag for conversion - const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); + const std::uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); ret = bam_aux_append(c, "CV", 'A', 1, &cv); if (ret < 0) throw dnmt_error(ret, "bam_aux_append in merge_overlap"); @@ -743,26 +714,26 @@ merge_overlap(const bam1_t *a, const bam1_t *b, const uint32_t head, } int -merge_overlap(const bam_rec &a, const bam_rec &b, const uint32_t head, - bam_rec &c) { +merge_overlap(const bamxx::bam_rec &a, const bamxx::bam_rec &b, + const std::uint32_t head, bamxx::bam_rec &c) { if (c.b == nullptr) c.b = bam_init1(); return merge_overlap(a.b, b.b, head, c.b); } static inline int -merge_non_overlap(const bam1_t *a, const bam1_t *b, const uint32_t spacer, +merge_non_overlap(const bam1_t *a, const bam1_t *b, const std::uint32_t spacer, bam1_t *c) { /* make the cigar string */ // collect info about the cigar strings - const uint32_t *a_cig = bam_get_cigar(a); - const uint32_t a_ops = a->core.n_cigar; - const uint32_t *b_cig = bam_get_cigar(b); - const uint32_t b_ops = b->core.n_cigar; + const std::uint32_t *a_cig = bam_get_cigar(a); + const std::uint32_t a_ops = a->core.n_cigar; + const std::uint32_t *b_cig = bam_get_cigar(b); + const std::uint32_t b_ops = b->core.n_cigar; // allocate the new cigar string - const uint32_t c_ops = a_ops + b_ops + 1; - vector c_cig(c_ops, 0); + const std::uint32_t c_ops = a_ops + b_ops + 1; + std::vector c_cig(c_ops, 0); // concatenate the new cigar strings with a "skip" in the middle auto c_cig_itr = std::copy(a_cig, a_cig + a_ops, begin(c_cig)); @@ -770,17 +741,18 @@ merge_non_overlap(const bam1_t *a, const bam1_t *b, const uint32_t spacer, std::copy(b_cig, b_cig + b_ops, c_cig_itr); /* done with cigars */ - const size_t a_seq_len = get_l_qseq(a); - const size_t b_seq_len = get_l_qseq(b); - const size_t c_seq_len = a_seq_len + b_seq_len; + const std::size_t a_seq_len = get_l_qseq(a); + const std::size_t b_seq_len = get_l_qseq(b); + const std::size_t c_seq_len = a_seq_len + b_seq_len; // get the template length from the cigar const hts_pos_t isize = bam_cigar2rlen(c_ops, c_cig.data()); // flag: only need to keep strand and single-end info - const uint16_t flag = a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); + const std::uint16_t flag = + a->core.flag & (BAM_FREAD1 | BAM_FREAD2 | BAM_FREVERSE); - const size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); + const std::size_t a_qname_len = a->core.l_qname - (a->core.l_extranul + 1); int ret = bam_set1_wrapper(c, a_qname_len, bam_get_qname(a), flag, // flags (no PE; revcomp info) a->core.tid, a->core.pos, @@ -798,14 +770,14 @@ merge_non_overlap(const bam1_t *a, const bam1_t *b, const uint32_t spacer, merge_by_byte(a, b, c); /* add the tags */ - const int64_t nm = + const std::int64_t nm = (bam_aux2i(bam_aux_get(a, "NM")) + bam_aux2i(bam_aux_get(b, "NM"))); // "udpate" for "int" because it determines the right size ret = bam_aux_update_int(c, "NM", nm); if (ret < 0) throw dnmt_error(ret, "merge_non_overlap:bam_aux_update_int"); - const uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); + const std::uint8_t cv = bam_aux2A(bam_aux_get(a, "CV")); // "append" for "char" because there is no corresponding update ret = bam_aux_append(c, "CV", 'A', 1, &cv); if (ret < 0) @@ -815,8 +787,8 @@ merge_non_overlap(const bam1_t *a, const bam1_t *b, const uint32_t spacer, } int -merge_non_overlap(const bam_rec &a, const bam_rec &b, const uint32_t spacer, - bam_rec &c) { +merge_non_overlap(const bamxx::bam_rec &a, const bamxx::bam_rec &b, + const std::uint32_t spacer, bamxx::bam_rec &c) { if (c.b == nullptr) c.b = bam_init1(); return merge_non_overlap(a.b, b.b, spacer, c.b); @@ -836,7 +808,8 @@ keep_better_end(const bam1_t *a, const bam1_t *b, bam1_t *c) { } int -keep_better_end(const bam_rec &a, const bam_rec &b, bam_rec &c) { +keep_better_end(const bamxx::bam_rec &a, const bamxx::bam_rec &b, + bamxx::bam_rec &c) { if (c.b == nullptr) c.b = bam_init1(); return keep_better_end(a.b, b.b, c.b); @@ -844,7 +817,7 @@ keep_better_end(const bam_rec &a, const bam_rec &b, bam_rec &c) { // ADS: will move to using this function once it is written static inline void -standardize_format(const string &input_format, bam1_t *aln) { +standardize_format(const std::string &input_format, bam1_t *aln) { int err_code{}; if (input_format == "abismal" || input_format == "walt") @@ -856,17 +829,17 @@ standardize_format(const string &input_format, bam1_t *aln) { if (!zs_tag) throw dnmt_error("bam_aux_get for ZS (invalid bsmap)"); // ADS: test for errors on the line below - const auto zs_tag_value = string(bam_aux2Z(zs_tag)); + const auto zs_tag_value = std::string(bam_aux2Z(zs_tag)); if (zs_tag_value.empty()) throw dnmt_error("empty ZS tag in bsmap format"); if (zs_tag_value[0] != '-' && zs_tag_value[0] != '+') throw dnmt_error("invalid ZS tag in bsmap format"); - const uint8_t cv = zs_tag_value[1] == '-' ? 'A' : 'T'; + const std::uint8_t cv = zs_tag_value[1] == '-' ? 'A' : 'T'; // get the "mismatches" tag const auto nm_tag = bam_aux_get(aln, "NM"); if (!nm_tag) throw dnmt_error("invalid NM tag in bsmap format"); - const int64_t nm = bam_aux2i(nm_tag); + const std::int64_t nm = bam_aux2i(nm_tag); // ADS: this should delete the aux data by truncating the used // range within the bam1_t while avoiding resizing memory @@ -897,12 +870,12 @@ standardize_format(const string &input_format, bam1_t *aln) { auto xr_tag = bam_aux_get(aln, "XR"); if (!xr_tag) throw dnmt_error("bam_aux_get for XR (invalid bismark)"); - const uint8_t cv = string(bam_aux2Z(xr_tag)) == "GA" ? 'A' : 'T'; + const std::uint8_t cv = std::string(bam_aux2Z(xr_tag)) == "GA" ? 'A' : 'T'; // get the "mismatches" tag auto nm_tag = bam_aux_get(aln, "NM"); if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bismark)"); - const int64_t nm = bam_aux2i(nm_tag); + const std::int64_t nm = bam_aux2i(nm_tag); aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize) @@ -923,7 +896,7 @@ standardize_format(const string &input_format, bam1_t *aln) { // ADS: the condition below should be checked much earlier, ideally // before the output file is created else - throw runtime_error("incorrect format specified: " + input_format); + throw std::runtime_error("incorrect format specified: " + input_format); // Be sure this doesn't depend on mapper! Removes the "qual" part of // the data in a bam1_t struct but does not change its uncompressed @@ -933,18 +906,19 @@ standardize_format(const string &input_format, bam1_t *aln) { } void -standardize_format(const string &input_format, - bam_rec &aln) { // cppcheck-suppress constParameterReference +standardize_format( + const std::string &input_format, + bamxx::bam_rec &aln) { // cppcheck-suppress constParameterReference standardize_format(input_format, aln.b); } // used in methstates void -apply_cigar(const bam_rec &aln, string &to_inflate, +apply_cigar(const bamxx::bam_rec &aln, std::string &to_inflate, const char inflation_symbol) { - string inflated_seq; - stringstream ss_cigar; - size_t i = 0; + std::string inflated_seq; + std::stringstream ss_cigar; + std::size_t i = 0; auto to_inflate_beg = std::begin(to_inflate); const auto beg_cig = bam_get_cigar(aln); @@ -970,34 +944,34 @@ apply_cigar(const bam_rec &aln, string &to_inflate, } // sum of total M/I/S/=/X/N operations must equal length of seq - const size_t orig_len = to_inflate.length(); + const std::size_t orig_len = std::size(to_inflate); if (i != orig_len) - throw runtime_error( + throw std::runtime_error( "inconsistent number of qseq ops in cigar: " + to_inflate + " " + - ss_cigar.str() + " " + to_string(i) + " " + to_string(orig_len)); + ss_cigar.str() + " " + std::to_string(i) + " " + + std::to_string(orig_len)); to_inflate.swap(inflated_seq); } void -get_seq_str(const bam_rec &aln, string &seq_str) { - size_t qlen = static_cast(get_l_qseq(aln)); +get_seq_str(const bamxx::bam_rec &aln, std::string &seq_str) { + const std::size_t qlen = static_cast(get_l_qseq(aln)); seq_str.resize(qlen); auto seq = bam_get_seq(aln); - for (size_t i = 0; i < qlen; i++) { + for (std::size_t i = 0; i < qlen; ++i) seq_str[i] = seq_nt16_str[bam_seqi(seq, i)]; - } } -string -to_string(const bam_header &hdr, const bam_rec &aln) { - kstring_t ks = {0, 0, nullptr}; - int ret = sam_format1(hdr.h, aln.b, &ks); - if (ret < 0) { - throw runtime_error("Can't format record: " + to_string(hdr, aln)); - } - const std::string s = string(ks.s); +std::string +to_string(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln) { + kstring_t ks = KS_INITIALIZE; + const int ret = sam_format1(hdr.h, aln.b, &ks); + if (ret < 0) + throw std::runtime_error("Can't format record: " + + std::string(bam_get_qname(aln.b))); + std::string s{ks.s}; ks_free(&ks); return s; } -// NOLINTEND(*-pointer-arithmetic,*-avoid-magic-numbers,*-type-reinterpret-cast,*-owning-memory,*-no-malloc,*-narrowing-conversions,*-avoid-c-arrays,*-constant-array-index) +// NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) diff --git a/src/common/bam_record_utils.hpp b/src/common/bam_record_utils.hpp index bfc77228..e36c2096 100644 --- a/src/common/bam_record_utils.hpp +++ b/src/common/bam_record_utils.hpp @@ -1,6 +1,4 @@ -/* Copyright (C) 2020-2023 Masaru Nakajima and Andrew D. Smith - * - * Authors: Masaru Nakajima and Andrew D. Smith +/* Copyright (C) 2020-2026 Andrew D. Smith and Masaru Nakajima * * This program is free software: you can redistribute it and/or * modify it under the terms of the GNU General Public License as @@ -16,15 +14,14 @@ #ifndef BAM_RECORD_UTILS_HPP #define BAM_RECORD_UTILS_HPP -/* ADS: need to control all the macros from HTSlib pollution. For - functions maybe: +/* ADS: need to control macros from HTSlib. For functions maybe: $ gcc -dM -E sam.h | grep "define [a-z]" | awk '{print $2}' |\ grep "[(]" | awk -v FS="(" '{print "#undef",$1}' - This gives about 65 symbols that need to be deleted. For the others - I don't know what to do because some of them have "#define _" which - means they should be system symbols. + This gives about 65 symbols that need to be deleted. For the others I don't + know what to do because some of them have "#define _" which means they + should be system symbols. */ #include @@ -39,7 +36,7 @@ #undef bam_is_rev #endif -inline bool +[[nodiscard]] inline bool bam_is_rev(const bamxx::bam_rec &b) { return (b.b->core.flag & BAM_FREVERSE) != 0; } @@ -57,7 +54,7 @@ bam_is_mrev(const bamxx::bam_rec &b) { #undef bam_get_qname #endif -inline char * +[[nodiscard]] inline char * bam_get_qname(const bamxx::bam_rec &b) { return reinterpret_cast(b.b->data); } @@ -66,7 +63,7 @@ bam_get_qname(const bamxx::bam_rec &b) { #undef bam_get_cigar #endif -inline std::uint32_t * +[[nodiscard]] inline std::uint32_t * bam_get_cigar(const bamxx::bam_rec &b) { // start of data + bytes for query/read name return reinterpret_cast(b.b->data + b.b->core.l_qname); @@ -76,7 +73,7 @@ bam_get_cigar(const bamxx::bam_rec &b) { #undef bam_get_seq #endif -inline uint8_t * +[[nodiscard]] inline uint8_t * bam_get_seq(const bamxx::bam_rec &b) { // start of data + bytes for cigar + bytes for query/read name return b.b->data + b.b->core.l_qname + (b.b->core.n_cigar << 2); @@ -86,7 +83,7 @@ bam_get_seq(const bamxx::bam_rec &b) { #undef bam_get_qual #endif -inline uint8_t * +[[nodiscard]] inline uint8_t * bam_get_qual(const bamxx::bam_rec &b) { return b.b->data + // start of data b.b->core.l_qname + // bytes for query name @@ -98,7 +95,7 @@ bam_get_qual(const bamxx::bam_rec &b) { #undef bam_get_aux #endif -inline uint8_t * +[[nodiscard]] inline uint8_t * bam_get_aux(const bamxx::bam_rec &b) { return b.b->data + b.b->core.l_qname + (b.b->core.n_cigar << 2) + ((b.b->core.l_qseq + 1) >> 1) + b.b->core.l_qseq; @@ -108,7 +105,7 @@ bam_get_aux(const bamxx::bam_rec &b) { #undef bam_get_l_aux #endif -inline int +[[nodiscard]] inline int bam_get_l_aux(const bamxx::bam_rec &b) { return b.b->l_data - (b.b->core.l_qname + (b.b->core.n_cigar << 2) + ((b.b->core.l_qseq + 1) >> 1) + b.b->core.l_qseq); @@ -118,7 +115,7 @@ bam_get_l_aux(const bamxx::bam_rec &b) { #undef bam_cigar_op #endif -inline std::uint32_t +[[nodiscard]] inline std::uint32_t bam_cigar_op(const std::uint32_t c) { return c & BAM_CIGAR_MASK; } @@ -127,12 +124,12 @@ bam_cigar_op(const std::uint32_t c) { #undef bam_cigar_oplen #endif -inline std::uint32_t +[[nodiscard]] inline std::uint32_t bam_cigar_oplen(const std::uint32_t c) { return c >> BAM_CIGAR_SHIFT; } -inline bool +[[nodiscard]] inline bool bam_same_orientation(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { return ((a.b->core.flag ^ b.b->core.flag) & BAM_FREVERSE) != 0; } @@ -159,7 +156,7 @@ correct_cigar(bamxx::bam_rec &b); void flip_conversion(bamxx::bam_rec &aln); -inline bool +[[nodiscard]] inline bool is_a_rich(const bamxx::bam_rec &b) { return bam_aux2A(bam_aux_get(b.b, "CV")) == 'A'; } @@ -174,7 +171,7 @@ apply_cigar(const bamxx::bam_rec &aln, std::string &to_inflate, void get_seq_str(const bamxx::bam_rec &aln, std::string &seq_str); -inline bool +[[nodiscard]] inline bool are_mates(const bamxx::bam_rec &one, const bamxx::bam_rec &two) { return one.b->core.mtid == two.b->core.tid && one.b->core.mpos == two.b->core.pos && bam_same_orientation(one, two); @@ -184,73 +181,73 @@ are_mates(const bamxx::bam_rec &one, const bamxx::bam_rec &two) { two->core.mpos == one->core.pos; */ } -inline std::int32_t +[[nodiscard]] inline std::int32_t get_l_qseq(const bamxx::bam_rec &b) { return b.b->core.l_qseq; } -inline std::int32_t +[[nodiscard]] inline std::int32_t get_n_targets(const bamxx::bam_header &bh) { return bh.h->n_targets; } -inline std::string +[[nodiscard]] inline std::string get_qname(const bamxx::bam_rec &b) { return bam_get_qname(b); } -inline std::int32_t +[[nodiscard]] inline constexpr std::int32_t get_tid(const bamxx::bam_rec &b) { return b.b->core.tid; } -inline hts_pos_t +[[nodiscard]] inline constexpr hts_pos_t get_pos(const bamxx::bam_rec &b) { return b.b->core.pos; } -inline std::int32_t +[[nodiscard]] inline constexpr std::int32_t get_mtid(const bamxx::bam_rec &b) { return b.b->core.mtid; } -inline hts_pos_t +[[nodiscard]] inline hts_pos_t get_mpos(const bamxx::bam_rec &b) { return b.b->core.mpos; } -inline std::uint32_t +[[nodiscard]] inline std::uint32_t get_n_cigar(const bamxx::bam_rec &b) { return b.b->core.n_cigar; } -inline hts_pos_t +[[nodiscard]] inline hts_pos_t get_endpos(const bamxx::bam_rec &b) { return bam_endpos(b.b); } -inline bool +[[nodiscard]] inline bool cigar_eats_ref(const std::uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 2; } -inline bool +[[nodiscard]] inline bool cigar_eats_query(const std::uint32_t c) { return bam_cigar_type(bam_cigar_op(c)) & 1; } -inline bool +[[nodiscard]] inline bool cigar_eats_frag(const std::uint32_t c) { return bam_cigar_op(c) == BAM_CREF_SKIP; } -inline bool +[[nodiscard]] inline bool precedes_by_start(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { // assumes a.get_tid() <= b.get_tid() return get_tid(a) == get_tid(b) && get_pos(a) < get_pos(b); } -inline bool +[[nodiscard]] inline bool precedes_by_end_and_strand(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { const auto end_a = bam_endpos(a.b); const auto end_b = bam_endpos(b.b); @@ -258,46 +255,46 @@ precedes_by_end_and_strand(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { (end_a == end_b && bam_is_rev(a) == false && bam_is_rev(b) == true); } -inline bool +[[nodiscard]] inline bool equivalent_chrom_and_start(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { return a.b->core.pos == b.b->core.pos && a.b->core.tid == b.b->core.tid; } -inline bool +[[nodiscard]] inline bool equivalent_end_and_strand(const bamxx::bam_rec &a, const bamxx::bam_rec &b) { return bam_endpos(a.b) == bam_endpos(b.b) && bam_is_rev(a) == bam_is_rev(b); } template -int +[[nodiscard]] int bam_aux_update_int(bamxx::bam_rec &b, const char tag[2], T val) { return bam_aux_update_int(b.b, tag, val); } -inline std::string +[[nodiscard]] inline std::string sam_hdr_tid2name(const bamxx::bam_header &hdr, const std::int32_t tid) { return std::string(sam_hdr_tid2name(hdr.h, tid)); } -inline const char * +[[nodiscard]] inline const char * sam_hdr_tid2name_ptr(const bamxx::bam_header &hdr, const std::int32_t tid) { return sam_hdr_tid2name(hdr.h, tid); } -inline std::uint32_t +[[nodiscard]] inline std::uint32_t sam_hdr_tid2len(const bamxx::bam_header &hdr, const std::int32_t tid) { return sam_hdr_tid2len(hdr.h, tid); } -inline std::string +[[nodiscard]] inline std::string sam_hdr_tid2name(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln) { return std::string(sam_hdr_tid2name(hdr.h, aln.b->core.tid)); } -std::string +[[nodiscard]] std::string to_string(const bamxx::bam_header &hdr, const bamxx::bam_rec &aln); -inline std::size_t +[[nodiscard]] inline std::size_t rlen_from_cigar(const bamxx::bam_rec &aln) { return bam_cigar2rlen(get_n_cigar(aln), bam_get_cigar(aln)); }