From 13ee56cb20fdee5b08e4beaf7082606be48e988d Mon Sep 17 00:00:00 2001 From: Valeriu Ohan Date: Mon, 18 Nov 2019 14:44:55 +0000 Subject: [PATCH] Fix sam_hdr_dup to cope with long refs. The h->sdict hash is used to track references that are > 4Gb in size. The dup code didn't copy this. This manifested itself as CRAM SQ headers being truncated (read SAM hdr, dup, write as CRAM hdr). Fix by making sam_hdr_dup copy h0->sdict entries for non-parsed headers; and by making sam_hdr_update_target_arrays() update bh->sdict for parsed ones. Co-Authored-By: Rob Davies Co-Authored-By: James Bonfield --- header.c | 29 +++++++++++++++++++++++++++++ sam.c | 31 +++++++++++++++++++++++++++++++ test/sam.c | 50 ++++++++++++++++++++++++++++++++++++++++---------- test/test.pl | 6 ++++++ 4 files changed, 106 insertions(+), 10 deletions(-) diff --git a/header.c b/header.c index dbc01c1ba..c7fe84b30 100644 --- a/header.c +++ b/header.c @@ -916,6 +916,8 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, // hrecs->refs_changed is the first ref that has been updated, so ones // before that can be skipped. int i; + khint_t k; + khash_t(s2i) *long_refs = (khash_t(s2i) *) bh->sdict; for (i = refs_changed; i < hrecs->nref; i++) { if (i >= bh->n_targets || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) { @@ -927,13 +929,40 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, } if (hrecs->ref[i].len < UINT32_MAX) { bh->target_len[i] = hrecs->ref[i].len; + + if (!long_refs) + continue; + + // Check if we have an old length, if so remove it. + k = kh_get(s2i, long_refs, bh->target_name[i]); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); } else { bh->target_len[i] = UINT32_MAX; + if (bh->hrecs != hrecs) { + // Called from sam_hdr_dup; need to add sdict entries + if (!long_refs) { + if (!(bh->sdict = long_refs = kh_init(s2i))) + return -1; + } + + // Add / update length + int absent; + k = kh_put(s2i, long_refs, bh->target_name[i], &absent); + if (absent < 0) + return -1; + kh_val(long_refs, k) = hrecs->ref[i].len; + } } } // Free up any names that have been removed for (; i < bh->n_targets; i++) { + if (long_refs) { + k = kh_get(s2i, long_refs, bh->target_name[i]); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); + } free(bh->target_name[i]); } diff --git a/sam.c b/sam.c index e4b5c6205..688147633 100644 --- a/sam.c +++ b/sam.c @@ -131,6 +131,33 @@ void sam_hdr_destroy(sam_hdr_t *bh) free(bh); } +// Copy the sam_hdr_t::sdict hash, used to store the real lengths of long +// references before sam_hdr_t::hrecs is populated +int sam_hdr_dup_sdict(const sam_hdr_t *h0, sam_hdr_t *h) +{ + const khash_t(s2i) *src_long_refs = (khash_t(s2i) *) h0->sdict; + khash_t(s2i) *dest_long_refs = kh_init(s2i); + int i; + if (!dest_long_refs) return -1; + + for (i = 0; i < h->n_targets; i++) { + int ret; + khiter_t ksrc, kdest; + if (h->target_len[i] < UINT32_MAX) continue; + ksrc = kh_get(s2i, src_long_refs, h->target_name[i]); + if (ksrc == kh_end(src_long_refs)) continue; + kdest = kh_put(s2i, dest_long_refs, h->target_name[i], &ret); + if (ret < 0) { + kh_destroy(s2i, dest_long_refs); + return -1; + } + kh_val(dest_long_refs, kdest) = kh_val(src_long_refs, ksrc); + } + + h->sdict = dest_long_refs; + return 0; +} + sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) { if (h0 == NULL) return NULL; @@ -157,6 +184,10 @@ sam_hdr_t *sam_hdr_dup(const sam_hdr_t *h0) } h->n_targets = i; if (i < h0->n_targets) goto fail; + + if (h0->sdict) { + if (sam_hdr_dup_sdict(h0, h) < 0) goto fail; + } } if (h0->hrecs) { diff --git a/test/sam.c b/test/sam.c index 3b798a91f..b8936ee0c 100644 --- a/test/sam.c +++ b/test/sam.c @@ -1226,6 +1226,23 @@ static void samrecord_layout(void) "test/sam_alignment.tmp.sam_", "w", NULL); } +static int check_ref_lengths(const sam_hdr_t *header, + const hts_pos_t *expected_lengths, + int num_refs, const char *hdr_name) +{ + int i; + for (i = 0; i < num_refs; i++) { + hts_pos_t ln = sam_hdr_tid2len(header, i); + if (ln != expected_lengths[i]) { + fail("Wrong length for %s ref %d : " + "expected %"PRIhts_pos" got %"PRIhts_pos"\n", + hdr_name, i, expected_lengths[i], ln); + return -1; + } + } + return 0; +} + static void check_big_ref(int parse_header) { static const char sam_text[] = "data:," @@ -1257,7 +1274,7 @@ static void check_big_ref(int parse_header) -1, -1, -1, 9223372034707292150LL - 1, 1LL - 1, -1 }; samFile *in = NULL, *out = NULL; - sam_hdr_t *header = NULL; + sam_hdr_t *header = NULL, *dup_header = NULL; bam1_t *aln = bam_init1(); const int num_refs = sizeof(expected_lengths) / sizeof(expected_lengths[0]); const int num_align = sizeof(expected_tids) / sizeof(expected_tids[0]); @@ -1288,21 +1305,33 @@ static void check_big_ref(int parse_header) goto cleanup; } if (parse_header) { - // This will force the reader to be parsed + // This will force the header to be parsed if (sam_hdr_count_lines(header, "SQ") != num_refs) { fail("Wrong number of SQ lines in header"); goto cleanup; } } - for (i = 0; i < num_refs; i++) { - hts_pos_t ln = sam_hdr_tid2len(header, i); - if (ln != expected_lengths[i]) { - fail("Wrong length for ref %d : " - "expected %"PRIhts_pos" got %"PRIhts_pos"\n", - i, expected_lengths[i], ln); - goto cleanup; - } + if (check_ref_lengths(header, expected_lengths, num_refs, "header") < 0) + goto cleanup; + + dup_header = sam_hdr_dup(header); + if (!dup_header) { + fail("Failed to duplicate header"); + } + + if (check_ref_lengths(dup_header, expected_lengths, + num_refs, "duplicate header") < 0) + goto cleanup; + + if (sam_hdr_count_lines(dup_header, "SQ") != num_refs) { + fail("Wrong number of SQ lines in duplicate header"); + goto cleanup; } + + if (check_ref_lengths(dup_header, expected_lengths, + num_refs, "parsed duplicate header") < 0) + goto cleanup; + if (sam_hdr_write(out, header) < 0) { fail("Failed to write SAM header"); goto cleanup; @@ -1379,6 +1408,7 @@ static void check_big_ref(int parse_header) cleanup: bam_destroy1(aln); sam_hdr_destroy(header); + sam_hdr_destroy(dup_header); if (in) sam_close(in); if (out) sam_close(out); if (inf) fclose(inf); diff --git a/test/test.pl b/test/test.pl index fe71d58c4..3e3081c5b 100755 --- a/test/test.pl +++ b/test/test.pl @@ -645,6 +645,12 @@ sub test_view testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.sam.gz"; testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + # CRAM disabled for now as the positions cannot be 32-bit. (These tests are useful for + # checking SQ headers only.) + # testv $opts, "./test_view $tv_args -C -o no_ref -p longrefs/longref.tmp.cram longrefs/longref.sam"; + # testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.cram"; + # testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + # Build index and compare with on-the-fly one made earlier. test_compare $opts, "$$opts{path}/test_index -c longrefs/longref.tmp.sam.gz", "longrefs/longref.tmp.sam.gz.csi.otf", "longrefs/longref.tmp.sam.gz.csi", gz=>1;