Skip to content

Commit 3ea377d

Browse files
committed
- Rename mpileup --edlib to --indels-cns
The main benefit of the new indel caller is the use of alignment against diploid consensus generation instead of simply patching the reference with candidate indel types. This greatly reduces false positives and incorrect allele alignment (leading to wrong genotype calls). This was the earlier PR #1679, but has since acquired edlib as an alternative to BAQ for the indel alignment. However this is primarily a speed benefit (with some minor removal of false-negatives due to quality smashing), and isn't the main thing users should think about when choosing an indel caller. Also tidied up the usage statement and added an explicit "-X list" to print the profile parameters. - Add extra debugging defines. GLF_DEBUG reports GLF calculation in bam2bcf.c. ALIGN_DEBUG uses edlib EDLIB_TASK_PATH to report sequence alignment. NB to use this you need to link against edlib itself rather than the cutdown version in this repository. Also fix the edlib heuristics used in bcf_call_glfgen. We don't want to change the call (b=5) as this affects AD. Instead we change the quality so poor calls get filtered by QUAL rather than simply being removed. - Tweak edlib tuning for SeqQ/qual. Add quality value assessment into soft-clip recovery. Use /500 instead of /111 in indelQ assignment, and skew indel-bias accordingly. This gives better separation of FP/GT/FN generally. - Added --seqq-offset parameter so we can use it in tunables per profile. This is used as a limit on the seqQ reduction in the "VAL-5*MIN(20,depth)" formula, used for favouring data over seqQ scores when depth is sufficient. Experimentation showed no single value that worked for all platforms, but the default is in the middle. - Tidy up to cull ifdefed and commented out code. - Add test for indels-cns. It's minimal, but the whole indel calling has minimal testing. I think we have under 10 indels in total with develop (all short read and mostly duplications of each other), and no testing of indels-2.0. This tests 4 indels with indels-cns. - Added documentation for the new --indels-2.0 options - Cull more unused parts of edlib.c. This avoids clang warnings (which become errors with -Werror). We're only including the bits we need here for mpileup. If you want the whole thing, link against the upstream source library instead.
1 parent 0e45073 commit 3ea377d

File tree

8 files changed

+655
-1201
lines changed

8 files changed

+655
-1201
lines changed

bam2bcf.c

Lines changed: 62 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* bam2bcf.c -- variant calling.
22
33
Copyright (C) 2010-2012 Broad Institute.
4-
Copyright (C) 2012-2022 Genome Research Ltd.
4+
Copyright (C) 2012-2024 Genome Research Ltd.
55
66
Author: Heng Li <lh3@sanger.ac.uk>
77
@@ -249,6 +249,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
249249
{
250250
int i, n, ref4, is_indel, ori_depth = 0;
251251

252+
#ifdef GLF_DEBUG
253+
fprintf(stderr, "Call GLFGEN\n");
254+
#endif
255+
252256
// clean from previous run
253257
r->ori_depth = 0;
254258
r->mq0 = 0;
@@ -351,6 +355,10 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
351355
}
352356
}
353357

358+
#ifdef GLF_DEBUG
359+
fprintf(stderr, "GLF %s\t%d\t%d\n", bam_get_qname(p->b),
360+
bca->indel_types[b], q);
361+
#endif
354362
if (q < bca->min_baseQ)
355363
{
356364
if (!p->indel && b < 4) // not an indel read
@@ -363,29 +371,50 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
363371
continue;
364372
}
365373

366-
// FIXME: CHECK if this is still needed with edlib mode
367-
// It's a slight variant on the one above guarded by --indels-2.0
374+
#ifndef MIN
375+
#define MIN(a,b) ((a)<(b)?(a):(b))
376+
#endif
377+
378+
#ifndef MAX
379+
#define MAX(a,b) ((a)>(b)?(a):(b))
380+
#endif
381+
382+
#if 1 // TEST 6
368383
if (bca->edlib) {
369-
if (indel_in_sample && p->indel == 0 && (q < _n/2 || _n > 20)) {
370-
// high quality indel calls without p->indel set aren't
371-
// particularly indicative of being a good REF match either,
372-
// at least not in low coverage. So require solid coverage
373-
// before we start utilising such quals.
374-
if (b != 0)
375-
b = 5;
376-
q = (int)bam_get_qual(p->b)[p->qpos];
377-
seqQ = (3*seqQ + 2*q)/8;
384+
// Deeper data should rely more heavily on counts of data
385+
// than quality, as quality can be unreliable and prone to
386+
// miscalculations through BAQ, STR analysis, etc.
387+
// So we put a cap on how good seqQ can be.
388+
//
389+
// Is it simply the equivalent of increasing -F filter?
390+
// Not quite, as the latter removes many real variants upfront.
391+
// This calls them and then post-adjusts quality, potentially
392+
// dropping it later or changing genotype. So we still get
393+
// calls, but lower qual.
394+
seqQ = MIN(seqQ, bca->seqQ_offset-(MIN(20,_n)*5));
395+
396+
if (indel_in_sample && p->indel == 0 && b != 0) {
397+
// This read doesn't contain an indel in CIGAR, but it
398+
// is assigned to an indel now (b != 0), These are
399+
// reads we've corrected with realignment, but they're
400+
// also enriched for FPs so at high depth we reduce their
401+
// confidence and let the depth do the talking. If it's
402+
// real and deep, then we don't need every read aligning.
403+
// We also reduce base quality too to reflect the
404+
// chance of our realignment being incorrect.
405+
406+
seqQ = MIN(seqQ, seqQ/2 + 5); // q2p5
407+
408+
// Finally reduce indel quality.
409+
// This is a blend of indelQ and base QUAL.
410+
q = MIN((int)bam_get_qual(p->b)[p->qpos]/4+10, q/4+1);
378411
}
379-
if (_n > 20 && seqQ > 40) seqQ = 40;
380412
}
413+
#endif
381414

382415
// Note baseQ changes some output fields such as I16, but has no
383416
// significant affect on "call".
384417
baseQ = p->aux>>8&0xff;
385-
386-
// Can we reuse baseQ as indelQ1 instead of indelQ?
387-
// So we can distinguish between likelihood of any indel vs
388-
// likelihood of this specific indel?
389418
}
390419
else
391420
{
@@ -419,6 +448,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
419448
}
420449
mapQ = p->b->core.qual < 255? p->b->core.qual : DEF_MAPQ; // special case for mapQ==255
421450
if ( !mapQ ) r->mq0++;
451+
#ifdef GLF_DEBUG
452+
fprintf(stderr, "GLF2 %s\t%d\t%d\t%d,%d\n",
453+
bam_get_qname(p->b), b, q,
454+
seqQ, mapQ);
455+
#endif
422456
if (q > seqQ) q = seqQ;
423457
mapQ = mapQ < bca->capQ? mapQ : bca->capQ;
424458
if (q > mapQ) q = mapQ;
@@ -1202,25 +1236,29 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
12021236
{
12031237
bcf_update_info_flag(hdr, rec, "INDEL", NULL, 1);
12041238
uint32_t idv = bca->max_support;
1205-
if ( fmt_flag&B2B_INFO_IMF ) {
1239+
if ( fmt_flag&B2B_INFO_IMF) {
12061240
float max_frac;
1207-
if (bc->ADF && bc->ADR) {
1208-
int max_ad = 0, tot_ad = bc->ADF[0] + bc->ADR[0];
1241+
// Recompute IDV and IMF based on alignment results for more
1242+
// accurate counts, but only when in new "--indels-cns" mode.
1243+
if (bc->ADF && bc->ADR && bca->edlib) {
1244+
int max_ad = 0;
12091245
for (int k = 1; k < rec->n_allele; k++) {
12101246
if (max_ad < bc->ADF[k] + bc->ADR[k])
12111247
max_ad = bc->ADF[k] + bc->ADR[k];
1212-
tot_ad += bc->ADF[k] + bc->ADR[k];
12131248
}
12141249
max_frac = (double)(max_ad) / bc->ori_depth;
1215-
//max_frac = (double)(max_ad) / tot_ad;
12161250
idv = max_ad;
12171251
} else {
12181252
max_frac = bca->max_frac;
12191253
}
1254+
// Copied here to maintain order for consistency of "make check"
1255+
if ( fmt_flag&B2B_INFO_IDV )
1256+
bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
12201257
bcf_update_info_float(hdr, rec, "IMF", &max_frac, 1);
1258+
} else {
1259+
if ( fmt_flag&B2B_INFO_IDV )
1260+
bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
12211261
}
1222-
if ( fmt_flag&B2B_INFO_IDV )
1223-
bcf_update_info_int32(hdr, rec, "IDV", &idv, 1);
12241262
}
12251263
bcf_update_info_int32(hdr, rec, "DP", &bc->ori_depth, 1);
12261264
if ( fmt_flag&B2B_INFO_ADF )

bam2bcf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ typedef struct __bcf_callaux_t {
123123
int max_bases;
124124
int indel_types[4]; // indel lengths
125125
int indel_win_size, indels_v20, edlib;
126+
int seqQ_offset; // edlib mode, seqQ=MIN(seqQ, offset - MIN(20,depth)*5);
126127
int maxins, indelreg, poly_mqual;
127128
int read_len;
128129
char *inscns;

0 commit comments

Comments
 (0)