Skip to content

Commit 15d4139

Browse files
committed
WIP: "edlib-12".
Tweak edlib tuning for SeqQ/qual. Add quality value assessment into soft-clip recovery. Use /500 instead of /111 in indelQ assignment, and skew indel-bias accordingly. This gives better separation of FP/GT/FN generally. Added --seqq-offset parameter so we can use it in tunables per profile. This is used as a limit on the seqQ reduction in the "VAL-5*MIN(20,depth)" formula, used for favouring data over seqQ scores when depth is sufficient. Experimentation showed no single value that worked for all platforms, but the default is in the middle.
1 parent 1e3ed22 commit 15d4139

File tree

4 files changed

+138
-34
lines changed

4 files changed

+138
-34
lines changed

bam2bcf.c

Lines changed: 67 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -395,7 +395,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
395395
#define MIN(a,b) ((a)<(b)?(a):(b))
396396
#endif
397397

398-
if (bca->edlib) {
398+
#ifndef MAX
399+
#define MAX(a,b) ((a)>(b)?(a):(b))
400+
#endif
401+
402+
if (1 && bca->edlib) {
399403
// Deeper data should rely more heavily on counts of data
400404
// than quality, as quality can be unreliable and prone to
401405
// miscalculations through BAQ, STR analysis, etc.
@@ -406,19 +410,68 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
406410
// This calls them and then post-adjusts quality, potentially
407411
// dropping it later or changing genotype. So we still get
408412
// calls, but lower qual.
409-
seqQ = MIN(seqQ, 100-(MIN(20,_n)*3));
410-
411-
if (indel_in_sample && p->indel == 0) {
412-
// High quality indel calls when the read wasn't initially
413-
// containing an indel are enriched for FPs.
414-
// Dont change call as that goes into AD, but do change the
415-
// quality of the call instead.
416-
if (b != 0) {
417-
// Reduce qual, with lower-bound reduction to Q10
418-
seqQ = MIN(seqQ, seqQ/4 + 10);
419-
//q = (int)bam_get_qual(p->b)[p->qpos]/4 + 20;
420-
//q += 10;
421-
}
413+
//seqQ = MIN(seqQ, 100-(MIN(20,_n)*3)); // orig
414+
//seqQ = MIN(seqQ, 100-(MIN(30,_n)*2.5)); // m30.25
415+
//seqQ = MIN(seqQ, 100-(MIN(25,_n)*3)); // m25
416+
//seqQ = MIN(seqQ, 100-(MIN(20,_n)*4)); // m20.40; vgood BGI
417+
//seqQ = MIN(seqQ, 115-(MIN(20,_n)*5));
418+
//seqQ = MAX(15, 130-20*sqrt(_n));
419+
//seqQ = MAX(15, 100-15*sqrt(_n));
420+
//seqQ = MIN(seqQ, 110-(MIN(15,_n)*6));
421+
//seqQ = MIN(seqQ, 140-(MIN(20,_n)*6));
422+
seqQ = MIN(seqQ, bca->seqQ_offset-(MIN(20,_n)*5));
423+
//seqQ = MIN(seqQ, 25 + 2*(30-MIN(30,_n)));
424+
//seqQ = MIN(seqQ, 30 + MAX(-10, 2*(30-_n)));
425+
//seqQ = MIN(seqQ, 120-(MIN(20,_n)*5));
426+
//seqQ = MIN(seqQ, 120-(MIN(25,_n)*4)); // m25b.40; good BGI
427+
//seqQ = MIN(seqQ, 110-(MIN(20,_n)*4)); // m20b.40; poor
428+
//seqQ = MIN(seqQ, 130-(MIN(30,_n)*4)); // m30b.40; BAD!
429+
//seqQ = MIN(seqQ, 170-(MIN(30,_n)*5)); // m30b.50; poor BGI
430+
431+
// Use base quality in there too?
432+
433+
if (1 && indel_in_sample && p->indel == 0 && b != 0) {
434+
// This read doesn't contain an indel in CIGAR, but it
435+
// is assigned to an indel now (b != 0), These are
436+
// reads we've corrected with realignment, but they're
437+
// also enriched for FPs so at high depth we reduce their
438+
// confidence and let the depth do the talking. If it's
439+
// real and deep, then we don't need every read aligning.
440+
// We also reduce base quality too to reflect the
441+
// chance of our realignment being incorrect.
442+
443+
//seqQ = MIN(seqQ, seqQ/4 + 15); // q4p15
444+
//seqQ = MIN(seqQ, seqQ/4 + 10); // orig, q4p10
445+
seqQ = MIN(seqQ, seqQ/2 + 5); // q2p5
446+
//q = (int)bam_get_qual(p->b)[p->qpos]/4 + 20;
447+
//q += 10;
448+
449+
// Finally reduce base quality
450+
// With qual it's ...+10[cd]
451+
// Without qual it's ...+10i[cd]
452+
// Best without qual!
453+
// q -= q>>1; // qdiv2c for non-qual q (indelQ?)
454+
// q -= q>>1; // qdiv2d for non-qual q (indelQ?)
455+
// // Best without +10?
456+
// q += 10;
457+
458+
// Mix of base qual and indel qual
459+
460+
// int bq = 999, i;
461+
// uint8_t *qual = bam_get_qual(p->b);
462+
// for (i = MAX(0, p->qpos-5); i < MIN(p->b->core.l_qseq, p->qpos+5); i++)
463+
// if (bq < qual[i])
464+
// bq = qual[i];
465+
// q = MAX(bq/4+10, q/4+1); // x
466+
467+
// PB Old: this is all of very minor impact
468+
q = MIN((int)bam_get_qual(p->b)[p->qpos]/4+10, q/4+1); // x
469+
470+
// int bq = (int)bam_get_qual(p->b)[p->qpos];
471+
// if (bq > bca->max_baseQ)
472+
// bq = bca->max_baseQ;
473+
// q = MIN(bq/2, q/4+1);
474+
// also for low mapQ?
422475
}
423476
}
424477

bam2bcf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ typedef struct __bcf_callaux_t {
123123
int max_bases;
124124
int indel_types[4]; // indel lengths
125125
int indel_win_size, indels_v20, edlib;
126+
int seqQ_offset; // edlib mode, seqQ=MIN(seqQ, offset - MIN(20,depth)*5);
126127
int maxins, indelreg, poly_mqual;
127128
int read_len;
128129
char *inscns;

bam2bcf_edlib.c

Lines changed: 47 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1098,7 +1098,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
10981098
l = .5*(100. * sc2 / (qend - qbeg) + .499);
10991099
l += iscore*(qavg/(m2min+1.0) + qavg/m2);
11001100

1101-
*score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias);
1101+
*score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias * .5);
11021102

11031103
free(qq);
11041104

@@ -1303,8 +1303,9 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
13031303

13041304
// So we lower qual in some, but raise the average to keep FN/FP
13051305
// ratios up.
1306-
indelQ /= bca->indel_bias;
1307-
indelQ1 /= bca->indel_bias;
1306+
// Is this key diff for PacBio old vs new HiFi?
1307+
indelQ /= bca->indel_bias*0.5;
1308+
indelQ1 /= bca->indel_bias*0.5;
13081309

13091310
// Or maybe just *2 if bca->poly_mqual and be done with it?
13101311
// Or perhaps adjust the MIN(qavg/20, ...) to MIN(qavg/10) ?
@@ -1366,8 +1367,17 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
13661367
// low normalised scores leave indelQ unmodified
13671368
// high normalised scores set indelQ to 0
13681369
// inbetween scores have a linear scale from indelQ to 0
1369-
indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499);
1370-
indelQ1= tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1+ .499);
1370+
// Alitering the MAGIC value below (originally 111, but chosen for unknown
1371+
// reasons) is comparable to altering --indel-bias.
1372+
//#define TMP_MAGIC 111.0
1373+
#define TMP_MAGIC 255.0
1374+
//#define TMP_MAGIC 500.0
1375+
1376+
indelQ = tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ + .499);
1377+
indelQ1= tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ1+ .499);
1378+
1379+
indelQ = MIN(indelQ, 255);
1380+
indelQ1 = MIN(indelQ1, 255);
13711381

13721382
// Doesn't really help accuracy, but permits -h to take
13731383
// affect still.
@@ -1774,14 +1784,39 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
17741784
0, &tend) - qbeg;
17751785
qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b),
17761786
right2, 1, &tend);
1777-
// if (qbeg-4 >= 0)
1778-
// qbeg-=4;
1779-
if (p->b->core.l_qseq > qend) {
1780-
// FIXME: use quality here...
1781-
int n = MIN(6, p->b->core.l_qseq - qend);
1782-
// fprintf(stderr, "CLIPS2 %d\n", n);
1783-
qend+=n;
1787+
#if 1
1788+
// Unhide up to 6bp of soft-clipped seq to aid realignment
1789+
{
1790+
// Adjusting left clip doesn't help.
1791+
// int i = qbeg;
1792+
// uint8_t *qual = bam_get_qual(p->b);
1793+
// int qsum = 0, qbest = 0, qbesti = qbeg, qdist = 0;
1794+
//
1795+
// while (qdist < 6 && qsum > -20 && --i >= 0) {
1796+
// qsum += qual[i]-(qavg*.5);
1797+
// if (qbest < qsum) {
1798+
// qbest = qsum;
1799+
// qbesti = i;
1800+
// }
1801+
// qdist++;
1802+
// }
1803+
// qbeg = qbesti;
1804+
1805+
int i = qend;
1806+
uint8_t *qual = bam_get_qual(p->b);
1807+
int qsum = 0, qbest = 0, qbesti = qend, qdist = 0;
1808+
while (qdist < 6 && qsum >-20 && ++i < p->b->core.l_qseq) {
1809+
// Best run of at least 50% of qavg
1810+
qsum += qual[i]-(qavg*.5);
1811+
if (qbest < qsum) {
1812+
qbest = qsum;
1813+
qbesti = i;
1814+
}
1815+
qdist++;
1816+
}
1817+
qend = qbesti;
17841818
}
1819+
#endif
17851820

17861821
int old_tend = tend;
17871822
int old_tbeg = tbeg;

mpileup.c

Lines changed: 23 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ typedef struct {
7272
uint32_t fmt_flag;
7373
int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type;
7474
int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
75+
int seqQ_offset;
7576
double min_frac; // for indels
7677
double indel_bias, poly_mqual;
7778
double del_bias; // compensate for diff deletion vs insertion error rates
@@ -878,6 +879,7 @@ static int mpileup(mplp_conf_t *conf)
878879
conf->bca->indel_win_size = conf->indel_win_size;
879880
conf->bca->indels_v20 = conf->indels_v20;
880881
conf->bca->edlib = conf->edlib;
882+
conf->bca->seqQ_offset = conf->seqQ_offset;
881883
conf->bca->poly_mqual = conf->poly_mqual;
882884
conf->bca->vs_ref = conf->vs_ref;
883885

@@ -1288,6 +1290,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
12881290
fprintf(fp,
12891291
" --indels-2.0 New EXPERIMENTAL indel calling model (diploid reference consensus)\n"
12901292
" --indels-cns New EXPERIMENTAL indel calling model with edlib\n"
1293+
" --seqq-offset Indel-cns tuning for indel seq-qual scores [120]\n"
12911294
" --no-indels-cns Disable CNS mode, to use after a -X profile\n"
12921295
" --poly-mqual (Edlib mode) Use minimum quality within homopolymers\n");
12931296
fprintf(fp,"\n");
@@ -1364,6 +1367,7 @@ int main_mpileup(int argc, char *argv[])
13641367
mplp.ambig_reads = B2B_DROP;
13651368
mplp.indel_win_size = 80;
13661369
mplp.poly_mqual = 0;
1370+
mplp.seqQ_offset = 120;
13671371
mplp.clevel = -1;
13681372
mplp.del_bias = 0; // even insertion and deletion likelhoods.
13691373
hts_srand48(0);
@@ -1443,6 +1447,7 @@ int main_mpileup(int argc, char *argv[])
14431447
{"poly-mqual", no_argument, NULL, 24},
14441448
{"no-poly-mqual", no_argument, NULL, 26},
14451449
{"score-vs-ref",required_argument, NULL, 27},
1450+
{"seqq-offset", required_argument, NULL, 28},
14461451
{NULL, 0, NULL, 0}
14471452
};
14481453
while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
@@ -1573,6 +1578,13 @@ int main_mpileup(int argc, char *argv[])
15731578
case 21: mplp.write_index = 1; break;
15741579
case 22: mplp.edlib = 1; mplp.indels_v20 = 0; break;
15751580
case 25: mplp.edlib = 0; break;
1581+
case 28:
1582+
mplp.seqQ_offset = atoi(optarg);
1583+
if (mplp.seqQ_offset < 100)
1584+
mplp.seqQ_offset = 100;
1585+
if (mplp.seqQ_offset > 200)
1586+
mplp.seqQ_offset = 200;
1587+
break;
15761588
case 23: mplp.del_bias = atof(optarg); break;
15771589
case 24: mplp.poly_mqual = 1; break;
15781590
case 26: mplp.poly_mqual = 0; break;
@@ -1612,7 +1624,8 @@ int main_mpileup(int argc, char *argv[])
16121624
mplp.extQ = 1;
16131625
mplp.flag &= ~MPLP_REALN;
16141626
mplp.del_bias = 0.4;
1615-
mplp.indel_bias = 1/1.2;
1627+
mplp.indel_bias = 1/.9;
1628+
mplp.seqQ_offset = 118;
16161629
mplp.poly_mqual = 1;
16171630
mplp.edlib = 1;
16181631
mplp.vs_ref = 0.7;
@@ -1638,13 +1651,10 @@ int main_mpileup(int argc, char *argv[])
16381651
mplp.del_bias = 0.4;
16391652
mplp.poly_mqual = 1;
16401653
mplp.edlib = 1;
1641-
// Good trade-offs of homopolymer score vs indel-bias.
1642-
// --indel-bias 0.7 -h 100-110
1643-
// --indel-bias 0.8 -h 110, or maybe 120
1644-
// --indel-bias 0.9 -h 120
1645-
// --indel-bias 1.0 -h 120, or maybe 130
1654+
// If we increase -h then we can increase bias denominator too
16461655
mplp.tandemQ = 110;
1647-
mplp.indel_bias = 1/0.8;
1656+
mplp.indel_bias = 1/0.7;
1657+
mplp.seqQ_offset = 130;
16481658

16491659
} else if (strcasecmp(optarg, "ultima") == 0 ||
16501660
strcasecmp(optarg, "ultima-1.20") == 0) {
@@ -1659,6 +1669,8 @@ int main_mpileup(int argc, char *argv[])
16591669
mplp.del_bias = 0.3;
16601670
mplp.poly_mqual = 1;
16611671
mplp.edlib = 1;
1672+
mplp.indel_bias = 1/0.7;
1673+
mplp.seqQ_offset = 140;
16621674
mplp.vs_ref = 0.3;
16631675

16641676
} else if (strcasecmp(optarg, "1.12") == 0) {
@@ -1679,12 +1691,15 @@ int main_mpileup(int argc, char *argv[])
16791691
mplp.edlib = 1;
16801692
mplp.indel_win_size = 110;
16811693
mplp.flag |= MPLP_REALN_PARTIAL;
1694+
mplp.indel_bias = 1;
1695+
mplp.seqQ_offset = 125;
16821696

16831697
} else if (strcasecmp(optarg, "bgi") == 0 ||
16841698
strcasecmp(optarg, "bgi-1.20") == 0) {
16851699
mplp.min_frac = 0.1;
16861700
mplp.edlib = 1;
1687-
mplp.indel_bias = 1/0.9;
1701+
mplp.indel_bias = 1;
1702+
mplp.seqQ_offset = 120;
16881703
mplp.flag |= MPLP_REALN_PARTIAL;
16891704

16901705
} else if (strcasecmp(optarg, "list") == 0 ||

0 commit comments

Comments
 (0)