WIP: "edlib-12".

jkbonfield · jkbonfield · commit 15d4139d9aad · 2024-01-12T17:23:52.000Z
Tweak edlib tuning for SeqQ/qual.
Add quality value assessment into soft-clip recovery.
Use /500 instead of /111 in indelQ assignment, and skew indel-bias
accordingly.  This gives better separation of FP/GT/FN generally.

Added --seqq-offset parameter so we can use it in tunables per
profile.  This is used as a limit on the seqQ reduction in the
"VAL-5*MIN(20,depth)" formula, used for favouring data over seqQ
scores when depth is sufficient.  Experimentation showed no single
value that worked for all platforms, but the default is in the
middle.
diff --git a/bam2bcf.c b/bam2bcf.c
@@ -395,7 +395,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
 #define MIN(a,b) ((a)<(b)?(a):(b))
 #endif
 
-            if (bca->edlib) {
+#ifndef MAX
+#define MAX(a,b) ((a)>(b)?(a):(b))
+#endif
+
+            if (1 && bca->edlib) {
                 // Deeper data should rely more heavily on counts of data
                 // than quality, as quality can be unreliable and prone to
                 // miscalculations through BAQ, STR analysis, etc.
@@ -406,19 +410,68 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
                 // This calls them and then post-adjusts quality, potentially
                 // dropping it later or changing genotype. So we still get
                 // calls, but lower qual.
-                seqQ = MIN(seqQ, 100-(MIN(20,_n)*3));
-
-                if (indel_in_sample && p->indel == 0) {
-                    // High quality indel calls when the read wasn't initially
-                    // containing an indel are enriched for FPs.
-                    // Dont change call as that goes into AD, but do change the
-                    // quality of the call instead.
-                    if (b != 0) {
-                        // Reduce qual, with lower-bound reduction to Q10
-                        seqQ = MIN(seqQ, seqQ/4 + 10);
-                        //q = (int)bam_get_qual(p->b)[p->qpos]/4 + 20;
-                        //q += 10;
-                    }
+                //seqQ = MIN(seqQ, 100-(MIN(20,_n)*3)); // orig
+                //seqQ = MIN(seqQ, 100-(MIN(30,_n)*2.5)); // m30.25
+                //seqQ = MIN(seqQ, 100-(MIN(25,_n)*3)); // m25
+                //seqQ = MIN(seqQ, 100-(MIN(20,_n)*4)); // m20.40; vgood BGI
+                //seqQ = MIN(seqQ, 115-(MIN(20,_n)*5));
+                //seqQ = MAX(15, 130-20*sqrt(_n));
+                //seqQ = MAX(15, 100-15*sqrt(_n));
+                //seqQ = MIN(seqQ, 110-(MIN(15,_n)*6));
+                //seqQ = MIN(seqQ, 140-(MIN(20,_n)*6));
+                seqQ = MIN(seqQ, bca->seqQ_offset-(MIN(20,_n)*5));
+                //seqQ = MIN(seqQ, 25 + 2*(30-MIN(30,_n)));
+                //seqQ = MIN(seqQ, 30 + MAX(-10, 2*(30-_n)));
+                //seqQ = MIN(seqQ, 120-(MIN(20,_n)*5));
+                //seqQ = MIN(seqQ, 120-(MIN(25,_n)*4)); // m25b.40; good BGI
+                //seqQ = MIN(seqQ, 110-(MIN(20,_n)*4)); // m20b.40; poor
+                //seqQ = MIN(seqQ, 130-(MIN(30,_n)*4)); // m30b.40; BAD!
+                //seqQ = MIN(seqQ, 170-(MIN(30,_n)*5)); // m30b.50; poor BGI
+
+                // Use base quality in there too?
+
+                if (1 && indel_in_sample && p->indel == 0 && b != 0) {
+                    // This read doesn't contain an indel in CIGAR, but it
+                    // is assigned to an indel now (b != 0),  These are
+                    // reads we've corrected with realignment, but they're
+                    // also enriched for FPs so at high depth we reduce their
+                    // confidence and let the depth do the talking.  If it's
+                    // real and deep, then we don't need every read aligning.
+                    // We also reduce base quality too to reflect the
+                    // chance of our realignment being incorrect.
+
+                    //seqQ = MIN(seqQ, seqQ/4 + 15); // q4p15
+                    //seqQ = MIN(seqQ, seqQ/4 + 10); // orig, q4p10
+                    seqQ = MIN(seqQ, seqQ/2 + 5); // q2p5
+                    //q = (int)bam_get_qual(p->b)[p->qpos]/4 + 20;
+                    //q += 10;
+
+                    // Finally reduce base quality
+                    // With qual it's ...+10[cd]
+                    // Without qual it's ...+10i[cd]
+                    // Best without qual!
+//                    q -= q>>1; // qdiv2c for non-qual q (indelQ?)
+//                    q -= q>>1; // qdiv2d for non-qual q (indelQ?)
+//                    // Best without +10?
+//                    q += 10;
+
+                    // Mix of base qual and indel qual
+
+                    // int bq = 999, i;
+                    // uint8_t *qual = bam_get_qual(p->b);
+                    // for (i = MAX(0, p->qpos-5); i < MIN(p->b->core.l_qseq, p->qpos+5); i++)
+                    //     if (bq < qual[i])
+                    //         bq = qual[i];
+                    // q = MAX(bq/4+10, q/4+1); // x
+
+                    // PB Old: this is all of very minor impact
+                    q = MIN((int)bam_get_qual(p->b)[p->qpos]/4+10, q/4+1); // x
+
+//                    int bq = (int)bam_get_qual(p->b)[p->qpos];
+//                    if (bq > bca->max_baseQ)
+//                        bq = bca->max_baseQ;
+//                    q = MIN(bq/2, q/4+1);
+                    // also for low mapQ?
                 }
             }
 
diff --git a/bam2bcf.h b/bam2bcf.h
@@ -123,6 +123,7 @@ typedef struct __bcf_callaux_t {
     int max_bases;
     int indel_types[4];     // indel lengths
     int indel_win_size, indels_v20, edlib;
+    int seqQ_offset; // edlib mode, seqQ=MIN(seqQ, offset - MIN(20,depth)*5);
     int maxins, indelreg, poly_mqual;
     int read_len;
     char *inscns;
diff --git a/bam2bcf_edlib.c b/bam2bcf_edlib.c
@@ -1098,7 +1098,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca,
     l = .5*(100. * sc2 / (qend - qbeg) + .499);
     l += iscore*(qavg/(m2min+1.0) + qavg/m2);
 
-    *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias);
+    *score = (sc2<<8) | (int)MIN(255, l * bca->indel_bias * .5);
 
     free(qq);
 
@@ -1303,8 +1303,9 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
 
             // So we lower qual in some, but raise the average to keep FN/FP
             // ratios up.
-            indelQ /= bca->indel_bias;
-            indelQ1 /= bca->indel_bias;
+            // Is this key diff for PacBio old vs new HiFi?
+            indelQ  /= bca->indel_bias*0.5;
+            indelQ1 /= bca->indel_bias*0.5;
 
             // Or maybe just *2 if bca->poly_mqual and be done with it?
             // Or perhaps adjust the MIN(qavg/20, ...) to MIN(qavg/10) ?
@@ -1366,8 +1367,17 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp,
             // low normalised scores leave indelQ unmodified
             // high normalised scores set indelQ to 0
             // inbetween scores have a linear scale from indelQ to 0
-            indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499);
-            indelQ1= tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ1+ .499);
+// Alitering the MAGIC value below (originally 111, but chosen for unknown
+// reasons) is comparable to altering --indel-bias.
+//#define TMP_MAGIC 111.0
+#define TMP_MAGIC 255.0
+//#define TMP_MAGIC 500.0
+
+            indelQ = tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ + .499);
+            indelQ1= tmp > TMP_MAGIC? 0 : (int)((1. - tmp/TMP_MAGIC) * indelQ1+ .499);
+
+            indelQ  = MIN(indelQ,  255);
+            indelQ1 = MIN(indelQ1, 255);
 
             // Doesn't really help accuracy, but permits -h to take
             // affect still.
@@ -1774,14 +1784,39 @@ int bcf_edlib_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos,
                                      0, &tend) - qbeg;
                 qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b),
                                  right2, 1, &tend);
-//                if (qbeg-4 >= 0)
-//                    qbeg-=4;
-                if (p->b->core.l_qseq > qend) {
-                    // FIXME: use quality here...
-                    int n = MIN(6, p->b->core.l_qseq - qend);
-//                    fprintf(stderr, "CLIPS2 %d\n", n);
-                    qend+=n;
+#if 1
+                // Unhide up to 6bp of soft-clipped seq to aid realignment
+                {
+// Adjusting left clip doesn't help.
+//                    int i = qbeg;
+//                    uint8_t *qual = bam_get_qual(p->b);
+//                    int qsum = 0, qbest = 0, qbesti = qbeg, qdist = 0;
+//
+//                    while (qdist < 6 && qsum > -20 && --i >= 0) {
+//                        qsum += qual[i]-(qavg*.5);
+//                        if (qbest < qsum) {
+//                            qbest = qsum;
+//                            qbesti = i;
+//                        }
+//                        qdist++;
+//                    }
+//                    qbeg = qbesti;
+
+                    int i = qend;
+                    uint8_t *qual = bam_get_qual(p->b);
+                    int qsum = 0, qbest = 0, qbesti = qend, qdist = 0;
+                    while (qdist < 6 && qsum >-20 && ++i < p->b->core.l_qseq) {
+                        // Best run of at least 50% of qavg
+                        qsum += qual[i]-(qavg*.5);
+                        if (qbest < qsum) {
+                            qbest = qsum;
+                            qbesti = i;
+                        }
+                        qdist++;
+                    }
+                    qend = qbesti;
                 }
+#endif
 
                 int old_tend = tend;
                 int old_tbeg = tbeg;
diff --git a/mpileup.c b/mpileup.c
@@ -72,6 +72,7 @@ typedef struct {
     uint32_t fmt_flag;
     int rflag_skip_any_unset, rflag_skip_all_unset, rflag_skip_any_set, rflag_skip_all_set, output_type;
     int openQ, extQ, tandemQ, min_support, indel_win_size; // for indels
+    int seqQ_offset;
     double min_frac; // for indels
     double indel_bias, poly_mqual;
     double del_bias; // compensate for diff deletion vs insertion error rates
@@ -878,6 +879,7 @@ static int mpileup(mplp_conf_t *conf)
     conf->bca->indel_win_size = conf->indel_win_size;
     conf->bca->indels_v20 = conf->indels_v20;
     conf->bca->edlib = conf->edlib;
+    conf->bca->seqQ_offset = conf->seqQ_offset;
     conf->bca->poly_mqual = conf->poly_mqual;
     conf->bca->vs_ref = conf->vs_ref;
 
@@ -1288,6 +1290,7 @@ static void print_usage(FILE *fp, const mplp_conf_t *mplp)
     fprintf(fp,
         "      --indels-2.0        New EXPERIMENTAL indel calling model (diploid reference consensus)\n"
         "      --indels-cns        New EXPERIMENTAL indel calling model with edlib\n"
+        "      --seqq-offset       Indel-cns tuning for indel seq-qual scores [120]\n"
         "      --no-indels-cns     Disable CNS mode, to use after a -X profile\n"
         "      --poly-mqual        (Edlib mode) Use minimum quality within homopolymers\n");
     fprintf(fp,"\n");
@@ -1364,6 +1367,7 @@ int main_mpileup(int argc, char *argv[])
     mplp.ambig_reads = B2B_DROP;
     mplp.indel_win_size = 80;
     mplp.poly_mqual = 0;
+    mplp.seqQ_offset = 120;
     mplp.clevel = -1;
     mplp.del_bias = 0; // even insertion and deletion likelhoods.
     hts_srand48(0);
@@ -1443,6 +1447,7 @@ int main_mpileup(int argc, char *argv[])
         {"poly-mqual", no_argument, NULL, 24},
         {"no-poly-mqual", no_argument, NULL, 26},
         {"score-vs-ref",required_argument, NULL, 27},
+        {"seqq-offset", required_argument, NULL, 28},
         {NULL, 0, NULL, 0}
     };
     while ((c = getopt_long(argc, argv, "Ag:f:r:R:q:Q:C:BDd:L:b:P:po:e:h:Im:F:EG:6O:xa:s:S:t:T:M:X:U",lopts,NULL)) >= 0) {
@@ -1573,6 +1578,13 @@ int main_mpileup(int argc, char *argv[])
         case  21: mplp.write_index = 1; break;
         case  22: mplp.edlib = 1; mplp.indels_v20 = 0; break;
         case  25: mplp.edlib = 0; break;
+        case  28:
+            mplp.seqQ_offset = atoi(optarg);
+            if (mplp.seqQ_offset < 100)
+                mplp.seqQ_offset = 100;
+            if (mplp.seqQ_offset > 200)
+                mplp.seqQ_offset = 200;
+            break;
         case  23: mplp.del_bias = atof(optarg); break;
         case  24: mplp.poly_mqual = 1; break;
         case  26: mplp.poly_mqual = 0; break;
@@ -1612,7 +1624,8 @@ int main_mpileup(int argc, char *argv[])
                 mplp.extQ = 1;
                 mplp.flag &= ~MPLP_REALN;
                 mplp.del_bias = 0.4;
-                mplp.indel_bias = 1/1.2;
+                mplp.indel_bias = 1/.9;
+                mplp.seqQ_offset = 118;
                 mplp.poly_mqual = 1;
                 mplp.edlib = 1;
                 mplp.vs_ref = 0.7;
@@ -1638,13 +1651,10 @@ int main_mpileup(int argc, char *argv[])
                 mplp.del_bias = 0.4;
                 mplp.poly_mqual = 1;
                 mplp.edlib = 1;
-                // Good trade-offs of homopolymer score vs indel-bias.
-                // --indel-bias 0.7 -h 100-110
-                // --indel-bias 0.8 -h 110, or maybe 120
-                // --indel-bias 0.9 -h 120
-                // --indel-bias 1.0 -h 120, or maybe 130
+                // If we increase -h then we can increase bias denominator too
                 mplp.tandemQ = 110;
-                mplp.indel_bias = 1/0.8;
+                mplp.indel_bias = 1/0.7;
+                mplp.seqQ_offset = 130;
 
             } else if (strcasecmp(optarg, "ultima") == 0 ||
                        strcasecmp(optarg, "ultima-1.20") == 0) {
@@ -1659,6 +1669,8 @@ int main_mpileup(int argc, char *argv[])
                 mplp.del_bias = 0.3;
                 mplp.poly_mqual = 1;
                 mplp.edlib = 1;
+                mplp.indel_bias = 1/0.7;
+                mplp.seqQ_offset = 140;
                 mplp.vs_ref = 0.3;
 
             } else if (strcasecmp(optarg, "1.12") == 0) {
@@ -1679,12 +1691,15 @@ int main_mpileup(int argc, char *argv[])
                 mplp.edlib = 1;
                 mplp.indel_win_size = 110;
                 mplp.flag |= MPLP_REALN_PARTIAL;
+                mplp.indel_bias = 1;
+                mplp.seqQ_offset = 125;
 
             } else if (strcasecmp(optarg, "bgi") == 0 ||
                        strcasecmp(optarg, "bgi-1.20") == 0) {
                 mplp.min_frac = 0.1;
                 mplp.edlib = 1;
-                mplp.indel_bias = 1/0.9;
+                mplp.indel_bias = 1;
+                mplp.seqQ_offset = 120;
                 mplp.flag |= MPLP_REALN_PARTIAL;
 
             } else if (strcasecmp(optarg, "list") == 0 ||