From bf0cbf26bbf425dae098acfb0707fd68d2ccc262 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 11 Feb 2022 10:01:24 +0000 Subject: [PATCH 01/31] Attempt 1 to fix #1459; move left/right based on STR presence. The fixed +/- indel_win_size is still used in construction of the type[] array, but then we reduce that size down before doing the alignments and evaluating them. This means, where appropriate (nice unique data without lots of STRs) we can assess over a smaller window and avoid the negative impact of other nearby indels. It cures the example covid19 problem, but also reduces recall elsewhere as if we *do* still get other nearby indels (eg due to low complexity data between our candidate indel and a neighbouring one) then we're now paying a much larger normalised per-base hit as the length is smaller. --- bam2bcf_indel.c | 138 +++++++++++++++++++++++++++++++++++++++++++++--- mpileup.c | 2 +- 2 files changed, 132 insertions(+), 8 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 108d50557..2813afa8a 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -482,6 +482,10 @@ static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, # define MIN(a,b) ((a)<(b)?(a):(b)) #endif +#ifndef MAX +# define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + // Part of bcf_call_gap_prep. // // Realign using BAQ to get an alignment score of a single read vs @@ -542,6 +546,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // used for adjusting indelQ below l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias; *score = sc<<8 | MIN(255, l); + //fprintf(stderr, "score = %d, qend-qbeg = %d, => adj score %d\n", sc, qend-qbeg, l); rep_ele *reps, *elt, *tmp; uint8_t *seg = ref2 + tbeg - left; @@ -601,6 +606,10 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, for (s = K = 0; s < n; ++s) { for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; + // Labelling is confusing here. + // sct is short for score. + // sc is score + t(type) + // Why aren't these variable names reversed? int *sct = &score[K*n_types], seqQ, indelQ; for (t = 0; t < n_types; ++t) sc[t] = sct[t]<<6 | t; for (t = 1; t < n_types; ++t) // insertion sort @@ -614,6 +623,8 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, * compromise for multi-allelic indels. */ if ((sc[0]&0x3f) == ref_type) { + // sc >> 14 is the total score. It's been shifted by 8 + // from normalised score and 6 from type. indelQ = (sc[1]>>14) - (sc[0]>>14); seqQ = est_seqQ(bca, types[sc[1]&0x3f], l_run); } else { @@ -622,8 +633,14 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, indelQ = (sc[t]>>14) - (sc[0]>>14); seqQ = est_seqQ(bca, types[sc[0]&0x3f], l_run); } - tmp = sc[0]>>6 & 0xff; + + tmp = sc[0]>>6 & 0xff; // normalised score + // reduce indelQ + // high score = bad, low score = good. + // low normalised scores leave indelQ unmodified + // high normalised scores set indelQ to 0 + // inbetween scores have a linear scale from indelQ to 0 indelQ = tmp > 111? 0 : (int)((1. - tmp/111.) * indelQ + .499); // Doesn't really help accuracy, but permits -h to take @@ -632,8 +649,8 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (indelQ > 255) indelQ = 255; if (seqQ > 255) seqQ = 255; p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; - // fprintf(stderr, "pos=%d read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", pos, s, i, bam1_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; // FIXME: redunctant; always indelQ + // fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@ -721,13 +738,118 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // calculate left and right boundary left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0; right = pos + bca->indel_win_size; - if (types[0] < 0) right -= types[0]; + int del_size = types[0]<0 ? -types[0] : 0; + int ins_size = types[0]>0 ? types[0] : 0; + right += del_size; // in case the alignments stand out the reference for (i = pos; i < right; ++i) if (ref[i] == 0) break; right = i; + // FIXME: move to own function: STR_adj_left_right? + if (1) { + rep_ele *reps, *elt, *tmp; + + // Convert ASCII to 0,1,2,3 seq for find_STR usage + int j; + char ref4[1024]; // FIXME, check! + if (right > left+1024) + right = left+1024; + for (j = 0, i = left; i < right; i++, j++) { + switch(ref[i]) { + case 'A': ref4[j] = 0; break; + case 'C': ref4[j] = 1; break; + case 'G': ref4[j] = 2; break; + case 'T': ref4[j] = 3; break; + default: ref4[j] = j%4; break; // mix N across all 4 types + } + } + reps = find_STR(ref4, right-left, 0); + + int over_l = pos-1; + int over_r = pos+del_size+1; + int adjusted = 1; + + //fprintf(stderr, "\nRef at %d: %.*s\n", left, right-left, ref+left); + +#if 0 + DL_FOREACH_SAFE(reps, elt, tmp) { + //fprintf(stderr, "rep %d..%d: %.*s\n", elt->start, elt->end, + // elt->end-elt->start+1, ref+left+elt->start); + if (elt->start + left < over_l && elt->end + left >= pos-1) { + over_l = elt->start + left; + //fprintf(stderr, "Adj left\n"); + adjusted=1; + } + if (elt->end + left > over_r && elt->start + left <= pos+1) { + over_r = elt->end + left; + //fprintf(stderr, "Adj right\n"); + adjusted=1; + } + //DL_DELETE(reps, elt); + //free(elt); + } + + // 2nd pass, adjusting to next STR so require 2 STRs out + if (adjusted) { + int pos_l = over_l; + int pos_r = over_r; + DL_FOREACH_SAFE(reps, elt, tmp) { + if (elt->start + left < over_l && elt->end + left >= pos_l-1) + over_l = elt->start + left; + if (elt->end + left > over_r && elt->start + left <= pos_r+1) + over_r = elt->end + left; + DL_DELETE(reps, elt); + free(elt); + } + } + //fprintf(stderr, "STR overlap = %d..(%d)..%d\n", over_l, pos, over_r); + + // FIXME adjustable param + over_l = pos - (pos-over_l)*2; + over_r = pos + (over_r-pos)*2; + //over_l -= 5+del_size+ins_size; + //over_r += 5+del_size+ins_size; + + over_l -= 5+3*(del_size+ins_size); + over_r += 5+3*(del_size+ins_size); + //fprintf(stderr, "=> overlap = %d..(%d)..%d\n", over_l, pos, over_r); + if (left < over_l) + left = over_l; + if (right > over_r) + right = over_r; +#else + // Too many FNs, but OK otherwise. + char str[1024] = {0}; + const int n = 3; + DL_FOREACH_SAFE(reps, elt, tmp) { + int i, i_start = MAX(elt->start-n, 0), i_end = MIN(elt->end+n, 1024); +// fprintf(stderr, "rep %d..%d: %.*s\n", elt->start, elt->end, +// elt->end-elt->start+1, ref+left+elt->start); + for (i = i_start; i < i_end; i++) + str[i]=1; + DL_DELETE(reps, elt); + free(elt); + } + int score; + for (score = 3, i = pos; i > left && score; i--) + score -= str[i-left]==0; + int left_new = i; + + for (score = 3, i = pos; i < right && score; i++) + score -= str[i-left]==0; + int right_new = i; + +// fprintf(stderr, "left %d, %d, pos %d, %d, right %d\n", +// left, left_new, pos, right_new, right); + + left = left_new; + right = right_new; +#endif + } + +// fprintf(stderr, "=== POS %d, left/right = len %d\n", pos, right-left); /* The following call fixes a long-existing flaw in the INDEL * calling model: the interference of nearby SNPs. However, it also @@ -895,7 +1017,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // RG platform field. int long_read = p->b->core.l_qseq > 1000; - // do realignment; this is the bottleneck + // do realignment; this is the bottleneck. + // + // Note low score = good, high score = bad. if (tend > tbeg) { if (bcf_cgp_align_score(p, bca, types[t], (uint8_t *)ref2 + left2-left, @@ -920,9 +1044,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, fputc("ACGTN"[(int)query[l]], stderr); fputc('\n', stderr); fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s " - "qbeg=%d tbeg=%d score=%d\n", + "qbeg=%d tbeg=%d score=%d,%d\n", pos, types[t], s, i, bam_get_qname(p->b), - qbeg, tbeg, sc); + qbeg, tbeg, score[K*n_types + t]>>8, score[K*n_types + t]&0xff); #endif } } diff --git a/mpileup.c b/mpileup.c index fd5aa510e..3106ac51a 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1412,7 +1412,7 @@ int main_mpileup(int argc, char *argv[]) if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg); if ( mplp.indel_win_size < 110 ) { - mplp.indel_win_size = 110; + //mplp.indel_win_size = 110; fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size); } } From 7710d96d2d9b9cc83b49ad4674ce577ab3cc0d4b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 18 Feb 2022 16:02:18 +0000 Subject: [PATCH 02/31] WIP2. "Shift7-RF1c50-2ref" method so far. First 30MB of SynDip chr1: SNP Q>0 / Q>=30 / Filtered InDel TP 4915 / 4817 / 4796 InDel FP 334 / 266 / 226 InDel GT 265 / 249 / 245 InDel FN 1528 / 1626 / 1647 vs devel: InDel TP 4910 / 4828 / 4807 InDel FP 266 / 192 / 169 InDel GT 240 / 230 / 226 InDel FN 1533 / 1615 / 1636 So not a win apparently. However it's a starting point, and many of the FP/FN are related to differing calls at correct locations. --- INSTALL | 19 ++ bam2bcf_indel.c | 611 ++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 609 insertions(+), 21 deletions(-) diff --git a/INSTALL b/INSTALL index bcdd2f4e3..b02782c3e 100644 --- a/INSTALL +++ b/INSTALL @@ -266,3 +266,22 @@ mingw-w64-x86_64-tools-git (The last is only needed for building libraries compatible with MSVC.) +Windows MSYS2/MINGW64 +--------------------- + +The configure script must be used as without it the compilation will +likely fail. + +Follow MSYS2 installation instructions at +https://www.msys2.org/wiki/MSYS2-installation/ + +Then relaunch to MSYS2 shell using the "MSYS2 MinGW x64" executable. +Once in that environment (check $MSYSTEM equals "MINGW64") install the +compilers using pacman -S and the following package list: + +base-devel mingw-w64-x86_64-toolchain +mingw-w64-x86_64-libdeflate mingw-w64-x86_64-zlib mingw-w64-x86_64-bzip2 +mingw-w64-x86_64-xz mingw-w64-x86_64-curl mingw-w64-x86_64-autotools +mingw-w64-x86_64-tools-git + +(The last is only needed for building libraries compatible with MSVC.) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 2813afa8a..b1213e8c0 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -277,8 +277,9 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, // or NULL on failure. static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, - int left, int right) { + int left, int right, int max_ref2) { int i, k, s, L = right - left + 1, max_i, max2_i; + if (L < max_ref2) L = max_ref2; char **ref_sample; // returned uint32_t *cns = NULL, max, max2; char *ref0 = NULL, *r; @@ -407,6 +408,438 @@ static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, return NULL; } +// Increment ins["str"] and freq["str"] +#define NI 10 // number of alternative insertion sequences +// Could use a hash table too, but expectation is a tiny number of alternatives +typedef struct { + char *str[NI]; + int len[NI]; + int freq[NI]; +} str_freq; + +static int bcf_cgp_append_cons(str_freq *sf, char *str, int len) { + int j; + + for (j = 0; j < NI && sf->str[j]; j++) { + if (sf->len[j] == len && memcmp(sf->str[j], str, len) == 0) + break; + } + if (j >= NI) + return 0; // too many choices; discard + + sf->freq[j]++; + if (!sf->str[j]) { + // new insertion + if (!(sf->str[j] = malloc(len+1))) + return -1; + memcpy(sf->str[j], str, len); + sf->len[j] = len; + } + + return 0; +} + +/* + * Compute the consensus for a specific indel type at pos. + * + * left_shift is the number of inserted(+) or deleted(-) bases added to + * the consensus before we get to pos. This is necessary so the alignment + * band is correct as it's expected to start at left/right edges in + * sync + * + * We accumulate into several buffers for counting base types: + * cons_base - consensus of data with p->indel == type, bases or gap + * ref_base - consensus of data with p->indel != type, bases or gap + * cons_ins - consensus of data with p->indel == type, insertions + * + * The purpose of cons_ins vs cons_base is if we have very low + * coverage due to nearly all reads being another type, then we can + * still get a robust consensus using the other data. If we don't + * have shallow data, then we'll not use as much of ref_base as we may + * have correlated variants. + * + * Eg: + * REF: AGCTATGAGGCTGATA + * SEQ: AGGTAGGAGGGTGATA (x1) + * SEQ: AGCTACGAGG*TGATA (x24) + * SEQ: AGCTACTAGG*TGATA (x24) + * + * Cons for no-del is Cs not Gs: + * CON: AGCTACNAGGGTGATA + */ +static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + char *ref_sample, + int left, int right, int sample, int type, + int *left_shift, int *right_shift) { + int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));// single base or del + str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); // multi-base insertions + + // non-indel ref for all reads on this sample, rather than those just + // matching type. We use this for handling the case where we have a + // homozygous deletion being studied, but with 1 or 2 reads misaligned + // and containing a base there. + // + // Eg if the type[]=0 consensus is made up of a very small sample size, + // which is also enriched for highly error prone data. We can use + // the other reads from type[] != 0 to flesh out the consensus and + // improve accuracy. + int (*ref_base)[6] = calloc(right - left + 1, sizeof(*ref_base)); + + int i, j, k, s = sample; + + // Seed cons_base with ref so lack of data still aligns against the ref. + // fprintf(stderr, "ref=%.*s\n", right-left, ref+left); + + // FIXME: do this at end, and use ref_sample instead of ref. + // We add a figure based on the total depth. + // Eg if we added 50 out of 100 then our consensus is probably fine. + // If we added 2 out of 100 then they're problably erroneous, so we + // don't want to start calling them or Ns. Use ref_sample instead. + // So the amount we add should be a proportion of the amount we didn't + // include in our consensus. Eg MAX(0, depth - Nused*2). + + // FIXME: maybe this no longer matters now we have ref_base[] +#define REF_SEED 1 + for (i = left; i < right; i++) { + switch(ref[i]) { + case 'A': cons_base[i-left][0]+=REF_SEED; break; + case 'C': cons_base[i-left][1]+=REF_SEED; break; + case 'G': cons_base[i-left][2]+=REF_SEED; break; + case 'T': cons_base[i-left][3]+=REF_SEED; break; + default: cons_base[i-left][4]+=REF_SEED; break; + } + } + + // Accumulate sequences into cons_base and cons_ins arrays + int last_base_ins = 0; + for (i = 0; i < n_plp[s]; i++) { + const bam_pileup1_t *p = plp[s] + i; +// if (p->indel != type) +// continue; + + // fprintf(stderr, "p=%d\t%d/%d: Seq %3d of %3d\t", p->b->core.pos, s, type, i, n_plp[s]); + + bam1_t *b = p->b; + int x = b->core.pos; // ref coordinate + int y = 0; // seq coordinate + uint32_t *cigar = bam_get_cigar(b); + uint8_t *seq = bam_get_seq(b); + + last_base_ins = 0; + for (k = 0; k < b->core.n_cigar; ++k) { + int op = cigar[k] & BAM_CIGAR_MASK; + int len = cigar[k] >> BAM_CIGAR_SHIFT; + int base; + + switch(op) { + case BAM_CSOFT_CLIP: + y += len; + break; + + case BAM_CMATCH: + case BAM_CEQUAL: + case BAM_CDIFF: { + int L[16] = { + // 1,2,4,8 to 0,1,2,3 plus 4 for N/ambig (and 5 for gap) + 4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4 + }; + + // Can short-cut this with j_start and j_end based on x+len and left,right + for (j = 0; j < len; j++, x++, y++) { + if (x < left) continue; + if (x >= right) break; + if (last_base_ins) { + last_base_ins = 0; + continue; + } + base = bam_seqi(seq, y); + if (p->indel == type) + cons_base[x-left][L[base]]++; + //else + ref_base[x-left][L[base]]++; + // fputc(seq_nt16_str[base], stderr); + } + break; + } + + case BAM_CINS: { + if (p->indel != type) + break; + + char ins[1024]; + for (j = 0; j < len; j++, y++) { + if (x < left) continue; + if (x >= right) break; + base = bam_seqi(seq, y); + if (j < 1024) + ins[j] = seq_nt16_str[base]; + } + + // Insertions come before a ref match. + // 5I 5M is IIIIIM M M M M events, not + // {IIIII,M} M M M M choice. So we need to include the + // next match in our sequence when choosing the consensus. + if (y < b->core.l_qseq) { + base = bam_seqi(seq, y); + if (j < 1024) + ins[j++] = seq_nt16_str[base]; + } + last_base_ins = 1; + + // fprintf(stderr, "<+%.*s>", j<1024?j:1024, ins); + if (x >= left && x < right) + bcf_cgp_append_cons(&cons_ins[x-left], ins, j<1024?j:1024); + break; + } + + case BAM_CDEL: + // FIXME, not perfect for I/D combos, but likely sufficient. + last_base_ins = 0; + for (j = 0; j < len; j++, x++) { + if (x < left) continue; + if (x >= right) break; + // fputc('-', stderr); + if (p->indel == type) + cons_base[x-left][5]++; + //else + ref_base[x-left][5]++; + } + break; + } + } + // fprintf(stderr, " %s\n", bam_get_qname(p->b)); + } + + // Expand cons_base to include depth from ref_sample. + // Caveat: except at pos itself, where true ref is used if type != 0 + for (i = 0; i < right-left; i++) { + // Total observed depth + int t = cons_base[i][0] + cons_base[i][1] + cons_base[i][2] + + cons_base[i][3] + cons_base[i][4] + cons_base[i][5]; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + t += cons_ins[i].freq[j]; + } + + int r = ref_base[i][0] + ref_base[i][1] + ref_base[i][2] + + ref_base[i][3] + ref_base[i][4] + ref_base[i][5]; + + double rfract = (r - t*2)*.75 / (r+1); + if (rfract > 0) { // && !(type == 0 && i+left == pos)) { + if (i+left >= pos+1 && i+left <= pos+1-(type<0?type+1:0)) { + int rem = rfract * n_plp[s]; + fprintf(stderr, "rfract=%f rem=%d type=%d, t=%d r=%d\n", rfract, rem, type, t, r); +// switch(ref_sample[i]) { +// case 1: cons_base[i][0] += rem; break; // A +// case 2: cons_base[i][1] += rem; break; // C +// case 4: cons_base[i][2] += rem; break; // G +// case 8: cons_base[i][3] += rem; break; // T +// default:cons_base[i][4] += rem; break; // N +// } + } else { + cons_base[i][0] += rfract * ref_base[i][0]; + cons_base[i][1] += rfract * ref_base[i][1]; + cons_base[i][2] += rfract * ref_base[i][2]; + cons_base[i][3] += rfract * ref_base[i][3]; + cons_base[i][4] += rfract * ref_base[i][4]; + cons_base[i][5] += rfract * ref_base[i][5]; + } + } + +// // A portion of what's left copied from ref_sample +// int rem = (n_plp[s] - t*2)*.75; +// if (rem > 0) { +// rem = REF_SEED; // FUDGE; minimal count to block N +// // Add in the full "rem" amount and we get many more FN again +// // (but low low FP). Assume this is the del off-target being +// // turned back into bases? +// // +// // We could use the ref_sample construction code which adds to +// // cns[] as depth to track base vs gap. Or write a newer +// // ref_sample creation code. Do it right here infact... +// switch(ref_sample[i]) { +// case 1: cons_base[i][0] += rem; break; // A +// case 2: cons_base[i][1] += rem; break; // C +// case 4: cons_base[i][2] += rem; break; // G +// case 8: cons_base[i][3] += rem; break; // T +// default:cons_base[i][4] += rem; break; // N +// } +// } + } + + // Allocate consensus buffer, to worst case length + int max_len = right-left; + for (i = 0; i < right-left; i++) { + if (!cons_ins[i].str[0]) + continue; + + int ins = 0; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + if (cons_ins[i].str[j] && ins < cons_ins[i].len[j]) + ins = cons_ins[i].len[j]; + } + max_len += ins; + } + char *cons = malloc(max_len+1); + + // FIXME: helps sometimes, harms others + + // Merge insertions where they are the same length but different + // sequences. + // NB: we could just index by length and have accumulators for each, + // instead of storing separately and merging later (here). + // Ie str_freq.str is [NI][5] instead. + for (i = 0; i < right-left; i++) { + int ins[1024][5]; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + + if (cons_ins[i].freq[j] == 0) + continue; // already merged + + int l; + for (l = 0; l < cons_ins[i].len[j]; l++) { + // FIXME! optimise this + ins[l][0] = ins[l][1] = ins[l][2] = ins[l][3] = ins[l][4] = 0; + switch(cons_ins[i].str[j][l]) { + case 'A': ins[l][0] = cons_ins[i].freq[j]; break; + case 'C': ins[l][1] = cons_ins[i].freq[j]; break; + case 'G': ins[l][2] = cons_ins[i].freq[j]; break; + case 'T': ins[l][3] = cons_ins[i].freq[j]; break; + default: ins[l][4] = cons_ins[i].freq[j]; break; + } + } + + // Merge other insertions of the same length to ins[] counters + for (k = j+1; k < NI; k++) { + if (!cons_ins[i].str[k]) + break; + if (cons_ins[i].len[k] != cons_ins[i].len[j]) + continue; + if (cons_ins[i].freq[k] == 0) + continue; // redundant? + + // Merge str[j] and str[k] + for (l = 0; l < cons_ins[i].len[k]; l++) { + // FIXME! optimise this + switch(cons_ins[i].str[k][l]) { + case 'A': ins[l][0]+=cons_ins[i].freq[k]; break; + case 'C': ins[l][1]+=cons_ins[i].freq[k]; break; + case 'G': ins[l][2]+=cons_ins[i].freq[k]; break; + case 'T': ins[l][3]+=cons_ins[i].freq[k]; break; + default: ins[l][4]+=cons_ins[i].freq[k]; break; + } + } + cons_ins[i].freq[j] += cons_ins[i].freq[k]; + cons_ins[i].freq[k] = 0; + } + + // Now replace ins[j] with the consensus insertion of this len. + for (l = 0; l < cons_ins[i].len[j]; l++) { + int max_v = 0, base = 0; + int tot = ins[l][0] + ins[l][1] + ins[l][2] + + ins[l][3] + ins[l][4]; + if (max_v < ins[l][0]) max_v = ins[l][0], base = 0; + if (max_v < ins[l][1]) max_v = ins[l][1], base = 1; + if (max_v < ins[l][2]) max_v = ins[l][2], base = 2; + if (max_v < ins[l][3]) max_v = ins[l][3], base = 3; + if (max_v < ins[l][4]) max_v = ins[l][4], base = 4; + + cons_ins[i].str[j][l] = (max_v > 0.8*tot) ?"ACGTN"[base] :'N'; + } + } + } + +#define CONS_CUTOFF .40 // 70% needed for base vs N +#define CONS_CUTOFF_INC .30 // 30% to include any insertion. +#define CONS_CUTOFF_INS .60 // and then 70% needed for it to be bases vs N + // Walk through the frequency arrays to call the consensus + *left_shift = 0; + *right_shift = 0; + for (i = k = 0; i < right-left; i++) { + // fprintf(stderr, "%d\t", i); + int max_v = 0, max_j = 4, tot = 0; + for (j = 0; j < 6; j++) { + if (max_v < cons_base[i][j]) + max_v = cons_base[i][j], max_j = j; + tot += cons_base[i][j]; +// if (cons_base[i][j]) +// fprintf(stderr, "%c%d ", "ACGTN*"[j], cons_base[i][j]); + } + + // +INS + int max_v_ins = 0, max_j_ins = 0; + int tot_ins = 0; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + if (cons_ins[i].freq[j] == 0) + continue; // previously merged + + if (max_v_ins < cons_ins[i].freq[j]) + max_v_ins = cons_ins[i].freq[j], max_j_ins = j; + tot_ins += cons_ins[i].freq[j]; + +// fprintf(stderr, "%.*s%d ", cons_ins[i].len[j], cons_ins[i].str[j], +// cons_ins[i].freq[j]); + } + if (max_v_ins > CONS_CUTOFF_INC*(tot+tot_ins)) { + if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { + // Insert bases + for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) { + // FIXME: commented out to deliberate get consensus shift. + // Need to know how to get aligner working properly in that + // scenario, as it'll happen sometimes! + if (k < pos-left+*left_shift) + (*left_shift)++; + else + (*right_shift)++; + cons[k++] = cons_ins[i].str[max_j_ins][j]; + } + } else { + for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) + cons[k++] = 'N'; + } + continue; // don't call next base as included in insertion + // NB causes debugging output missing newlines, but meh + } + + // Call + if (max_v > CONS_CUTOFF*tot) { + if (max_j != 5) // gap + cons[k++] = "ACGTN*"[max_j]; + else if (k < pos-left+*left_shift) + (*left_shift)--; + else + (*right_shift)++; + } else { + cons[k++] = 'N'; + } + + // fprintf(stderr, "\n"); + } + cons[k++] = '\0'; + + // fprintf(stderr, "Cons: %s\n", cons); + free(cons_base); + free(ref_base); + + for (i = 0; i < right-left; i++) { + for (j = 0; j < NI; j++) + // FIXME: replace by string pool + if (cons_ins[i].str[j]) + free(cons_ins[i].str[j]); + } + free(cons_ins); + + return cons; +} + // The length of the homopolymer run around the current position static int bcf_cgp_l_run(const char *ref, int pos) { int i, l_run; @@ -489,13 +922,44 @@ static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, // Part of bcf_call_gap_prep. // // Realign using BAQ to get an alignment score of a single read vs -// a haplotype consensus. +// a haplotype consensus. TODO: replace BAQ with something more robust. +// +// There are many coordinates, so let's explain them. +// - left, right, tbeg, tend, r_start and r_end are in aligned reference +// coordinates. +// left/right start from pos +/- indel_win_size. +// r_start/r_end are the BAM first and last mapped coord on the reference. +// tbeg and tend are the intersection of the two. +// - qbeg and qend are in BAM sequence coordinates +// - qpos is in sequence coordinates, relative to qbeg. +// +// To see what this means, we have illustrations with coordinates +// above the seqs in reference space and below the seqs in BAM seq space. +// +// Overlap left: +// tbeg tend +// r_start left pos r_end right +// REF :..............|--------------------#------:--------------|... +// SEQ :..............|--------------------#------| +// 0 qbeg qpos qend +// +// Overlap right: +// r_start tend +// left tbeg pos right r_end +// REF ...|--------------:-----#---------------------|...........: +// SEQ |-----#---------------------|...........: +// qbeg qpos qend +// 0 +// +// The "-" sequence is the bit passed in. +// Ie ref2 spans left..right and query spans qbeg..qend. +// We need to adjust ref2 therefore to tbeg..tend. // // Fills out score // Returns 0 on success, // <0 on error -static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, - int type, uint8_t *ref2, uint8_t *query, +static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, + uint8_t *ref1, uint8_t *ref2, uint8_t *query, int r_start, int r_end, int long_read, int tbeg, int tend, int left, int right, @@ -514,8 +978,8 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, } type = abs(type); - apf.bw = type + 3; - int l, sc; + apf.bw = type + 3; // apf.bw=100; + int l, sc1, sc2; const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; @@ -535,17 +999,32 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // The bottom 8 bits are length-normalised score while // the top bits are unnormalised. - sc = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, - query, qend - qbeg, qq, &apf, 0, 0); - if (sc < 0) { + // + // Try original cons and new cons and pick best. + // This doesn't removed FN much (infact maybe adds very slightly), + // but it does reduce GT errors and some slight reduction to FP. + sc1 = probaln_glocal(ref1 + tbeg - left, tend - tbeg + type, + query, qend - qbeg, qq, &apf, 0, 0); + sc2 = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, + query, qend - qbeg, qq, &apf, 0, 0); + if (sc1 < 0 && sc2 < 0) { *score = 0xffffff; free(qq); return 0; } + if (sc1 < 0) { + // sc2 is already correct + } else if (sc2 < 0) { + sc2 = sc1; + } else { + // sc1 and sc2 both pass, so use best + if (sc2 > sc1) + sc2 = sc1; + } // used for adjusting indelQ below - l = (int)(100. * sc / (qend - qbeg) + .499) * bca->indel_bias; - *score = sc<<8 | MIN(255, l); + l = (int)((100. * sc2 / (qend - qbeg) + .499) * bca->indel_bias); + *score = sc2<<8 | MIN(255, l); //fprintf(stderr, "score = %d, qend-qbeg = %d, => adj score %d\n", sc, qend-qbeg, l); rep_ele *reps, *elt, *tmp; @@ -716,7 +1195,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins; int *score, max_ref2; int N, K, l_run, ref_type, n_alt; - char *inscns = 0, *ref2, *query, **ref_sample; + char *inscns = 0, *ref1, *ref2, *query, **ref_sample; + + // FIXME: Does 2 references help? // determine if there is a gap for (s = N = 0; s < n; ++s) { @@ -748,7 +1229,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, right = i; // FIXME: move to own function: STR_adj_left_right? - if (1) { + if (0) { rep_ele *reps, *elt, *tmp; // Convert ASCII to 0,1,2,3 seq for find_STR usage @@ -851,6 +1332,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // fprintf(stderr, "=== POS %d, left/right = len %d\n", pos, right-left); + // compute the likelihood given each type of indel for each read + max_ins = types[n_types - 1]; // max_ins is at least 0 + max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); + // FIXME: add fudge to permit some extra neighbouring indels + max_ref2 += 50; + /* The following call fixes a long-existing flaw in the INDEL * calling model: the interference of nearby SNPs. However, it also * reduces the power because sometimes, substitutions caused by @@ -859,13 +1346,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, * * Masks mismatches present in at least 70% of the reads with 'N'. */ - ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right); + ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right, + max_ref2); // The length of the homopolymer run around the current position l_run = bcf_cgp_l_run(ref, pos); // construct the consensus sequence (minus indels, which are added later) - max_ins = types[n_types - 1]; // max_ins is at least 0 if (max_ins > 0) { inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos, types, n_types, max_ins, s); @@ -873,14 +1360,24 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, return -1; } - // compute the likelihood given each type of indel for each read - max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); + ref1 = (char*) calloc(max_ref2, 1); ref2 = (char*) calloc(max_ref2, 1); query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); score = (int*) calloc(N * n_types, sizeof(int)); bca->indelreg = 0; double nqual_over_60 = bca->nqual / 60.0; + // FIXME: need additional types, or rather to amend the type 0 case? + // + // We have types matching indel, plus type 0 which is ref. + // What about type 0 which matches consensus? + // Eg we have a small (wrong) 1bp insertion at current location, + // and a larger (correct) homozygous insertion say 10 bp away. + // + // We don't want the alignment of seqs vs wrong indel-hypothesis to be + // scoring higher than against ref. So need a consensus with the large + // insertion and no small hypothesised one. + for (t = 0; t < n_types; ++t) { int l, ir; @@ -907,6 +1404,27 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // Realignment score, computed via BAQ for (s = K = 0; s < n; ++s) { + char *tcons, *cp; + int left_shift, right_shift; + tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, + ref_sample[s], + left, right, s, types[t], + &left_shift, &right_shift); + fprintf(stderr, "Cons (%2d) %d/%d %s\n", left_shift, t, s, tcons); + // FIXME: map from ascii to 0,1,2,3,4. + // This is only needed because bcf_cgp_consensus is reporting in ASCII + // currently, for ease of debugging. + for (cp = tcons; *cp; cp++) { + switch(*cp) { + case 'A': *cp = 0; break; + case 'C': *cp = 1; break; + case 'G': *cp = 2; break; + case 'T': *cp = 3; break; + default : *cp = 4; break; + } + } + int tcon_len = cp-tcons; + // Construct ref2 from ref_sample, inscns and indels. // This is now the true sample consensus (possibly prepended // and appended with reference if sample data doesn't span @@ -920,7 +1438,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, for (l = 0; l < types[t]; ++l) ref2[k++] = inscns[t*max_ins + l]; - for (; j < right && ref[j]; ++j) + for (; j < right && ref[j] && k < right-left; ++j) ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; for (; k < max_ref2; ++k) ref2[k] = 4; @@ -928,6 +1446,41 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (right > j) right = j; + memcpy(ref1, ref2, right-left); // original consensus method + fprintf(stderr, "Type %d = %2d\t", t, types[t]); + for (j = 0; j < right-left; j++) + putc("ACGTN"[ref2[j]], stderr); + putc('\n', stderr); + + // Our computed consensus may start/end in slightly different + // positions due to indels. + // We pad it out with Ns so sequences overlapping don't + // carry penalties. (Ideally we'd pad with the reference, but + // this suffices and it's tricky to track.) + int ref2_pos = 0; + int rright = left + tcon_len; // ref left/right + if (left_shift > 0) { + memset(ref2, 4/*N*/, MIN(left_shift, max_ref2)); + ref2_pos += MIN(left_shift, max_ref2); +// rright += MIN(left_shift, max_ref2); +// if (rright-left > max_ref2) +// rright = left+max_ref2; + } + memcpy(ref2 + ref2_pos, tcons, MIN(tcon_len, max_ref2-ref2_pos)); + ref2_pos += MIN(tcon_len, max_ref2-ref2_pos); + if (right_shift > 0) { + memset(ref2 + ref2_pos, 4/*N*/, + MIN(right_shift, max_ref2-ref2_pos)); +// rright += MIN(right_shift, max_ref2-ref2_pos); + } + //memcpy(ref2, tcons, tcon_len); + free(tcons); + + fprintf(stderr, "TYPE %d = %2d\t", t, types[t]); + for (j = 0; j < right-left && j < max_ref2; j++) + putc("ACGTN"[ref2[j]], stderr); + putc('\n', stderr); + // align each read to ref2 for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; @@ -1009,6 +1562,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, tbeg = tbeg - l > left? tbeg - l : left; } + // FIXME: Why +20? tbeg-left_shift to tend+right_shift + // is still insufficient. Why? Check tpos2qpos maybe? + if (left_shift+20 > 0) + tbeg = tbeg - (left_shift+20) > left + ? tbeg - (left_shift+20) + : left; + if (right_shift+20 > 0) + tend = tend + right_shift+20 < rright + ? tend + right_shift+20 + : rright; + // write the query sequence for (l = qbeg; l < qend; ++l) query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; @@ -1022,10 +1586,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // Note low score = good, high score = bad. if (tend > tbeg) { if (bcf_cgp_align_score(p, bca, types[t], + (uint8_t *)ref1 + left2-left, (uint8_t *)ref2 + left2-left, (uint8_t *)query, r_start, r_end, long_read, - tbeg, tend, left2, right2, + tbeg, tend, left2, rright, qbeg, qend, qpos, max_deletion, &score[K*n_types + t]) < 0) { score[K*n_types + t] = 0xffffff; @@ -1036,9 +1601,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // region entirely within a deletion (thus tend < tbeg). score[K*n_types + t] = 0xffffff; } -#if 0 - for (l = 0; l < tend - tbeg + abs(types[t]); ++l) +#if 1 + for (l = 0; l < tend - tbeg + abs(types[t]); ++l) { + if (tbeg-left+l >= max_ref2) + break; fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); + } fputc('\n', stderr); for (l = 0; l < qend - qbeg; ++l) fputc("ACGTN"[(int)query[l]], stderr); @@ -1057,6 +1625,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, ref_type, types, n_types, score); // free + free(ref1); free(ref2); free(query); free(score); From 0e050ceb9eb3310c5cb77aa8a29f75861617dbab Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 21 Feb 2022 14:29:54 +0000 Subject: [PATCH 03/31] Fixes to consensus. Multiple del types need to use ref_cons based on the biggest del not the current one, so we're not incorporating other bits of deletion in the current "run". Bug fix to seq-pos ("y") when doing cons_base vs ref_base on INS. --- bam2bcf_indel.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index b1213e8c0..b96892ab3 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -469,12 +469,14 @@ static int bcf_cgp_append_cons(str_freq *sf, char *str, int len) { */ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, - char *ref_sample, - int left, int right, int sample, int type, + char *ref_sample, int left, int right, + int sample, int type, int biggest_del, int *left_shift, int *right_shift) { int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));// single base or del str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); // multi-base insertions + biggest_del = biggest_del<0?biggest_del+1:0; + // non-indel ref for all reads on this sample, rather than those just // matching type. We use this for handling the case where we have a // homozygous deletion being studied, but with 1 or 2 reads misaligned @@ -564,8 +566,10 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } case BAM_CINS: { - if (p->indel != type) + if (p->indel != type) { + y += len; // for when adding to ref_base break; + } char ins[1024]; for (j = 0; j < len; j++, y++) { @@ -628,7 +632,8 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, double rfract = (r - t*2)*.75 / (r+1); if (rfract > 0) { // && !(type == 0 && i+left == pos)) { - if (i+left >= pos+1 && i+left <= pos+1-(type<0?type+1:0)) { + //if (i+left >= pos+1 && i+left <= pos+1-(type<0?type+1:0)) { + if (i+left >= pos+1 && i+left <= pos+1-biggest_del) { int rem = rfract * n_plp[s]; fprintf(stderr, "rfract=%f rem=%d type=%d, t=%d r=%d\n", rfract, rem, type, t, r); // switch(ref_sample[i]) { @@ -755,9 +760,9 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } -#define CONS_CUTOFF .40 // 70% needed for base vs N +#define CONS_CUTOFF .40 // 40% needed for base vs N #define CONS_CUTOFF_INC .30 // 30% to include any insertion. -#define CONS_CUTOFF_INS .60 // and then 70% needed for it to be bases vs N +#define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N // Walk through the frequency arrays to call the consensus *left_shift = 0; *right_shift = 0; @@ -1378,6 +1383,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // scoring higher than against ref. So need a consensus with the large // insertion and no small hypothesised one. + int biggest_del = 0; + for (t = 0; t < n_types; t++) + if (biggest_del > types[t]) + biggest_del = types[t]; + for (t = 0; t < n_types; ++t) { int l, ir; @@ -1408,7 +1418,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, int left_shift, right_shift; tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, ref_sample[s], - left, right, s, types[t], + left, right, s, types[t], biggest_del, &left_shift, &right_shift); fprintf(stderr, "Cons (%2d) %d/%d %s\n", left_shift, t, s, tcons); // FIXME: map from ascii to 0,1,2,3,4. From d826490547aa053305cb384f95e56251806f6a85 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 22 Feb 2022 11:58:28 +0000 Subject: [PATCH 04/31] Fix display of type/TYPE cons. Debug change only --- bam2bcf_indel.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index b96892ab3..1da63f576 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -760,8 +760,8 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } -#define CONS_CUTOFF .40 // 40% needed for base vs N -#define CONS_CUTOFF_INC .30 // 30% to include any insertion. +#define CONS_CUTOFF .30 // 40% needed for base vs N +#define CONS_CUTOFF_INC .20 // 30% to include any insertion. #define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N // Walk through the frequency arrays to call the consensus *left_shift = 0; @@ -983,7 +983,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, } type = abs(type); - apf.bw = type + 3; // apf.bw=100; + apf.bw = type + 3; int l, sc1, sc2; const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; @@ -1012,6 +1012,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, query, qend - qbeg, qq, &apf, 0, 0); sc2 = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, query, qend - qbeg, qq, &apf, 0, 0); + // sc1 = INT_MAX; // disable for now if (sc1 < 0 && sc2 < 0) { *score = 0xffffff; free(qq); @@ -1456,9 +1457,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (right > j) right = j; - memcpy(ref1, ref2, right-left); // original consensus method + // original consensus method + memcpy(ref1, ref2, right-left+(types[t]>0?types[t]:0)); fprintf(stderr, "Type %d = %2d\t", t, types[t]); - for (j = 0; j < right-left; j++) + for (j = 0; j < right-left+(types[t]>0?types[t]:0); j++) putc("ACGTN"[ref2[j]], stderr); putc('\n', stderr); @@ -1487,7 +1489,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, free(tcons); fprintf(stderr, "TYPE %d = %2d\t", t, types[t]); - for (j = 0; j < right-left && j < max_ref2; j++) + for (j = 0; j < rright-left && j < max_ref2; j++) putc("ACGTN"[ref2[j]], stderr); putc('\n', stderr); From 7db37bbb9051aa8b2f3b82bb9cab45f0f5073532 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 22 Feb 2022 15:28:24 +0000 Subject: [PATCH 05/31] Apply IDV per type, not just across all indels as a whole. --- bam2bcf_indel.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 1da63f576..f0189ab3d 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -251,12 +251,27 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, return NULL; } t = 0; - types[t++] = aux[0] - MINUS_CONST; - for (i = 1; i < m; ++i) - if (aux[i] != aux[i-1]) - types[t++] = aux[i] - MINUS_CONST; + for (i = 0; i < m; ++i) { + int sz = (int32_t)(aux[i] - MINUS_CONST); + int j; + for (j = i+1; j < m; j++) + if (aux[j] != aux[i]) + break; + + if (sz == 0 + || j-i >= bca->min_support + // Note, doesn't handle bca->per_sample_flt yet + || bca->per_sample_flt + || (double)(j-i) / n_tot >= bca->min_frac) + types[t++] = sz; + i = j-1; + } free(aux); + if (t <= 1) + return NULL; + n_types = t; + // Find reference type; types[?] == 0) for (t = 0; t < n_types; ++t) if (types[t] == 0) break; @@ -551,12 +566,16 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, for (j = 0; j < len; j++, x++, y++) { if (x < left) continue; if (x >= right) break; + + // FIXME: don't want this, but alternative isn't + // working yet. if (last_base_ins) { last_base_ins = 0; continue; } base = bam_seqi(seq, y); if (p->indel == type) +// if (p->indel == type || p->indel >= 0) // alternative cons_base[x-left][L[base]]++; //else ref_base[x-left][L[base]]++; @@ -584,7 +603,7 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // 5I 5M is IIIIIM M M M M events, not // {IIIII,M} M M M M choice. So we need to include the // next match in our sequence when choosing the consensus. - if (y < b->core.l_qseq) { + if (y < b->core.l_qseq) { // alternative is to comment out base = bam_seqi(seq, y); if (j < 1024) ins[j++] = seq_nt16_str[base]; @@ -810,8 +829,8 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) cons[k++] = 'N'; } - continue; // don't call next base as included in insertion - // NB causes debugging output missing newlines, but meh + // don't call next base as included in insertion + continue; // alternative is to comment out. } // Call From 9ac3c72f17eac4b9274b198cae48ba9c4754e33d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 22 Feb 2022 17:33:02 +0000 Subject: [PATCH 06/31] Tweaks to try and improve on insertion calling. The define INS_PLUS_BASE is enabled, which is how the old code worked so no change. This includes the next base matching call in the last base of the insertion, so it's *either* cons_base *or* cons_ins. However we instead have an easier to understand alternative which is an easier to understand with cons_ins being the inserted bases and cons_base always being used for the matching bases. (This corrects some defects for reads than end on an insertion with no flanking M, although that is rare.) Unfortunately, for reasons unknown, it is very slightly higher on FP/GT. (Hence we still use the old method) --- bam2bcf_indel.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index f0189ab3d..f811f4da1 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -528,6 +528,11 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } +// cons_ins sequence is the insertion seq followed by the +// next match base +#define INS_PLUS_BASE + + // Accumulate sequences into cons_base and cons_ins arrays int last_base_ins = 0; for (i = 0; i < n_plp[s]; i++) { @@ -567,15 +572,18 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (x < left) continue; if (x >= right) break; - // FIXME: don't want this, but alternative isn't - // working yet. +#ifdef INS_PLUS_BASE if (last_base_ins) { last_base_ins = 0; continue; } +#endif base = bam_seqi(seq, y); +#ifdef INS_PLUS_BASE if (p->indel == type) -// if (p->indel == type || p->indel >= 0) // alternative +#else + if (p->indel == type || p->indel > 0) // alternative +#endif cons_base[x-left][L[base]]++; //else ref_base[x-left][L[base]]++; @@ -603,12 +611,14 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // 5I 5M is IIIIIM M M M M events, not // {IIIII,M} M M M M choice. So we need to include the // next match in our sequence when choosing the consensus. - if (y < b->core.l_qseq) { // alternative is to comment out +#ifdef INS_PLUS_BASE + if (y < b->core.l_qseq) { base = bam_seqi(seq, y); if (j < 1024) ins[j++] = seq_nt16_str[base]; } last_base_ins = 1; +#endif // fprintf(stderr, "<+%.*s>", j<1024?j:1024, ins); if (x >= left && x < right) @@ -812,7 +822,13 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // fprintf(stderr, "%.*s%d ", cons_ins[i].len[j], cons_ins[i].str[j], // cons_ins[i].freq[j]); } + // NB: tot is based on next matching base, so it includes + // everything with or without the insertion. +#ifdef INS_PLUS_BASE if (max_v_ins > CONS_CUTOFF_INC*(tot+tot_ins)) { +#else + if (max_v_ins > CONS_CUTOFF_INC*tot) { +#endif if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { // Insert bases for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) { @@ -830,7 +846,9 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, cons[k++] = 'N'; } // don't call next base as included in insertion - continue; // alternative is to comment out. +#ifdef INS_PLUS_BASE + continue; +#endif } // Call From c50d25b77c53f7c602d9b659460338729349e617 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 24 Feb 2022 10:08:34 +0000 Subject: [PATCH 07/31] First stab at returning dual consensus. We have the most common consensus plus a sub-consensus for when neighbouring heterozygous indels are present. This is acting as an alternative to using the old ref_sample method. --- bam2bcf_indel.c | 254 +++++++++++++++++++++++++++++++----------------- 1 file changed, 164 insertions(+), 90 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index f811f4da1..61e4f487e 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -482,11 +482,11 @@ static int bcf_cgp_append_cons(str_freq *sf, char *str, int len) { * Cons for no-del is Cs not Gs: * CON: AGCTACNAGGGTGATA */ -static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, - int pos, bcf_callaux_t *bca, const char *ref, - char *ref_sample, int left, int right, - int sample, int type, int biggest_del, - int *left_shift, int *right_shift) { +static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, + int pos, bcf_callaux_t *bca, const char *ref, + char *ref_sample, int left, int right, + int sample, int type, int biggest_del, + int *left_shift, int *right_shift) { int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));// single base or del str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); // multi-base insertions @@ -718,7 +718,10 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } max_len += ins; } - char *cons = malloc(max_len+1); + char **cons = malloc((max_len+1)*2 + sizeof(char *)*2); + cons[0] = (char *)&cons[2]; + cons[1] = cons[0] + max_len+1; +// char *cons = malloc(max_len+1); // FIXME: helps sometimes, harms others @@ -789,84 +792,120 @@ static char *bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } -#define CONS_CUTOFF .30 // 40% needed for base vs N -#define CONS_CUTOFF_INC .20 // 30% to include any insertion. -#define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N - // Walk through the frequency arrays to call the consensus +#define CONS_CUTOFF .30 // 30% needed for base vs N +#define CONS_CUTOFF2 .80 // 80% needed for gap in cons[1] +#define CONS_CUTOFF_INC .20 // 20% to include any insertion cons[0] +#define CONS_CUTOFF_INC2 .80 // 80% to include any insertion cons[1] +#define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N + // Walk through the frequency arrays to call the consensus. + // We produce cons[0] and cons[1]. Both include strongly + // homozygous indels. Both also include the indel at 'pos'. + // However for heterozygous indels we call the most likely event + // for cons[0] and the less-likely alternative in cons[1]. + // TODO: a proper phase analysis so multiple events end up + // combining together into the correct consensus. *left_shift = 0; *right_shift = 0; - for (i = k = 0; i < right-left; i++) { - // fprintf(stderr, "%d\t", i); - int max_v = 0, max_j = 4, tot = 0; - for (j = 0; j < 6; j++) { - if (max_v < cons_base[i][j]) - max_v = cons_base[i][j], max_j = j; - tot += cons_base[i][j]; -// if (cons_base[i][j]) -// fprintf(stderr, "%c%d ", "ACGTN*"[j], cons_base[i][j]); - } + int cnum; + for (cnum = 0; cnum < 2; cnum++) { + for (i = k = 0; i < right-left; i++) { + // fprintf(stderr, "%d\t", i); + int max_v = 0, max_v2 = 0, max_j = 4, max_j2 = 4, tot = 0; + for (j = 0; j < 6; j++) { + // Top 2 consensus calls + if (max_v < cons_base[i][j]) { + max_v2 = max_v, max_j2 = j; + max_v = cons_base[i][j], max_j = j; + } else if (max_v2 < cons_base[i][j]) { + max_v2 = cons_base[i][j], max_j2 = j; + } + tot += cons_base[i][j]; + // if (cons_base[i][j]) + // fprintf(stderr, "%c%d ", "ACGTN*"[j], cons_base[i][j]); + } - // +INS - int max_v_ins = 0, max_j_ins = 0; - int tot_ins = 0; - for (j = 0; j < NI; j++) { - if (!cons_ins[i].str[j]) - break; - if (cons_ins[i].freq[j] == 0) - continue; // previously merged + // +INS + int max_v_ins = 0, max_j_ins = 0; + int tot_ins = 0; + for (j = 0; j < NI; j++) { + if (!cons_ins[i].str[j]) + break; + if (cons_ins[i].freq[j] == 0) + continue; // previously merged - if (max_v_ins < cons_ins[i].freq[j]) - max_v_ins = cons_ins[i].freq[j], max_j_ins = j; - tot_ins += cons_ins[i].freq[j]; + if (max_v_ins < cons_ins[i].freq[j]) + max_v_ins = cons_ins[i].freq[j], max_j_ins = j; + tot_ins += cons_ins[i].freq[j]; -// fprintf(stderr, "%.*s%d ", cons_ins[i].len[j], cons_ins[i].str[j], -// cons_ins[i].freq[j]); - } - // NB: tot is based on next matching base, so it includes - // everything with or without the insertion. + // fprintf(stderr, "%.*s%d ", cons_ins[i].len[j], cons_ins[i].str[j], + // cons_ins[i].freq[j]); + } + // NB: tot is based on next matching base, so it includes + // everything with or without the insertion. #ifdef INS_PLUS_BASE - if (max_v_ins > CONS_CUTOFF_INC*(tot+tot_ins)) { + if (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins) && (cnum==0 || + max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || + i == pos-left+1)) { #else - if (max_v_ins > CONS_CUTOFF_INC*tot) { + if (max_v_ins > CONS_CUTOFF_INC *tot && (cnum==0 || + max_v_ins > CONS_CUTOFF_INC2*tot || + i == pos-left+1)) { #endif - if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { - // Insert bases - for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) { - // FIXME: commented out to deliberate get consensus shift. - // Need to know how to get aligner working properly in that - // scenario, as it'll happen sometimes! - if (k < pos-left+*left_shift) - (*left_shift)++; + if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { + // Insert bases + for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) { + // FIXME: commented out to deliberate get consensus shift. + // Need to know how to get aligner working properly in that + // scenario, as it'll happen sometimes! + if (cnum == 0) { + if (k < pos-left+*left_shift) + (*left_shift)++; + else + (*right_shift)++; + } + cons[cnum][k++] = cons_ins[i].str[max_j_ins][j]; + } + } else { + for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) + cons[cnum][k++] = 'N'; + } + // don't call next base as included in insertion +#ifdef INS_PLUS_BASE + continue; +#endif + } + + // Call + if (cnum == 0) { + if (max_v > CONS_CUTOFF*tot) { + if (max_j != 5) // gap + cons[cnum][k++] = "ACGTN*"[max_j]; + else if (k < pos-left+*left_shift) + (*left_shift)--; else (*right_shift)++; - cons[k++] = cons_ins[i].str[max_j_ins][j]; + } else { + cons[cnum][k++] = 'N'; } } else { - for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) - cons[k++] = 'N'; + if (max_j == 5) { + if (max_v > CONS_CUTOFF2*tot) + ; // no need to output "*" + else + max_j = max_j2, max_v = max_v2; + } + if (max_j != 5) { + if (max_v > CONS_CUTOFF*tot) + cons[cnum][k++] = "ACGTN*"[max_j]; + else + cons[cnum][k++] = 'N'; + } } - // don't call next base as included in insertion -#ifdef INS_PLUS_BASE - continue; -#endif - } - // Call - if (max_v > CONS_CUTOFF*tot) { - if (max_j != 5) // gap - cons[k++] = "ACGTN*"[max_j]; - else if (k < pos-left+*left_shift) - (*left_shift)--; - else - (*right_shift)++; - } else { - cons[k++] = 'N'; + // fprintf(stderr, "\n"); } - - // fprintf(stderr, "\n"); + cons[cnum][k++] = '\0'; } - cons[k++] = '\0'; - // fprintf(stderr, "Cons: %s\n", cons); free(cons_base); free(ref_base); @@ -1037,6 +1076,29 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, if (qval < 7) qval = 7; qq[l - qbeg] = qval; + + // Skew qq at qpos to be higher than background and qq at + // other regions to be lower. This means the alignment of + // indel we are currently assessing takes precedence over + // alignment of flanking regions. + // + // Ins; type = +ve + // Ref AGCTAG---CTGA + // Qry AGCTAGGGGCTGA (qpos..qpos+type) + // + // Del; type = -ve + // Ref AGCTAGGGGCTGA + // Qry AGCTAG---CTGA (qpos..qpos) + +// // Tests over 1-47MB +// // shift8b FP/GT/FN = 290/296/2310 +// // develop = 264/326/2282 +// if (l >= qpos-2 && l <= qpos+2+(type>0?type:0)) +// //qq[l-qbeg] += 15; //qq2 = 282/312/2334 +// qq[l-qbeg] *= 1.5; //qq3 = 284/305/2326 +// //qq[l-qbeg] *= 0.75;//qq4 = 287/333/2347 +//// else +//// qq[l-qbeg] *= 0.67; // qq = 269/343/2413 (qq3 with else clause) } // The bottom 8 bits are length-normalised score while @@ -1049,7 +1111,11 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, query, qend - qbeg, qq, &apf, 0, 0); sc2 = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, query, qend - qbeg, qq, &apf, 0, 0); + fprintf(stderr, "sc1=%x, sc2=%x\n", sc1, sc2); // sc1 = INT_MAX; // disable for now + // Correct solution is to get ref1 being second consensus rather + // than ref_sample, and to pick top two alleles by monitoring + // indel locations. (See google docs details) if (sc1 < 0 && sc2 < 0) { *score = 0xffffff; free(qq); @@ -1263,7 +1329,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, left = pos > bca->indel_win_size ? pos - bca->indel_win_size : 0; right = pos + bca->indel_win_size; int del_size = types[0]<0 ? -types[0] : 0; - int ins_size = types[0]>0 ? types[0] : 0; right += del_size; // in case the alignments stand out the reference @@ -1291,13 +1356,13 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, } reps = find_STR(ref4, right-left, 0); - int over_l = pos-1; - int over_r = pos+del_size+1; - int adjusted = 1; - //fprintf(stderr, "\nRef at %d: %.*s\n", left, right-left, ref+left); #if 0 + int adjusted = 1; + int over_l = pos-1; + int over_r = pos+del_size+1; + int ins_size = types[0]>0 ? types[0] : 0; DL_FOREACH_SAFE(reps, elt, tmp) { //fprintf(stderr, "rep %d..%d: %.*s\n", elt->start, elt->end, // elt->end-elt->start+1, ref+left+elt->start); @@ -1452,26 +1517,30 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // Realignment score, computed via BAQ for (s = K = 0; s < n; ++s) { - char *tcons, *cp; + char **tcons, *cp; int left_shift, right_shift; tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, ref_sample[s], left, right, s, types[t], biggest_del, &left_shift, &right_shift); - fprintf(stderr, "Cons (%2d) %d/%d %s\n", left_shift, t, s, tcons); + fprintf(stderr, "Cons0 (%2d) %d/%d %s\n", left_shift, t, s, tcons[0]); + fprintf(stderr, "Cons1 (%2d) %d/%d %s\n", left_shift, t, s, tcons[1]); // FIXME: map from ascii to 0,1,2,3,4. // This is only needed because bcf_cgp_consensus is reporting in ASCII // currently, for ease of debugging. - for (cp = tcons; *cp; cp++) { - switch(*cp) { - case 'A': *cp = 0; break; - case 'C': *cp = 1; break; - case 'G': *cp = 2; break; - case 'T': *cp = 3; break; - default : *cp = 4; break; + int tcon_len[2], cnum; + for (cnum = 0; cnum < 2; cnum++) { + for (cp = tcons[cnum]; *cp; cp++) { + switch(*cp) { + case 'A': *cp = 0; break; + case 'C': *cp = 1; break; + case 'G': *cp = 2; break; + case 'T': *cp = 3; break; + default : *cp = 4; break; + } } + tcon_len[cnum] = cp-tcons[cnum]; } - int tcon_len = cp-tcons; // Construct ref2 from ref_sample, inscns and indels. // This is now the true sample consensus (possibly prepended @@ -1494,11 +1563,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (right > j) right = j; + fprintf(stderr, "ConsR (##) ?/? "); + for (i = 0; i < right-left+(types[t]>0?types[t]:0); i++) + putc("ACGTN"[(uint8_t)ref2[i]], stderr); + putc('\n', stderr); + // original consensus method - memcpy(ref1, ref2, right-left+(types[t]>0?types[t]:0)); + //memcpy(ref1, ref2, right-left+(types[t]>0?types[t]:0)); + memcpy(ref1, tcons[1], tcon_len[1]); fprintf(stderr, "Type %d = %2d\t", t, types[t]); for (j = 0; j < right-left+(types[t]>0?types[t]:0); j++) - putc("ACGTN"[ref2[j]], stderr); + putc("ACGTN"[(uint8_t)ref2[j]], stderr); putc('\n', stderr); // Our computed consensus may start/end in slightly different @@ -1507,7 +1582,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // carry penalties. (Ideally we'd pad with the reference, but // this suffices and it's tricky to track.) int ref2_pos = 0; - int rright = left + tcon_len; // ref left/right + int rright = left + tcon_len[0]; // ref left/right if (left_shift > 0) { memset(ref2, 4/*N*/, MIN(left_shift, max_ref2)); ref2_pos += MIN(left_shift, max_ref2); @@ -1515,19 +1590,18 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // if (rright-left > max_ref2) // rright = left+max_ref2; } - memcpy(ref2 + ref2_pos, tcons, MIN(tcon_len, max_ref2-ref2_pos)); - ref2_pos += MIN(tcon_len, max_ref2-ref2_pos); + memcpy(ref2 + ref2_pos, tcons[0], MIN(tcon_len[0], max_ref2-ref2_pos)); + ref2_pos += MIN(tcon_len[0], max_ref2-ref2_pos); if (right_shift > 0) { memset(ref2 + ref2_pos, 4/*N*/, MIN(right_shift, max_ref2-ref2_pos)); // rright += MIN(right_shift, max_ref2-ref2_pos); } - //memcpy(ref2, tcons, tcon_len); free(tcons); fprintf(stderr, "TYPE %d = %2d\t", t, types[t]); for (j = 0; j < rright-left && j < max_ref2; j++) - putc("ACGTN"[ref2[j]], stderr); + putc("ACGTN"[(uint8_t)ref2[j]], stderr); putc('\n', stderr); // align each read to ref2 From bfc951f8a4f116485e7133e8e56af77ccef8148d Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 24 Feb 2022 12:56:13 +0000 Subject: [PATCH 08/31] Improvements to 2nd consensus generation --- bam2bcf_indel.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 61e4f487e..d15031051 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -834,6 +834,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, continue; // previously merged if (max_v_ins < cons_ins[i].freq[j]) + //if (i != pos-left+1 || cons_ins[i].len[j] == type) max_v_ins = cons_ins[i].freq[j], max_j_ins = j; tot_ins += cons_ins[i].freq[j]; @@ -843,13 +844,18 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // NB: tot is based on next matching base, so it includes // everything with or without the insertion. #ifdef INS_PLUS_BASE - if (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins) && (cnum==0 || - max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || - i == pos-left+1)) { +// if (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins) && (cnum==0 || +// max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || +// i == pos-left+1)) { + if ((i == pos-left+1 && type) || // current 'type' at pos + max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || // HOM + (max_v_ins > bca->min_support && + (cnum != 0) ^ max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins))) { // HET #else - if (max_v_ins > CONS_CUTOFF_INC *tot && (cnum==0 || - max_v_ins > CONS_CUTOFF_INC2*tot || - i == pos-left+1)) { + if ((i == pos-left+1 && type) || // current 'type' at pos + max_v_ins > CONS_CUTOFF_INC2*tot || // HOM + (max_v_ins > bca->min_support && + (cnum != 0) ^ max_v_ins > CONS_CUTOFF_INC*tot)) { // HET #endif if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { // Insert bases @@ -877,7 +883,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Call if (cnum == 0) { - if (max_v > CONS_CUTOFF*tot) { + if (max_v > CONS_CUTOFF*tot) { // HET or HOM if (max_j != 5) // gap cons[cnum][k++] = "ACGTN*"[max_j]; else if (k < pos-left+*left_shift) @@ -889,7 +895,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } else { if (max_j == 5) { - if (max_v > CONS_CUTOFF2*tot) + if (max_v > CONS_CUTOFF2*tot) // HOM ; // no need to output "*" else max_j = max_j2, max_v = max_v2; @@ -1570,7 +1576,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // original consensus method //memcpy(ref1, ref2, right-left+(types[t]>0?types[t]:0)); - memcpy(ref1, tcons[1], tcon_len[1]); + memcpy(ref1, tcons[1], right-left+(types[t]>0?types[t]:0)); + if (tcon_len[1] < right-left+(types[t]>0?types[t]:0)) { + memset(ref1+tcon_len[1], 4, + right-left+(types[t]>0?types[t]:0) - tcon_len[1]); + } fprintf(stderr, "Type %d = %2d\t", t, types[t]); for (j = 0; j < right-left+(types[t]>0?types[t]:0); j++) putc("ACGTN"[(uint8_t)ref2[j]], stderr); From 72b88f7f27565e2fc0454ac6fc928dd7d91362b2 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 24 Feb 2022 15:30:12 +0000 Subject: [PATCH 09/31] Correct consensus to match type, even if data disagrees. Also tweak ins vs N merge threshold. --- bam2bcf_indel.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index d15031051..c96a9d570 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -787,7 +787,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (max_v < ins[l][3]) max_v = ins[l][3], base = 3; if (max_v < ins[l][4]) max_v = ins[l][4], base = 4; - cons_ins[i].str[j][l] = (max_v > 0.8*tot) ?"ACGTN"[base] :'N'; + cons_ins[i].str[j][l] = (max_v > 0.6*tot) ?"ACGTN"[base] :'N'; } } } @@ -882,6 +882,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } // Call + if (type < 0 && i > pos-left && i <= pos-left-type) { + if (max_j != 5) + fprintf(stderr, "pos %d i %d pos-left %d type %d, max_j %d\n", + pos, i, pos-left, type, max_j); + max_v = cons_base[i][max_j = 5]; + } + if (cnum == 0) { if (max_v > CONS_CUTOFF*tot) { // HET or HOM if (max_j != 5) // gap From bae483dd1d1d298e6d2f9c0db9f46f9522b9c3f6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 25 Feb 2022 12:42:48 +0000 Subject: [PATCH 10/31] Removed ref_sample usage, also fix type call. - We previously started our consensus with 1 depth copy of ref_sample. This brings in data from the overall consensus for where one specific indel 'type' doesn't have data spanning the entire region. We now ensure a minimum of 1-fold depth is copied from the ref_base data instead. It's still not quite as good results, but good enough and paves the way for removal of a lot of existing code. - The bcf_call_glfgen code had a heuristic to change type calls to 0 (REF) where the read doesn't have an indel at that location, under the assumption our assessment was bad. This may have been helpful in the past, but with better consensus calculation our type calls are now more robust. Instead we change such data to be unclassified types instead so they don't count to AD and PL. This is still a fudge. It may be preferable to leave them as-is, and/or filter by some other means such as whether they span STRs. We get a 7% rise in FP value, but a 2% drop in FN and 16% drop in GT assignment errors. TODO: study those new FPs in more detail. --- bam2bcf.c | 3 ++- bam2bcf_indel.c | 40 +++++++++++++++++++++++----------------- 2 files changed, 25 insertions(+), 18 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index 76a0d439b..bd8bc5154 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -234,7 +234,8 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t // particularly indicative of being a good REF match either, // at least not in low coverage. So require solid coverage // before we start utilising such quals. - b = 0; + if (b != 0) + b = 5; q = (int)bam_get_qual(p->b)[p->qpos]; seqQ = (3*seqQ + 2*q)/8; } diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index c96a9d570..980f64ad0 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -516,17 +516,21 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // So the amount we add should be a proportion of the amount we didn't // include in our consensus. Eg MAX(0, depth - Nused*2). - // FIXME: maybe this no longer matters now we have ref_base[] -#define REF_SEED 1 - for (i = left; i < right; i++) { - switch(ref[i]) { - case 'A': cons_base[i-left][0]+=REF_SEED; break; - case 'C': cons_base[i-left][1]+=REF_SEED; break; - case 'G': cons_base[i-left][2]+=REF_SEED; break; - case 'T': cons_base[i-left][3]+=REF_SEED; break; - default: cons_base[i-left][4]+=REF_SEED; break; - } - } + // FIXME: maybe this no longer matters now we have ref_base[]. + // ~20MB of chr1 showed 0.6% more FN (vs FN not FN+TP) and + // identical FP/GT errs. Maybe not worth the effort of retaining + // ref_sample creation, but explore why this is and if tweaking + // elsewhere helps instead, eg the proportion of ref_base (rfract). +//#define REF_SEED 1 +// for (i = left; i < right; i++) { +// switch(ref[i]) { +// case 'A': cons_base[i-left][0]+=REF_SEED; break; +// case 'C': cons_base[i-left][1]+=REF_SEED; break; +// case 'G': cons_base[i-left][2]+=REF_SEED; break; +// case 'T': cons_base[i-left][3]+=REF_SEED; break; +// default: cons_base[i-left][4]+=REF_SEED; break; +// } +// } // cons_ins sequence is the insertion seq followed by the // next match base @@ -660,6 +664,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, ref_base[i][3] + ref_base[i][4] + ref_base[i][5]; double rfract = (r - t*2)*.75 / (r+1); + rfract += 1.01 / (r+1e-10); // compensate for REF_SEED=0 above: RF0b if (rfract > 0) { // && !(type == 0 && i+left == pos)) { //if (i+left >= pos+1 && i+left <= pos+1-(type<0?type+1:0)) { if (i+left >= pos+1 && i+left <= pos+1-biggest_del) { @@ -792,10 +797,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } -#define CONS_CUTOFF .30 // 30% needed for base vs N +// TODO CUTOFF .4, INC .4 (test .5) +#define CONS_CUTOFF .40 // 40% needed for base vs N #define CONS_CUTOFF2 .80 // 80% needed for gap in cons[1] -#define CONS_CUTOFF_INC .20 // 20% to include any insertion cons[0] -#define CONS_CUTOFF_INC2 .80 // 80% to include any insertion cons[1] +#define CONS_CUTOFF_INC .40 // 40% to include any insertion cons[0] +#define CONS_CUTOFF_INC2 .80 // 80% to include any insertion cons[1] HOM #define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N // Walk through the frequency arrays to call the consensus. // We produce cons[0] and cons[1]. Both include strongly @@ -1251,7 +1257,7 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (seqQ > 255) seqQ = 255; p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; // FIXME: redunctant; always indelQ - // fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@ -1443,8 +1449,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, score -= str[i-left]==0; int right_new = i; -// fprintf(stderr, "left %d, %d, pos %d, %d, right %d\n", -// left, left_new, pos, right_new, right); + fprintf(stderr, "left %d, %d, pos %d, %d, right %d\n", + left, left_new, pos, right_new, right); left = left_new; right = right_new; From 76b542b436e66bd10a88863b856790f8e8f2e9fd Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 28 Feb 2022 14:12:56 +0000 Subject: [PATCH 11/31] Change default indel-size and lower limit --- mpileup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mpileup.c b/mpileup.c index 3106ac51a..33e832b0f 100644 --- a/mpileup.c +++ b/mpileup.c @@ -1224,7 +1224,7 @@ int main_mpileup(int argc, char *argv[]) mplp.fmt_flag = B2B_INFO_VDB|B2B_INFO_RPB|B2B_INFO_SCB|B2B_INFO_ZSCORE; mplp.max_read_len = 500; mplp.ambig_reads = B2B_DROP; - mplp.indel_win_size = 110; + mplp.indel_win_size = 80; mplp.clevel = -1; hts_srand48(0); @@ -1410,9 +1410,9 @@ int main_mpileup(int argc, char *argv[]) char *tmp; mplp.indel_win_size = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --indel-size %s\n", optarg); - if ( mplp.indel_win_size < 110 ) + if ( mplp.indel_win_size < 20 ) { - //mplp.indel_win_size = 110; + mplp.indel_win_size = 20; fprintf(stderr,"Warning: running with --indel-size %d, the requested value is too small\n",mplp.indel_win_size); } } From 1295ec14b1bb2064a18f5a5ad0ea2e64e5f22904 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 28 Feb 2022 16:14:33 +0000 Subject: [PATCH 12/31] Remove debugging --- bam2bcf_indel.c | 52 ++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 980f64ad0..7c528f70d 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -668,8 +668,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (rfract > 0) { // && !(type == 0 && i+left == pos)) { //if (i+left >= pos+1 && i+left <= pos+1-(type<0?type+1:0)) { if (i+left >= pos+1 && i+left <= pos+1-biggest_del) { - int rem = rfract * n_plp[s]; - fprintf(stderr, "rfract=%f rem=%d type=%d, t=%d r=%d\n", rfract, rem, type, t, r); +// int rem = rfract * n_plp[s]; +// fprintf(stderr, "rfract=%f rem=%d type=%d, t=%d r=%d\n", rfract, rem, type, t, r); // switch(ref_sample[i]) { // case 1: cons_base[i][0] += rem; break; // A // case 2: cons_base[i][1] += rem; break; // C @@ -856,7 +856,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if ((i == pos-left+1 && type) || // current 'type' at pos max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || // HOM (max_v_ins > bca->min_support && - (cnum != 0) ^ max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins))) { // HET + (cnum != 0) ^ (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins)))) { // HET #else if ((i == pos-left+1 && type) || // current 'type' at pos max_v_ins > CONS_CUTOFF_INC2*tot || // HOM @@ -1126,15 +1126,15 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, // Try original cons and new cons and pick best. // This doesn't removed FN much (infact maybe adds very slightly), // but it does reduce GT errors and some slight reduction to FP. - sc1 = probaln_glocal(ref1 + tbeg - left, tend - tbeg + type, - query, qend - qbeg, qq, &apf, 0, 0); sc2 = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, query, qend - qbeg, qq, &apf, 0, 0); - fprintf(stderr, "sc1=%x, sc2=%x\n", sc1, sc2); - // sc1 = INT_MAX; // disable for now - // Correct solution is to get ref1 being second consensus rather - // than ref_sample, and to pick top two alleles by monitoring - // indel locations. (See google docs details) + if (memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, + tend - tbeg + type) != 0) + sc1 = probaln_glocal(ref1 + tbeg - left, tend - tbeg + type, + query, qend - qbeg, qq, &apf, 0, 0); + else + sc1 = INT_MAX; // skip + if (sc1 < 0 && sc2 < 0) { *score = 0xffffff; free(qq); @@ -1257,7 +1257,7 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (seqQ > 255) seqQ = 255; p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; // FIXME: redunctant; always indelQ - fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); +// fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); } } // determine bca->indel_types[] and bca->inscns @@ -1542,8 +1542,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, ref_sample[s], left, right, s, types[t], biggest_del, &left_shift, &right_shift); - fprintf(stderr, "Cons0 (%2d) %d/%d %s\n", left_shift, t, s, tcons[0]); - fprintf(stderr, "Cons1 (%2d) %d/%d %s\n", left_shift, t, s, tcons[1]); +// fprintf(stderr, "Cons0 (%2d) %d/%d %s\n", left_shift, t, s, tcons[0]); +// fprintf(stderr, "Cons1 (%2d) %d/%d %s\n", left_shift, t, s, tcons[1]); // FIXME: map from ascii to 0,1,2,3,4. // This is only needed because bcf_cgp_consensus is reporting in ASCII // currently, for ease of debugging. @@ -1582,10 +1582,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (right > j) right = j; - fprintf(stderr, "ConsR (##) ?/? "); - for (i = 0; i < right-left+(types[t]>0?types[t]:0); i++) - putc("ACGTN"[(uint8_t)ref2[i]], stderr); - putc('\n', stderr); +// fprintf(stderr, "ConsR (##) ?/? "); +// for (i = 0; i < right-left+(types[t]>0?types[t]:0); i++) +// putc("ACGTN"[(uint8_t)ref2[i]], stderr); +// putc('\n', stderr); // original consensus method //memcpy(ref1, ref2, right-left+(types[t]>0?types[t]:0)); @@ -1594,10 +1594,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, memset(ref1+tcon_len[1], 4, right-left+(types[t]>0?types[t]:0) - tcon_len[1]); } - fprintf(stderr, "Type %d = %2d\t", t, types[t]); - for (j = 0; j < right-left+(types[t]>0?types[t]:0); j++) - putc("ACGTN"[(uint8_t)ref2[j]], stderr); - putc('\n', stderr); +// fprintf(stderr, "Type %d = %2d\t", t, types[t]); +// for (j = 0; j < right-left+(types[t]>0?types[t]:0); j++) +// putc("ACGTN"[(uint8_t)ref2[j]], stderr); +// putc('\n', stderr); // Our computed consensus may start/end in slightly different // positions due to indels. @@ -1622,10 +1622,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, } free(tcons); - fprintf(stderr, "TYPE %d = %2d\t", t, types[t]); - for (j = 0; j < rright-left && j < max_ref2; j++) - putc("ACGTN"[(uint8_t)ref2[j]], stderr); - putc('\n', stderr); +// fprintf(stderr, "TYPE %d = %2d\t", t, types[t]); +// for (j = 0; j < rright-left && j < max_ref2; j++) +// putc("ACGTN"[(uint8_t)ref2[j]], stderr); +// putc('\n', stderr); // align each read to ref2 for (i = 0; i < n_plp[s]; ++i, ++K) { @@ -1747,7 +1747,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // region entirely within a deletion (thus tend < tbeg). score[K*n_types + t] = 0xffffff; } -#if 1 +#if 0 for (l = 0; l < tend - tbeg + abs(types[t]); ++l) { if (tbeg-left+l >= max_ref2) break; From b1a2a1074cf1ad044f207c4203bf3a7493f6813a Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 1 Mar 2022 19:29:11 +0000 Subject: [PATCH 13/31] Improvements to consensus generation --- bam2bcf_indel.c | 128 ++++++++++++++++++++++++++++++++++-------------- 1 file changed, 90 insertions(+), 38 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 7c528f70d..a9695af17 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -432,7 +432,7 @@ typedef struct { int freq[NI]; } str_freq; -static int bcf_cgp_append_cons(str_freq *sf, char *str, int len) { +static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) { int j; for (j = 0; j < NI && sf->str[j]; j++) { @@ -442,7 +442,7 @@ static int bcf_cgp_append_cons(str_freq *sf, char *str, int len) { if (j >= NI) return 0; // too many choices; discard - sf->freq[j]++; + sf->freq[j]+=freq; if (!sf->str[j]) { // new insertion if (!(sf->str[j] = malloc(len+1))) @@ -466,6 +466,7 @@ static int bcf_cgp_append_cons(str_freq *sf, char *str, int len) { * cons_base - consensus of data with p->indel == type, bases or gap * ref_base - consensus of data with p->indel != type, bases or gap * cons_ins - consensus of data with p->indel == type, insertions + * ref_ins - consensus of data with p->indel == type, bases or gap * * The purpose of cons_ins vs cons_base is if we have very low * coverage due to nearly all reads being another type, then we can @@ -502,6 +503,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // the other reads from type[] != 0 to flesh out the consensus and // improve accuracy. int (*ref_base)[6] = calloc(right - left + 1, sizeof(*ref_base)); + str_freq *ref_ins = calloc(right - left + 1, sizeof(*ref_ins)); int i, j, k, s = sample; @@ -577,6 +579,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (x >= right) break; #ifdef INS_PLUS_BASE + // FIXME: need last_base_ins_type and last_base_ins_ref? if (last_base_ins) { last_base_ins = 0; continue; @@ -589,18 +592,19 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (p->indel == type || p->indel > 0) // alternative #endif cons_base[x-left][L[base]]++; - //else + else if (x != pos+1) // indel being assessed question ref_base[x-left][L[base]]++; // fputc(seq_nt16_str[base], stderr); + // else last_base_ins=0? } break; } case BAM_CINS: { - if (p->indel != type) { - y += len; // for when adding to ref_base - break; - } +// if (p->indel != type) { +// y += len; // for when adding to ref_base +// break; +// } char ins[1024]; for (j = 0; j < len; j++, y++) { @@ -621,12 +625,21 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (j < 1024) ins[j++] = seq_nt16_str[base]; } - last_base_ins = 1; + //last_base_ins = 1; #endif // fprintf(stderr, "<+%.*s>", j<1024?j:1024, ins); - if (x >= left && x < right) - bcf_cgp_append_cons(&cons_ins[x-left], ins, j<1024?j:1024); + if (x >= left && x < right) { + int ilen = j<1024?j:1024; + if (p->indel == type) { + bcf_cgp_append_cons(&cons_ins[x-left], ins, ilen, 1); + } else if (x != pos+1){ + bcf_cgp_append_cons(&ref_ins[x-left], ins, ilen, 1); + } +#ifdef INS_PLUS_BASE + last_base_ins = 1; +#endif + } break; } @@ -638,8 +651,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (x >= right) break; // fputc('-', stderr); if (p->indel == type) + // fixme: not p->indel==type but x==pos+1 cons_base[x-left][5]++; - //else + else ref_base[x-left][5]++; } break; @@ -659,11 +673,16 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, break; t += cons_ins[i].freq[j]; } - int r = ref_base[i][0] + ref_base[i][1] + ref_base[i][2] + ref_base[i][3] + ref_base[i][4] + ref_base[i][5]; + for (j = 0; j < NI; j++) { + if (!ref_ins[i].str[j]) + break; + r += ref_ins[i].freq[j]; + } double rfract = (r - t*2)*.75 / (r+1); + if (rfract<0) rfract=0; // with or without? TEST rfract += 1.01 / (r+1e-10); // compensate for REF_SEED=0 above: RF0b if (rfract > 0) { // && !(type == 0 && i+left == pos)) { //if (i+left >= pos+1 && i+left <= pos+1-(type<0?type+1:0)) { @@ -685,27 +704,15 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, cons_base[i][4] += rfract * ref_base[i][4]; cons_base[i][5] += rfract * ref_base[i][5]; } - } -// // A portion of what's left copied from ref_sample -// int rem = (n_plp[s] - t*2)*.75; -// if (rem > 0) { -// rem = REF_SEED; // FUDGE; minimal count to block N -// // Add in the full "rem" amount and we get many more FN again -// // (but low low FP). Assume this is the del off-target being -// // turned back into bases? -// // -// // We could use the ref_sample construction code which adds to -// // cns[] as depth to track base vs gap. Or write a newer -// // ref_sample creation code. Do it right here infact... -// switch(ref_sample[i]) { -// case 1: cons_base[i][0] += rem; break; // A -// case 2: cons_base[i][1] += rem; break; // C -// case 4: cons_base[i][2] += rem; break; // G -// case 8: cons_base[i][3] += rem; break; // T -// default:cons_base[i][4] += rem; break; // N -// } -// } + for (j = 0; j < NI; j++) { + if (!ref_ins[i].str[j]) + break; + bcf_cgp_append_cons(&cons_ins[i], + ref_ins[i].str[j], ref_ins[i].len[j], + rfract * ref_ins[i].freq[j]); + } + } } // Allocate consensus buffer, to worst case length @@ -813,6 +820,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, *left_shift = 0; *right_shift = 0; int cnum; + + // Het call filled out in cnum==0 (+ve or -ve) + // Used in cnum==1 to do the opposite of whichever way we did before. + int het[1024] = {0}; + for (cnum = 0; cnum < 2; cnum++) { for (i = k = 0; i < right-left; i++) { // fprintf(stderr, "%d\t", i); @@ -853,10 +865,28 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // if (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins) && (cnum==0 || // max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || // i == pos-left+1)) { - if ((i == pos-left+1 && type) || // current 'type' at pos - max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || // HOM - (max_v_ins > bca->min_support && - (cnum != 0) ^ (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins)))) { // HET + int always_ins = + (i == pos-left+1 && type) || // current eval + max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins);// HOM + int het_ins = 0; + if (!always_ins && max_v_ins >= bca->min_support) { + // Candidate HET ins. + if (cnum == 0) { + het_ins = max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins); + if (i < 1024) het[i] = het_ins + ? 1 + : (max_v_ins > .2*(tot+tot_ins) ? -1:0); + } else { + het_ins = (het[i] == -1); // HET but uncalled before + } + } +// if (max_v_ins) +// fprintf(stderr, "Cons @ %d: type %d cnum %d always %d het_ins %d // max_v %d vs %d+%d\n", i, type, cnum, always_ins, het_ins, max_v_ins, tot, tot_ins); + if (always_ins || het_ins) { +// if ((i == pos-left+1 && type) || // current 'type' at pos +// max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || // HOM +// (max_v_ins > bca->min_support && +// (cnum != 0) ^ (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins)))) { // HET #else if ((i == pos-left+1 && type) || // current 'type' at pos max_v_ins > CONS_CUTOFF_INC2*tot || // HOM @@ -930,12 +960,16 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, free(ref_base); for (i = 0; i < right-left; i++) { - for (j = 0; j < NI; j++) + for (j = 0; j < NI; j++) { // FIXME: replace by string pool if (cons_ins[i].str[j]) free(cons_ins[i].str[j]); + if (ref_ins[i].str[j]) + free(ref_ins[i].str[j]); + } } free(cons_ins); + free(ref_ins); return cons; } @@ -1128,6 +1162,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, // but it does reduce GT errors and some slight reduction to FP. sc2 = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, query, qend - qbeg, qq, &apf, 0, 0); + if (memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, tend - tbeg + type) != 0) sc1 = probaln_glocal(ref1 + tbeg - left, tend - tbeg + type, @@ -1135,6 +1170,22 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, else sc1 = INT_MAX; // skip +#if 0 + fprintf(stderr, "\nref1: "); + for (int j = 0; j < tend-tbeg+type; j++) + putc("ACGTN"[(uint8_t)ref1[j+tbeg-left]], stderr); + putc('\n', stderr); + fprintf(stderr, "ref2: "); + for (int j = 0; j < tend-tbeg+type; j++) + putc("ACGTN"[(uint8_t)ref2[j+tbeg-left]], stderr); + putc('\n', stderr); + fprintf(stderr, "qury: "); + for (int j = 0; j < qend-qbeg; j++) + putc("ACGTN"[(uint8_t)query[j]], stderr); + putc('\n', stderr); + fprintf(stderr, "sc1=%d sc2=%d\n\n", sc1, sc2); +#endif + if (sc1 < 0 && sc2 < 0) { *score = 0xffffff; free(qq); @@ -1544,6 +1595,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, &left_shift, &right_shift); // fprintf(stderr, "Cons0 (%2d) %d/%d %s\n", left_shift, t, s, tcons[0]); // fprintf(stderr, "Cons1 (%2d) %d/%d %s\n", left_shift, t, s, tcons[1]); + // FIXME: map from ascii to 0,1,2,3,4. // This is only needed because bcf_cgp_consensus is reporting in ASCII // currently, for ease of debugging. @@ -1589,7 +1641,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // original consensus method //memcpy(ref1, ref2, right-left+(types[t]>0?types[t]:0)); - memcpy(ref1, tcons[1], right-left+(types[t]>0?types[t]:0)); + memcpy(ref1, tcons[1], MIN(tcon_len[1], max_ref2)); if (tcon_len[1] < right-left+(types[t]>0?types[t]:0)) { memset(ref1+tcon_len[1], 4, right-left+(types[t]>0?types[t]:0) - tcon_len[1]); From 98f4b74ebacc5b15ed1bef49831f7c8228ed535b Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 2 Mar 2022 17:50:14 +0000 Subject: [PATCH 14/31] Adjustments to the ref vs query region for alignments. Also a small correction to consensus rfract so it's a minimum rather than additive amendment. --- bam2bcf_indel.c | 55 +++++++++++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 20 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index a9695af17..6effe8e29 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -682,9 +682,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } double rfract = (r - t*2)*.75 / (r+1); - if (rfract<0) rfract=0; // with or without? TEST - rfract += 1.01 / (r+1e-10); // compensate for REF_SEED=0 above: RF0b - if (rfract > 0) { // && !(type == 0 && i+left == pos)) { + //rfract*=.5; // -FN +FP/GT. Which poison do we want? + if (rfract < 1.01 / (r+1e-10))// compensate for REF_SEED=0 above: RF0b + rfract = 1.01 / (r+1e-10); + + if (1 || rfract > 0) { // && !(type == 0 && i+left == pos)) { //if (i+left >= pos+1 && i+left <= pos+1-(type<0?type+1:0)) { if (i+left >= pos+1 && i+left <= pos+1-biggest_del) { // int rem = rfract * n_plp[s]; @@ -804,7 +806,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } -// TODO CUTOFF .4, INC .4 (test .5) +// TODO: try CONS_CUTOFF higher, eg .6, to force more Ns? #define CONS_CUTOFF .40 // 40% needed for base vs N #define CONS_CUTOFF2 .80 // 80% needed for gap in cons[1] #define CONS_CUTOFF_INC .40 // 40% to include any insertion cons[0] @@ -937,6 +939,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, cons[cnum][k++] = 'N'; } } else { + // FIXME: use the same het[] array logic as for ins above if (max_j == 5) { if (max_v > CONS_CUTOFF2*tot) // HOM ; // no need to output "*" @@ -1095,7 +1098,7 @@ static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, uint8_t *ref1, uint8_t *ref2, uint8_t *query, int r_start, int r_end, int long_read, - int tbeg, int tend, + int tbeg, int tend1, int tend2, int left, int right, int qbeg, int qend, int qpos, int max_deletion, @@ -1160,23 +1163,24 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, // Try original cons and new cons and pick best. // This doesn't removed FN much (infact maybe adds very slightly), // but it does reduce GT errors and some slight reduction to FP. - sc2 = probaln_glocal(ref2 + tbeg - left, tend - tbeg + type, + sc2 = probaln_glocal(ref2 + tbeg - left, tend2 - tbeg, query, qend - qbeg, qq, &apf, 0, 0); - if (memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, - tend - tbeg + type) != 0) - sc1 = probaln_glocal(ref1 + tbeg - left, tend - tbeg + type, + if (tend1 != tend2 || + memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, + tend1 - tbeg + type) != 0) + sc1 = probaln_glocal(ref1 + tbeg - left, tend1 - tbeg, query, qend - qbeg, qq, &apf, 0, 0); else sc1 = INT_MAX; // skip -#if 0 +#if 1 fprintf(stderr, "\nref1: "); - for (int j = 0; j < tend-tbeg+type; j++) + for (int j = 0; j < tend1-tbeg; j++) putc("ACGTN"[(uint8_t)ref1[j+tbeg-left]], stderr); putc('\n', stderr); fprintf(stderr, "ref2: "); - for (int j = 0; j < tend-tbeg+type; j++) + for (int j = 0; j < tend2-tbeg; j++) putc("ACGTN"[(uint8_t)ref2[j+tbeg-left]], stderr); putc('\n', stderr); fprintf(stderr, "qury: "); @@ -1208,7 +1212,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, rep_ele *reps, *elt, *tmp; uint8_t *seg = ref2 + tbeg - left; - int seg_len = tend - tbeg + type; + int seg_len = tend2 - tbeg + type; // Note: although seg moves (tbeg varies), ref2 is reused many times // so we could factor out some find_STR calls. However it's not the @@ -1590,11 +1594,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, char **tcons, *cp; int left_shift, right_shift; tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, - ref_sample[s], + NULL,//ref_sample[s], left, right, s, types[t], biggest_del, &left_shift, &right_shift); -// fprintf(stderr, "Cons0 (%2d) %d/%d %s\n", left_shift, t, s, tcons[0]); -// fprintf(stderr, "Cons1 (%2d) %d/%d %s\n", left_shift, t, s, tcons[1]); + fprintf(stderr, "Cons0 (%2d) %d/%d %s\n", left_shift, t, s, tcons[0]); + fprintf(stderr, "Cons1 (%2d) %d/%d %s\n", left_shift, t, s, tcons[1]); // FIXME: map from ascii to 0,1,2,3,4. // This is only needed because bcf_cgp_consensus is reporting in ASCII @@ -1672,7 +1676,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, MIN(right_shift, max_ref2-ref2_pos)); // rright += MIN(right_shift, max_ref2-ref2_pos); } - free(tcons); // fprintf(stderr, "TYPE %d = %2d\t", t, types[t]); // for (j = 0; j < rright-left && j < max_ref2; j++) @@ -1779,16 +1782,27 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // RG platform field. int long_read = p->b->core.l_qseq > 1000; + // FIXME: we can improve (see above). + // Maybe use tbeg/tend as before, but with adjustment for + // difference between right-left and tcon_len. + // For now we just brute force it and do full ref range. + // It doesn't seem to impact on band at all. *Why?* + int tend1 = left + tcon_len[0] - (left2-left); + int tend2 = left + tcon_len[1] - (left2-left); + + // do realignment; this is the bottleneck. // // Note low score = good, high score = bad. if (tend > tbeg) { if (bcf_cgp_align_score(p, bca, types[t], - (uint8_t *)ref1 + left2-left, - (uint8_t *)ref2 + left2-left, + //(uint8_t *)ref1 + left2-left, + //(uint8_t *)ref2 + left2-left, + (uint8_t *)tcons[0] + left2-left, + (uint8_t *)tcons[1] + left2-left, (uint8_t *)query, r_start, r_end, long_read, - tbeg, tend, left2, rright, + tbeg, tend1, tend2, left2, rright, qbeg, qend, qpos, max_deletion, &score[K*n_types + t]) < 0) { score[K*n_types + t] = 0xffffff; @@ -1815,6 +1829,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, qbeg, tbeg, score[K*n_types + t]>>8, score[K*n_types + t]&0xff); #endif } + free(tcons); } } From 36b972ba7daf6e322e1acb05b09f58c30aefb468 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 3 Mar 2022 15:37:44 +0000 Subject: [PATCH 15/31] Code tidyup --- bam2bcf_indel.c | 356 +++++++++++++++--------------------------------- 1 file changed, 107 insertions(+), 249 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 6effe8e29..a8e1d3c24 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -1,3 +1,24 @@ +/* + +TODO: + +- Reevaluate the two STR indel-size adjusting modes. + Maybe no longer relevant + +- Write deletion test perl script and evaluate consensus construction + +- Explore INS_PLUS_BASE again. Prefer to disable this as it's hard to + understand and doesn't work properly on reads ending on an + insertion. + +- Explore indelQ and the effect of STR at boundaries. I'm not + convined our quality calculation is correct. Certainly QUAL appears + to have little reality with actual indel likelihood! + +- Consider limiting fract to never add more than current depth, so we + change cons to Ns but not to another base type entirely. +*/ + /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. @@ -40,6 +61,18 @@ KSORT_INIT_GENERIC(uint32_t) #define MAX_TYPES 64 +#ifndef MIN +# define MIN(a,b) ((a)<(b)?(a):(b)) +#endif + +#ifndef ABS +# define ABS(a) ((a)<0?-(a):(a)) +#endif + +#ifndef MAX +# define MAX(a,b) ((a)>(b)?(a):(b)) +#endif + // Take a reference position tpos and convert to a query position (returned). // This uses the CIGAR string plus alignment c->pos to do the mapping. // @@ -284,145 +317,6 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, return types; } -// Part of bcf_call_gap_prep. -// -// Construct per-sample consensus. -// -// Returns an array of consensus seqs, -// or NULL on failure. -static char **bcf_cgp_ref_sample(int n, int *n_plp, bam_pileup1_t **plp, - int pos, bcf_callaux_t *bca, const char *ref, - int left, int right, int max_ref2) { - int i, k, s, L = right - left + 1, max_i, max2_i; - if (L < max_ref2) L = max_ref2; - char **ref_sample; // returned - uint32_t *cns = NULL, max, max2; - char *ref0 = NULL, *r; - ref_sample = (char**) calloc(n, sizeof(char*)); - cns = (uint32_t*) calloc(L, 4); - ref0 = (char*) calloc(L, 1); - if (!ref_sample || !cns || !ref0) { - n = 0; - goto err; - } - - // Convert ref ASCII to 0-15. - for (i = 0; i < right - left; ++i) - ref0[i] = seq_nt16_table[(int)ref[i+left]]; - - // NB: one consensus per sample 'n', not per indel type. - // FIXME: consider fixing this. We should compute alignments vs - // types, not vs samples? Or types/sample combined? - for (s = 0; s < n; ++s) { - r = ref_sample[s] = (char*) calloc(L, 1); - if (!r) { - n = s-1; - goto err; - } - - memset(cns, 0, sizeof(int) * L); - - // collect ref and non-ref counts in cns - for (i = 0; i < n_plp[s]; ++i) { - bam_pileup1_t *p = plp[s] + i; - bam1_t *b = p->b; - uint32_t *cigar = bam_get_cigar(b); - uint8_t *seq = bam_get_seq(b); - int x = b->core.pos, y = 0; - - // TODO: pileup exposes pileup_ind, but we also need e.g. - // pileup_len to know how much of the current CIGAR op-len - // we've used (or have remaining). If we had that, we - // could start at p->qpos without having to scan through - // the entire CIGAR string until we find it. - // - // Without it about all we could do is have a side channel - // to cache the last known coords. Messy, so punt for now. - // This is no longer the bottle neck until we get to 1000s of - // CIGAR ops. - - for (k = 0; k < b->core.n_cigar; ++k) { - int op = cigar[k]&0xf; - int j, l = cigar[k]>>4; - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - if (x + l >= left) { - j = left - x > 0 ? left - x : 0; - int j_end = right - x < l ? right - x : l; - for (; j < j_end; j++) - // Append to cns. Note this is ref coords, - // so insertions aren't in cns and deletions - // will have lower coverage. - - // FIXME: want true consensus (with ins) per - // type, so we can independently compare each - // seq to each consensus and see which it - // matches best, so we get proper GT analysis. - cns[x+j-left] += - (bam_seqi(seq, y+j) == ref0[x+j-left]) - ? 1 // REF - : (1<<16); // ALT - } - x += l; y += l; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP) { - x += l; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP) { - y += l; - } - - if (x > right) - break; - } - } - - // Determine a sample specific reference. - for (i = 0; i < right - left; ++i) - r[i] = ref0[i]; - - // Find deepest and 2nd deepest ALT region (max & max2). - max = max2 = 0; max_i = max2_i = -1; - for (i = 0; i < right - left; ++i) { - if (cns[i]>>16 >= max>>16) - max2 = max, max2_i = max_i, max = cns[i], max_i = i; - else if (cns[i]>>16 >= max2>>16) - max2 = cns[i], max2_i = i; - } - - // Masks mismatches present in at least 70% of the reads with 'N'. - // This code is nREF/(nREF+n_ALT) >= 70% for deepest region. - // The effect is that at least 30% of bases differing to REF will - // use "N" in consensus, so we don't penalise ALT or REF when - // aligning against it. (A poor man IUPAC code) - // - // Why is it only done in two loci at most? - if ((double)(max&0xffff) / ((max&0xffff) + (max>>16)) >= 0.7) - max_i = -1; - if ((double)(max2&0xffff) / ((max2&0xffff) + (max2>>16)) >= 0.7) - max2_i = -1; - if (max_i >= 0) r[max_i] = 15; - if (max2_i >= 0) r[max2_i] = 15; - - //for (i = 0; i < right - left; ++i) - // fputc("=ACMGRSVTWYHKDBN"[(int)r[i]], stderr); - //fputc('\n', stderr); - } - - free(ref0); - free(cns); - - return ref_sample; - - err: - free(ref0); - free(cns); - if (ref_sample) { - for (s = 0; s < n; s++) - free(ref_sample[s]); - free(ref_sample); - } - - return NULL; -} - // Increment ins["str"] and freq["str"] #define NI 10 // number of alternative insertion sequences // Could use a hash table too, but expectation is a tiny number of alternatives @@ -480,12 +374,12 @@ static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) { * SEQ: AGCTACGAGG*TGATA (x24) * SEQ: AGCTACTAGG*TGATA (x24) * - * Cons for no-del is Cs not Gs: + * Cons for no-del is Cs not Gs. Cannot trust it, so use N if shallow. * CON: AGCTACNAGGGTGATA */ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, - char *ref_sample, int left, int right, + int left, int right, int sample, int type, int biggest_del, int *left_shift, int *right_shift) { int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));// single base or del @@ -507,33 +401,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int i, j, k, s = sample; - // Seed cons_base with ref so lack of data still aligns against the ref. - // fprintf(stderr, "ref=%.*s\n", right-left, ref+left); - - // FIXME: do this at end, and use ref_sample instead of ref. - // We add a figure based on the total depth. - // Eg if we added 50 out of 100 then our consensus is probably fine. - // If we added 2 out of 100 then they're problably erroneous, so we - // don't want to start calling them or Ns. Use ref_sample instead. - // So the amount we add should be a proportion of the amount we didn't - // include in our consensus. Eg MAX(0, depth - Nused*2). - - // FIXME: maybe this no longer matters now we have ref_base[]. - // ~20MB of chr1 showed 0.6% more FN (vs FN not FN+TP) and - // identical FP/GT errs. Maybe not worth the effort of retaining - // ref_sample creation, but explore why this is and if tweaking - // elsewhere helps instead, eg the proportion of ref_base (rfract). -//#define REF_SEED 1 -// for (i = left; i < right; i++) { -// switch(ref[i]) { -// case 'A': cons_base[i-left][0]+=REF_SEED; break; -// case 'C': cons_base[i-left][1]+=REF_SEED; break; -// case 'G': cons_base[i-left][2]+=REF_SEED; break; -// case 'T': cons_base[i-left][3]+=REF_SEED; break; -// default: cons_base[i-left][4]+=REF_SEED; break; -// } -// } - // cons_ins sequence is the insertion seq followed by the // next match base #define INS_PLUS_BASE @@ -662,7 +529,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // fprintf(stderr, " %s\n", bam_get_qname(p->b)); } - // Expand cons_base to include depth from ref_sample. + // Expand cons_base to include depth from ref_base/ref_ins // Caveat: except at pos itself, where true ref is used if type != 0 for (i = 0; i < right-left; i++) { // Total observed depth @@ -681,24 +548,36 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, r += ref_ins[i].freq[j]; } + // When evaluating this particular indel, we don't want to + // penalise alignments by SNP errors elsewhere. This can + // happen when we have low depth for a particular 'type'. + // + // So add in a little data from ref_base/ref_ins. double rfract = (r - t*2)*.75 / (r+1); + +// // We ensure this is at least 1 fold deep, and we try to add +// // no more than the amount of coverage in this consesnsus. +// double rfract = (MIN(r, t*3.333+1) - t*2)*.75 / (r+1); + //rfract*=.5; // -FN +FP/GT. Which poison do we want? - if (rfract < 1.01 / (r+1e-10))// compensate for REF_SEED=0 above: RF0b - rfract = 1.01 / (r+1e-10); + if (rfract < 1.01 / (r+1e-10)) + rfract = 1.01 / (r+1e-10); // low depth compensation + + // TODO: consider limiting rfract so we never drown out the + // signal. We want to use the remaining data only to correct + // for sequencing errors in low depth alleles. If we get + // conflicts, it's better to use N than to change a base + // incase that variant is genuine. if (1 || rfract > 0) { // && !(type == 0 && i+left == pos)) { - //if (i+left >= pos+1 && i+left <= pos+1-(type<0?type+1:0)) { if (i+left >= pos+1 && i+left <= pos+1-biggest_del) { -// int rem = rfract * n_plp[s]; -// fprintf(stderr, "rfract=%f rem=%d type=%d, t=%d r=%d\n", rfract, rem, type, t, r); -// switch(ref_sample[i]) { -// case 1: cons_base[i][0] += rem; break; // A -// case 2: cons_base[i][1] += rem; break; // C -// case 4: cons_base[i][2] += rem; break; // G -// case 8: cons_base[i][3] += rem; break; // T -// default:cons_base[i][4] += rem; break; // N -// } + // We're overlapping the current indel region, so + // we don't wish to bring in evidence from the other + // "type" data. + continue; // 10f+ } else { + // Otherwise add in a portion of other data to + // boost low population numbers. cons_base[i][0] += rfract * ref_base[i][0]; cons_base[i][1] += rfract * ref_base[i][1]; cons_base[i][2] += rfract * ref_base[i][2]; @@ -735,9 +614,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, char **cons = malloc((max_len+1)*2 + sizeof(char *)*2); cons[0] = (char *)&cons[2]; cons[1] = cons[0] + max_len+1; -// char *cons = malloc(max_len+1); - - // FIXME: helps sometimes, harms others // Merge insertions where they are the same length but different // sequences. @@ -1048,14 +924,6 @@ static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, return inscns; } -#ifndef MIN -# define MIN(a,b) ((a)<(b)?(a):(b)) -#endif - -#ifndef MAX -# define MAX(a,b) ((a)>(b)?(a):(b)) -#endif - // Part of bcf_call_gap_prep. // // Realign using BAQ to get an alignment score of a single read vs @@ -1115,11 +983,44 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, } type = abs(type); - apf.bw = type + 3; + apf.bw = type + 3; // or abs(l_ref - l_query), so we want to keep similar int l, sc1, sc2; const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; + // Trim poly_Ns at ends of ref. + // This helps to keep len(ref) and len(query) similar, to reduce + // band size and reduce the chance of -ve BAQ scores. + + // FIXME Maybe instead of l>ABS(type) it should be l>query_len/2 ? + // TODO: no difference to result, but what difference is there to + // speed? Is this worth it? + for (l = 0; l < tend1-tbeg && l < tend2-tbeg; l++) + if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4) + break; + if (l > ABS(type)) { + fprintf(stderr, "Prune %d N to left\n", l-ABS(type)); + tbeg += l-ABS(type); + } + + for (l = tend1-tbeg-1; l >= 0; l--) + if (ref1[l + tbeg-left] != 4) + break; + l = tend1-tbeg-1 - l; + if (l > ABS(type)) { + fprintf(stderr, "Prune %d N to right 1\n", l-ABS(type)); + tend1 -= l-ABS(type); + } + + for (l = tend2-tbeg-1; l >= 0; l--) + if (ref2[l + tbeg-left] != 4) + break; + l = tend2-tbeg-1 - l; + if (l > ABS(type)) { + fprintf(stderr, "Prune %d N to right 2\n", l-ABS(type)); + tend2 -= l-ABS(type); + } + // Get segment of quality, either ZQ tag or if absent QUAL. if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) return -1; @@ -1175,6 +1076,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, sc1 = INT_MAX; // skip #if 1 +#define CONS_DEBUG fprintf(stderr, "\nref1: "); for (int j = 0; j < tend1-tbeg; j++) putc("ACGTN"[(uint8_t)ref1[j+tbeg-left]], stderr); @@ -1187,7 +1089,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, for (int j = 0; j < qend-qbeg; j++) putc("ACGTN"[(uint8_t)query[j]], stderr); putc('\n', stderr); - fprintf(stderr, "sc1=%d sc2=%d\n\n", sc1, sc2); + fprintf(stderr, "sc1 %-9d sc2 %-9d ", sc1, sc2); #endif if (sc1 < 0 && sc2 < 0) { @@ -1236,8 +1138,9 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, if (elt->start <= qpos && elt->end >= qpos) { iscore += (elt->end-elt->start) / elt->rep_len; // c if (elt->start+tbeg <= r_start || - elt->end+tbeg >= r_end) + elt->end+tbeg >= r_end) { iscore += 2*(elt->end-elt->start); + } } DL_DELETE(reps, elt); @@ -1247,6 +1150,9 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, // Apply STR score to existing indelQ l = (*score&0xff)*.8 + iscore*2; *score = (*score & ~0xff) | MIN(255, l); +#ifdef CONS_DEBUG + fprintf(stderr, " iscore %-4d l %d %d..%d\n\n", iscore, l, r_start, r_end); +#endif free(qq); @@ -1375,12 +1281,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, { if (ref == 0 || bca == 0) return -1; - int i, s, j, k, t, n_types, *types, max_rd_len, left, right, max_ins; + int i, s, t, n_types, *types, max_rd_len, left, right, max_ins; int *score, max_ref2; int N, K, l_run, ref_type, n_alt; - char *inscns = 0, *ref1, *ref2, *query, **ref_sample; - - // FIXME: Does 2 references help? + char *inscns = 0, *ref1, *ref2, *query; // determine if there is a gap for (s = N = 0; s < n; ++s) { @@ -1520,17 +1424,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // FIXME: add fudge to permit some extra neighbouring indels max_ref2 += 50; - /* The following call fixes a long-existing flaw in the INDEL - * calling model: the interference of nearby SNPs. However, it also - * reduces the power because sometimes, substitutions caused by - * indels are not distinguishable from true mutations. Multiple - * sequence realignment helps to increase the power. - * - * Masks mismatches present in at least 70% of the reads with 'N'. - */ - ref_sample = bcf_cgp_ref_sample(n, n_plp, plp, pos, bca, ref, left, right, - max_ref2); - // The length of the homopolymer run around the current position l_run = bcf_cgp_l_run(ref, pos); @@ -1594,11 +1487,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, char **tcons, *cp; int left_shift, right_shift; tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, - NULL,//ref_sample[s], left, right, s, types[t], biggest_del, &left_shift, &right_shift); - fprintf(stderr, "Cons0 (%2d) %d/%d %s\n", left_shift, t, s, tcons[0]); - fprintf(stderr, "Cons1 (%2d) %d/%d %s\n", left_shift, t, s, tcons[1]); + fprintf(stderr, "Cons0 @ %d %d %s\n", pos, types[t], tcons[0]); + fprintf(stderr, "Cons1 @ %d %d %s\n", pos, types[t], tcons[1]); // FIXME: map from ascii to 0,1,2,3,4. // This is only needed because bcf_cgp_consensus is reporting in ASCII @@ -1617,32 +1509,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, tcon_len[cnum] = cp-tcons[cnum]; } - // Construct ref2 from ref_sample, inscns and indels. - // This is now the true sample consensus (possibly prepended - // and appended with reference if sample data doesn't span - // the full length). - for (k = 0, j = left; j <= pos; ++j) - ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - - if (types[t] <= 0) - j += -types[t]; - else - for (l = 0; l < types[t]; ++l) - ref2[k++] = inscns[t*max_ins + l]; - - for (; j < right && ref[j] && k < right-left; ++j) - ref2[k++] = seq_nt16_int[(int)ref_sample[s][j-left]]; - for (; k < max_ref2; ++k) - ref2[k] = 4; - - if (right > j) - right = j; - -// fprintf(stderr, "ConsR (##) ?/? "); -// for (i = 0; i < right-left+(types[t]>0?types[t]:0); i++) -// putc("ACGTN"[(uint8_t)ref2[i]], stderr); -// putc('\n', stderr); - // original consensus method //memcpy(ref1, ref2, right-left+(types[t]>0?types[t]:0)); memcpy(ref1, tcons[1], MIN(tcon_len[1], max_ref2)); @@ -1665,16 +1531,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (left_shift > 0) { memset(ref2, 4/*N*/, MIN(left_shift, max_ref2)); ref2_pos += MIN(left_shift, max_ref2); -// rright += MIN(left_shift, max_ref2); -// if (rright-left > max_ref2) -// rright = left+max_ref2; } memcpy(ref2 + ref2_pos, tcons[0], MIN(tcon_len[0], max_ref2-ref2_pos)); ref2_pos += MIN(tcon_len[0], max_ref2-ref2_pos); if (right_shift > 0) { memset(ref2 + ref2_pos, 4/*N*/, MIN(right_shift, max_ref2-ref2_pos)); -// rright += MIN(right_shift, max_ref2-ref2_pos); } // fprintf(stderr, "TYPE %d = %2d\t", t, types[t]); @@ -1842,12 +1704,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, free(ref2); free(query); free(score); - - for (i = 0; i < n; ++i) - free(ref_sample[i]); - - free(ref_sample); - free(types); free(inscns); + free(types); + free(inscns); return n_alt > 0? 0 : -1; } From 68485fb8d23b39e478a533c7ba47a2b1bdf0d7a6 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 4 Mar 2022 10:25:15 +0000 Subject: [PATCH 16/31] Give deletions the same treatment as insertions for consensus gen. Ie cnum==1 is always opposite of cnum==0 for hets. --- bam2bcf.c | 5 ++ bam2bcf_indel.c | 139 +++++++++++++++++++++++++++++++++++------------- 2 files changed, 106 insertions(+), 38 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index bd8bc5154..160a112df 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -361,6 +361,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype + + // TODO: account for the number of unassigned reads. If depth is 50, + // but AD is 5,7 then it may look like a variant but it's probably + // should be low quality. + return n; } diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index a8e1d3c24..1f1df000b 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -5,8 +5,6 @@ - Reevaluate the two STR indel-size adjusting modes. Maybe no longer relevant -- Write deletion test perl script and evaluate consensus construction - - Explore INS_PLUS_BASE again. Prefer to disable this as it's hard to understand and doesn't work properly on reads ending on an insertion. @@ -17,6 +15,30 @@ - Consider limiting fract to never add more than current depth, so we change cons to Ns but not to another base type entirely. + +- Set BAQ band width based on maximum size of ins / del observed. Do + from *all* types, as we may realign from one type to another. + +- Trim left/right down better, as we used to. Judge this based on + summation of various types and their consensii? + +- Separate consensus het[] array into heti[] and hetd[] to cope with + varying numbers of poly-X including both + and -. + +- Consider a separate rfract for lift-over of SNPs than for indels. + SNPs is good at replacing bases with N where we're unsure on the + data. However ref_ins may cause issues with sizing? + +- Left-align indels before consensus generation. Eg: + + /pos being studied + AGCTGGGGGGAATCG REF + AGCT-GGGGGAATCG Seq type -1 + ACGTGGGGG-AATGCG Seq type 0 + ^ + + Type 0 cons shouldn't include the right hand del, but it's outside + of "biggest_del" window. Expand this to STR size or left-align. */ /* bam2bcf_indel.c -- indel caller. @@ -385,8 +407,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));// single base or del str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); // multi-base insertions - biggest_del = biggest_del<0?biggest_del+1:0; - // non-indel ref for all reads on this sample, rather than those just // matching type. We use this for handling the case where we have a // homozygous deletion being studied, but with 1 or 2 reads misaligned @@ -570,7 +590,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // incase that variant is genuine. if (1 || rfract > 0) { // && !(type == 0 && i+left == pos)) { - if (i+left >= pos+1 && i+left <= pos+1-biggest_del) { + if (i+left >= pos+1 && i+left < pos+1-biggest_del) { // We're overlapping the current indel region, so // we don't wish to bring in evidence from the other // "type" data. @@ -710,7 +730,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, for (j = 0; j < 6; j++) { // Top 2 consensus calls if (max_v < cons_base[i][j]) { - max_v2 = max_v, max_j2 = j; + max_v2 = max_v, max_j2 = max_j; max_v = cons_base[i][j], max_j = j; } else if (max_v2 < cons_base[i][j]) { max_v2 = cons_base[i][j], max_j2 = j; @@ -744,7 +764,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || // i == pos-left+1)) { int always_ins = - (i == pos-left+1 && type) || // current eval + (i == pos-left+1 && type>0) || // current eval max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins);// HOM int het_ins = 0; if (!always_ins && max_v_ins >= bca->min_support) { @@ -796,40 +816,79 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } // Call - if (type < 0 && i > pos-left && i <= pos-left-type) { - if (max_j != 5) - fprintf(stderr, "pos %d i %d pos-left %d type %d, max_j %d\n", - pos, i, pos-left, type, max_j); - max_v = cons_base[i][max_j = 5]; - } - - if (cnum == 0) { - if (max_v > CONS_CUTOFF*tot) { // HET or HOM - if (max_j != 5) // gap - cons[cnum][k++] = "ACGTN*"[max_j]; - else if (k < pos-left+*left_shift) - (*left_shift)--; - else - (*right_shift)++; +// if (type < 0 && i > pos-left && i <= pos-left-type) { +//// if (max_j != 5) +//// fprintf(stderr, "pos %d i %d pos-left %d type %d, max_j %d\n", +//// pos, i, pos-left, type, max_j); +// max_v = cons_base[i][max_j = 5]; +// } + + // FIXME. Sounds good, but old code is still doing better. + // Double check the biggest_del and region stuff... + + int always_del = (type < 0 && i > pos-left && i <= pos-left-type) || + cons_base[i][5] > CONS_CUTOFF2 * tot; // HOM del + int het_del = 0; + if (!always_del && cons_base[i][5] >= bca->min_support) { + // Candidate HET del. + if (cnum == 0) { + het_del = cons_base[i][5] >= CONS_CUTOFF * tot; + if (i < 1024) { + if (i >= pos-left && i <= pos-left-biggest_del) + het[i] = 0; + else + het[i] = het_del + ? 1 + : (cons_base[i][5] >= .2 * tot ? -1 : 0); + } } else { - cons[cnum][k++] = 'N'; + het_del = (het[i] == -1); // HET del uncalled on cnum 0 + if (max_j == 5 && het_del == 0) { + max_v = max_v2; + max_j = max_j2; + } } + } + if (always_del || het_del) { + // Deletion + if (k < pos-left+*left_shift) + (*left_shift)--; + else + (*right_shift)++; } else { - // FIXME: use the same het[] array logic as for ins above - if (max_j == 5) { - if (max_v > CONS_CUTOFF2*tot) // HOM - ; // no need to output "*" - else - max_j = max_j2, max_v = max_v2; - } - if (max_j != 5) { - if (max_v > CONS_CUTOFF*tot) - cons[cnum][k++] = "ACGTN*"[max_j]; - else - cons[cnum][k++] = 'N'; - } + if (max_v > CONS_CUTOFF*tot) + cons[cnum][k++] = "ACGTN*"[max_j]; + else + cons[cnum][k++] = 'N'; } +// if (cnum == 0) { +// if (max_v > CONS_CUTOFF*tot) { // HET or HOM +// if (max_j != 5) // gap +// cons[cnum][k++] = "ACGTN*"[max_j]; +// else if (k < pos-left+*left_shift) +// (*left_shift)--; +// else +// (*right_shift)++; +// } else { +// cons[cnum][k++] = 'N'; +// } +// } else { +// // FIXME: use the same het[] array logic as for ins above +// if (max_j == 5) { +// if (max_v > CONS_CUTOFF2*tot) // HOM +// ; // no need to output "*" +// else +// max_j = max_j2, max_v = max_v2; +// } +// if (max_j != 5) { +// if (max_v > CONS_CUTOFF*tot) +// cons[cnum][k++] = "ACGTN*"[max_j]; +// else +// cons[cnum][k++] = 'N'; +// } +// } + // fprintf(stderr, "\n"); } cons[cnum][k++] = '\0'; @@ -995,6 +1054,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, // FIXME Maybe instead of l>ABS(type) it should be l>query_len/2 ? // TODO: no difference to result, but what difference is there to // speed? Is this worth it? +#if 1 for (l = 0; l < tend1-tbeg && l < tend2-tbeg; l++) if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4) break; @@ -1020,6 +1080,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, fprintf(stderr, "Prune %d N to right 2\n", l-ABS(type)); tend2 -= l-ABS(type); } +#endif // Get segment of quality, either ZQ tag or if absent QUAL. if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) @@ -1489,8 +1550,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, left, right, s, types[t], biggest_del, &left_shift, &right_shift); - fprintf(stderr, "Cons0 @ %d %d %s\n", pos, types[t], tcons[0]); - fprintf(stderr, "Cons1 @ %d %d %s\n", pos, types[t], tcons[1]); + fprintf(stderr, "Cons0 @ %d %4d/%3d %s\n", pos, types[t], left_shift, tcons[0]); + fprintf(stderr, "Cons1 @ %d %4d/%3d %s\n", pos, types[t], left_shift, tcons[1]); // FIXME: map from ascii to 0,1,2,3,4. // This is only needed because bcf_cgp_consensus is reporting in ASCII @@ -1624,6 +1685,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, int l = -types[t]; tbeg = tbeg - l > left? tbeg - l : left; } + if (left_shift < 0) + tbeg = tbeg + left_shift > left ? tbeg + left_shift : left; // FIXME: Why +20? tbeg-left_shift to tend+right_shift // is still insufficient. Why? Check tpos2qpos maybe? From 333f203f41c81ae44e5d52fd7fea37a819bbe7b3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 8 Mar 2022 09:31:52 +0000 Subject: [PATCH 17/31] Minor tidying plus tweak STR iscore code --- bam2bcf_indel.c | 68 ++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 52 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 1f1df000b..25c6b2b71 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -12,6 +12,8 @@ - Explore indelQ and the effect of STR at boundaries. I'm not convined our quality calculation is correct. Certainly QUAL appears to have little reality with actual indel likelihood! + It's already there - see end of bcf_cgp_align_score. + However try tweaking this now we've got better consensus. - Consider limiting fract to never add more than current depth, so we change cons to Ns but not to another base type entirely. @@ -22,12 +24,10 @@ - Trim left/right down better, as we used to. Judge this based on summation of various types and their consensii? -- Separate consensus het[] array into heti[] and hetd[] to cope with - varying numbers of poly-X including both + and -. - - Consider a separate rfract for lift-over of SNPs than for indels. SNPs is good at replacing bases with N where we're unsure on the data. However ref_ins may cause issues with sizing? + rfract*.8 is working better (so far). Trying 0.5 too. - Left-align indels before consensus generation. Eg: @@ -721,7 +721,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Het call filled out in cnum==0 (+ve or -ve) // Used in cnum==1 to do the opposite of whichever way we did before. - int het[1024] = {0}; + int heti[1024] = {0}, hetd[1024] = {0}; for (cnum = 0; cnum < 2; cnum++) { for (i = k = 0; i < right-left; i++) { @@ -771,11 +771,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Candidate HET ins. if (cnum == 0) { het_ins = max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins); - if (i < 1024) het[i] = het_ins + if (i < 1024) heti[i] = het_ins ? 1 : (max_v_ins > .2*(tot+tot_ins) ? -1:0); } else { - het_ins = (het[i] == -1); // HET but uncalled before + het_ins = (heti[i] == -1); // HET but uncalled before } } // if (max_v_ins) @@ -815,19 +815,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, #endif } - // Call -// if (type < 0 && i > pos-left && i <= pos-left-type) { -//// if (max_j != 5) -//// fprintf(stderr, "pos %d i %d pos-left %d type %d, max_j %d\n", -//// pos, i, pos-left, type, max_j); -// max_v = cons_base[i][max_j = 5]; -// } - - // FIXME. Sounds good, but old code is still doing better. - // Double check the biggest_del and region stuff... - - int always_del = (type < 0 && i > pos-left && i <= pos-left-type) || - cons_base[i][5] > CONS_CUTOFF2 * tot; // HOM del + // Call deletions + int always_del = (type < 0 && i > pos-left && i <= pos-left-type) + || cons_base[i][5] > CONS_CUTOFF2 * tot; // HOM del int het_del = 0; if (!always_del && cons_base[i][5] >= bca->min_support) { // Candidate HET del. @@ -835,14 +825,14 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, het_del = cons_base[i][5] >= CONS_CUTOFF * tot; if (i < 1024) { if (i >= pos-left && i <= pos-left-biggest_del) - het[i] = 0; + hetd[i] = 0; else - het[i] = het_del + hetd[i] = het_del ? 1 : (cons_base[i][5] >= .2 * tot ? -1 : 0); } } else { - het_del = (het[i] == -1); // HET del uncalled on cnum 0 + het_del = (hetd[i] == -1); // HET del uncalled on cnum 0 if (max_j == 5 && het_del == 0) { max_v = max_v2; max_j = max_j2; @@ -856,40 +846,12 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, else (*right_shift)++; } else { + // Finally the easy case - a non-indel base or an N if (max_v > CONS_CUTOFF*tot) cons[cnum][k++] = "ACGTN*"[max_j]; else cons[cnum][k++] = 'N'; } - -// if (cnum == 0) { -// if (max_v > CONS_CUTOFF*tot) { // HET or HOM -// if (max_j != 5) // gap -// cons[cnum][k++] = "ACGTN*"[max_j]; -// else if (k < pos-left+*left_shift) -// (*left_shift)--; -// else -// (*right_shift)++; -// } else { -// cons[cnum][k++] = 'N'; -// } -// } else { -// // FIXME: use the same het[] array logic as for ins above -// if (max_j == 5) { -// if (max_v > CONS_CUTOFF2*tot) // HOM -// ; // no need to output "*" -// else -// max_j = max_j2, max_v = max_v2; -// } -// if (max_j != 5) { -// if (max_v > CONS_CUTOFF*tot) -// cons[cnum][k++] = "ACGTN*"[max_j]; -// else -// cons[cnum][k++] = 'N'; -// } -// } - - // fprintf(stderr, "\n"); } cons[cnum][k++] = '\0'; } @@ -1200,7 +1162,9 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, iscore += (elt->end-elt->start) / elt->rep_len; // c if (elt->start+tbeg <= r_start || elt->end+tbeg >= r_end) { - iscore += 2*(elt->end-elt->start); + //iscore += 2*(elt->end-elt->start); //h5 (STR2) + //iscore += 4*(elt->end-elt->start); //h5STR4 + iscore += (elt->end-elt->start); //h5STR1 } } From c450882ce87e3a5cca5c17b7c0bb281320e6fae7 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 8 Mar 2022 12:52:04 +0000 Subject: [PATCH 18/31] Disable INS_PLUS_BASE, plus a start on better band calc. The INS_PLUS_BASE define is complicated code, and validation now shows the differences are very marginal. So disabled for now and will cull soon so we only have the simpler variant left. --- bam2bcf_indel.c | 54 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 25c6b2b71..a09db3591 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -403,7 +403,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, int left, int right, int sample, int type, int biggest_del, - int *left_shift, int *right_shift) { + int *left_shift, int *right_shift, + int *band) { int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));// single base or del str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); // multi-base insertions @@ -423,11 +424,12 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // cons_ins sequence is the insertion seq followed by the // next match base -#define INS_PLUS_BASE +//#define INS_PLUS_BASE + int last_base_ins = 0; // Accumulate sequences into cons_base and cons_ins arrays - int last_base_ins = 0; + int local_band_max = 0; // maximum absolute deviation from diagonal for (i = 0; i < n_plp[s]; i++) { const bam_pileup1_t *p = plp[s] + i; // if (p->indel != type) @@ -442,6 +444,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, uint8_t *seq = bam_get_seq(b); last_base_ins = 0; + int local_band = 0; // current deviation from diagonal for (k = 0; k < b->core.n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; int len = cigar[k] >> BAM_CIGAR_SHIFT; @@ -488,6 +491,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } case BAM_CINS: { + local_band += p->indel; + if (local_band_max < local_band) + local_band_max = local_band; // if (p->indel != type) { // y += len; // for when adding to ref_base // break; @@ -531,6 +537,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } case BAM_CDEL: + local_band += p->indel; + if (local_band_max < -local_band) + local_band_max = -local_band; + // FIXME, not perfect for I/D combos, but likely sufficient. last_base_ins = 0; for (j = 0; j < len; j++, x++) { @@ -546,6 +556,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, break; } } + // Also track the biggest deviation +/- from diagonal + if (*band < local_band_max) + *band = local_band_max; // fprintf(stderr, " %s\n", bam_get_qname(p->b)); } @@ -760,20 +773,24 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // NB: tot is based on next matching base, so it includes // everything with or without the insertion. #ifdef INS_PLUS_BASE + int tot_sum = tot+tot_ins; +#else + int tot_sum = tot; +#endif // if (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins) && (cnum==0 || // max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || // i == pos-left+1)) { int always_ins = (i == pos-left+1 && type>0) || // current eval - max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins);// HOM + max_v_ins > CONS_CUTOFF_INC2*tot_sum;// HOM int het_ins = 0; if (!always_ins && max_v_ins >= bca->min_support) { // Candidate HET ins. if (cnum == 0) { - het_ins = max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins); + het_ins = max_v_ins > CONS_CUTOFF_INC * tot_sum; if (i < 1024) heti[i] = het_ins ? 1 - : (max_v_ins > .2*(tot+tot_ins) ? -1:0); + : (max_v_ins > .2*tot_sum ? -1:0); } else { het_ins = (heti[i] == -1); // HET but uncalled before } @@ -781,16 +798,12 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // if (max_v_ins) // fprintf(stderr, "Cons @ %d: type %d cnum %d always %d het_ins %d // max_v %d vs %d+%d\n", i, type, cnum, always_ins, het_ins, max_v_ins, tot, tot_ins); if (always_ins || het_ins) { -// if ((i == pos-left+1 && type) || // current 'type' at pos -// max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || // HOM -// (max_v_ins > bca->min_support && -// (cnum != 0) ^ (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins)))) { // HET -#else - if ((i == pos-left+1 && type) || // current 'type' at pos - max_v_ins > CONS_CUTOFF_INC2*tot || // HOM - (max_v_ins > bca->min_support && - (cnum != 0) ^ max_v_ins > CONS_CUTOFF_INC*tot)) { // HET -#endif +// #else +// if ((i == pos-left+1 && type) || // current 'type' at pos +// max_v_ins > CONS_CUTOFF_INC2*tot || // HOM +// (max_v_ins > bca->min_support && +// (cnum != 0) ^ max_v_ins > CONS_CUTOFF_INC*tot)) { // HET +// #endif if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { // Insert bases for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) { @@ -1479,9 +1492,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // insertion and no small hypothesised one. int biggest_del = 0; - for (t = 0; t < n_types; t++) + int biggest_ins = 0; + for (t = 0; t < n_types; t++) { if (biggest_del > types[t]) biggest_del = types[t]; + if (biggest_ins < types[t]) + biggest_ins = types[t]; + } + int band = biggest_ins - biggest_del; // NB del is -ve for (t = 0; t < n_types; ++t) { int l, ir; @@ -1513,7 +1531,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, int left_shift, right_shift; tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, left, right, s, types[t], biggest_del, - &left_shift, &right_shift); + &left_shift, &right_shift, &band); fprintf(stderr, "Cons0 @ %d %4d/%3d %s\n", pos, types[t], left_shift, tcons[0]); fprintf(stderr, "Cons1 @ %d %4d/%3d %s\n", pos, types[t], left_shift, tcons[1]); From bc9cad07774993f80673ee288e68384c7cc89552 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 9 Mar 2022 16:46:54 +0000 Subject: [PATCH 19/31] Improve STR finder to cope with longer repeat units --- str_finder.c | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/str_finder.c b/str_finder.c index 800cbfef9..79bd31c35 100644 --- a/str_finder.c +++ b/str_finder.c @@ -1,7 +1,7 @@ /* str_finder.c -- Short Tandem Repeat finder. Originally from Crumble (https://github.com/jkbonfield/crumble) - Copyright (C) 2015-2016, 2021 Genome Research Ltd. + Copyright (C) 2015-2016, 2021-2022 Genome Research Ltd. Author: James Bonfield @@ -139,10 +139,10 @@ static void add_rep(rep_ele **list, char *cons, int clen, int pos, int rlen, */ rep_ele *find_STR(char *cons, int len, int lower_only) { int i, j; - uint32_t w = 0; + uint64_t w = 0; rep_ele *reps = NULL; - for (i = j = 0; i < len && j < 15; i++) { + for (i = j = 0; i < len && j < 26; i++) { if (cons[i] == '*') continue; w <<= 2; @@ -162,6 +162,18 @@ rep_ele *find_STR(char *cons, int len, int lower_only) { add_rep(&reps, cons, len, i, 6, lower_only, w); if (j>=13 && (w&0x3fff) == ((w>>14)&0x3fff)) add_rep(&reps, cons, len, i, 7, lower_only, w); + if (j>=15 && (w&0xffff) == ((w>>16)&0xffff)) + add_rep(&reps, cons, len, i, 8, lower_only, w); + if (j>=17 && (w&0x003ffff) == ((w>>18)&0x003ffff)) + add_rep(&reps, cons, len, i, 9, lower_only, w); + if (j>=19 && (w&0x00fffff) == ((w>>20)&0x00fffff)) + add_rep(&reps, cons, len, i,10, lower_only, w); + if (j>=21 && (w&0x03fffff) == ((w>>22)&0x03fffff)) + add_rep(&reps, cons, len, i,11, lower_only, w); + if (j>=23 && (w&0x0ffffff) == ((w>>24)&0x0ffffff)) + add_rep(&reps, cons, len, i,12, lower_only, w); + if (j>=24 && (w&0x3ffffff) == ((w>>26)&0x3ffffff)) + add_rep(&reps, cons, len, i,13, lower_only, w); j++; } @@ -172,7 +184,19 @@ rep_ele *find_STR(char *cons, int len, int lower_only) { w <<= 2; w |= cons[i]; //printf("%3d %c w=%08x\n", i, cons[i], w); - if ((w&0xffff) == ((w>>16)&0xffff)) + if ((w&0xfffffff) == ((w>>28)&0xfffffff)) + add_rep(&reps, cons, len, i, 14, lower_only, w); + else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff)) + add_rep(&reps, cons, len, i, 13, lower_only, w); + else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff)) + add_rep(&reps, cons, len, i, 12, lower_only, w); + else if ((w&0x03fffff) == ((w>>22)&0x03fffff)) + add_rep(&reps, cons, len, i, 11, lower_only, w); + else if ((w&0x00fffff) == ((w>>20)&0x00fffff)) + add_rep(&reps, cons, len, i, 10, lower_only, w); + else if ((w&0x003ffff) == ((w>>18)&0x003ffff)) + add_rep(&reps, cons, len, i, 9, lower_only, w); + else if ((w&0xffff) == ((w>>16)&0xffff)) add_rep(&reps, cons, len, i, 8, lower_only, w); else if ((w&0x3fff) == ((w>>14)&0x3fff)) add_rep(&reps, cons, len, i, 7, lower_only, w); From fa8145d5b10682df404a0920beebd4186ceecefa Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 9 Mar 2022 16:47:43 +0000 Subject: [PATCH 20/31] Improvements for long reads. - local_band is computed only over left..right part of alignment instead of from the entire CIGAR string. - Fix for p->indel == type matching. This isn't sufficient to filter out when we're in the deletion being studied if it starts eg 1 base earlier, where p->indel is now zero, but p->is_del is true. - Use band size computed from all observed indels and STRs, not "type". - Replace bca->indel_win_size/2 for long reads with an indel/STR aware limit. - Replace the tbeg/tend calculations with a indel/STR aware setting. --- bam2bcf_indel.c | 196 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 152 insertions(+), 44 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index a09db3591..9106c6794 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -1,3 +1,5 @@ +#define CONS_DEBUG + /* TODO: @@ -39,6 +41,12 @@ Type 0 cons shouldn't include the right hand del, but it's outside of "biggest_del" window. Expand this to STR size or left-align. + +- Long reads cause multiple scans of CIGAR to compute consensus. + We need a way of caching CIGAR/seq start coords for pos p=left so at + pos P where P>p we can start at p and continue instead of from the + start each time. + */ /* bam2bcf_indel.c -- indel caller. @@ -491,9 +499,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } case BAM_CINS: { - local_band += p->indel; - if (local_band_max < local_band) - local_band_max = local_band; + if (x >= left && x < right) { + local_band += p->indel; + if (local_band_max < local_band) + local_band_max = local_band; + } // if (p->indel != type) { // y += len; // for when adding to ref_base // break; @@ -537,9 +547,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } case BAM_CDEL: - local_band += p->indel; - if (local_band_max < -local_band) - local_band_max = -local_band; + if (x >= left && x < right) { + local_band += p->indel; + if (local_band_max < -local_band) + local_band_max = -local_band; + } // FIXME, not perfect for I/D combos, but likely sufficient. last_base_ins = 0; @@ -547,7 +559,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (x < left) continue; if (x >= right) break; // fputc('-', stderr); - if (p->indel == type) + //if (p->indel == type) + if ((p->indel == type && !p->is_del) || // starts here + (p->indel == 0 && p->is_del && len == type)) // to left // fixme: not p->indel==type but x==pos+1 cons_base[x-left][5]++; else @@ -822,8 +836,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) cons[cnum][k++] = 'N'; } - // don't call next base as included in insertion #ifdef INS_PLUS_BASE + // don't call next base as included in insertion continue; #endif } @@ -862,8 +876,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Finally the easy case - a non-indel base or an N if (max_v > CONS_CUTOFF*tot) cons[cnum][k++] = "ACGTN*"[max_j]; - else + else if (max_v > 0) cons[cnum][k++] = 'N'; + else + cons[cnum][k++] = ref[left+k]; } } cons[cnum][k++] = '\0'; @@ -997,7 +1013,8 @@ static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, // Fills out score // Returns 0 on success, // <0 on error -static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, +static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, + int type, int band, uint8_t *ref1, uint8_t *ref2, uint8_t *query, int r_start, int r_end, int long_read, int tbeg, int tend1, int tend2, @@ -1017,7 +1034,10 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, } type = abs(type); - apf.bw = type + 3; // or abs(l_ref - l_query), so we want to keep similar + if (band > (qend-qbeg)/2-3) + band = (qend-qbeg)/2-3; + apf.bw = band + 3; // or abs(l_ref - l_query), so we want to keep similar + int l, sc1, sc2; const uint8_t *qual = bam_get_qual(p->b), *bq; uint8_t *qq; @@ -1034,7 +1054,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4) break; if (l > ABS(type)) { - fprintf(stderr, "Prune %d N to left\n", l-ABS(type)); +// fprintf(stderr, "Prune %d N to left\n", l-ABS(type)); tbeg += l-ABS(type); } @@ -1043,7 +1063,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, break; l = tend1-tbeg-1 - l; if (l > ABS(type)) { - fprintf(stderr, "Prune %d N to right 1\n", l-ABS(type)); +// fprintf(stderr, "Prune %d N to right 1\n", l-ABS(type)); tend1 -= l-ABS(type); } @@ -1052,7 +1072,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, break; l = tend2-tbeg-1 - l; if (l > ABS(type)) { - fprintf(stderr, "Prune %d N to right 2\n", l-ABS(type)); +// fprintf(stderr, "Prune %d N to right 2\n", l-ABS(type)); tend2 -= l-ABS(type); } #endif @@ -1111,13 +1131,16 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int type, else sc1 = INT_MAX; // skip -#if 1 -#define CONS_DEBUG - fprintf(stderr, "\nref1: "); +#ifdef CONS_DEBUG + fprintf(stderr, "\nref1"); + fprintf(stderr, "%c ", + memcmp(ref1+tbeg-left, query, qend-qbeg)?':':'='); for (int j = 0; j < tend1-tbeg; j++) putc("ACGTN"[(uint8_t)ref1[j+tbeg-left]], stderr); putc('\n', stderr); - fprintf(stderr, "ref2: "); + fprintf(stderr, "ref2"); + fprintf(stderr, "%c ", + memcmp(ref2+tbeg-left, query, qend-qbeg)?':':'='); for (int j = 0; j < tend2-tbeg; j++) putc("ACGTN"[(uint8_t)ref2[j+tbeg-left]], stderr); putc('\n', stderr); @@ -1517,11 +1540,14 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // Identify max deletion length int max_deletion = 0; + int max_insertion = 0; for (s = 0; s < n; ++s) { for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; if (max_deletion < -p->indel) max_deletion = -p->indel; + if (max_insertion < p->indel) + max_insertion = p->indel; } } @@ -1532,8 +1558,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, left, right, s, types[t], biggest_del, &left_shift, &right_shift, &band); +#ifdef CONS_DEBUG fprintf(stderr, "Cons0 @ %d %4d/%3d %s\n", pos, types[t], left_shift, tcons[0]); fprintf(stderr, "Cons1 @ %d %4d/%3d %s\n", pos, types[t], left_shift, tcons[1]); +#endif // FIXME: map from ascii to 0,1,2,3,4. // This is only needed because bcf_cgp_consensus is reporting in ASCII @@ -1641,14 +1669,48 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // determine the start and end of sequences for alignment // FIXME: loops over CIGAR multiple times int left2 = left, right2 = right; + int min_win_size = MAX(-biggest_del, biggest_ins); + min_win_size += ABS(left_shift) + ABS(right_shift); + { + rep_ele *reps, *elt, *tmp; + reps = find_STR(tcons[0], tcon_len[0], 0); + int max_str = 0, tot_str = 0; + DL_FOREACH_SAFE(reps, elt, tmp) { + if (max_str < elt->end - elt->start) + max_str = elt->end - elt->start; + tot_str += elt->end - elt->start; + DL_DELETE(reps, elt); + free(elt); + } + + // Max_str should be enough, but it's still not + // sufficient in longer range some repeats. + //min_win_size += max_str; + min_win_size += tot_str; + + fprintf(stderr, "BAND=%d STR %d %d INDEL %d %d sh %d %d", band, max_str, tot_str, biggest_del, biggest_ins, left_shift, right_shift); + } + min_win_size += 10; + fprintf(stderr, " => %d", min_win_size); if (p->b->core.l_qseq > 1000) { // long read data needs less context. It also tends to // have many more candidate indels to investigate so // speed here matters more. - if (pos - left >= bca->indel_win_size) - left2 += bca->indel_win_size/2; - if (right-pos >= bca->indel_win_size) - right2 -= bca->indel_win_size/2; +// if (pos - left >= bca->indel_win_size) +// left2 += bca->indel_win_size/2; +// if (right-pos >= bca->indel_win_size) +// right2 -= bca->indel_win_size/2; + if (pos - left >= min_win_size) + left2 = MAX(left2, pos - min_win_size); + if (right-pos >= min_win_size) + right2 = MIN(right2, pos + min_win_size); + +// if (pos - left >= min_win_size) +// left2 += bca->indel_win_size - min_win_size; +// if (right-pos >= min_win_size) +// right2 -= bca->indel_win_size - min_win_size; + + fprintf(stderr, " LR = %d / %d / %d", left2, pos, right2); } int r_start = p->b->core.pos; @@ -1656,30 +1718,33 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bam_get_cigar(p->b)) -1 + r_start; - qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2, - 0, &tbeg); + qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), + left2/*+biggest_ins*/, 0, &tbeg); qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos, 0, &tend) - qbeg; - qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), right2, - 1, &tend); + qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), + right2/*-biggest_ins*/, 1, &tend); - if (types[t] < 0) { - int l = -types[t]; - tbeg = tbeg - l > left? tbeg - l : left; - } - if (left_shift < 0) - tbeg = tbeg + left_shift > left ? tbeg + left_shift : left; - - // FIXME: Why +20? tbeg-left_shift to tend+right_shift - // is still insufficient. Why? Check tpos2qpos maybe? - if (left_shift+20 > 0) - tbeg = tbeg - (left_shift+20) > left - ? tbeg - (left_shift+20) - : left; - if (right_shift+20 > 0) - tend = tend + right_shift+20 < rright - ? tend + right_shift+20 - : rright; + int old_tend = tend; + int old_tbeg = tbeg; + +// if (types[t] < 0) { +// int l = -types[t]; +// tbeg = tbeg - l > left? tbeg - l : left; +// } +// if (left_shift < 0) +// tbeg = tbeg + left_shift > left ? tbeg + left_shift : left; +// +// // FIXME: Why +20? tbeg-left_shift to tend+right_shift +// // is still insufficient. Why? Check tpos2qpos maybe? +// if (left_shift+20 > 0) +// tbeg = tbeg - (left_shift+20) > left +// ? tbeg - (left_shift+20) +// : left; +// if (right_shift+20 > 0) +// tend = tend + right_shift+20 < rright +// ? tend + right_shift+20 +// : rright; // write the query sequence for (l = qbeg; l < qend; ++l) @@ -1697,12 +1762,55 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, int tend1 = left + tcon_len[0] - (left2-left); int tend2 = left + tcon_len[1] - (left2-left); +// fprintf(stderr, "Type %d REG %d+%d / %d+%d tbeg %d+%d %d+%d/%d q %d+%d band %d\n", +// types[t], left, right-left, left2, rright-left2, +// old_tbeg, old_tend-old_tbeg, +// tbeg, tend1-tbeg, tend2-tbeg, +// r_start+qbeg, qend-qbeg, band); + + // tbeg and tend are the genomic locations equivalent + // to qbeg and qend on the sequence. + // These may being entirely within our left/right + // coordinates over which we've computed the + // consensus, or overlapping to left/right. + // + // We know an estimation of band, plus biggest indel, + // so we can trim tbeg/tend to a smaller region if we + // wish here. This speeds up BAQ scoring. + + // band+MAX(...) = h6 + // band+MAX(...)+10 = h6b + // band+MAX(...)*2+10 = h6c + // band+MAX(...)*2+20 = h6d + + // TODO: check 10h5STR1-i.30x vs 10h6.30x to look for + // differences and see what band, max_ins/del, etc are. + // Can we rescue these without discarding the others, + // or is it fundamentally not a possibility? + //int wband = band + MAX(max_deletion, max_insertion) + 10; + int wband = band + MAX(max_deletion, max_insertion)*2 + 20; +// if (tend1 > old_tend + wband) +// tend1 = MIN(right2, old_tend + wband); +// if (tend2 > old_tend + wband) +// tend2 = MIN(right2, old_tend + wband); + +#if 1 + tend1 = MIN(tend1, old_tend + wband); + tend2 = MIN(tend2, old_tend + wband); + tbeg = MAX(left2, old_tbeg - wband); +#endif + + fprintf(stderr, "\nNew: %d REG %d+%d / %d+%d tbeg %d+%d %d+%d/%d q %d+%d band %d", + types[t], left, right-left, left2, rright-left2, + old_tbeg, old_tend-old_tbeg, + tbeg, tend1-tbeg, tend2-tbeg, + r_start+qbeg, qend-qbeg, band); // do realignment; this is the bottleneck. // // Note low score = good, high score = bad. if (tend > tbeg) { - if (bcf_cgp_align_score(p, bca, types[t], + if (bcf_cgp_align_score(p, bca, types[t], band, //(uint8_t *)ref1 + left2-left, //(uint8_t *)ref2 + left2-left, (uint8_t *)tcons[0] + left2-left, From e07fe35ad603a7b72bcce547a68667be0a04cfb3 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 10 Mar 2022 15:07:10 +0000 Subject: [PATCH 21/31] More tweaking of consensus. - Fixed (?) out by one error in candidate del caller: Was "i >= pos-left" and now "i > pos-left". This was incorrectly not marking the last base in a het del as used when immediately followed by an insertion (or maybe it was vice versa). - The same line as above decided to add to either cons_base vs ref_base using "biggest_del", but this fails when we have nested deletions present. Eg -10 at pos 105 and -1 at pos 107. We now track containing deletions and set a skip_to coordinate instead of relying on "biggest_del". - Alter the fraction at which we label a heterozygous indel as heterozygous from .2 to .3. (Shift10h7j in notes) --- bam2bcf_indel.c | 50 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 9106c6794..7fa0df75d 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -406,6 +406,11 @@ static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) { * * Cons for no-del is Cs not Gs. Cannot trust it, so use N if shallow. * CON: AGCTACNAGGGTGATA + * + * There are still some problems in cons_ins vs ref_ins assignment. + * We sometimes seem multiple similar-length insertions added at + * different locations. Ideally we'd like to consider these as all + * the same insertion if the size is the same and it's comparable seq. */ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, @@ -457,6 +462,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int op = cigar[k] & BAM_CIGAR_MASK; int len = cigar[k] >> BAM_CIGAR_SHIFT; int base; + int skip_to = 0; switch(op) { case BAM_CSOFT_CLIP: @@ -487,7 +493,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, #ifdef INS_PLUS_BASE if (p->indel == type) #else - if (p->indel == type || p->indel > 0) // alternative + // FIXME: or is_del && len==type as below + if (p->indel == type) // 7g + //if (p->indel == type || p->indel > 0) // 7f + //if (p->indel == type || (p->indel > 0 && x == pos+1) #endif cons_base[x-left][L[base]]++; else if (x != pos+1) // indel being assessed question @@ -534,7 +543,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // fprintf(stderr, "<+%.*s>", j<1024?j:1024, ins); if (x >= left && x < right) { int ilen = j<1024?j:1024; - if (p->indel == type) { + if (p->indel == type /*&& x == pos+1*/) { + // Assume any ins of the same size is the same ins. + // (This rescues misaligned insertions.) bcf_cgp_append_cons(&cons_ins[x-left], ins, ilen, 1); } else if (x != pos+1){ bcf_cgp_append_cons(&ref_ins[x-left], ins, ilen, 1); @@ -562,10 +573,24 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, //if (p->indel == type) if ((p->indel == type && !p->is_del) || // starts here (p->indel == 0 && p->is_del && len == type)) // to left - // fixme: not p->indel==type but x==pos+1 + // FIXME: len == -type? cons_base[x-left][5]++; - else +// else // 7h +// ref_base[x-left][5]++; + + // 7i + else if (x+len <= pos+1 || (skip_to && x > skip_to)) ref_base[x-left][5]++; + else if (x <= pos && x+len > pos+1) { + // we have a deletion which overlaps pos, but + // isn't the same "type". We don't wish to + // include these as they may bias the + // evaluation by confirming against a + // secondary consensus produced with the other + // deletion. We set a marker for how long to + // skip adding to ref_base. + skip_to = x+len; + } } break; } @@ -804,7 +829,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, het_ins = max_v_ins > CONS_CUTOFF_INC * tot_sum; if (i < 1024) heti[i] = het_ins ? 1 - : (max_v_ins > .2*tot_sum ? -1:0); + : (max_v_ins > .3*tot_sum ? -1:0); } else { het_ins = (heti[i] == -1); // HET but uncalled before } @@ -851,12 +876,12 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (cnum == 0) { het_del = cons_base[i][5] >= CONS_CUTOFF * tot; if (i < 1024) { - if (i >= pos-left && i <= pos-left-biggest_del) + if (i > pos-left && i <= pos-left-biggest_del) hetd[i] = 0; else hetd[i] = het_del ? 1 - : (cons_base[i][5] >= .2 * tot ? -1 : 0); + : (cons_base[i][5] >= .3 * tot ? -1 : 0); } } else { het_del = (hetd[i] == -1); // HET del uncalled on cnum 0 @@ -1692,24 +1717,15 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, } min_win_size += 10; fprintf(stderr, " => %d", min_win_size); - if (p->b->core.l_qseq > 1000) { + if (p->b->core.l_qseq > 1000) { // ||1 for 7f-long // long read data needs less context. It also tends to // have many more candidate indels to investigate so // speed here matters more. -// if (pos - left >= bca->indel_win_size) -// left2 += bca->indel_win_size/2; -// if (right-pos >= bca->indel_win_size) -// right2 -= bca->indel_win_size/2; if (pos - left >= min_win_size) left2 = MAX(left2, pos - min_win_size); if (right-pos >= min_win_size) right2 = MIN(right2, pos + min_win_size); -// if (pos - left >= min_win_size) -// left2 += bca->indel_win_size - min_win_size; -// if (right-pos >= min_win_size) -// right2 -= bca->indel_win_size - min_win_size; - fprintf(stderr, " LR = %d / %d / %d", left2, pos, right2); } From bbfadc1286c0748371824d5e4a548c85f6c05696 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 10 Mar 2022 16:32:14 +0000 Subject: [PATCH 22/31] Lots of tidying up of consensus code. Also bug fix "len==type" check in deletion. It should be "len==-type". --- bam2bcf_indel.c | 215 ++++++++++++++---------------------------------- 1 file changed, 64 insertions(+), 151 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 7fa0df75d..790700972 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -1,4 +1,4 @@ -#define CONS_DEBUG +//#define CONS_DEBUG /* @@ -7,25 +7,9 @@ - Reevaluate the two STR indel-size adjusting modes. Maybe no longer relevant -- Explore INS_PLUS_BASE again. Prefer to disable this as it's hard to - understand and doesn't work properly on reads ending on an - insertion. - -- Explore indelQ and the effect of STR at boundaries. I'm not - convined our quality calculation is correct. Certainly QUAL appears - to have little reality with actual indel likelihood! - It's already there - see end of bcf_cgp_align_score. - However try tweaking this now we've got better consensus. - - Consider limiting fract to never add more than current depth, so we change cons to Ns but not to another base type entirely. -- Set BAQ band width based on maximum size of ins / del observed. Do - from *all* types, as we may realign from one type to another. - -- Trim left/right down better, as we used to. Judge this based on - summation of various types and their consensii? - - Consider a separate rfract for lift-over of SNPs than for indels. SNPs is good at replacing bases with N where we're unsure on the data. However ref_ins may cause issues with sizing? @@ -47,6 +31,11 @@ pos P where P>p we can start at p and continue instead of from the start each time. +- Improve QUAL scoring to consider AD vs DP. + Eg. AD 10,8 looks good if we have 18 sequences. High qual. + But AD 10,8 looks poor if we had 50 seqs. Why did we have to discard + 32 of them? + */ /* bam2bcf_indel.c -- indel caller. @@ -418,8 +407,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int sample, int type, int biggest_del, int *left_shift, int *right_shift, int *band) { - int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base));// single base or del - str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); // multi-base insertions + // single base or del + int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base)); + // multi-base insertions + str_freq *cons_ins = calloc(right - left + 1, sizeof(*cons_ins)); // non-indel ref for all reads on this sample, rather than those just // matching type. We use this for handling the case where we have a @@ -432,31 +423,19 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // improve accuracy. int (*ref_base)[6] = calloc(right - left + 1, sizeof(*ref_base)); str_freq *ref_ins = calloc(right - left + 1, sizeof(*ref_ins)); - int i, j, k, s = sample; -// cons_ins sequence is the insertion seq followed by the -// next match base -//#define INS_PLUS_BASE - - int last_base_ins = 0; - + //-------------------------------------------------- // Accumulate sequences into cons_base and cons_ins arrays int local_band_max = 0; // maximum absolute deviation from diagonal for (i = 0; i < n_plp[s]; i++) { const bam_pileup1_t *p = plp[s] + i; -// if (p->indel != type) -// continue; - - // fprintf(stderr, "p=%d\t%d/%d: Seq %3d of %3d\t", p->b->core.pos, s, type, i, n_plp[s]); - bam1_t *b = p->b; int x = b->core.pos; // ref coordinate int y = 0; // seq coordinate uint32_t *cigar = bam_get_cigar(b); uint8_t *seq = bam_get_seq(b); - last_base_ins = 0; int local_band = 0; // current deviation from diagonal for (k = 0; k < b->core.n_cigar; ++k) { int op = cigar[k] & BAM_CIGAR_MASK; @@ -477,32 +456,20 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, 4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4 }; - // Can short-cut this with j_start and j_end based on x+len and left,right + // Can short-cut this with j_start and j_end based on + // x+len and left,right for (j = 0; j < len; j++, x++, y++) { if (x < left) continue; if (x >= right) break; -#ifdef INS_PLUS_BASE - // FIXME: need last_base_ins_type and last_base_ins_ref? - if (last_base_ins) { - last_base_ins = 0; - continue; - } -#endif base = bam_seqi(seq, y); -#ifdef INS_PLUS_BASE - if (p->indel == type) -#else // FIXME: or is_del && len==type as below if (p->indel == type) // 7g //if (p->indel == type || p->indel > 0) // 7f //if (p->indel == type || (p->indel > 0 && x == pos+1) -#endif cons_base[x-left][L[base]]++; else if (x != pos+1) // indel being assessed question ref_base[x-left][L[base]]++; - // fputc(seq_nt16_str[base], stderr); - // else last_base_ins=0? } break; } @@ -513,10 +480,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (local_band_max < local_band) local_band_max = local_band; } -// if (p->indel != type) { -// y += len; // for when adding to ref_base -// break; -// } char ins[1024]; for (j = 0; j < len; j++, y++) { @@ -531,16 +494,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // 5I 5M is IIIIIM M M M M events, not // {IIIII,M} M M M M choice. So we need to include the // next match in our sequence when choosing the consensus. -#ifdef INS_PLUS_BASE - if (y < b->core.l_qseq) { - base = bam_seqi(seq, y); - if (j < 1024) - ins[j++] = seq_nt16_str[base]; - } - //last_base_ins = 1; -#endif - - // fprintf(stderr, "<+%.*s>", j<1024?j:1024, ins); if (x >= left && x < right) { int ilen = j<1024?j:1024; if (p->indel == type /*&& x == pos+1*/) { @@ -550,9 +503,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } else if (x != pos+1){ bcf_cgp_append_cons(&ref_ins[x-left], ins, ilen, 1); } -#ifdef INS_PLUS_BASE - last_base_ins = 1; -#endif } break; } @@ -565,20 +515,12 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } // FIXME, not perfect for I/D combos, but likely sufficient. - last_base_ins = 0; for (j = 0; j < len; j++, x++) { if (x < left) continue; if (x >= right) break; - // fputc('-', stderr); - //if (p->indel == type) if ((p->indel == type && !p->is_del) || // starts here - (p->indel == 0 && p->is_del && len == type)) // to left - // FIXME: len == -type? + (p->indel == 0 && p->is_del && len == -type)) // left cons_base[x-left][5]++; -// else // 7h -// ref_base[x-left][5]++; - - // 7i else if (x+len <= pos+1 || (skip_to && x > skip_to)) ref_base[x-left][5]++; else if (x <= pos && x+len > pos+1) { @@ -595,12 +537,14 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, break; } } - // Also track the biggest deviation +/- from diagonal + + // Also track the biggest deviation +/- from diagonal. We use + // this band observation in our BAQ alignment step. if (*band < local_band_max) *band = local_band_max; - // fprintf(stderr, " %s\n", bam_get_qname(p->b)); } + //-------------------------------------------------- // Expand cons_base to include depth from ref_base/ref_ins // Caveat: except at pos itself, where true ref is used if type != 0 for (i = 0; i < right-left; i++) { @@ -612,6 +556,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, break; t += cons_ins[i].freq[j]; } + + // Similarly for depth on the non-ALT calls (NB: not necessarily + // REF as maybe it's other ALTs). int r = ref_base[i][0] + ref_base[i][1] + ref_base[i][2] + ref_base[i][3] + ref_base[i][4] + ref_base[i][5]; for (j = 0; j < NI; j++) { @@ -627,11 +574,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // So add in a little data from ref_base/ref_ins. double rfract = (r - t*2)*.75 / (r+1); -// // We ensure this is at least 1 fold deep, and we try to add -// // no more than the amount of coverage in this consesnsus. -// double rfract = (MIN(r, t*3.333+1) - t*2)*.75 / (r+1); - - //rfract*=.5; // -FN +FP/GT. Which poison do we want? if (rfract < 1.01 / (r+1e-10)) rfract = 1.01 / (r+1e-10); // low depth compensation @@ -640,34 +582,33 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // for sequencing errors in low depth alleles. If we get // conflicts, it's better to use N than to change a base // incase that variant is genuine. + if (i+left >= pos+1 && i+left < pos+1-biggest_del) { + // We're overlapping the current indel region, so + // we don't wish to bring in evidence from the other + // "type" data as it'll harm calling. + continue; + } else { + // Otherwise add in a portion of other data to + // boost low population numbers. + cons_base[i][0] += rfract * ref_base[i][0]; + cons_base[i][1] += rfract * ref_base[i][1]; + cons_base[i][2] += rfract * ref_base[i][2]; + cons_base[i][3] += rfract * ref_base[i][3]; + cons_base[i][4] += rfract * ref_base[i][4]; + cons_base[i][5] += rfract * ref_base[i][5]; + } - if (1 || rfract > 0) { // && !(type == 0 && i+left == pos)) { - if (i+left >= pos+1 && i+left < pos+1-biggest_del) { - // We're overlapping the current indel region, so - // we don't wish to bring in evidence from the other - // "type" data. - continue; // 10f+ - } else { - // Otherwise add in a portion of other data to - // boost low population numbers. - cons_base[i][0] += rfract * ref_base[i][0]; - cons_base[i][1] += rfract * ref_base[i][1]; - cons_base[i][2] += rfract * ref_base[i][2]; - cons_base[i][3] += rfract * ref_base[i][3]; - cons_base[i][4] += rfract * ref_base[i][4]; - cons_base[i][5] += rfract * ref_base[i][5]; - } - - for (j = 0; j < NI; j++) { - if (!ref_ins[i].str[j]) - break; - bcf_cgp_append_cons(&cons_ins[i], - ref_ins[i].str[j], ref_ins[i].len[j], - rfract * ref_ins[i].freq[j]); - } + // Similarly for insertions too; consider a different rfract here? + for (j = 0; j < NI; j++) { + if (!ref_ins[i].str[j]) + break; + bcf_cgp_append_cons(&cons_ins[i], + ref_ins[i].str[j], ref_ins[i].len[j], + rfract * ref_ins[i].freq[j]); } } + //-------------------------------------------------- // Allocate consensus buffer, to worst case length int max_len = right-left; for (i = 0; i < right-left; i++) { @@ -687,6 +628,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, cons[0] = (char *)&cons[2]; cons[1] = cons[0] + max_len+1; + //-------------------------------------------------- // Merge insertions where they are the same length but different // sequences. // NB: we could just index by length and have accumulators for each, @@ -754,12 +696,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } -// TODO: try CONS_CUTOFF higher, eg .6, to force more Ns? #define CONS_CUTOFF .40 // 40% needed for base vs N #define CONS_CUTOFF2 .80 // 80% needed for gap in cons[1] #define CONS_CUTOFF_INC .40 // 40% to include any insertion cons[0] #define CONS_CUTOFF_INC2 .80 // 80% to include any insertion cons[1] HOM #define CONS_CUTOFF_INS .60 // and then 60% needed for it to be bases vs N + + //-------------------------------------------------- // Walk through the frequency arrays to call the consensus. // We produce cons[0] and cons[1]. Both include strongly // homozygous indels. Both also include the indel at 'pos'. @@ -771,13 +714,12 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, *right_shift = 0; int cnum; - // Het call filled out in cnum==0 (+ve or -ve) + // Het call filled out in cnum==0 (+ve or -ve). // Used in cnum==1 to do the opposite of whichever way we did before. int heti[1024] = {0}, hetd[1024] = {0}; for (cnum = 0; cnum < 2; cnum++) { for (i = k = 0; i < right-left; i++) { - // fprintf(stderr, "%d\t", i); int max_v = 0, max_v2 = 0, max_j = 4, max_j2 = 4, tot = 0; for (j = 0; j < 6; j++) { // Top 2 consensus calls @@ -788,8 +730,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, max_v2 = cons_base[i][j], max_j2 = j; } tot += cons_base[i][j]; - // if (cons_base[i][j]) - // fprintf(stderr, "%c%d ", "ACGTN*"[j], cons_base[i][j]); } // +INS @@ -805,22 +745,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, //if (i != pos-left+1 || cons_ins[i].len[j] == type) max_v_ins = cons_ins[i].freq[j], max_j_ins = j; tot_ins += cons_ins[i].freq[j]; - - // fprintf(stderr, "%.*s%d ", cons_ins[i].len[j], cons_ins[i].str[j], - // cons_ins[i].freq[j]); } + // NB: tot is based on next matching base, so it includes // everything with or without the insertion. -#ifdef INS_PLUS_BASE - int tot_sum = tot+tot_ins; -#else int tot_sum = tot; -#endif -// if (max_v_ins > CONS_CUTOFF_INC *(tot+tot_ins) && (cnum==0 || -// max_v_ins > CONS_CUTOFF_INC2*(tot+tot_ins) || -// i == pos-left+1)) { int always_ins = - (i == pos-left+1 && type>0) || // current eval + (i == pos-left+1 && type>0) || // current eval max_v_ins > CONS_CUTOFF_INC2*tot_sum;// HOM int het_ins = 0; if (!always_ins && max_v_ins >= bca->min_support) { @@ -834,21 +765,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, het_ins = (heti[i] == -1); // HET but uncalled before } } -// if (max_v_ins) -// fprintf(stderr, "Cons @ %d: type %d cnum %d always %d het_ins %d // max_v %d vs %d+%d\n", i, type, cnum, always_ins, het_ins, max_v_ins, tot, tot_ins); + if (always_ins || het_ins) { -// #else -// if ((i == pos-left+1 && type) || // current 'type' at pos -// max_v_ins > CONS_CUTOFF_INC2*tot || // HOM -// (max_v_ins > bca->min_support && -// (cnum != 0) ^ max_v_ins > CONS_CUTOFF_INC*tot)) { // HET -// #endif if (max_v_ins > CONS_CUTOFF_INS*tot_ins) { // Insert bases for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) { - // FIXME: commented out to deliberate get consensus shift. - // Need to know how to get aligner working properly in that - // scenario, as it'll happen sometimes! if (cnum == 0) { if (k < pos-left+*left_shift) (*left_shift)++; @@ -861,13 +782,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) cons[cnum][k++] = 'N'; } -#ifdef INS_PLUS_BASE - // don't call next base as included in insertion - continue; -#endif } - // Call deletions + // Call deletions & bases int always_del = (type < 0 && i > pos-left && i <= pos-left-type) || cons_base[i][5] > CONS_CUTOFF2 * tot; // HOM del int het_del = 0; @@ -904,24 +821,26 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, else if (max_v > 0) cons[cnum][k++] = 'N'; else - cons[cnum][k++] = ref[left+k]; + cons[cnum][k] = ref[left+k], k++; } } + + // Null termination purely for ease of printing cons[cnum][k++] = '\0'; } - // fprintf(stderr, "Cons: %s\n", cons); - free(cons_base); - free(ref_base); + // FIXME: replace by string pool for rapid tidying for (i = 0; i < right-left; i++) { for (j = 0; j < NI; j++) { - // FIXME: replace by string pool if (cons_ins[i].str[j]) free(cons_ins[i].str[j]); if (ref_ins[i].str[j]) free(ref_ins[i].str[j]); } } + + free(cons_base); + free(ref_base); free(cons_ins); free(ref_ins); @@ -1079,7 +998,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4) break; if (l > ABS(type)) { -// fprintf(stderr, "Prune %d N to left\n", l-ABS(type)); tbeg += l-ABS(type); } @@ -1088,7 +1006,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, break; l = tend1-tbeg-1 - l; if (l > ABS(type)) { -// fprintf(stderr, "Prune %d N to right 1\n", l-ABS(type)); tend1 -= l-ABS(type); } @@ -1097,7 +1014,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, break; l = tend2-tbeg-1 - l; if (l > ABS(type)) { -// fprintf(stderr, "Prune %d N to right 2\n", l-ABS(type)); tend2 -= l-ABS(type); } #endif @@ -1712,11 +1628,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // sufficient in longer range some repeats. //min_win_size += max_str; min_win_size += tot_str; - - fprintf(stderr, "BAND=%d STR %d %d INDEL %d %d sh %d %d", band, max_str, tot_str, biggest_del, biggest_ins, left_shift, right_shift); } min_win_size += 10; - fprintf(stderr, " => %d", min_win_size); if (p->b->core.l_qseq > 1000) { // ||1 for 7f-long // long read data needs less context. It also tends to // have many more candidate indels to investigate so @@ -1816,11 +1729,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, tbeg = MAX(left2, old_tbeg - wband); #endif - fprintf(stderr, "\nNew: %d REG %d+%d / %d+%d tbeg %d+%d %d+%d/%d q %d+%d band %d", - types[t], left, right-left, left2, rright-left2, - old_tbeg, old_tend-old_tbeg, - tbeg, tend1-tbeg, tend2-tbeg, - r_start+qbeg, qend-qbeg, band); +// fprintf(stderr, "\nNew: %d REG %d+%d / %d+%d tbeg %d+%d %d+%d/%d q %d+%d band %d", +// types[t], left, right-left, left2, rright-left2, +// old_tbeg, old_tend-old_tbeg, +// tbeg, tend1-tbeg, tend2-tbeg, +// r_start+qbeg, qend-qbeg, band); // do realignment; this is the bottleneck. // From 1173c62e84f7529a866e1d1687a91bfdbe604674 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 11 Mar 2022 11:21:24 +0000 Subject: [PATCH 23/31] Fix a small buffer overrun introduced by changing tbeg/tend usage. Also fixed a minor memory leak (approx 36 bytes per 1MB of genome). --- bam2bcf_indel.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 790700972..06959fd98 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -5,7 +5,7 @@ TODO: - Reevaluate the two STR indel-size adjusting modes. - Maybe no longer relevant + Maybe no longer relevant. (Looks poor to me) - Consider limiting fract to never add more than current depth, so we change cons to Ns but not to another base type entirely. @@ -320,8 +320,10 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, } free(aux); - if (t <= 1) + if (t <= 1) { + free(types); return NULL; + } n_types = t; // Find reference type; types[?] == 0) @@ -1066,7 +1068,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, if (tend1 != tend2 || memcmp((char *)ref1 + tbeg - left, (char *)ref2 + tbeg - left, - tend1 - tbeg + type) != 0) + tend1 - tbeg) != 0) sc1 = probaln_glocal(ref1 + tbeg - left, tend1 - tbeg, query, qend - qbeg, qq, &apf, 0, 0); else @@ -1114,7 +1116,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, rep_ele *reps, *elt, *tmp; uint8_t *seg = ref2 + tbeg - left; - int seg_len = tend2 - tbeg + type; + int seg_len = tend2 - tbeg; // Note: although seg moves (tbeg varies), ref2 is reused many times // so we could factor out some find_STR calls. However it's not the @@ -1638,8 +1640,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, left2 = MAX(left2, pos - min_win_size); if (right-pos >= min_win_size) right2 = MIN(right2, pos + min_win_size); - - fprintf(stderr, " LR = %d / %d / %d", left2, pos, right2); } int r_start = p->b->core.pos; From ac4d267acc363e238541c41cdd3f65725c91b129 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Fri, 11 Mar 2022 16:25:43 +0000 Subject: [PATCH 24/31] Tidy the consensus code --- bam2bcf_indel.c | 388 +++++++++++------------------------------------- 1 file changed, 86 insertions(+), 302 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 06959fd98..2b53abc9c 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -408,7 +408,21 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int left, int right, int sample, int type, int biggest_del, int *left_shift, int *right_shift, - int *band) { + int *band, int *tcon_len) { + // Map ASCII ACGTN* to 012345 + static uint8_t base6[256] = { + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,5,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + //A C G *^ T + 4,0,4,1,4,4,4,2, 4,4,4,4,4,4,4,4, 4,4,4,4,3,3,4,4, 4,4,4,4,4,4,4,4, + 4,0,4,1,4,4,4,2, 4,4,4,4,4,4,4,4, 4,4,4,4,3,3,4,4, 4,4,4,4,4,4,4,4, + + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, + }; + // single base or del int (*cons_base)[6] = calloc(right - left + 1, sizeof(*cons_base)); // multi-base insertions @@ -453,11 +467,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, case BAM_CMATCH: case BAM_CEQUAL: case BAM_CDIFF: { - int L[16] = { - // 1,2,4,8 to 0,1,2,3 plus 4 for N/ambig (and 5 for gap) - 4,0,1,4, 2,4,4,4, 3,4,4,4, 4,4,4,4 - }; - // Can short-cut this with j_start and j_end based on // x+len and left,right for (j = 0; j < len; j++, x++, y++) { @@ -465,13 +474,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (x >= right) break; base = bam_seqi(seq, y); - // FIXME: or is_del && len==type as below - if (p->indel == type) // 7g - //if (p->indel == type || p->indel > 0) // 7f - //if (p->indel == type || (p->indel > 0 && x == pos+1) - cons_base[x-left][L[base]]++; + if (p->indel == type) + // Convert 4-bit base ambig code to 0,1,2,3,4 range + cons_base[x-left][seq_nt16_int[base]]++; else if (x != pos+1) // indel being assessed question - ref_base[x-left][L[base]]++; + ref_base[x-left][seq_nt16_int[base]]++; } break; } @@ -486,10 +493,11 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, char ins[1024]; for (j = 0; j < len; j++, y++) { if (x < left) continue; - if (x >= right) break; + if (x >= right) + break; base = bam_seqi(seq, y); if (j < 1024) - ins[j] = seq_nt16_str[base]; + ins[j] = seq_nt16_int[base]; } // Insertions come before a ref match. @@ -516,7 +524,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, local_band_max = -local_band; } - // FIXME, not perfect for I/D combos, but likely sufficient. + // Maybe not perfect for I/D combos, but likely sufficient. for (j = 0; j < len; j++, x++) { if (x < left) continue; if (x >= right) break; @@ -647,15 +655,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int l; for (l = 0; l < cons_ins[i].len[j]; l++) { - // FIXME! optimise this + // Append to relevant frequency counter, zero all others ins[l][0] = ins[l][1] = ins[l][2] = ins[l][3] = ins[l][4] = 0; - switch(cons_ins[i].str[j][l]) { - case 'A': ins[l][0] = cons_ins[i].freq[j]; break; - case 'C': ins[l][1] = cons_ins[i].freq[j]; break; - case 'G': ins[l][2] = cons_ins[i].freq[j]; break; - case 'T': ins[l][3] = cons_ins[i].freq[j]; break; - default: ins[l][4] = cons_ins[i].freq[j]; break; - } + uint8_t b = cons_ins[i].str[j][l]; + ins[l][b] = cons_ins[i].freq[j]; } // Merge other insertions of the same length to ins[] counters @@ -669,14 +672,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Merge str[j] and str[k] for (l = 0; l < cons_ins[i].len[k]; l++) { - // FIXME! optimise this - switch(cons_ins[i].str[k][l]) { - case 'A': ins[l][0]+=cons_ins[i].freq[k]; break; - case 'C': ins[l][1]+=cons_ins[i].freq[k]; break; - case 'G': ins[l][2]+=cons_ins[i].freq[k]; break; - case 'T': ins[l][3]+=cons_ins[i].freq[k]; break; - default: ins[l][4]+=cons_ins[i].freq[k]; break; - } + uint8_t b = cons_ins[i].str[k][l]; + ins[l][b] += cons_ins[i].freq[k]; } cons_ins[i].freq[j] += cons_ins[i].freq[k]; cons_ins[i].freq[k] = 0; @@ -693,7 +690,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (max_v < ins[l][3]) max_v = ins[l][3], base = 3; if (max_v < ins[l][4]) max_v = ins[l][4], base = 4; - cons_ins[i].str[j][l] = (max_v > 0.6*tot) ?"ACGTN"[base] :'N'; + cons_ins[i].str[j][l] = (max_v > 0.6*tot) ? base : 4; } } } @@ -782,7 +779,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } else { for (j = 0; j < cons_ins[i].len[max_j_ins]; j++) - cons[cnum][k++] = 'N'; + cons[cnum][k++] = 4; // 'N'; } } @@ -818,17 +815,17 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, (*right_shift)++; } else { // Finally the easy case - a non-indel base or an N + // FIXME: make cons[] in 0,1,2,3,4,5 terms if (max_v > CONS_CUTOFF*tot) - cons[cnum][k++] = "ACGTN*"[max_j]; + cons[cnum][k++] = max_j; // "ACGTN*" else if (max_v > 0) - cons[cnum][k++] = 'N'; + cons[cnum][k++] = 4; // 'N'; else - cons[cnum][k] = ref[left+k], k++; + cons[cnum][k] = base6[(uint8_t)ref[left+k]], k++; } } - // Null termination purely for ease of printing - cons[cnum][k++] = '\0'; + tcon_len[cnum] = k; } // FIXME: replace by string pool for rapid tidying @@ -869,11 +866,11 @@ static int bcf_cgp_l_run(const char *ref, int pos) { } -// Compute the consensus for this sample 's', minus indels which -// get added later. -static char *bcf_cgp_calc_cons(int n, int *n_plp, bam_pileup1_t **plp, - int pos, int *types, int n_types, - int max_ins, int s) { +// Compute the insertion consensus for this sample 's' via a basic +// majority rule +static char *bcf_cgp_calc_ins_cons(int n, int *n_plp, bam_pileup1_t **plp, + int pos, int *types, int n_types, + int max_ins, int s) { int i, j, t, k; int *inscns_aux = (int*)calloc(5 * n_types * max_ins, sizeof(int)); if (!inscns_aux) @@ -1286,7 +1283,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (ref == 0 || bca == 0) return -1; int i, s, t, n_types, *types, max_rd_len, left, right, max_ins; - int *score, max_ref2; + int *score; int N, K, l_run, ref_type, n_alt; char *inscns = 0, *ref1, *ref2, *query; @@ -1318,145 +1315,28 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (ref[i] == 0) break; right = i; - // FIXME: move to own function: STR_adj_left_right? - if (0) { - rep_ele *reps, *elt, *tmp; - - // Convert ASCII to 0,1,2,3 seq for find_STR usage - int j; - char ref4[1024]; // FIXME, check! - if (right > left+1024) - right = left+1024; - for (j = 0, i = left; i < right; i++, j++) { - switch(ref[i]) { - case 'A': ref4[j] = 0; break; - case 'C': ref4[j] = 1; break; - case 'G': ref4[j] = 2; break; - case 'T': ref4[j] = 3; break; - default: ref4[j] = j%4; break; // mix N across all 4 types - } - } - reps = find_STR(ref4, right-left, 0); - - //fprintf(stderr, "\nRef at %d: %.*s\n", left, right-left, ref+left); - -#if 0 - int adjusted = 1; - int over_l = pos-1; - int over_r = pos+del_size+1; - int ins_size = types[0]>0 ? types[0] : 0; - DL_FOREACH_SAFE(reps, elt, tmp) { - //fprintf(stderr, "rep %d..%d: %.*s\n", elt->start, elt->end, - // elt->end-elt->start+1, ref+left+elt->start); - if (elt->start + left < over_l && elt->end + left >= pos-1) { - over_l = elt->start + left; - //fprintf(stderr, "Adj left\n"); - adjusted=1; - } - if (elt->end + left > over_r && elt->start + left <= pos+1) { - over_r = elt->end + left; - //fprintf(stderr, "Adj right\n"); - adjusted=1; - } - //DL_DELETE(reps, elt); - //free(elt); - } - - // 2nd pass, adjusting to next STR so require 2 STRs out - if (adjusted) { - int pos_l = over_l; - int pos_r = over_r; - DL_FOREACH_SAFE(reps, elt, tmp) { - if (elt->start + left < over_l && elt->end + left >= pos_l-1) - over_l = elt->start + left; - if (elt->end + left > over_r && elt->start + left <= pos_r+1) - over_r = elt->end + left; - DL_DELETE(reps, elt); - free(elt); - } - } - //fprintf(stderr, "STR overlap = %d..(%d)..%d\n", over_l, pos, over_r); - - // FIXME adjustable param - over_l = pos - (pos-over_l)*2; - over_r = pos + (over_r-pos)*2; - //over_l -= 5+del_size+ins_size; - //over_r += 5+del_size+ins_size; - - over_l -= 5+3*(del_size+ins_size); - over_r += 5+3*(del_size+ins_size); - //fprintf(stderr, "=> overlap = %d..(%d)..%d\n", over_l, pos, over_r); - if (left < over_l) - left = over_l; - if (right > over_r) - right = over_r; -#else - // Too many FNs, but OK otherwise. - char str[1024] = {0}; - const int n = 3; - DL_FOREACH_SAFE(reps, elt, tmp) { - int i, i_start = MAX(elt->start-n, 0), i_end = MIN(elt->end+n, 1024); -// fprintf(stderr, "rep %d..%d: %.*s\n", elt->start, elt->end, -// elt->end-elt->start+1, ref+left+elt->start); - for (i = i_start; i < i_end; i++) - str[i]=1; - DL_DELETE(reps, elt); - free(elt); - } - int score; - for (score = 3, i = pos; i > left && score; i--) - score -= str[i-left]==0; - int left_new = i; - - for (score = 3, i = pos; i < right && score; i++) - score -= str[i-left]==0; - int right_new = i; - - fprintf(stderr, "left %d, %d, pos %d, %d, right %d\n", - left, left_new, pos, right_new, right); - - left = left_new; - right = right_new; -#endif - } - -// fprintf(stderr, "=== POS %d, left/right = len %d\n", pos, right-left); - // compute the likelihood given each type of indel for each read max_ins = types[n_types - 1]; // max_ins is at least 0 - max_ref2 = right - left + 2 + 2 * (max_ins > -types[0]? max_ins : -types[0]); - // FIXME: add fudge to permit some extra neighbouring indels - max_ref2 += 50; // The length of the homopolymer run around the current position l_run = bcf_cgp_l_run(ref, pos); // construct the consensus sequence (minus indels, which are added later) if (max_ins > 0) { - inscns = bcf_cgp_calc_cons(n, n_plp, plp, pos, - types, n_types, max_ins, s); + // FIXME: replace filling inscns[] with calc_consensus return + // so the merges of the insertion consensus for type[t] is + // reported directly. (It may need adjustment to avoid N) + inscns = bcf_cgp_calc_ins_cons(n, n_plp, plp, pos, + types, n_types, max_ins, s); if (!inscns) return -1; } - ref1 = (char*) calloc(max_ref2, 1); - ref2 = (char*) calloc(max_ref2, 1); query = (char*) calloc(right - left + max_rd_len + max_ins + 2, 1); score = (int*) calloc(N * n_types, sizeof(int)); bca->indelreg = 0; double nqual_over_60 = bca->nqual / 60.0; - // FIXME: need additional types, or rather to amend the type 0 case? - // - // We have types matching indel, plus type 0 which is ref. - // What about type 0 which matches consensus? - // Eg we have a small (wrong) 1bp insertion at current location, - // and a larger (correct) homozygous insertion say 10 bp away. - // - // We don't want the alignment of seqs vs wrong indel-hypothesis to be - // scoring higher than against ref. So need a consensus with the large - // insertion and no small hypothesised one. - int biggest_del = 0; int biggest_ins = 0; for (t = 0; t < n_types; t++) { @@ -1470,7 +1350,12 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, for (t = 0; t < n_types; ++t) { int l, ir; - // compute indelreg + // Compute indelreg. This is the context in the reference. Eg: + // + // REF: AG--TTTC Inscns is "TT". + // SEQ: AGTTTTTC Indelreg is 3; next 3 "TTT" bases + // + // => GTTT GTTTTT is call. if (types[t] == 0) ir = 0; else if (types[t] > 0) @@ -1481,7 +1366,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (ir > bca->indelreg) bca->indelreg = ir; - // Identify max deletion length + // Identify max deletion length. + // Note these are maximum sizes in the aligned data, rather + // than the maximum sizes in the types[] array (which are + // already known in biggest_del and biggest_ins). int max_deletion = 0; int max_insertion = 0; for (s = 0; s < n; ++s) { @@ -1494,71 +1382,32 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, } } +// // FIXME: does this matter? Try just using +// // biggest_del/biggest_ins +// max_insertion = biggest_ins; // 8b +// max_deletion = -biggest_del; + // Realignment score, computed via BAQ for (s = K = 0; s < n; ++s) { - char **tcons, *cp; + char **tcons; int left_shift, right_shift; + int tcon_len[2]; tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, left, right, s, types[t], biggest_del, - &left_shift, &right_shift, &band); + &left_shift, &right_shift, &band, + tcon_len); #ifdef CONS_DEBUG - fprintf(stderr, "Cons0 @ %d %4d/%3d %s\n", pos, types[t], left_shift, tcons[0]); - fprintf(stderr, "Cons1 @ %d %4d/%3d %s\n", pos, types[t], left_shift, tcons[1]); -#endif - - // FIXME: map from ascii to 0,1,2,3,4. - // This is only needed because bcf_cgp_consensus is reporting in ASCII - // currently, for ease of debugging. - int tcon_len[2], cnum; - for (cnum = 0; cnum < 2; cnum++) { - for (cp = tcons[cnum]; *cp; cp++) { - switch(*cp) { - case 'A': *cp = 0; break; - case 'C': *cp = 1; break; - case 'G': *cp = 2; break; - case 'T': *cp = 3; break; - default : *cp = 4; break; - } - } - tcon_len[cnum] = cp-tcons[cnum]; - } - - // original consensus method - //memcpy(ref1, ref2, right-left+(types[t]>0?types[t]:0)); - memcpy(ref1, tcons[1], MIN(tcon_len[1], max_ref2)); - if (tcon_len[1] < right-left+(types[t]>0?types[t]:0)) { - memset(ref1+tcon_len[1], 4, - right-left+(types[t]>0?types[t]:0) - tcon_len[1]); + for (j = 0; j < 2; j++) { + int k; + fprintf(stderr, "Cons%d @ %d %4d/%3d ", + pos, types[t], left_shift); + for (k = 0; k < tcon_len[j]; k++) + putc("ACGTN"[(uint8_t)tcons[j][k]], stderr); + putc('\n', stderr); } -// fprintf(stderr, "Type %d = %2d\t", t, types[t]); -// for (j = 0; j < right-left+(types[t]>0?types[t]:0); j++) -// putc("ACGTN"[(uint8_t)ref2[j]], stderr); -// putc('\n', stderr); - - // Our computed consensus may start/end in slightly different - // positions due to indels. - // We pad it out with Ns so sequences overlapping don't - // carry penalties. (Ideally we'd pad with the reference, but - // this suffices and it's tricky to track.) - int ref2_pos = 0; - int rright = left + tcon_len[0]; // ref left/right - if (left_shift > 0) { - memset(ref2, 4/*N*/, MIN(left_shift, max_ref2)); - ref2_pos += MIN(left_shift, max_ref2); - } - memcpy(ref2 + ref2_pos, tcons[0], MIN(tcon_len[0], max_ref2-ref2_pos)); - ref2_pos += MIN(tcon_len[0], max_ref2-ref2_pos); - if (right_shift > 0) { - memset(ref2 + ref2_pos, 4/*N*/, - MIN(right_shift, max_ref2-ref2_pos)); - } - -// fprintf(stderr, "TYPE %d = %2d\t", t, types[t]); -// for (j = 0; j < rright-left && j < max_ref2; j++) -// putc("ACGTN"[(uint8_t)ref2[j]], stderr); -// putc('\n', stderr); +#endif - // align each read to ref2 + // align each read to consensus(es) for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; @@ -1610,7 +1459,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, continue; // determine the start and end of sequences for alignment - // FIXME: loops over CIGAR multiple times int left2 = left, right2 = right; int min_win_size = MAX(-biggest_del, biggest_ins); min_win_size += ABS(left_shift) + ABS(right_shift); @@ -1642,11 +1490,19 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, right2 = MIN(right2, pos + min_win_size); } + // Genomic coords for first and last base of query + // alignment. This is only used in bcf_cgp_align_score + // for computing scores by looking for the proximity + // of STRs with the end of the query alignment. int r_start = p->b->core.pos; int r_end = bam_cigar2rlen(p->b->core.n_cigar, bam_get_cigar(p->b)) -1 + r_start; + // Map left2/right2 genomic coordinates to qbeg/qend + // query coordinates. The query may not span the + // entire left/right region, so this also returns the + // equivalent genomic coords for qbeg/qend in tbeg/tend. qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), left2/*+biggest_ins*/, 0, &tbeg); qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos, @@ -1657,24 +1513,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, int old_tend = tend; int old_tbeg = tbeg; -// if (types[t] < 0) { -// int l = -types[t]; -// tbeg = tbeg - l > left? tbeg - l : left; -// } -// if (left_shift < 0) -// tbeg = tbeg + left_shift > left ? tbeg + left_shift : left; -// -// // FIXME: Why +20? tbeg-left_shift to tend+right_shift -// // is still insufficient. Why? Check tpos2qpos maybe? -// if (left_shift+20 > 0) -// tbeg = tbeg - (left_shift+20) > left -// ? tbeg - (left_shift+20) -// : left; -// if (right_shift+20 > 0) -// tend = tend + right_shift+20 < rright -// ? tend + right_shift+20 -// : rright; - // write the query sequence for (l = qbeg; l < qend; ++l) query[l - qbeg] = seq_nt16_int[bam_seqi(seq, l)]; @@ -1683,20 +1521,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // RG platform field. int long_read = p->b->core.l_qseq > 1000; - // FIXME: we can improve (see above). - // Maybe use tbeg/tend as before, but with adjustment for - // difference between right-left and tcon_len. - // For now we just brute force it and do full ref range. - // It doesn't seem to impact on band at all. *Why?* - int tend1 = left + tcon_len[0] - (left2-left); - int tend2 = left + tcon_len[1] - (left2-left); - -// fprintf(stderr, "Type %d REG %d+%d / %d+%d tbeg %d+%d %d+%d/%d q %d+%d band %d\n", -// types[t], left, right-left, left2, rright-left2, -// old_tbeg, old_tend-old_tbeg, -// tbeg, tend1-tbeg, tend2-tbeg, -// r_start+qbeg, qend-qbeg, band); - // tbeg and tend are the genomic locations equivalent // to qbeg and qend on the sequence. // These may being entirely within our left/right @@ -1706,47 +1530,24 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // We know an estimation of band, plus biggest indel, // so we can trim tbeg/tend to a smaller region if we // wish here. This speeds up BAQ scoring. - - // band+MAX(...) = h6 - // band+MAX(...)+10 = h6b - // band+MAX(...)*2+10 = h6c - // band+MAX(...)*2+20 = h6d - - // TODO: check 10h5STR1-i.30x vs 10h6.30x to look for - // differences and see what band, max_ins/del, etc are. - // Can we rescue these without discarding the others, - // or is it fundamentally not a possibility? - //int wband = band + MAX(max_deletion, max_insertion) + 10; int wband = band + MAX(max_deletion, max_insertion)*2 + 20; -// if (tend1 > old_tend + wband) -// tend1 = MIN(right2, old_tend + wband); -// if (tend2 > old_tend + wband) -// tend2 = MIN(right2, old_tend + wband); - -#if 1 + int tend1 = left + tcon_len[0] - (left2-left); + int tend2 = left + tcon_len[1] - (left2-left); tend1 = MIN(tend1, old_tend + wband); tend2 = MIN(tend2, old_tend + wband); tbeg = MAX(left2, old_tbeg - wband); -#endif - -// fprintf(stderr, "\nNew: %d REG %d+%d / %d+%d tbeg %d+%d %d+%d/%d q %d+%d band %d", -// types[t], left, right-left, left2, rright-left2, -// old_tbeg, old_tend-old_tbeg, -// tbeg, tend1-tbeg, tend2-tbeg, -// r_start+qbeg, qend-qbeg, band); // do realignment; this is the bottleneck. // // Note low score = good, high score = bad. if (tend > tbeg) { if (bcf_cgp_align_score(p, bca, types[t], band, - //(uint8_t *)ref1 + left2-left, - //(uint8_t *)ref2 + left2-left, (uint8_t *)tcons[0] + left2-left, (uint8_t *)tcons[1] + left2-left, (uint8_t *)query, r_start, r_end, long_read, - tbeg, tend1, tend2, left2, rright, + tbeg, tend1, tend2, + left2, left + tcon_len[0], qbeg, qend, qpos, max_deletion, &score[K*n_types + t]) < 0) { score[K*n_types + t] = 0xffffff; @@ -1757,21 +1558,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // region entirely within a deletion (thus tend < tbeg). score[K*n_types + t] = 0xffffff; } -#if 0 - for (l = 0; l < tend - tbeg + abs(types[t]); ++l) { - if (tbeg-left+l >= max_ref2) - break; - fputc("ACGTN"[(int)ref2[tbeg-left+l]], stderr); - } - fputc('\n', stderr); - for (l = 0; l < qend - qbeg; ++l) - fputc("ACGTN"[(int)query[l]], stderr); - fputc('\n', stderr); - fprintf(stderr, "pos=%d type=%d read=%d:%d name=%s " - "qbeg=%d tbeg=%d score=%d,%d\n", - pos, types[t], s, i, bam_get_qname(p->b), - qbeg, tbeg, score[K*n_types + t]>>8, score[K*n_types + t]&0xff); -#endif } free(tcons); } @@ -1782,8 +1568,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, ref_type, types, n_types, score); // free - free(ref1); - free(ref2); free(query); free(score); free(types); From 5271e9c8a17da7bd505c6cae5a7e70fa6a151748 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 14 Mar 2022 10:21:05 +0000 Subject: [PATCH 25/31] Tidy up biggest_del / max_deletion. Similarly for insertion. These aren't exact duplicates, as it's overall and per-sample, but close enough and the doubling up of work was accidental. --- bam2bcf_indel.c | 39 +++++++++------------------------------ 1 file changed, 9 insertions(+), 30 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 2b53abc9c..e8cd0f7ee 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -1282,10 +1282,10 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, { if (ref == 0 || bca == 0) return -1; - int i, s, t, n_types, *types, max_rd_len, left, right, max_ins; - int *score; - int N, K, l_run, ref_type, n_alt; - char *inscns = 0, *ref1, *ref2, *query; + int i, s, t, n_types, *types = NULL, max_rd_len, left, right, max_ins; + int *score = NULL; + int N, K, l_run, ref_type, n_alt = -1; + char *inscns = NULL, *query = NULL; // determine if there is a gap for (s = N = 0; s < n; ++s) { @@ -1301,7 +1301,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, types = bcf_cgp_find_types(n, n_plp, plp, pos, bca, ref, &max_rd_len, &n_types, &ref_type, &N); if (!types) - return -1; + goto err; // calculate left and right boundary @@ -1366,27 +1366,6 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, if (ir > bca->indelreg) bca->indelreg = ir; - // Identify max deletion length. - // Note these are maximum sizes in the aligned data, rather - // than the maximum sizes in the types[] array (which are - // already known in biggest_del and biggest_ins). - int max_deletion = 0; - int max_insertion = 0; - for (s = 0; s < n; ++s) { - for (i = 0; i < n_plp[s]; ++i, ++K) { - bam_pileup1_t *p = plp[s] + i; - if (max_deletion < -p->indel) - max_deletion = -p->indel; - if (max_insertion < p->indel) - max_insertion = p->indel; - } - } - -// // FIXME: does this matter? Try just using -// // biggest_del/biggest_ins -// max_insertion = biggest_ins; // 8b -// max_deletion = -biggest_del; - // Realignment score, computed via BAQ for (s = K = 0; s < n; ++s) { char **tcons; @@ -1530,7 +1509,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // We know an estimation of band, plus biggest indel, // so we can trim tbeg/tend to a smaller region if we // wish here. This speeds up BAQ scoring. - int wband = band + MAX(max_deletion, max_insertion)*2 + 20; + int wband = band + MAX(-biggest_del, biggest_ins)*2 + 20; int tend1 = left + tcon_len[0] - (left2-left); int tend2 = left + tcon_len[1] - (left2-left); tend1 = MIN(tend1, old_tend + wband); @@ -1548,10 +1527,9 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, r_start, r_end, long_read, tbeg, tend1, tend2, left2, left + tcon_len[0], - qbeg, qend, qpos, max_deletion, + qbeg, qend, qpos, -biggest_del, &score[K*n_types + t]) < 0) { - score[K*n_types + t] = 0xffffff; - return -1; + goto err; } } else { // place holder large cost for reads that cover the @@ -1567,6 +1545,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins, ref_type, types, n_types, score); + err: // free free(query); free(score); From eee86f08983544218d9c3c2e8aac781f348885c0 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Mon, 14 Mar 2022 15:40:49 +0000 Subject: [PATCH 26/31] Improve l_run estQ assignment. If the sequence being inserted has differing base-calls to the flanking sequence, then est_seqQ's analysis of homopolymer runs and how they impact the score is irrelevant as the insertion isn't ambiguous anyway. This has a small reduction to FN on PacBio CCS with minimal change to FP. --- bam2bcf.c | 7 ++++ bam2bcf_indel.c | 99 +++++++++++++++++++++++-------------------------- 2 files changed, 54 insertions(+), 52 deletions(-) diff --git a/bam2bcf.c b/bam2bcf.c index 160a112df..1cd019f49 100644 --- a/bam2bcf.c +++ b/bam2bcf.c @@ -240,6 +240,8 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t seqQ = (3*seqQ + 2*q)/8; } if (_n > 20 && seqQ > 40) seqQ = 40; + // Note baseQ changes some output fields such as I16, but has no + // significant affect on "call". baseQ = p->aux>>8&0xff; is_diff = (b != 0); @@ -358,6 +360,11 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t for (i=0; i<4; i++) r->ADF[i] += lroundf((float)dp_ambig * r->ADF[i]/dp); } + // Else consider downgrading bca->bases[] scores by AD vs AD_ref_missed + // ratios. This is detrimental on Illumina, but beneficial on PacBio CCS. + // It's possibly related to the homopolyer error likelihoods or overall + // Indel accuracy. Maybe tie this in to the -h option? + r->ori_depth = ori_depth; // glfgen errmod_cal(bca->e, n, 5, bca->bases, r->p); // calculate PL of each genotype diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index e8cd0f7ee..38ada7831 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -1,43 +1,3 @@ -//#define CONS_DEBUG - -/* - -TODO: - -- Reevaluate the two STR indel-size adjusting modes. - Maybe no longer relevant. (Looks poor to me) - -- Consider limiting fract to never add more than current depth, so we - change cons to Ns but not to another base type entirely. - -- Consider a separate rfract for lift-over of SNPs than for indels. - SNPs is good at replacing bases with N where we're unsure on the - data. However ref_ins may cause issues with sizing? - rfract*.8 is working better (so far). Trying 0.5 too. - -- Left-align indels before consensus generation. Eg: - - /pos being studied - AGCTGGGGGGAATCG REF - AGCT-GGGGGAATCG Seq type -1 - ACGTGGGGG-AATGCG Seq type 0 - ^ - - Type 0 cons shouldn't include the right hand del, but it's outside - of "biggest_del" window. Expand this to STR size or left-align. - -- Long reads cause multiple scans of CIGAR to compute consensus. - We need a way of caching CIGAR/seq start coords for pos p=left so at - pos P where P>p we can start at p and continue instead of from the - start each time. - -- Improve QUAL scoring to consider AD vs DP. - Eg. AD 10,8 looks good if we have 18 sequences. High qual. - But AD 10,8 looks poor if we had 50 seqs. Why did we have to discard - 32 of them? - -*/ - /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. @@ -63,6 +23,8 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +//#define CONS_DEBUG + #include #include #include @@ -126,8 +88,16 @@ static int tpos2qpos(const bam1_core_t *c, const uint32_t *cigar, int32_t tpos, return last_y; } -// FIXME: check if the inserted sequence is consistent with the homopolymer run -// l is the relative gap length and l_run is the length of the homopolymer on the reference +// l is the relative gap length and l_run is the length of the homopolymer +// on the reference. +// +// Larger seqQ is good, so increasing tandemQ calls more indels, +// and longer l_run means fewer calls. It is capped later at 255. +// For short l_runs, the qual is simply based on size of indel +// larger ones being considered more likely to be real. +// Longer indels get assigned a score based on the relative indel size +// to homopolymer, where l_run base will have already been verified by +// the caller to ensure it's compatible. static inline int est_seqQ(const bcf_callaux_t *bca, int l, int l_run) { int q, qh; @@ -408,7 +378,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int left, int right, int sample, int type, int biggest_del, int *left_shift, int *right_shift, - int *band, int *tcon_len) { + int *band, int *tcon_len, int *cpos_pos) { // Map ASCII ACGTN* to 012345 static uint8_t base6[256] = { 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4, @@ -717,8 +687,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // Used in cnum==1 to do the opposite of whichever way we did before. int heti[1024] = {0}, hetd[1024] = {0}; + *cpos_pos = -1; for (cnum = 0; cnum < 2; cnum++) { for (i = k = 0; i < right-left; i++) { + // Location in consensus matching the indel itself + if (i >= pos-left+1 && *cpos_pos == -1) + *cpos_pos = k; + int max_v = 0, max_v2 = 0, max_j = 4, max_j2 = 4, tot = 0; for (j = 0; j < 6; j++) { // Top 2 consensus calls @@ -815,7 +790,6 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, (*right_shift)++; } else { // Finally the easy case - a non-indel base or an N - // FIXME: make cons[] in 0,1,2,3,4,5 terms if (max_v > CONS_CUTOFF*tot) cons[cnum][k++] = max_j; // "ACGTN*" else if (max_v > 0) @@ -1320,6 +1294,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // The length of the homopolymer run around the current position l_run = bcf_cgp_l_run(ref, pos); + int l_run_base = seq_nt16_table[(uint8_t)ref[pos+1]]; + int l_run_ins = 0; // construct the consensus sequence (minus indels, which are added later) if (max_ins > 0) { @@ -1371,21 +1347,38 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, char **tcons; int left_shift, right_shift; int tcon_len[2]; + int cpos_pos; tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, left, right, s, types[t], biggest_del, &left_shift, &right_shift, &band, - tcon_len); + tcon_len, &cpos_pos); #ifdef CONS_DEBUG - for (j = 0; j < 2; j++) { - int k; - fprintf(stderr, "Cons%d @ %d %4d/%3d ", - pos, types[t], left_shift); - for (k = 0; k < tcon_len[j]; k++) - putc("ACGTN"[(uint8_t)tcons[j][k]], stderr); - putc('\n', stderr); + { + int j; + for (j = 0; j < 2; j++) { + int k; + fprintf(stderr, "Cons%d @ %d %4d/%4d ", + j, pos, types[t], left_shift); + for (k = 0; k < tcon_len[j]; k++) { + if (k == cpos_pos) + putc('#', stderr); + putc("ACGTN"[(uint8_t)tcons[j][k]], stderr); + } + putc('\n', stderr); + } } #endif + // Scan for base-runs in the insertion. + int k = tcons[0][cpos_pos], j; + for (j = 0; j < types[t]; j++) + if (tcons[0][cpos_pos+j] != k) + break; + if (j && j == types[t]) + l_run_ins |= "\x1\x2\x4\x8\xf"[k]; // ACGTN + if (types[t] < 0) + l_run_ins |= 0xff; + // align each read to consensus(es) for (i = 0; i < n_plp[s]; ++i, ++K) { bam_pileup1_t *p = plp[s] + i; @@ -1542,6 +1535,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, } // compute indelQ + if (!(l_run_base & l_run_ins)) + l_run = 1; // different base type in ins to flanking region. n_alt = bcf_cgp_compute_indelQ(n, n_plp, plp, bca, inscns, l_run, max_ins, ref_type, types, n_types, score); From ed5458b658ebba282eae03aaaabe3e312888c80e Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 15 Mar 2022 12:29:17 +0000 Subject: [PATCH 27/31] Tidy the indel changes up a bit. --- bam2bcf_indel.c | 109 +++++++++++++++++++++--------------------------- str_finder.c | 28 ++++++------- 2 files changed, 61 insertions(+), 76 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 38ada7831..82e875522 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -1,7 +1,7 @@ /* bam2bcf_indel.c -- indel caller. Copyright (C) 2010, 2011 Broad Institute. - Copyright (C) 2012-2014,2016-2017, 2021 Genome Research Ltd. + Copyright (C) 2012-2014,2016-2017, 2021-2022 Genome Research Ltd. Author: Heng Li @@ -283,7 +283,7 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, if (sz == 0 || j-i >= bca->min_support // Note, doesn't handle bca->per_sample_flt yet - || bca->per_sample_flt + || bca->per_sample_flt || (double)(j-i) / n_tot >= bca->min_frac) types[t++] = sz; i = j-1; @@ -309,7 +309,7 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, } // Increment ins["str"] and freq["str"] -#define NI 10 // number of alternative insertion sequences +#define NI 100 // number of alternative insertion sequences // Could use a hash table too, but expectation is a tiny number of alternatives typedef struct { char *str[NI]; @@ -410,6 +410,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int (*ref_base)[6] = calloc(right - left + 1, sizeof(*ref_base)); str_freq *ref_ins = calloc(right - left + 1, sizeof(*ref_ins)); int i, j, k, s = sample; + char **cons = NULL; + + if (!cons_base || !cons_ins || !ref_base || !ref_ins) + goto err; //-------------------------------------------------- // Accumulate sequences into cons_base and cons_ins arrays @@ -479,9 +483,13 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, if (p->indel == type /*&& x == pos+1*/) { // Assume any ins of the same size is the same ins. // (This rescues misaligned insertions.) - bcf_cgp_append_cons(&cons_ins[x-left], ins, ilen, 1); + if (bcf_cgp_append_cons(&cons_ins[x-left], ins, + ilen, 1) < 0) + goto err; } else if (x != pos+1){ - bcf_cgp_append_cons(&ref_ins[x-left], ins, ilen, 1); + if (bcf_cgp_append_cons(&ref_ins[x-left], ins, + ilen, 1) < 0) + goto err; } } break; @@ -550,7 +558,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // When evaluating this particular indel, we don't want to // penalise alignments by SNP errors elsewhere. This can // happen when we have low depth for a particular 'type'. - // + // // So add in a little data from ref_base/ref_ins. double rfract = (r - t*2)*.75 / (r+1); @@ -582,9 +590,10 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, for (j = 0; j < NI; j++) { if (!ref_ins[i].str[j]) break; - bcf_cgp_append_cons(&cons_ins[i], - ref_ins[i].str[j], ref_ins[i].len[j], - rfract * ref_ins[i].freq[j]); + if (bcf_cgp_append_cons(&cons_ins[i], + ref_ins[i].str[j], ref_ins[i].len[j], + rfract * ref_ins[i].freq[j]) < 0) + goto err; } } @@ -604,7 +613,9 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } max_len += ins; } - char **cons = malloc((max_len+1)*2 + sizeof(char *)*2); + cons = malloc((max_len+1)*2 + sizeof(char *)*2); + if (!cons) + goto err; cons[0] = (char *)&cons[2]; cons[1] = cons[0] + max_len+1; @@ -802,7 +813,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, tcon_len[cnum] = k; } - // FIXME: replace by string pool for rapid tidying + // TODO: replace by io_lib's string pool for rapid tidying. + // For now this isn't the bottleneck though. for (i = 0; i < right-left; i++) { for (j = 0; j < NI; j++) { if (cons_ins[i].str[j]) @@ -812,6 +824,7 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, } } + err: free(cons_base); free(ref_base); free(cons_ins); @@ -841,7 +854,9 @@ static int bcf_cgp_l_run(const char *ref, int pos) { // Compute the insertion consensus for this sample 's' via a basic -// majority rule +// majority rule. +// +// TODO: merge this into bcf_cgp_consensus as another return value? static char *bcf_cgp_calc_ins_cons(int n, int *n_plp, bam_pileup1_t **plp, int pos, int *types, int n_types, int max_ins, int s) { @@ -962,25 +977,18 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // Trim poly_Ns at ends of ref. // This helps to keep len(ref) and len(query) similar, to reduce // band size and reduce the chance of -ve BAQ scores. - - // FIXME Maybe instead of l>ABS(type) it should be l>query_len/2 ? - // TODO: no difference to result, but what difference is there to - // speed? Is this worth it? -#if 1 for (l = 0; l < tend1-tbeg && l < tend2-tbeg; l++) if (ref1[l + tbeg-left] != 4 || ref2[l + tbeg-left] != 4) break; - if (l > ABS(type)) { + if (l > ABS(type)) tbeg += l-ABS(type); - } for (l = tend1-tbeg-1; l >= 0; l--) if (ref1[l + tbeg-left] != 4) break; l = tend1-tbeg-1 - l; - if (l > ABS(type)) { + if (l > ABS(type)) tend1 -= l-ABS(type); - } for (l = tend2-tbeg-1; l >= 0; l--) if (ref2[l + tbeg-left] != 4) @@ -989,7 +997,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, if (l > ABS(type)) { tend2 -= l-ABS(type); } -#endif // Get segment of quality, either ZQ tag or if absent QUAL. if (!(qq = (uint8_t*) calloc(qend - qbeg, 1))) @@ -1003,37 +1010,14 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, if (qval < 7) qval = 7; qq[l - qbeg] = qval; - - // Skew qq at qpos to be higher than background and qq at - // other regions to be lower. This means the alignment of - // indel we are currently assessing takes precedence over - // alignment of flanking regions. - // - // Ins; type = +ve - // Ref AGCTAG---CTGA - // Qry AGCTAGGGGCTGA (qpos..qpos+type) - // - // Del; type = -ve - // Ref AGCTAGGGGCTGA - // Qry AGCTAG---CTGA (qpos..qpos) - -// // Tests over 1-47MB -// // shift8b FP/GT/FN = 290/296/2310 -// // develop = 264/326/2282 -// if (l >= qpos-2 && l <= qpos+2+(type>0?type:0)) -// //qq[l-qbeg] += 15; //qq2 = 282/312/2334 -// qq[l-qbeg] *= 1.5; //qq3 = 284/305/2326 -// //qq[l-qbeg] *= 0.75;//qq4 = 287/333/2347 -//// else -//// qq[l-qbeg] *= 0.67; // qq = 269/343/2413 (qq3 with else clause) } // The bottom 8 bits are length-normalised score while // the top bits are unnormalised. // // Try original cons and new cons and pick best. - // This doesn't removed FN much (infact maybe adds very slightly), - // but it does reduce GT errors and some slight reduction to FP. + // This doesn't reduce FN much (infact maybe adds very slightly), + // but it does reduce GT errors and is a slight reduction to FP. sc2 = probaln_glocal(ref2 + tbeg - left, tend2 - tbeg, query, qend - qbeg, qq, &apf, 0, 0); @@ -1083,7 +1067,6 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // used for adjusting indelQ below l = (int)((100. * sc2 / (qend - qbeg) + .499) * bca->indel_bias); *score = sc2<<8 | MIN(255, l); - //fprintf(stderr, "score = %d, qend-qbeg = %d, => adj score %d\n", sc, qend-qbeg, l); rep_ele *reps, *elt, *tmp; uint8_t *seg = ref2 + tbeg - left; @@ -1112,9 +1095,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, iscore += (elt->end-elt->start) / elt->rep_len; // c if (elt->start+tbeg <= r_start || elt->end+tbeg >= r_end) { - //iscore += 2*(elt->end-elt->start); //h5 (STR2) - //iscore += 4*(elt->end-elt->start); //h5STR4 - iscore += (elt->end-elt->start); //h5STR1 + iscore += (elt->end-elt->start); } } @@ -1191,9 +1172,10 @@ static int bcf_cgp_compute_indelQ(int n, int *n_plp, bam_pileup1_t **plp, if (indelQ > seqQ) indelQ = seqQ; if (indelQ > 255) indelQ = 255; if (seqQ > 255) seqQ = 255; - p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; // use 22 bits in total - sumq[sc[0]&0x3f] += indelQ < seqQ? indelQ : seqQ; // FIXME: redunctant; always indelQ -// fprintf(stderr, " read=%d:%d name=%s call=%d indelQ=%d seqQ=%d\n", s, i, bam_get_qname(p->b), types[sc[0]&0x3f], indelQ, seqQ); + + // use 22 bits in total + p->aux = (sc[0]&0x3f)<<16 | seqQ<<8 | indelQ; + sumq[sc[0]&0x3f] += indelQ; } } // determine bca->indel_types[] and bca->inscns @@ -1299,7 +1281,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // construct the consensus sequence (minus indels, which are added later) if (max_ins > 0) { - // FIXME: replace filling inscns[] with calc_consensus return + // TODO: replace filling inscns[] with calc_consensus return // so the merges of the insertion consensus for type[t] is // reported directly. (It may need adjustment to avoid N) inscns = bcf_cgp_calc_ins_cons(n, n_plp, plp, pos, @@ -1349,7 +1331,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, int tcon_len[2]; int cpos_pos; tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, - left, right, s, types[t], biggest_del, + left, right, s, types[t], biggest_del, &left_shift, &right_shift, &band, tcon_len, &cpos_pos); #ifdef CONS_DEBUG @@ -1370,6 +1352,8 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, #endif // Scan for base-runs in the insertion. + // We use this to avoid over-correction in est_seqQ when the + // insertion is not part of the neighbouring homopolymer. int k = tcons[0][cpos_pos], j; for (j = 0; j < types[t]; j++) if (tcons[0][cpos_pos+j] != k) @@ -1437,16 +1421,17 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, { rep_ele *reps, *elt, *tmp; reps = find_STR(tcons[0], tcon_len[0], 0); - int max_str = 0, tot_str = 0; + //int max_str = 0; + int tot_str = 0; DL_FOREACH_SAFE(reps, elt, tmp) { - if (max_str < elt->end - elt->start) - max_str = elt->end - elt->start; + // if (max_str < elt->end - elt->start) + // max_str = elt->end - elt->start; tot_str += elt->end - elt->start; DL_DELETE(reps, elt); free(elt); } - // Max_str should be enough, but it's still not + // Ideally max_str should be enough, but it's still not // sufficient in longer range some repeats. //min_win_size += max_str; min_win_size += tot_str; @@ -1476,11 +1461,11 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, // entire left/right region, so this also returns the // equivalent genomic coords for qbeg/qend in tbeg/tend. qbeg = tpos2qpos(&p->b->core, bam_get_cigar(p->b), - left2/*+biggest_ins*/, 0, &tbeg); + left2, 0, &tbeg); qpos = tpos2qpos(&p->b->core, bam_get_cigar(p->b), pos, 0, &tend) - qbeg; qend = tpos2qpos(&p->b->core, bam_get_cigar(p->b), - right2/*-biggest_ins*/, 1, &tend); + right2, 1, &tend); int old_tend = tend; int old_tbeg = tbeg; diff --git a/str_finder.c b/str_finder.c index 79bd31c35..ebf26561d 100644 --- a/str_finder.c +++ b/str_finder.c @@ -184,33 +184,33 @@ rep_ele *find_STR(char *cons, int len, int lower_only) { w <<= 2; w |= cons[i]; //printf("%3d %c w=%08x\n", i, cons[i], w); - if ((w&0xfffffff) == ((w>>28)&0xfffffff)) + if ((w&0xfffffff) == ((w>>28)&0xfffffff)) add_rep(&reps, cons, len, i, 14, lower_only, w); - else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff)) + else if ((w&0x3ffffff) == ((w>>26)&0x3ffffff)) add_rep(&reps, cons, len, i, 13, lower_only, w); - else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff)) + else if ((w&0x0ffffff) == ((w>>24)&0x0ffffff)) add_rep(&reps, cons, len, i, 12, lower_only, w); - else if ((w&0x03fffff) == ((w>>22)&0x03fffff)) + else if ((w&0x03fffff) == ((w>>22)&0x03fffff)) add_rep(&reps, cons, len, i, 11, lower_only, w); - else if ((w&0x00fffff) == ((w>>20)&0x00fffff)) + else if ((w&0x00fffff) == ((w>>20)&0x00fffff)) add_rep(&reps, cons, len, i, 10, lower_only, w); - else if ((w&0x003ffff) == ((w>>18)&0x003ffff)) + else if ((w&0x003ffff) == ((w>>18)&0x003ffff)) add_rep(&reps, cons, len, i, 9, lower_only, w); - else if ((w&0xffff) == ((w>>16)&0xffff)) + else if ((w&0xffff) == ((w>>16)&0xffff)) add_rep(&reps, cons, len, i, 8, lower_only, w); - else if ((w&0x3fff) == ((w>>14)&0x3fff)) + else if ((w&0x3fff) == ((w>>14)&0x3fff)) add_rep(&reps, cons, len, i, 7, lower_only, w); - else if ((w&0x0fff) == ((w>>12)&0x0fff)) + else if ((w&0x0fff) == ((w>>12)&0x0fff)) add_rep(&reps, cons, len, i, 6, lower_only, w); - else if ((w&0x03ff) == ((w>>10)&0x03ff)) + else if ((w&0x03ff) == ((w>>10)&0x03ff)) add_rep(&reps, cons, len, i, 5, lower_only, w); - else if ((w&0x00ff) == ((w>> 8)&0x00ff)) + else if ((w&0x00ff) == ((w>> 8)&0x00ff)) add_rep(&reps, cons, len, i, 4, lower_only, w); - else if ((w&0x003f) == ((w>> 6)&0x003f)) + else if ((w&0x003f) == ((w>> 6)&0x003f)) add_rep(&reps, cons, len, i, 3, lower_only, w); - else if ((w&0x000f) == ((w>> 4)&0x000f)) + else if ((w&0x000f) == ((w>> 4)&0x000f)) add_rep(&reps, cons, len, i, 2, lower_only, w); - else if ((w&0x0003) == ((w>> 2)&0x0003)) + else if ((w&0x0003) == ((w>> 2)&0x0003)) add_rep(&reps, cons, len, i, 1, lower_only, w); } From 9de001b70000d65ce8943ad68af54d1b837a3788 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Tue, 15 Mar 2022 17:00:20 +0000 Subject: [PATCH 28/31] Fix test mpileup files --- test/mpileup/indel-AD.1.out | 8 ++++---- test/mpileup/mpileup.2.out | 2 +- test/mpileup/mpileup.4.out | 2 +- test/mpileup/mpileup.5.out | 2 +- test/mpileup/mpileup.6.out | 2 +- 5 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/mpileup/indel-AD.1.out b/test/mpileup/indel-AD.1.out index 9d785f963..7e0ac2a13 100644 --- a/test/mpileup/indel-AD.1.out +++ b/test/mpileup/indel-AD.1.out @@ -166,9 +166,9 @@ 000000F 535 . G A,<*> 0 . DP=125;I16=65,52,0,1,4309,171791,12,144,7020,421200,60,3600,2679,64385,25,625;QS=0.997223,0.00277713,0;SGB=-0.379885;RPBZ=-1.14518;MQBZ=0;MQSBZ=0;BQBZ=-1.74874;SCBZ=0.0762539;FS=0;MQ0F=0 PL:AD 0,255,255,255,255,255:117,1,0 000000F 536 . T G,A,<*> 0 . DP=125;I16=65,51,0,2,4274,171298,24,288,6960,417600,120,7200,2661,64041,48,1154;QS=0.994416,0.002792,0.002792,0;VDB=0.1;SGB=-0.453602;RPBZ=-1.69957;MQBZ=0;MQSBZ=0;BQBZ=-2.39373;SCBZ=-0.714873;FS=0;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:116,1,1,0 000000F 537 . A <*> 0 . DP=125;I16=65,53,0,0,4390,175290,0,0,7080,424800,0,0,2713,65375,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:118,0 -000000F 537 . AC A 0 . INDEL;IDV=60;IMF=0.48;DP=125;I16=37,25,30,27,2480,99200,2280,91200,3720,223200,3420,205200,1399,33485,1298,31150;QS=0.260049,0.739951;VDB=0.0363603;SGB=-0.693147;RPBZ=-0.543708;MQBZ=0;MQSBZ=0;SCBZ=0.641853;FS=0;MQ0F=0 PL:AD 255,0,27:62,57 +000000F 537 . AC A 0 . INDEL;IDV=60;IMF=0.48;DP=125;I16=37,25,31,27,2480,99200,2320,92800,3720,223200,3480,208800,1399,33485,1313,31375;QS=0.386704,0.613296;VDB=0.0486419;SGB=-0.693147;RPBZ=-0.543708;MQBZ=0;MQSBZ=0;SCBZ=0.641853;FS=0;MQ0F=0 PL:AD 255,0,193:62,58 000000F 538 . C <*> 0 . DP=65;I16=36,26,0,0,2195,86349,0,0,3720,223200,0,0,1432,34600,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,187,255:62,0 -000000F 538 . CT C 0 . INDEL;IDV=64;IMF=0.512;DP=125;I16=30,26,36,25,2240,89600,2440,97600,3360,201600,3660,219600,1279,30735,1380,33214;QS=0.218762,0.781238;VDB=0.0165787;SGB=-0.693147;RPBZ=0.242079;MQBZ=0;MQSBZ=0;SCBZ=-0.994262;FS=0;MQ0F=0 PL:AD 255,0,27:56,61 +000000F 538 . CT C 0 . INDEL;IDV=64;IMF=0.512;DP=125;I16=31,27,37,26,2320,92800,2520,100800,3480,208800,3780,226800,1318,31556,1422,34128;QS=0.360065,0.639935;VDB=0.0467386;SGB=-0.693147;RPBZ=0.242079;MQBZ=0;MQSBZ=0;SCBZ=-0.994262;FS=0;MQ0F=0 PL:AD 255,0,195:58,62 000000F 539 . T <*> 0 . DP=60;I16=29,26,0,0,2120,86238,0,0,3300,198000,0,0,1260,30374,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,166,255:55,0 000000F 540 . G <*> 0 . DP=124;I16=64,53,0,0,4130,161310,0,0,7020,421200,0,0,2703,65511,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:117,0 000000F 541 . G <*> 0 . DP=124;I16=64,53,0,0,4143,160525,0,0,7020,421200,0,0,2705,65703,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:117,0 @@ -286,11 +286,11 @@ 000000F 653 . A <*> 0 . DP=21;I16=9,12,0,0,804,31130,0,0,1260,75600,0,0,303,5555,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,63,255:21,0 000000F 654 . A <*> 0 . DP=21;I16=9,12,0,0,659,22061,0,0,1260,75600,0,0,285,5117,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,63,255:21,0 000000F 655 . C <*> 0 . DP=21;I16=9,12,0,0,664,22342,0,0,1260,75600,0,0,266,4666,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,63,255:21,0 -000000F 655 . CACAATACAA CACAA 0 . INDEL;IDV=6;IMF=0.285714;DP=21;I16=0,2,5,1,240,28800,720,86400,120,7200,360,21600,46,1060,100,1788;QS=0.0977444,0.902256;VDB=0.00018837;SGB=-0.616816;RPBZ=-2.81289;MQBZ=0;MQSBZ=0;SCBZ=-2.52262;FS=0;MQ0F=0 PL:AD 159,0,2:2,6 +000000F 655 . CACAATACAA CACAA 0 . INDEL;IDV=6;IMF=0.285714;DP=21;I16=0,0,5,1,0,0,720,86400,0,0,360,21600,0,0,100,1788;QS=0,1;VDB=0.00211394;SGB=-0.616816;RPBZ=-2.81289;MQBZ=0;MQSBZ=0;SCBZ=-2.52262;FS=0;MQ0F=0 PL:AD 179,18,0:0,6 000000F 656 . A <*> 0 . DP=11;I16=4,7,0,0,404,15690,0,0,660,39600,0,0,141,2411,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,33,255:11,0 000000F 657 . C <*> 0 . DP=11;I16=4,7,0,0,413,15607,0,0,660,39600,0,0,131,2189,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,33,255:11,0 000000F 658 . A <*> 0 . DP=10;I16=3,7,0,0,121,1651,0,0,600,36000,0,0,122,1986,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,30,79:10,0 -000000F 658 . AA AAATTA 0 . INDEL;IDV=2;IMF=0.125;DP=16;I16=2,1,1,2,300,30000,300,30000,180,10800,180,10800,59,1205,50,902;QS=0.31694,0.68306;VDB=0.00155977;SGB=-0.511536;RPBZ=-1.70026;MQBZ=0;MQSBZ=0;SCBZ=-1.35678;FS=0;MQ0F=0 PL:AD 99,0,38:3,3 +000000F 658 . AA AAATTA 0 . INDEL;IDV=2;IMF=0.125;DP=16;I16=2,1,1,2,300,30000,300,30000,180,10800,180,10800,59,1205,50,902;QS=0.227273,0.772727;VDB=0.00155977;SGB=-0.511536;RPBZ=-1.70026;MQBZ=0;MQSBZ=0;SCBZ=-1.35678;FS=0;MQ0F=0 PL:AD 51,0,3:3,2 000000F 659 . A <*> 0 . DP=10;I16=3,5,0,0,86,1088,0,0,480,28800,0,0,75,1077,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,24,63:8,0 000000F 660 . T <*> 0 . DP=2;I16=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;QS=0,0;FS=0;MQ0F=0 PL:AD 0,0,0:0,0 000000F 661 . A <*> 0 . DP=8;I16=0,2,0,0,8,32,0,0,120,7200,0,0,26,340,0,0;QS=1,0;FS=0;MQ0F=0 PL:AD 0,6,7:2,0 diff --git a/test/mpileup/mpileup.2.out b/test/mpileup/mpileup.2.out index 649046db6..fecae4050 100644 --- a/test/mpileup/mpileup.2.out +++ b/test/mpileup/mpileup.2.out @@ -224,7 +224,7 @@ 17 300 . A <*> 0 . DP=27;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,204:8:0 0,21,210:7:0 17 301 . G <*> 0 . DP=25;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV 0,30,255:10:0 0,21,195:7:0 0,21,196:7:0 17 302 . T <*> 0 . DP=25;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV 0,30,231:10:0 0,21,172:7:0 0,21,202:7:0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.539485,2.46052;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 161,0,99:11:6 158,0,14:7:6 201,21,0:7:7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.515744,2.48426;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 167,0,95:11:6 158,0,11:7:6 201,21,0:7:7 17 303 . G <*> 0 . DP=25;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,21,197:7:0 0,21,195:7:0 17 304 . C <*> 0 . DP=27;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,206:8:0 0,24,200:8:0 17 305 . C <*> 0 . DP=27;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,213:8:0 0,24,211:8:0 diff --git a/test/mpileup/mpileup.4.out b/test/mpileup/mpileup.4.out index d7366fae6..6f7ac492f 100644 --- a/test/mpileup/mpileup.4.out +++ b/test/mpileup/mpileup.4.out @@ -228,7 +228,7 @@ 17 300 . A <*> 0 . DP=27;DPR=26,0;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:6,5,0,0:11,0 0,24,204:8:0:0:3,5,0,0:8,0 0,21,210:7:0:0:2,5,0,0:7,0 17 301 . G <*> 0 . DP=25;DPR=24,0;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,255:10:0:0:5,5,0,0:10,0 0,21,195:7:0:0:3,4,0,0:7,0 0,21,196:7:0:0:2,5,0,0:7,0 17 302 . T <*> 0 . DP=25;DPR=24,0;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,231:10:0:0:5,5,0,0:10,0 0,21,172:7:0:0:3,4,0,0:7,0 0,21,202:7:0:0:2,5,0,0:7,0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;DPR=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.539485,2.46052;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 161,0,99:11:6:6:1,4,4,2:5,6 158,0,14:7:6:0:1,0,2,4:1,6 201,21,0:7:7:0:0,0,2,5:0,7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;DPR=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.515744,2.48426;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 167,0,95:11:6:6:1,4,4,2:5,6 158,0,11:7:6:0:1,0,2,4:1,6 201,21,0:7:7:0:0,0,2,5:0,7 17 303 . G <*> 0 . DP=25;DPR=25,0;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,21,197:7:0:0:3,4,0,0:7,0 0,21,195:7:0:0:2,5,0,0:7,0 17 304 . C <*> 0 . DP=27;DPR=27,0;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,24,206:8:0:0:4,4,0,0:8,0 0,24,200:8:0:0:2,6,0,0:8,0 17 305 . C <*> 0 . DP=27;DPR=27,0;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,24,213:8:0:0:4,4,0,0:8,0 0,24,211:8:0:0:2,6,0,0:8,0 diff --git a/test/mpileup/mpileup.5.out b/test/mpileup/mpileup.5.out index c72170dbd..964466088 100644 --- a/test/mpileup/mpileup.5.out +++ b/test/mpileup/mpileup.5.out @@ -230,7 +230,7 @@ 17 300 . A <*> 0 . DP=27;ADF=11,0;ADR=15,0;AD=26,0;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:6,0:5,0:11,0 0,24,204:8:0:3,0:5,0:8,0 0,21,210:7:0:2,0:5,0:7,0 17 301 . G <*> 0 . DP=25;ADF=10,0;ADR=14,0;AD=24,0;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,30,255:10:0:5,0:5,0:10,0 0,21,195:7:0:3,0:4,0:7,0 0,21,196:7:0:2,0:5,0:7,0 17 302 . T <*> 0 . DP=25;ADF=10,0;ADR=14,0;AD=24,0;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,30,231:10:0:5,0:5,0:10,0 0,21,172:7:0:3,0:4,0:7,0 0,21,202:7:0:2,0:5,0:7,0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;ADF=2,8;ADR=4,11;AD=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.539485,2.46052;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 161,0,99:11:6:1,4:4,2:5,6 158,0,14:7:0:1,2:0,4:1,6 201,21,0:7:0:0,2:0,5:0,7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;ADF=2,8;ADR=4,11;AD=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.515744,2.48426;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 167,0,95:11:6:1,4:4,2:5,6 158,0,11:7:0:1,2:0,4:1,6 201,21,0:7:0:0,2:0,5:0,7 17 303 . G <*> 0 . DP=25;ADF=10,0;ADR=15,0;AD=25,0;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,21,197:7:0:3,0:4,0:7,0 0,21,195:7:0:2,0:5,0:7,0 17 304 . C <*> 0 . DP=27;ADF=11,0;ADR=16,0;AD=27,0;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,24,206:8:0:4,0:4,0:8,0 0,24,200:8:0:2,0:6,0:8,0 17 305 . C <*> 0 . DP=27;ADF=11,0;ADR=16,0;AD=27,0;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,24,213:8:0:4,0:4,0:8,0 0,24,211:8:0:2,0:6,0:8,0 diff --git a/test/mpileup/mpileup.6.out b/test/mpileup/mpileup.6.out index 4337f6de6..3e43cb714 100644 --- a/test/mpileup/mpileup.6.out +++ b/test/mpileup/mpileup.6.out @@ -62,7 +62,7 @@ 17 283 . C <*> . . END=296;MinDP=5;QS=3,0 PL:DP 0,33,240:11 0,18,119:6 0,15,122:5 17 297 . C G,<*> 0 . DP=25;I16=9,15,1,0,901,34305,4,16,1138,59338,60,3600,445,9901,10,100;QS=2.98261,0.0173913,0;SGB=-0.556633;RPBZ=-1.24856;MQBZ=0.806872;MQSBZ=-3.22749;BQBZ=-1.67542;SCBZ=-0.368383;FS=0;MQ0F=0 PL:DP:DV 0,33,255,33,255,255:11:0 0,15,168,21,171,168:8:1 0,18,161,18,161,161:6:0 17 298 . A <*> . . END=301;MinDP=7;QS=3,0 PL:DP 0,30,231:10 0,21,172:7 0,21,189:7 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.539485,2.46052;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 161,0,99:11:6 158,0,14:7:6 201,21,0:7:7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.515744,2.48426;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 167,0,95:11:6 158,0,11:7:6 201,21,0:7:7 17 303 . G <*> . . END=334;MinDP=7;QS=3,0 PL:DP 0,30,235:10 0,21,197:7 0,21,195:7 17 335 . A G,<*> 0 . DP=32;I16=13,18,1,0,1084,40336,4,16,1589,87297,60,3600,555,11943,0,0;QS=2.98919,0.0108108,0;SGB=-0.556633;RPBZ=-1.67921;MQBZ=0.622171;MQSBZ=-2.25492;BQBZ=-1.68602;SCBZ=-0.258065;FS=0;MQ0F=0 PL:DP:DV 0,33,252,33,252,252:11:0 0,27,219,27,219,219:9:0 0,25,245,33,248,245:12:1 17 336 . A <*> . . MinDP=9;QS=3,0 PL:DP 0,33,255:11 0,27,212:9 0,36,255:12 From 16834c7ff96c6cd65588a788fef6228d369bcb40 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 17 Mar 2022 14:47:04 +0000 Subject: [PATCH 29/31] Further improve the indel calling. - We erroneously had the -m and -F pre-filter options applies as needing either to pass instead of both to pass in bcf_cgp_find_types. Fixing this is a significant reduction to FP rates. - The computation of which STRs spanned the indel being assessed was sometimes a little bit out, due to using "qpos" instead of "pos". This removes a few false negatives. - Improve the calculation of indel cost ("iscore") by counting the number of additional repeat units beyond the end of the sequence. The old penalty for STRs at the end of reads has been removed, meaning we regain some FNs while still penalising (more) the cases that lead to FPs. Overall there is some small FP / FN adjustments, but the biggest improvement here is a significant drop in GT errors (~10% fewer). --- bam2bcf_indel.c | 36 +++++++++++++++++++++++------------- test/mpileup/indel-AD.1.out | 6 +++--- test/mpileup/indel-AD.2.out | 2 +- test/mpileup/indel-AD.3.out | 2 +- test/mpileup/indel-AD.4.out | 2 +- test/mpileup/mpileup.2.out | 2 +- test/mpileup/mpileup.4.out | 2 +- test/mpileup/mpileup.5.out | 2 +- test/mpileup/mpileup.6.out | 2 +- 9 files changed, 33 insertions(+), 23 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 82e875522..7f4403d95 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -281,10 +281,10 @@ static int *bcf_cgp_find_types(int n, int *n_plp, bam_pileup1_t **plp, break; if (sz == 0 - || j-i >= bca->min_support - // Note, doesn't handle bca->per_sample_flt yet - || bca->per_sample_flt - || (double)(j-i) / n_tot >= bca->min_frac) + || (j-i >= bca->min_support && + // Note, doesn't handle bca->per_sample_flt yet + (bca->per_sample_flt + || (double)(j-i) / n_tot >= bca->min_frac))) types[t++] = sz; i = j-1; } @@ -519,7 +519,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, // secondary consensus produced with the other // deletion. We set a marker for how long to // skip adding to ref_base. - skip_to = x+len; + if (x > skip_to) + skip_to = x+len; } } break; @@ -952,7 +953,7 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, int tbeg, int tend1, int tend2, int left, int right, int qbeg, int qend, - int qpos, int max_deletion, + int pos, int qpos, int max_deletion, int *score) { // Illumina probaln_par_t apf = { 1e-4, 1e-2, 10 }; @@ -1091,12 +1092,21 @@ static int bcf_cgp_align_score(bam_pileup1_t *p, bcf_callaux_t *bca, // This is emphasised further if the sequence ends with // soft clipping. DL_FOREACH_SAFE(reps, elt, tmp) { - if (elt->start <= qpos && elt->end >= qpos) { - iscore += (elt->end-elt->start) / elt->rep_len; // c - if (elt->start+tbeg <= r_start || - elt->end+tbeg >= r_end) { - iscore += (elt->end-elt->start); - } + int str_beg = elt->start+tbeg; + int str_end = elt->end+tbeg; + + + if (str_beg <= pos && str_end >= pos) { + // Overlaps indel region; num repeat units. + iscore += (elt->end-elt->start) / elt->rep_len; + } +#define STR_HALO2 (2+2*elt->rep_len) + if (str_beg <= pos+STR_HALO2 && str_end >= pos-STR_HALO2) { + // Worst: extends beyond read end by >= 1 repeat unit + if (str_beg <= r_start-elt->rep_len) + iscore += 10*(r_start - str_beg)/elt->rep_len; + if (str_end >= r_end+elt->rep_len) + iscore += 10*(elt->end+tbeg - r_end)/elt->rep_len; } DL_DELETE(reps, elt); @@ -1505,7 +1515,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, r_start, r_end, long_read, tbeg, tend1, tend2, left2, left + tcon_len[0], - qbeg, qend, qpos, -biggest_del, + qbeg, qend, pos,qpos, -biggest_del, &score[K*n_types + t]) < 0) { goto err; } diff --git a/test/mpileup/indel-AD.1.out b/test/mpileup/indel-AD.1.out index 7e0ac2a13..6b28b5264 100644 --- a/test/mpileup/indel-AD.1.out +++ b/test/mpileup/indel-AD.1.out @@ -166,9 +166,9 @@ 000000F 535 . G A,<*> 0 . DP=125;I16=65,52,0,1,4309,171791,12,144,7020,421200,60,3600,2679,64385,25,625;QS=0.997223,0.00277713,0;SGB=-0.379885;RPBZ=-1.14518;MQBZ=0;MQSBZ=0;BQBZ=-1.74874;SCBZ=0.0762539;FS=0;MQ0F=0 PL:AD 0,255,255,255,255,255:117,1,0 000000F 536 . T G,A,<*> 0 . DP=125;I16=65,51,0,2,4274,171298,24,288,6960,417600,120,7200,2661,64041,48,1154;QS=0.994416,0.002792,0.002792,0;VDB=0.1;SGB=-0.453602;RPBZ=-1.69957;MQBZ=0;MQSBZ=0;BQBZ=-2.39373;SCBZ=-0.714873;FS=0;MQ0F=0 PL:AD 0,255,255,255,255,255,255,255,255,255:116,1,1,0 000000F 537 . A <*> 0 . DP=125;I16=65,53,0,0,4390,175290,0,0,7080,424800,0,0,2713,65375,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:118,0 -000000F 537 . AC A 0 . INDEL;IDV=60;IMF=0.48;DP=125;I16=37,25,31,27,2480,99200,2320,92800,3720,223200,3480,208800,1399,33485,1313,31375;QS=0.386704,0.613296;VDB=0.0486419;SGB=-0.693147;RPBZ=-0.543708;MQBZ=0;MQSBZ=0;SCBZ=0.641853;FS=0;MQ0F=0 PL:AD 255,0,193:62,58 +000000F 537 . AC A 0 . INDEL;IDV=60;IMF=0.48;DP=125;I16=36,26,31,27,2480,99200,2320,92800,3720,223200,3480,208800,1405,33737,1313,31375;QS=0.385658,0.614342;VDB=0.0342923;SGB=-0.693147;RPBZ=-0.543708;MQBZ=0;MQSBZ=0;SCBZ=0.641853;FS=0;MQ0F=0 PL:AD 255,0,193:62,58 000000F 538 . C <*> 0 . DP=65;I16=36,26,0,0,2195,86349,0,0,3720,223200,0,0,1432,34600,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,187,255:62,0 -000000F 538 . CT C 0 . INDEL;IDV=64;IMF=0.512;DP=125;I16=31,27,37,26,2320,92800,2520,100800,3480,208800,3780,226800,1318,31556,1422,34128;QS=0.360065,0.639935;VDB=0.0467386;SGB=-0.693147;RPBZ=0.242079;MQBZ=0;MQSBZ=0;SCBZ=-0.994262;FS=0;MQ0F=0 PL:AD 255,0,195:58,62 +000000F 538 . CT C 0 . INDEL;IDV=64;IMF=0.512;DP=125;I16=31,27,36,26,2320,92800,2480,99200,3480,208800,3720,223200,1318,31556,1405,33839;QS=0.360065,0.639935;VDB=0.0289027;SGB=-0.693147;RPBZ=0.242079;MQBZ=0;MQSBZ=0;SCBZ=-0.994262;FS=0;MQ0F=0 PL:AD 255,0,195:58,62 000000F 539 . T <*> 0 . DP=60;I16=29,26,0,0,2120,86238,0,0,3300,198000,0,0,1260,30374,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,166,255:55,0 000000F 540 . G <*> 0 . DP=124;I16=64,53,0,0,4130,161310,0,0,7020,421200,0,0,2703,65511,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:117,0 000000F 541 . G <*> 0 . DP=124;I16=64,53,0,0,4143,160525,0,0,7020,421200,0,0,2705,65703,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,255,255:117,0 @@ -290,7 +290,7 @@ 000000F 656 . A <*> 0 . DP=11;I16=4,7,0,0,404,15690,0,0,660,39600,0,0,141,2411,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,33,255:11,0 000000F 657 . C <*> 0 . DP=11;I16=4,7,0,0,413,15607,0,0,660,39600,0,0,131,2189,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,33,255:11,0 000000F 658 . A <*> 0 . DP=10;I16=3,7,0,0,121,1651,0,0,600,36000,0,0,122,1986,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,30,79:10,0 -000000F 658 . AA AAATTA 0 . INDEL;IDV=2;IMF=0.125;DP=16;I16=2,1,1,2,300,30000,300,30000,180,10800,180,10800,59,1205,50,902;QS=0.227273,0.772727;VDB=0.00155977;SGB=-0.511536;RPBZ=-1.70026;MQBZ=0;MQSBZ=0;SCBZ=-1.35678;FS=0;MQ0F=0 PL:AD 51,0,3:3,2 +000000F 658 . AA AAATTA 0 . INDEL;IDV=2;IMF=0.125;DP=16;I16=2,1,3,2,300,30000,500,50000,180,10800,300,18000,59,1205,76,1240;QS=0.111111,0.888889;VDB=0.000108203;SGB=-0.590765;RPBZ=-1.70026;MQBZ=0;MQSBZ=0;SCBZ=-1.35678;FS=0;MQ0F=0 PL:AD 124,1,0:3,5 000000F 659 . A <*> 0 . DP=10;I16=3,5,0,0,86,1088,0,0,480,28800,0,0,75,1077,0,0;QS=1,0;MQSBZ=0;FS=0;MQ0F=0 PL:AD 0,24,63:8,0 000000F 660 . T <*> 0 . DP=2;I16=0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0;QS=0,0;FS=0;MQ0F=0 PL:AD 0,0,0:0,0 000000F 661 . A <*> 0 . DP=8;I16=0,2,0,0,8,32,0,0,120,7200,0,0,26,340,0,0;QS=1,0;FS=0;MQ0F=0 PL:AD 0,6,7:2,0 diff --git a/test/mpileup/indel-AD.2.out b/test/mpileup/indel-AD.2.out index d932c5bd9..9cde5d51e 100644 --- a/test/mpileup/indel-AD.2.out +++ b/test/mpileup/indel-AD.2.out @@ -21,4 +21,4 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample 11 75 . G <*> 0 . DP=68;I16=6,62,0,0,2437,87909,0,0,3770,217210,0,0,838,15940,0,0;QS=1,0;MQSBZ=0.140975;FS=0;MQ0F=0 PL:AD 0,205,255:68,0 -11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=5,9,1,5,1680,201600,720,86400,840,50400,174,5046,244,5778,147,3609;QS=0.730233,0.269767;VDB=0.00674908;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 83,0,244:14,6 +11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=4,8,1,5,1440,172800,720,86400,720,43200,174,5046,244,5778,147,3609;QS=0.702055,0.297945;VDB=0.001602;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 88,0,221:12,6 diff --git a/test/mpileup/indel-AD.3.out b/test/mpileup/indel-AD.3.out index 2dd65ab61..c19a1febf 100644 --- a/test/mpileup/indel-AD.3.out +++ b/test/mpileup/indel-AD.3.out @@ -21,4 +21,4 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample 11 75 . G <*> 0 . DP=68;I16=6,62,0,0,2437,87909,0,0,3770,217210,0,0,838,15940,0,0;QS=1,0;MQSBZ=0.140975;FS=0;MQ0F=0 PL:AD 0,205,255:68,0 -11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=5,9,1,5,1680,201600,720,86400,840,50400,174,5046,244,5778,147,3609;QS=0.730233,0.269767;VDB=0.00674908;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 83,0,244:45,23 +11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=4,8,1,5,1440,172800,720,86400,720,43200,174,5046,244,5778,147,3609;QS=0.702055,0.297945;VDB=0.001602;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 88,0,221:43,25 diff --git a/test/mpileup/indel-AD.4.out b/test/mpileup/indel-AD.4.out index 21fc59740..d118a5daf 100644 --- a/test/mpileup/indel-AD.4.out +++ b/test/mpileup/indel-AD.4.out @@ -21,4 +21,4 @@ ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT sample 11 75 . G <*> 0 . DP=68;I16=6,62,0,0,2437,87909,0,0,3770,217210,0,0,838,15940,0,0;QS=1,0;MQSBZ=0.140975;FS=0;MQ0F=0 PL:AD 0,205,255:68,0 -11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=5,9,1,5,1680,201600,720,86400,840,50400,174,5046,244,5778,147,3609;QS=0.730233,0.269767;VDB=0.00674908;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 83,0,244:62,6 +11 75 . GTAAAATAAAATAAAATAAAATAAA GTAAAATAAAATAAAATAAAATAAAATAAA 0 . INDEL;IDV=6;IMF=0.0882353;DP=68;I16=4,8,1,5,1440,172800,720,86400,720,43200,174,5046,244,5778,147,3609;QS=0.702055,0.297945;VDB=0.001602;SGB=-0.616816;RPBZ=-3.24592;MQBZ=-6.13241;MQSBZ=0.140975;SCBZ=-0.546919;FS=0;MQ0F=0 PL:AD 88,0,221:62,6 diff --git a/test/mpileup/mpileup.2.out b/test/mpileup/mpileup.2.out index fecae4050..1b363cf30 100644 --- a/test/mpileup/mpileup.2.out +++ b/test/mpileup/mpileup.2.out @@ -224,7 +224,7 @@ 17 300 . A <*> 0 . DP=27;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,204:8:0 0,21,210:7:0 17 301 . G <*> 0 . DP=25;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV 0,30,255:10:0 0,21,195:7:0 0,21,196:7:0 17 302 . T <*> 0 . DP=25;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV 0,30,231:10:0 0,21,172:7:0 0,21,202:7:0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.515744,2.48426;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 167,0,95:11:6 158,0,11:7:6 201,21,0:7:7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.543141,2.45686;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 157,0,98:11:6 158,0,14:7:6 201,21,0:7:7 17 303 . G <*> 0 . DP=25;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,21,197:7:0 0,21,195:7:0 17 304 . C <*> 0 . DP=27;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,206:8:0 0,24,200:8:0 17 305 . C <*> 0 . DP=27;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV 0,33,255:11:0 0,24,213:8:0 0,24,211:8:0 diff --git a/test/mpileup/mpileup.4.out b/test/mpileup/mpileup.4.out index 6f7ac492f..c6ece56bf 100644 --- a/test/mpileup/mpileup.4.out +++ b/test/mpileup/mpileup.4.out @@ -228,7 +228,7 @@ 17 300 . A <*> 0 . DP=27;DPR=26,0;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:6,5,0,0:11,0 0,24,204:8:0:0:3,5,0,0:8,0 0,21,210:7:0:0:2,5,0,0:7,0 17 301 . G <*> 0 . DP=25;DPR=24,0;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,255:10:0:0:5,5,0,0:10,0 0,21,195:7:0:0:3,4,0,0:7,0 0,21,196:7:0:0:2,5,0,0:7,0 17 302 . T <*> 0 . DP=25;DPR=24,0;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,30,231:10:0:0:5,5,0,0:10,0 0,21,172:7:0:0:3,4,0,0:7,0 0,21,202:7:0:0:2,5,0,0:7,0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;DPR=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.515744,2.48426;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 167,0,95:11:6:6:1,4,4,2:5,6 158,0,11:7:6:0:1,0,2,4:1,6 201,21,0:7:7:0:0,0,2,5:0,7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;DPR=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.543141,2.45686;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 157,0,98:11:6:6:1,4,4,2:5,6 158,0,14:7:6:0:1,0,2,4:1,6 201,21,0:7:7:0:0,0,2,5:0,7 17 303 . G <*> 0 . DP=25;DPR=25,0;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,21,197:7:0:0:3,4,0,0:7,0 0,21,195:7:0:0:2,5,0,0:7,0 17 304 . C <*> 0 . DP=27;DPR=27,0;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,24,206:8:0:0:4,4,0,0:8,0 0,24,200:8:0:0:2,6,0,0:8,0 17 305 . C <*> 0 . DP=27;DPR=27,0;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:DV:SP:DP4:DPR 0,33,255:11:0:0:5,6,0,0:11,0 0,24,213:8:0:0:4,4,0,0:8,0 0,24,211:8:0:0:2,6,0,0:8,0 diff --git a/test/mpileup/mpileup.5.out b/test/mpileup/mpileup.5.out index 964466088..908c02be5 100644 --- a/test/mpileup/mpileup.5.out +++ b/test/mpileup/mpileup.5.out @@ -230,7 +230,7 @@ 17 300 . A <*> 0 . DP=27;ADF=11,0;ADR=15,0;AD=26,0;I16=11,15,0,0,1001,39455,0,0,1258,66538,0,0,469,10437,0,0;QS=3,0;MQSBZ=-3.34898;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:6,0:5,0:11,0 0,24,204:8:0:3,0:5,0:8,0 0,21,210:7:0:2,0:5,0:7,0 17 301 . G <*> 0 . DP=25;ADF=10,0;ADR=14,0;AD=24,0;I16=10,14,0,0,928,36116,0,0,1169,62097,0,0,476,10632,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,30,255:10:0:5,0:5,0:10,0 0,21,195:7:0:3,0:4,0:7,0 0,21,196:7:0:2,0:5,0:7,0 17 302 . T <*> 0 . DP=25;ADF=10,0;ADR=14,0;AD=24,0;I16=10,14,0,0,879,32885,0,0,1169,62097,0,0,483,10849,0,0;QS=3,0;MQSBZ=-3.10529;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,30,231:10:0:5,0:5,0:10,0 0,21,172:7:0:3,0:4,0:7,0 0,21,202:7:0:2,0:5,0:7,0 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;ADF=2,8;ADR=4,11;AD=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.515744,2.48426;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 167,0,95:11:6:1,4:4,2:5,6 158,0,11:7:0:1,2:0,4:1,6 201,21,0:7:0:0,2:0,5:0,7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;ADF=2,8;ADR=4,11;AD=6,19;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.543141,2.45686;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 157,0,98:11:6:1,4:4,2:5,6 158,0,14:7:0:1,2:0,4:1,6 201,21,0:7:0:0,2:0,5:0,7 17 303 . G <*> 0 . DP=25;ADF=10,0;ADR=15,0;AD=25,0;I16=10,15,0,0,968,37972,0,0,1229,65697,0,0,497,11181,0,0;QS=3,0;MQSBZ=-2.97044;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,21,197:7:0:3,0:4,0:7,0 0,21,195:7:0:2,0:5,0:7,0 17 304 . C <*> 0 . DP=27;ADF=11,0;ADR=16,0;AD=27,0;I16=11,16,0,0,991,37005,0,0,1318,70138,0,0,503,11359,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,24,206:8:0:4,0:4,0:8,0 0,24,200:8:0:2,0:6,0:8,0 17 305 . C <*> 0 . DP=27;ADF=11,0;ADR=16,0;AD=27,0;I16=11,16,0,0,1057,41761,0,0,1318,70138,0,0,510,11508,0,0;QS=3,0;MQSBZ=-2.39388;FS=0;MQ0F=0 PL:DP:SP:ADF:ADR:AD 0,33,255:11:0:5,0:6,0:11,0 0,24,213:8:0:4,0:4,0:8,0 0,24,211:8:0:2,0:6,0:8,0 diff --git a/test/mpileup/mpileup.6.out b/test/mpileup/mpileup.6.out index 3e43cb714..28d9abfb8 100644 --- a/test/mpileup/mpileup.6.out +++ b/test/mpileup/mpileup.6.out @@ -62,7 +62,7 @@ 17 283 . C <*> . . END=296;MinDP=5;QS=3,0 PL:DP 0,33,240:11 0,18,119:6 0,15,122:5 17 297 . C G,<*> 0 . DP=25;I16=9,15,1,0,901,34305,4,16,1138,59338,60,3600,445,9901,10,100;QS=2.98261,0.0173913,0;SGB=-0.556633;RPBZ=-1.24856;MQBZ=0.806872;MQSBZ=-3.22749;BQBZ=-1.67542;SCBZ=-0.368383;FS=0;MQ0F=0 PL:DP:DV 0,33,255,33,255,255:11:0 0,15,168,21,171,168:8:1 0,18,161,18,161,161:6:0 17 298 . A <*> . . END=301;MinDP=7;QS=3,0 PL:DP 0,30,231:10 0,21,172:7 0,21,189:7 -17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.515744,2.48426;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 167,0,95:11:6 158,0,11:7:6 201,21,0:7:7 +17 302 . T TA 0 . INDEL;IDV=7;IMF=1;DP=25;I16=2,4,8,11,240,9600,760,30400,236,10564,993,55133,109,2229,377,8629;QS=0.543141,2.45686;VDB=0.27613;SGB=-4.22417;RPBZ=1.11989;MQBZ=1.47646;MQSBZ=-3.10529;SCBZ=-0.268121;FS=0;MQ0F=0 PL:DP:DV 157,0,98:11:6 158,0,14:7:6 201,21,0:7:7 17 303 . G <*> . . END=334;MinDP=7;QS=3,0 PL:DP 0,30,235:10 0,21,197:7 0,21,195:7 17 335 . A G,<*> 0 . DP=32;I16=13,18,1,0,1084,40336,4,16,1589,87297,60,3600,555,11943,0,0;QS=2.98919,0.0108108,0;SGB=-0.556633;RPBZ=-1.67921;MQBZ=0.622171;MQSBZ=-2.25492;BQBZ=-1.68602;SCBZ=-0.258065;FS=0;MQ0F=0 PL:DP:DV 0,33,252,33,252,252:11:0 0,27,219,27,219,219:9:0 0,25,245,33,248,245:12:1 17 336 . A <*> . . MinDP=9;QS=3,0 PL:DP 0,33,255:11 0,27,212:9 0,36,255:12 From 155e36ea0dcb1f8c550d0fca31029c2575f220a8 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Thu, 17 Mar 2022 17:23:45 +0000 Subject: [PATCH 30/31] Add bounds checking to heti/hetd arrays. We already had this elsewhere, but forgot this case. It trips up on GIAB HG002.GRCh38.PacBio_CCS_15Kb.bam at chr1:112149167 which has a 13KB deletion. --- bam2bcf_indel.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 7f4403d95..493dd2989 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -748,7 +748,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, ? 1 : (max_v_ins > .3*tot_sum ? -1:0); } else { - het_ins = (heti[i] == -1); // HET but uncalled before + // HET but uncalled before + het_ins = i < 1024 ? (heti[i] == -1) : 0; } } @@ -787,7 +788,8 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, : (cons_base[i][5] >= .3 * tot ? -1 : 0); } } else { - het_del = (hetd[i] == -1); // HET del uncalled on cnum 0 + // HET del uncalled on cnum 0 + het_del = i < 1024 ? (hetd[i] == -1) : 0; if (max_j == 5 && het_del == 0) { max_v = max_v2; max_j = max_j2; From 72c0f76aa696a6d777f9e86591e8de295aeeec58 Mon Sep 17 00:00:00 2001 From: James Bonfield Date: Wed, 23 Mar 2022 12:45:58 +0000 Subject: [PATCH 31/31] Add a check for end of reference when indel calling. This fixes a read buffer-overrun, although I'm baffled how we didn't hit this with the original code as it's never apparently known where the reference ends. --- bam2bcf.h | 2 +- bam2bcf_indel.c | 14 +++++++++----- mpileup.c | 2 +- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/bam2bcf.h b/bam2bcf.h index e778b8952..2ffca4c6a 100644 --- a/bam2bcf.h +++ b/bam2bcf.h @@ -156,7 +156,7 @@ extern "C" { int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int ref_base /*4-bit*/, bcf_call_t *call); int bcf_call2bcf(bcf_call_t *bc, bcf1_t *b, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref); - int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref); + int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, int ref_len); void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call); #ifdef __cplusplus diff --git a/bam2bcf_indel.c b/bam2bcf_indel.c index 493dd2989..cd5e7956b 100644 --- a/bam2bcf_indel.c +++ b/bam2bcf_indel.c @@ -375,7 +375,7 @@ static int bcf_cgp_append_cons(str_freq *sf, char *str, int len, int freq) { */ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, int pos, bcf_callaux_t *bca, const char *ref, - int left, int right, + int ref_len, int left, int right, int sample, int type, int biggest_del, int *left_shift, int *right_shift, int *band, int *tcon_len, int *cpos_pos) { @@ -808,8 +808,12 @@ static char **bcf_cgp_consensus(int n, int *n_plp, bam_pileup1_t **plp, cons[cnum][k++] = max_j; // "ACGTN*" else if (max_v > 0) cons[cnum][k++] = 4; // 'N'; - else - cons[cnum][k] = base6[(uint8_t)ref[left+k]], k++; + else { + cons[cnum][k] = left+k < ref_len + ? base6[(uint8_t)ref[left+k]] + : 4; + k++; + } } } @@ -1246,7 +1250,7 @@ specific sample? Needs to check bca->per_sample_flt (--per-sample-mF) opt. - 8: indel quality .. aux&0xff */ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, - bcf_callaux_t *bca, const char *ref) + bcf_callaux_t *bca, const char *ref, int ref_len) { if (ref == 0 || bca == 0) return -1; @@ -1342,7 +1346,7 @@ int bcf_call_gap_prep(int n, int *n_plp, bam_pileup1_t **plp, int pos, int left_shift, right_shift; int tcon_len[2]; int cpos_pos; - tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, + tcons = bcf_cgp_consensus(n, n_plp, plp, pos, bca, ref, ref_len, left, right, s, types[t], biggest_del, &left_shift, &right_shift, &band, tcon_len, &cpos_pos); diff --git a/mpileup.c b/mpileup.c index 33e832b0f..35c9b2117 100644 --- a/mpileup.c +++ b/mpileup.c @@ -575,7 +575,7 @@ static int mpileup_reg(mplp_conf_t *conf, uint32_t beg, uint32_t end) // check me: rghash in bcf_call_gap_prep() should have no effect, reads mplp_func already excludes them if (!(conf->flag&MPLP_NO_INDEL) && total_depth < conf->max_indel_depth && (bcf_callaux_clean(conf->bca, &conf->bc), - bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref) >= 0)) + bcf_call_gap_prep(conf->gplp->n, conf->gplp->n_plp, conf->gplp->plp, pos, conf->bca, ref, ref_len) >= 0)) { for (i = 0; i < conf->gplp->n; ++i) bcf_call_glfgen(conf->gplp->n_plp[i], conf->gplp->plp[i], -1, conf->bca, conf->bcr + i);