diff --git a/test/merge.10.a.vcf b/test/merge.10.a.vcf new file mode 100644 index 000000000..2b9be9c0b --- /dev/null +++ b/test/merge.10.a.vcf @@ -0,0 +1,9 @@ +##fileformat=VCFv4.1 +##contig= +##reference=file:///ref.fa +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A +1 10 . C G . . AN=4;AC=2 GT 0/1 +1 12 . T A . . AN=4;AC=2 GT 0/1 diff --git a/test/merge.10.b.vcf b/test/merge.10.b.vcf new file mode 100644 index 000000000..b8b853394 --- /dev/null +++ b/test/merge.10.b.vcf @@ -0,0 +1,8 @@ +##fileformat=VCFv4.1 +##contig= +##reference=file:///ref.fa +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT B +1 10 . CGT CGA,AGT . . AN=4;AC=1 GT 1/2 diff --git a/test/merge.10.none.out b/test/merge.10.none.out new file mode 100644 index 000000000..773da6351 --- /dev/null +++ b/test/merge.10.none.out @@ -0,0 +1,11 @@ +##fileformat=VCFv4.1 +##FILTER= +##contig= +##reference=file:///ref.fa +##INFO= +##INFO= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT A B +1 10 . C G . . AN=2;AC=1 GT 0/1 ./. +1 10 . CGT CGA,AGT . . AN=2;AC=1,1 GT ./. 1/2 +1 12 . T A . . AN=2;AC=1 GT 0/1 ./. diff --git a/test/test.pl b/test/test.pl index 6fb1c66c4..21b0762d7 100755 --- a/test/test.pl +++ b/test/test.pl @@ -97,6 +97,7 @@ test_vcf_merge($opts,in=>['merge.8.a','merge.8.b'],out=>'merge.8.out',args=>'-i AN:sum,AC:sum'); test_vcf_merge($opts,in=>['merge.9.a','merge.9.b'],out=>'merge.9.1.out',args=>''); test_vcf_merge($opts,in=>['merge.9.a','merge.9.b'],out=>'merge.9.2.out',args=>'-i AN:sum,AC:sum'); +test_vcf_merge($opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.none.out',args=>'-m none'); # test_vcf_merge_big($opts,in=>'merge_big.1',out=>'merge_big.1.1',nsmpl=>79000,nfiles=>79,nalts=>486,args=>''); # commented out for speed test_vcf_query($opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided,_conflicting_interpretations"']); test_vcf_query($opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided" || CLNREVSTAT="_conflicting_interpretations"']); diff --git a/vcfmerge.c b/vcfmerge.c index 60a80c9fa..528294cf4 100644 --- a/vcfmerge.c +++ b/vcfmerge.c @@ -149,7 +149,7 @@ typedef struct maux_t *maux; regidx_t *regs; // apply regions only after the blocks are expanded regitr_t *regs_itr; - int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index; + int header_only, collapse, output_type, force_samples, merge_by_id, do_gvcf, filter_logic, missing_to_ref, no_index, non_normalize_alleles; char *header_fname, *output_fname, *regions_list, *info_rules, *file_list; faidx_t *gvcf_fai; info_rule_t *rules; @@ -973,7 +973,7 @@ void merge_chrom2qual(args_t *args, bcf1_t *out) for (i=0; inals; i++) if ( i==0 || al_idxs[i] ) ma->out_als[k++] = strdup(ma->als[i]); assert( k==ma->nout_als ); - normalize_alleles(ma->out_als, ma->nout_als); + if (args->non_normalize_alleles != 1) normalize_alleles(ma->out_als, ma->nout_als); bcf_update_alleles(out_hdr, out, (const char**) ma->out_als, ma->nout_als); free(al_idxs); for (i=0; inout_als; i++) free(ma->out_als[i]); @@ -2799,7 +2799,7 @@ int can_merge(args_t *args) // All alleles of the tested record must be present in the // selected maux record plus variant types must be the same if ( (maux->var_types & line_type) != line_type ) continue; - if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) < 0 ) continue; // refs not compatible + if ( vcmp_set_ref(args->vcmp,maux->als[0],line->d.allele[0]) <= 0 ) continue; // refs not perfect match for (k=1; kn_allele; k++) { if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,line->d.allele[k])>=0 ) break; @@ -3134,6 +3134,7 @@ static void usage(void) fprintf(stderr, " -R, --regions-file FILE Restrict to regions listed in a file\n"); fprintf(stderr, " --regions-overlap 0|1|2 Include if POS in the region (0), record overlaps (1), variant overlaps (2) [1]\n"); fprintf(stderr, " --threads INT Use multithreading with worker threads [0]\n"); + fprintf(stderr, " -N, --non_normalize_alleles Do not normalize_alleles\n"); fprintf(stderr, "\n"); exit(1); } @@ -3150,6 +3151,7 @@ int main_vcfmerge(int argc, char *argv[]) args->record_cmd_line = 1; args->collapse = COLLAPSE_BOTH; args->clevel = -1; + args->non_normalize_alleles = 0; int regions_is_file = 0; int regions_overlap = 1; @@ -3175,11 +3177,13 @@ int main_vcfmerge(int argc, char *argv[]) {"no-version",no_argument,NULL,8}, {"no-index",no_argument,NULL,10}, {"filter-logic",required_argument,NULL,'F'}, + {"non_normalize_alleles",no_argument,NULL,'N'}, {NULL,0,NULL,0} }; char *tmp; - while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:",loptions,NULL)) >= 0) { + while ((c = getopt_long(argc, argv, "hm:f:r:R:o:O:i:l:g:F:0L:N",loptions,NULL)) >= 0) { switch (c) { + case 'N': args->non_normalize_alleles = 1; break; case 'L': args->local_alleles = strtol(optarg,&tmp,10); if ( *tmp ) error("Could not parse argument: --local-alleles %s\n", optarg); diff --git a/vcmp.c b/vcmp.c index dbdc4b7ac..c1d2daf33 100644 --- a/vcmp.c +++ b/vcmp.c @@ -58,7 +58,7 @@ int vcmp_set_ref(vcmp_t *vcmp, char *ref1, char *ref2) char *a = ref1, *b = ref2; while ( *a && *b && toupper(*a)==toupper(*b) ) { a++; b++; } - if ( !*a && !*b ) return 0; + if ( !*a && !*b ) return 1; // perfect match if ( *a && *b ) return -1; // refs not compatible int i; @@ -70,18 +70,19 @@ int vcmp_set_ref(vcmp_t *vcmp, char *ref1, char *ref2) hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref); for (i=0; indref; i++) vcmp->dref[i] = toupper(ref1[vcmp->nmatch+i]); vcmp->dref[vcmp->ndref] = 0; - return 0; + return 0; // compatible + } else if ( *b ) { // ref2 is longer + vcmp->nmatch = a-ref1; + while ( *b ) b++; + vcmp->ndref = (b-ref2) - vcmp->nmatch; + hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref); + for (i=0; indref; i++) vcmp->dref[i] = toupper(ref2[vcmp->nmatch+i]); + vcmp->dref[vcmp->ndref] = 0; + vcmp->ndref *= -1; + return 0; // compatible + } else { + return -1; // should not happen } - - // ref2 is longer - vcmp->nmatch = a-ref1; - while ( *b ) b++; - vcmp->ndref = (b-ref2) - vcmp->nmatch; - hts_expand(char,vcmp->ndref+1,vcmp->mdref,vcmp->dref); - for (i=0; indref; i++) vcmp->dref[i] = toupper(ref2[vcmp->nmatch+i]); - vcmp->dref[vcmp->ndref] = 0; - vcmp->ndref *= -1; - return 0; } int vcmp_find_allele(vcmp_t *vcmp, char **als1, int nals1, char *al2)