diff --git a/AssemblyAnalysis.snakefile b/AssemblyAnalysis.snakefile index 34f59d9..64db819 100644 --- a/AssemblyAnalysis.snakefile +++ b/AssemblyAnalysis.snakefile @@ -27,13 +27,20 @@ asms=list(config["assemblies"].keys()) # Index assemblies by name and haplotype asmFiles={} +asmToSample={} +allAssemblies=[] for asm in config["assemblies"]: asmFiles[asm] = {} + allAssemblies.append(asm) asmFiles[asm]["assemb"] = {} asmFiles[asm]["assemb"]["1"] = config["assemblies"][asm][0] asmFiles[asm]["assemb"]["2"] = config["assemblies"][asm][1] asmFiles[asm]["readType"] = config["assemblies"][asm][2] + asmFiles[asm]["sample"] = config["assemblies"][asm][3] + asmToSample[asm] = config["assemblies"][asm][3] + +samples=config["samples"] methods=["lra", "mm2"] ops=["ins", "del"] cats=["missed", "found"] @@ -41,12 +48,12 @@ cats +=["missed_slop", "found_slop"] checks=["passed","failed"] flags=["1", "2", "3"] #Current number of filtering flags -localrules:IndexAssembly,FindAvgCov,MakeBed,SplitBed,GetVariantSupport,FlagCentromere,FlagUnsupported,FlagHighCoverage,CollectFlagStats,CollectRunStats,CombineRunStats,CombineFlagStats,CompareToSVSet,CombineVariantSupport,CombineChroms,ReadVCFToBed,ClusterBed,ClusterDiploid +localrules:IndexAssembly,FindAvgCov,SplitBed,FlagCentromere,FlagUnsupported,CollectFlagStats,CollectRunStats,CombineRunStats,CombineFlagStats,CompareToSVSet,CombineVariantSupport,CombineChroms,ReadVCFToBed,ClusterDiploid,ClusterBed rule all: input: - readVCF=expand("from_reads.{method}.vcf",method=methods), - readBed=expand("from_reads.{method}.{op}.bed",method=methods,op=ops), + readVCF=expand("from_reads.{sample}.{method}.vcf",sample=samples, method=methods), + readBed=expand("from_reads.{sample}.{method}.{op}.bed",sample=samples, method=methods,op=ops), aligned=expand("{asm}.{hap}/{method}/aln.bam",asm=asms,hap=haps,method=methods), asmBed=expand("{asm}.{hap}/{method}/variants.sv.bed",asm=asms,hap=haps,method=methods), splitBed=expand("{asm}.{hap}/{method}/variants.sv.{op}.bed",asm=asms,hap=haps,method=methods,op=ops), @@ -56,7 +63,10 @@ rule all: clustDip=expand("{asm}_dip/{method}/variants.clustered.bed",asm=asms,method=methods), comparison=expand("{asm}_dip/{method}/variants.sv.{op}.{cat}.bed",asm=asms,method=methods,op=ops+["hap"],cat=cats), filtered=expand("{asm}.{hap}/{method}/sv.{check}.bed",asm=asms,hap=haps,method=methods,check=checks), - findCov=expand("coverage/{method}.txt",method=methods), + false_call=expand("{asm}.{hap}/{method}/sv.fp.bed",asm=asms,hap=haps,method=methods), + false_call_tr=expand("{asm}.{hap}/{method}/sv.fp.bed.tr",asm=asms,hap=haps,method=methods), + false_call_sd=expand("{asm}.{hap}/{method}/sv.fp.bed.sd",asm=asms,hap=haps,method=methods), + findCov=expand("coverage/{sample}.txt",sample=samples), statsAsm=expand("{asm}.{hap}/{method}/stats.txt",asm=asms,hap=haps,method=methods), statsAsmFlags=expand("{asm}.{hap}/{method}/stats_flag.txt",asm=asms,hap=haps,method=methods), statsAll="run_stats_all.txt", @@ -68,6 +78,9 @@ rule IndexAssembly: asm=lambda wildcards: asmFiles[wildcards.asm]["assemb"][wildcards.hap] output: index="indices/{asm}.{hap}.assembly.fasta.fai" + params: + grid_opts=config["grid_small"], + node_constraint="" shell:""" mkdir -p indices ln -sfn {input.asm} indices/{wildcards.asm}.{wildcards.hap}.assembly.fasta && samtools faidx indices/{wildcards.asm}.{wildcards.hap}.assembly.fasta && rm -f indices/{wildcards.asm}.{wildcards.hap}.assembly.fasta @@ -75,37 +88,64 @@ ln -sfn {input.asm} indices/{wildcards.asm}.{wildcards.hap}.assembly.fasta && sa rule FindAvgCov: input: - reads=lambda wildcards: config["bam"][wildcards.method] + bamcov=lambda wildcards: config["bamcov"][wildcards.sample] output: - cov="coverage/{method}.txt" + cov="coverage/{sample}.txt", + params: + grid_opts=config["grid_small"], + sd=SD, + node_constraint="" shell:""" mkdir -p coverage -samtools coverage -H {input.reads} > {output.cov} -""" +bedtools intersect -v -a {input.bamcov} -b /home/cmb-16/mjc/shared/references/hg38/regions/cytobands/LowComplexity.bed | cut -f 4 | {params.sd}/stats > {output.cov} + +# Use hack to get past long coverage call +""" +# {params.sd}/stats +#samtools coverage -H {input.reads} > {output.cov} # #Step 1: Map contigs # + +def GetConstraint(method): + if method == "mm2": + return " --constraint=\"[E5-2640v3]\"" + else: + return "" + +def GetMemoryConstraint(method): + if method == "lra": + return 64 + else: + return 32 + rule MapFasta: input: asm=lambda wildcards: asmFiles[wildcards.asm]["assemb"][wildcards.hap] output: - bam="{asm}.{hap}/{method}/aln.bam" + bam=protected("{asm}.{hap}/{method}/aln.bam") resources: - threads=4 + mem_gb=lambda wildcards, attempt: GetMemoryConstraint(wildcards.method) + (attempt-1)*16 params: grid_opts=config["grid_memory"], readType=lambda wildcards: asmFiles[wildcards.asm]["readType"], - ref=config["ref"] + ref=config["ref"], + node_constraint=lambda wildcards: GetConstraint(wildcards.method) shell:""" +if [ ! -e {output.bam} ]; then + mkdir -p {wildcards.asm}.{wildcards.hap}/{wildcards.method} if [ {wildcards.method} == mm2 ] ; then \ - minimap2 -t 4 -ax asm10 --cs {params.ref} {input.asm} | samtools view -h -b - ; \ + minimap2 -t 8 -ax asm10 --cs {params.ref} {input.asm} ; \ elif [ {wildcards.method} == lra ] ; then \ - lra align -{params.readType} -t 4 {params.ref} {input.asm} -p s ; fi | samtools sort -@2 -T $TMPDIR/$$ -o {output.bam} + /home/cmb-16/mjc/mchaisso/projects/LRA/lra align -t 4 {params.ref} {input.asm} -p s ; fi | samtools sort -@2 -T $TMPDIR/$$ -m2G -o {output.bam} samtools index {output.bam} +else +exit 0 +fi """ # @@ -117,19 +157,23 @@ rule MakeBed: output: bed="{asm}.{hap}/{method}/variants.sv.bed" params: + grid_opts=config["grid_small"], ref=config["ref"], - sd=SD + sd=SD, + node_constraint="" + resources: + mem_gb=12 shell:""" samtools view -h {input.bam} | \ if [ {wildcards.method} == mm2 ] ; then \ - paftools.js sam2paf - | sort -k6,6 -k8,8n | paftools.js call -f {params.ref} - | \ - grep -v "^#" | awk '{{diff=length($4)-length($5); if (diff <= -50 || diff >= 50) {{ \ + paftools.js sam2paf -L - | sort -k6,6 -k8,8n | paftools.js call -f {params.ref} - | \ + grep -v "^#" | awk 'BEGIN{{ OFS="\\t";}} {{diff=length($4)-length($5); if (diff <= -50 || diff >= 50) {{ \ if (diff <= 0) {{oper="insertion"; diff*= -1}} else {{oper="deletion"}}; \ - print $1,$2,$2+diff,oper,diff,$4,$5}} }}' OFS="\t"; \ + print $1"\\t"$2"\\t"$2+diff"\\t"oper"\\t"diff"\\t"$4"\\t"$5"\\t0"}} }}' ; \ elif [ {wildcards.method} == lra ] ; then \ - {params.sd}/PrintGaps.py {params.ref} /dev/stdin | tail -n +2 | cut -f1-7 ; fi |\ - awk -F " " '!keep[$1,$2,$3,$4]++' OFS="\t" | bedtools sort > {output.bed} + {params.sd}/PrintGaps.py {params.ref} /dev/stdin --minAlignmentLength 50000 --printLength | tail -n +2 | cut -f1-7,11 ; fi | \ + awk '!keep[$1,$2,$3,$4]++' OFS="\\t" | bedtools sort > {output.bed} """ # @@ -140,10 +184,15 @@ rule SplitBed: bed="{asm}.{hap}/{meth}/variants.sv.bed" output: split=expand("{{asm}}.{{hap}}/{{meth}}/variants.sv.{op}.bed",op=ops) + params: + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=4 shell:""" beds[0]={output.split[0]} ; beds[1]={output.split[1]} oper=(insertion deletion) ; for op in {{0..1}} ; do egrep "^#|${{oper[$op]}}" {input.bed} | \ - awk 'function max(x, y) {{return length(x) > length(y) ? x: y}} {{print $1,$2,$3,$4,$5,max($6,$7)}}' OFS="\t" > ${{beds[$op]}}; done + awk 'function max(x, y) {{return length(x) > length(y) ? x: y}} {{print $1,$2,$3,$4,$5,max($6,$7),$8}}' OFS="\\t" > ${{beds[$op]}}; done """ # @@ -154,8 +203,13 @@ rule ClusterBed: bed="{asm}.{hap}/{method}/variants.sv.bed" output: clustBed="{asm}.{hap}/{method}/variants.clusters.bed" + params: + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=4 shell:""" -bedtools cluster -d 500 -i {input.bed} | bedtools groupby -g 8 -c 1,2,3,4,5,6,7,8 -o first,min,max,collapse,collapse,collapse,collapse,count | cut -f 2- | \ +bedtools cluster -d 500 -i {input.bed} | /home/cmb-16/mjc/mchaisso/software/bedtools2/bin/bedtools groupby -g 8 -c 1,2,3,4,5,6,7,8 -o first,min,max,collapse,collapse,collapse,collapse,count | cut -f 2- | \ awk '{{ if (index($4,"deletion") > 0 && index($4,"insertion") > 0) {{ $4="locus";}} else {{ if (index($4,"deletion") == 0) {{ $4="insertion"; }} else {{ $4="deletion";}} }} print; }}' | tr " " "\\t" > {output.clustBed} """ @@ -164,8 +218,10 @@ rule ClusterDiploid: clustBed=expand("{{asm}}.{hap}/{{method}}/variants.clusters.bed",hap=haps) output: dip="{asm}_dip/{method}/variants.clustered.bed", + resources: + mem_gb=1 shell:""" -cat {input.clustBed} | bedtools sort | bedtools cluster -d 500 | bedtools groupby -c 9 -g 9 -o count -full > {output.dip} +cat {input.clustBed} | bedtools sort | bedtools cluster -d 500 | /home/cmb-16/mjc/mchaisso/software/bedtools2/bin/bedtools groupby -c 9 -g 9 -o count -full > {output.dip} """ rule CompareToSVSet: @@ -178,7 +234,11 @@ rule CompareToSVSet: missed_slop="{asm}_dip/{method}/variants.sv.{op}.missed_slop.bed", found_slop="{asm}_dip/{method}/variants.sv.{op}.found_slop.bed" params: - ref=config["ref"] + ref=config["ref"], + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=4 shell:""" bedtools intersect -v -a {input.set} -b {input.bed} > {output.missed} @@ -193,26 +253,38 @@ bedtools slop -i {input.bed} -b 1000 -g {params.ref}.fai | bedtools intersect -u # rule CreateChromVCF: input: - reads=lambda wildcards: config["bam"][wildcards.method] + reads=lambda wildcards: config["bam"][wildcards.sample][wildcards.method] output: - vcf="gaps/{method}.{chrom}.gaps.vcf" + vcf="gaps/{sample}.{method}.{chrom}.gaps.vcf" + wildcard_constraints: + sample="^.+", + method="lra|mm2" resources: - threads=16 + threads=16, + mem_gb=4 params: grid_opts=config["grid_large"], ref=config["ref"], - sample=config["sample"], - sd=SD + sd=SD, + node_constraint="" shell:""" mkdir -p gaps -{params.sd}/SamToVCF.py --ref {params.ref} --sample {params.sample} --sam {input.reads} --minLength 25 --chrom {wildcards.chrom} > {output.vcf} +{params.sd}/SamToVCF.py --ref {params.ref} --sample {wildcards.sample} --sam {input.reads} --minLength 25 --chrom {wildcards.chrom} > {output.vcf} """ rule CombineChroms: input: - vcfs=expand("gaps/{{method}}.{chrom}.gaps.vcf", chrom=allChroms) + vcfs=expand("gaps/{{sample}}.{{method}}.{chrom}.gaps.vcf", chrom=allChroms) output: - gapVcf="from_reads.{method}.vcf" + gapVcf="from_reads.{sample}.{method}.vcf" + wildcard_constraints: + sample="|".join(samples), + method="lra|mm2" + params: + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=1 shell:""" grep "^#" {input.vcfs[0]} > {output.gapVcf} cat {input.vcfs} | grep -v "^#" >> {output.gapVcf} @@ -220,11 +292,18 @@ cat {input.vcfs} | grep -v "^#" >> {output.gapVcf} rule ReadVCFToBed: input: - vcf="from_reads.{method}.vcf" + vcf="from_reads.{sample}.{method}.vcf" output: - bed=expand("from_reads.{{method}}.{op}.bed",op=ops) + bed=expand("from_reads.{{sample}}.{{method}}.{op}.bed",op=ops) + wildcard_constraints: + sample="|".join(samples), + method="lra|mm2" params: - sd=SD + grid_opts=config["grid_small"], + sd=SD, + node_constraint="" + resources: + mem_gb=1 shell:""" {params.sd}/VcfToSplitBed.sh {input.vcf} {output.bed[1]} {output.bed[0]} """ @@ -235,17 +314,26 @@ rule ReadVCFToBed: rule GetVariantSupport: input: bed="{asm}.{hap}/{method}/variants.sv.{op}.bed", - sup="from_reads.{method}.{op}.bed" + sup=lambda wildcards: "from_reads." + asmToSample[wildcards.asm]+".{meth}.{op}.bed" + wildcard_constraints: + asm="|".join(allAssemblies), + method="lra|mm2", + hap="|".join(haps), + op="|".join(ops) output: sup="{asm}.{hap}/{method}/variants.sv.{op}.bed.{meth}.support" params: grid_opts=config["grid_medium"], ref=config["ref"], - sd=SD + sd=SD, + node_constraint="" + resources: +# This needs a gargantuan amount of memory from a bug in groupby + mem_gb=lambda wildcards, attempt: 16, shell:""" bedtools slop -g {params.ref}.fai -b 1000 -i {input.sup} | \ - bedtools intersect -loj -a {input.bed} -b stdin | \ - bedtools groupby -c 11 -o collapse -full | \ + bedtools intersect -sorted -loj -a {input.bed} -b stdin | \ + /home/cmb-16/mjc/mchaisso/software/bedtools2/bin/bedtools groupby -c 12 -o collapse -full | \ {params.sd}/GetSVSupport.py > {output.sup} """ @@ -254,8 +342,17 @@ rule CombineVariantSupport: meth=expand("{{asm}}.{{hap}}/{{method}}/variants.sv.{{op}}.bed.{meth}.support", meth=methods), output: comb="{asm}.{hap}/{method}/variants.sv.{op}.bed.support.combined" + wildcard_constraints: + asm="|".join(allAssemblies), + method="lra|mm2", + hap="|".join(haps) + params: + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=1 shell:""" -paste {input.meth[0]} <( cut -f 7 {input.meth[1]} ) > {output.comb} +paste {input.meth[0]} <( cut -f 6 {input.meth[1]} ) > {output.comb} """ # @@ -266,39 +363,62 @@ rule FlagCentromere: sup=expand("{{asm}}.{{hap}}/{{meth}}/variants.sv.{op}.bed.support.combined",op=ops) output: euchr="{asm}.{hap}/{meth}/sv.flag.1.bed" + wildcard_constraints: + meth="lra|mm2", + hap="|".join(haps) params: - grid_opts=config["grid_small"] + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=4 shell:""" - -bedtools intersect -c -a <(cat {input.sup}) -b /home/cmb-16/mjc/shared/references/hg38/regions/cytobands/hg38.heterochromatic.bed | awk '{{if ($9 > 0) print $0,1 ; else print $0,0}}' OFS="\t" > {output.euchr} +bedtools intersect -c -a <(cat {input.sup}) -b /home/cmb-16/mjc/shared/references/hg38/regions/cytobands/LowComplexity.bed | awk '{{if ($8 > 0) print $0,1 ; else print $0,0}}' OFS="\\t" > {output.euchr} """ rule FlagHighCoverage: input: sup=expand("{{asm}}.{{hap}}/{{meth}}/variants.sv.{op}.bed.support.combined",op=ops), - reads=lambda wildcards: config["bam"][wildcards.meth], - cov="coverage/{meth}.txt", + cov=lambda wildcards: "coverage/"+asmFiles[wildcards.asm]["sample"]+".txt", + bamcov=lambda wildcards: config["bamcov"][asmToSample[wildcards.asm]], + reads=lambda wildcards: config["bam"][asmToSample[wildcards.asm]][wildcards.meth], output: hcov="{asm}.{hap}/{meth}/sv.flag.2.bed" + wildcard_constraints: + meth="lra|mm2", + hap="|".join(haps) params: - grid_opts=config["grid_small"] + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=20 shell:""" +# +#Removed to make way for quick hack on input coverage. +#avgCov=$(cat {input.cov} | awk '{{ total+=$7 }} END {{ print total/NR }}') +# +avgCov=`cut -f 1 {input.cov}` +which bedtools -avgCov=$(cat {input.cov} | awk '{{ total+=$7 }} END {{ print total/NR }}') +bedtools intersect -loj -a <(cat {input.sup}) -b {input.bamcov} | /home/cmb-16/mjc/shared/software_packages/bedtools2/bin/bedtools groupby -g 1,2,3 -c 11 -o mean -full | awk -v avg=$avgCov 'BEGIN{{OFS="\\t";}} {{ if ($NF > 2*avg) print $1,$2,$3,$4,$5,$6,$7,1; else print $1,$2,$3,$4,$5,$6,$7,0; }}' > {output.hcov} -samtools bedcov <(cat {input.sup}) {input.reads} | awk -v avg=$avgCov '{{if ($9/$5 > 2*avg) print $0,1 ; else print $0,0}}' OFS="\t" > {output.hcov} """ +##samtools bedcov <(cat {input.sup}) {input.reads} | awk -v avg=$avgCov '{{if ($9/$5 > 2*avg) print $0,1 ; else print $0,0}}' OFS="\t" > {output.hcov} rule FlagUnsupported: input: sup=expand("{{asm}}.{{hap}}/{{meth}}/variants.sv.{op}.bed.support.combined",op=ops) output: unsup="{asm}.{hap}/{meth}/sv.flag.3.bed" + wildcard_constraints: + meth="lra|mm2", + hap="|".join(haps) params: - grid_opts=config["grid_small"] + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=1 shell:""" - -cat {input.sup} | awk '{{if ($7 < 3 && $8 < 3) print $0,1 ; else print $0,0}}' OFS="\t" > {output.unsup} +cat {input.sup} | awk '{{if ($7 < 3 && $8 < 3) print $0,1 ; else print $0,0}}' OFS="\\t" > {output.unsup} """ rule CheckFlags: @@ -306,15 +426,54 @@ rule CheckFlags: flagged=expand("{{asm}}.{{hap}}/{{meth}}/sv.flag.{flag}.bed",flag=flags) output: filtered=expand("{{asm}}.{{hap}}/{{meth}}/sv.{check}.bed",check=checks) + wildcard_constraints: + meth="lra|mm2", + hap="|".join(haps) params: - grid_opts=config["grid_small"] + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=1 shell:""" touch {output.filtered} -paste {input.flagged[0]} <( cut -f 10 {input.flagged[1]} ) <( cut -f 9 {input.flagged[2]} ) | awk -v asm="{wildcards.asm}" -v meth="{wildcards.meth}" -v hap="{wildcards.hap}" '{{if ($10+$11+$12 == 0) print $1,$2,$3,asm"."hap"/"meth"/"$4,$5,$6 > "{output.filtered[0]}" ; else print $1,$2,$3,asm"."hap"/"meth"/"$4,$5,$6,$10,$11,$12 > "{output.filtered[1]}"; }}' OFS="\t" +paste {input.flagged[0]} <( cut -f 8 {input.flagged[1]} ) <( cut -f 8 {input.flagged[2]} ) | awk -v asm="{wildcards.asm}" -v meth="{wildcards.meth}" -v hap="{wildcards.hap}" '{{if ($9+$10+$11 == 0) print $1,$2,$3,asm"."hap"/"meth"/"$4,$5,$6 > "{output.filtered[0]}" ; else print $1,$2,$3,asm"."hap"/"meth"/"$4,$5,$6,$9,$10,$11 > "{output.filtered[1]}"; }}' OFS="\\t" +""" + +rule TallyFP: + input: + filtered="{asm}.{hap}/{meth}/sv.failed.bed", + output: + fp="{asm}.{hap}/{meth}/sv.fp.bed", + params: + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=1 + shell:""" +cat {input.filtered} | awk '{{ if ($7==0 && $8 == 0 && $9 == 1) print; }}' > {output.fp} +""" + +rule TallyFPTR: + input: + fp="{asm}.{hap}/{meth}/sv.fp.bed", + output: + tr="{asm}.{hap}/{meth}/sv.fp.bed.tr", + sd="{asm}.{hap}/{meth}/sv.fp.bed.sd", + params: + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=1 + shell:""" +bedtools intersect -u -a {input.fp} -b /home/cmb-16/mjc/shared/references/hg38/regions/tandem_repeats.bed > {output.tr} +bedtools intersect -u -a {input.fp} -b /home/cmb-16/mjc/shared/references/hg38/regions/segdups/grch38_superdups.merged.bed > {output.sd} + """ + + # #Step 8: Collect statistics on this run of the pipeline # @@ -324,16 +483,21 @@ rule CollectRunStats: covs="{asm}.{hap}/{method}/aln.bam" output: stats="{asm}.{hap}/{method}/stats.txt" + wildcard_constraints: + method="lra|mm2" params: grid_opts=config["grid_small"], - ref=config["ref"] + ref=config["ref"], + node_constraint="" + resources: + mem_gb=4 shell:""" halfGnm=$(cat {params.ref}.fai | awk '{{size += $2}} END {{print int(size/2)}}') n50=$(sort -nr -k 2 {input.asm} | awk -v gnm="$halfGnm" '{{contgSum += $2 ; if(contgSum >= gnm) {{print $2}} }} END{{if(contgSum < gnm) {{print "notFound"}} }}' > {output.stats}.tmp ; head -n 1 {output.stats}.tmp) && rm -f {output.stats}.tmp -cov=$(samtools coverage -H {input.covs} | awk '{{count+=$5; total+=$3}} END {{print count/total}}') +cov=`samtools view {input.covs} | samToBed /dev/stdin | bedtools sort | bedtools merge | awk '{{ s+=$3-$2 }} END {{ print s;}}'` echo -e "{wildcards.method}.{wildcards.asm}.{wildcards.hap}\t$n50\t$cov" >> {output.stats} """ @@ -343,9 +507,16 @@ rule CollectFlagStats: sup=expand("{{asm}}.{{hap}}/{{method}}/sv.{check}.bed",check=checks) output: flags="{asm}.{hap}/{method}/stats_flag.txt" + wildcard_constraints: + asm="^.+", + method="lra|mm2", + hap="|".join(haps) params: grid_opts=config["grid_small"], - ref=config["ref"] + ref=config["ref"], + node_constraint="" + resources: + mem_gb=1 shell:""" pass=$(cat {input.sup[0]} | wc -l) @@ -367,6 +538,11 @@ rule CombineRunStats: flags=expand("{asm}.{hap}/{method}/stats_flag.txt",asm=asms,hap=haps,method=methods) output: all="run_stats_all.txt" + params: + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=1 shell:""" echo -e "Assembly\tN50\tBasesCov\tTotal\tTotal-Centro\tTotal-Centro-HighCov\tSup\n" | cat - <(paste <(cat {input.stats}) <(cat {input.flags} | cut -f 2-5)) > {output.all} @@ -377,7 +553,14 @@ rule CombineFlagStats: stats=expand("{asm}.{hap}/{method}/stats_flag.txt",asm=asms,hap=haps,method=methods) output: all="run_stats_all_flags.txt" + wildcard_constraints: + method="lra,mm2", + hap="|".join(haps) + params: + grid_opts=config["grid_small"], + node_constraint="" + resources: + mem_gb=1 shell:""" - echo -e "Assembly\tTotal\tTotal-Centro\tTotal-Centro-HighCov\tFullSup\tCentro\tHighCov\tUnSup\tCentroHigh\tCentroUnsup\tHighUnsup\tCentroHighUnsup" | cat - {input.stats} > {output.all} """ diff --git a/CreateAssemblyJSON.py b/CreateAssemblyJSON.py index 80d3b31..77f4ce9 100755 --- a/CreateAssemblyJSON.py +++ b/CreateAssemblyJSON.py @@ -9,6 +9,7 @@ ap.add_argument("--sample", help="Sample", required=True) ap.add_argument("--vcf", help="Phasing vcf", required=True) ap.add_argument("--ref", help="Ref", required=True) +ap.add_argument("--partition", help="Optional partition", default="cmb") args=ap.parse_args() if args.readtype == "ccs" or args.readtype == "ont": @@ -16,19 +17,28 @@ else: consensus = "arrow" +part=args.partition + sys.stdout.write("""{{ "bam" : [{}], "working_dir" : "{}", "sample" : "{}", "ref" : "{}", "vcf" : "{}", - "grid_large" : "sbatch --time=24:00:00 --partition=cmb", - "grid_manycore" : "sbatch --time=48:00:00 --partition=cmb", - "grid_medium" : "sbatch --time=24:00:00 --partition=cmb", - "grid_small" : "sbatch --time=4:00:00 --partition=cmb", - "grid_blat" : "sbatch --time=24:00:00 --partition=cmb", + "partition" : {}, + "grid_large" : "sbatch --time=24:00:00 --partition={}", + "grid_manycore" : "sbatch --time=48:00:00 --partition={}", + "grid_medium" : "sbatch --time=24:00:00 --partition={}", + "grid_small" : "sbatch --time=4:00:00 --partition={}", + "grid_blat" : "sbatch --time=24:00:00 --partition={}", "read-type" : "{}", "consensus" : "{}", "assembler" : "{}" -}}""".format(", ".join(args.bams), args.workingDir, args.sample, args.ref, args.vcf, args.readtype, consensus, args.assembler)) +}}""".format(", ".join(args.bams), args.workingDir, args.sample, args.ref, args.vcf, + args.partition, + args.partition, + args.partition, + args.partition, + args.partition, + args.partition, args.readtype, consensus, args.assembler)) diff --git a/GetSVSupport.py b/GetSVSupport.py index 25248af..8eb01ba 100755 --- a/GetSVSupport.py +++ b/GetSVSupport.py @@ -2,6 +2,7 @@ import sys +n=0 for line in sys.stdin: vals=line.split() svLen = int(vals[2]) - int(vals[1]) @@ -17,6 +18,9 @@ nSup+=1 vals = vals[0:5] + [str(nSup)] sys.stdout.write("\t".join(vals) + "\n") + n+=1 + if n % 200 == 0: + sys.stdout.flush() diff --git a/MultiAssemblerAssembly.snakefile b/MultiAssemblerAssembly.snakefile index 7d126a2..b852300 100644 --- a/MultiAssemblerAssembly.snakefile +++ b/MultiAssemblerAssembly.snakefile @@ -23,7 +23,8 @@ haps=["1", "2"] assemblers=config["assemblers"] datasets = [entry for entry in config["datasets"].keys()] - +if "partition" not in config: + config["partition"] = cmb rule all: input: config=expand("{dataset}/{assembler}/partitioned_assembly.json", dataset=datasets, assembler=assemblers, hap=haps), @@ -33,7 +34,8 @@ rule all: rule GenerateJSON: input: bams=lambda wildcards: config["datasets"][wildcards.datasetID]["bams"], - vcf=lambda wildcards: config["datasets"][wildcards.datasetID]["vcf"] + vcf=lambda wildcards: config["datasets"][wildcards.datasetID]["vcf"], + json="multi_asm.json" output: json="{datasetID}/{assembler}/partitioned_assembly.json", params: @@ -42,10 +44,11 @@ rule GenerateJSON: wd=lambda wildcards: config["workingDir"] + "/" + wildcards.datasetID, ref=config["ref"], readtype=lambda wildcards: config["datasets"][wildcards.datasetID]["datatype"], - workingDir=config["workingDir"] + workingDir=config["workingDir"], + part=config["partition"] shell:""" mkdir -p {wildcards.datasetID}/{wildcards.assembler} -{params.sd}/CreateAssemblyJSON.py --bams {input.bams} --workingDir {params.workingDir}/{wildcards.datasetID}/{wildcards.assembler}/run --sample {params.sample} --ref {params.ref} --vcf {input.vcf} --readtype {params.readtype} --assembler {wildcards.assembler} > {wildcards.datasetID}/{wildcards.assembler}/partitioned_assembly.json +{params.sd}/CreateAssemblyJSON.py --bams {input.bams} --workingDir {params.workingDir}/{wildcards.datasetID}/{wildcards.assembler}/run --sample {params.sample} --ref {params.ref} --vcf {input.vcf} --readtype {params.readtype} --assembler {wildcards.assembler} > {wildcards.datasetID}/{wildcards.assembler}/partitioned_assembly.json --partition={params.part} """ @@ -59,10 +62,13 @@ rule RunAssembly: pd=config["pd"], jobs_per_run=config["jobs_per_run"] shell:""" +if [ -e {output.asms[0]} ]; then + exit 0; +fi mkdir -p {params.wd} cp {input.json} {params.wd}/ -pushd {params.wd} && snakemake -p -s {params.pd}/PartitionedAssembly.snakefile -j {params.jobs_per_run} --cluster " {{params.grid_opts}} -c {{resources.threads}} --mem={{resources.mem_gb}}G {{params.node_constraint}} " --restart-times 4 && popd -cp {params.wd}/assembly.*.consensus.fasta {wildcards.dataset}/{wildcards.assembler}/ +pushd {params.wd} && snakemake -p -s {params.pd}/PartitionedAssembly.snakefile -j {params.jobs_per_run} --cluster " {{params.grid_opts}} -c {{resources.threads}} --mem={{resources.mem_gb}}G {{params.node_constraint}} " --restart-times 100 --rerun-incomplete && popd ; +cp {params.wd}/assembly.*.consensus.fasta {wildcards.dataset}/{wildcards.assembler}/ ; rm -rf {params.wd}/ """ diff --git a/PrintGaps.py b/PrintGaps.py index ebf9568..c8706f2 100755 --- a/PrintGaps.py +++ b/PrintGaps.py @@ -19,7 +19,6 @@ ap.add_argument("--minFraction", help="Minimum fraction of contig to keep in an alignment.", default=0.00,type=float) ap.add_argument("--maxLength", help="Maximum gap length.", default=None, type=int) ap.add_argument("--outFile", help="Print output here, default= stdout", default=None) -ap.add_argument("--context", help="Print surrounding context", default=0, type=int) ap.add_argument("--condense", help="Pack indels if the matches separating them is less than this value.", default=0, type=int) ap.add_argument("--tsd", help="Attempt to find Target Site Duplications at most this length", default=20, type=int) ap.add_argument("--outsam", help="Write the modified condensed sam to a file.", default=None) @@ -30,6 +29,7 @@ ap.add_argument("--contigBed", help="Print where contigs map.", default=None) ap.add_argument("--status", help="Print how far along the alignments are.", default=False, action='store_true') ap.add_argument("--blacklist", help="Exclude contigs on this list from callsets.", default=None) +ap.add_argument("--printAlignedLength", help="Print the length of the aligned sequence.", default=False) ap.add_argument("--flank", help="Amount of flank to compute identity for ", default=0,type=int) ap.add_argument("--flankIdentity", help="Identity required of flank.", default=0.95, type=float) ap.add_argument("--maxMasked", help="Ignore variants with this many or more N's.,0=allow all", default=0,type=int) @@ -133,8 +133,6 @@ def IsClip(c): coordRe = re.compile(".*(chr.*)\.(\d+)-(\d+).*") outFile.write("#chrom\ttStart\ttEnd\tsvType\tsvLen\tsvSeq\ttsd\tqName\tqStart\tqEnd") -if (args.context): - outFile.write("\tcontext") if (args.printStrand): outFile.write("\tqueryStrand") if (args.printLength): @@ -191,19 +189,6 @@ def RemoveMismatches(aln): # aln.tStart -=1 aln.tEnd -=1 - if (args.onTarget == True): - coordReMatch = coordRe.match(aln.title) - if (coordReMatch is not None): - coordMatchGroups = coordReMatch.groups() - srcChrom = coordMatchGroups[0] - srcStart = int(coordMatchGroups[1]) - srcEnd = int(coordMatchGroups[2]) - if (srcChrom != aln.tName): - print("off target chromosome: " + srcChrom + " " + aln.tName) - continue - if (((srcStart >= aln.tStart and srcStart < aln.tEnd) or (srcEnd >= aln.tStart and srcEnd < aln.tEnd) or (srcStart < aln.tStart and srcEnd > aln.tEnd )) == False): - print("no overlap " + srcChrom + " " + str(srcStart) + " " + str(srcEnd) + " alignment: " + str(aln.tStart) + " "+ str(aln.tEnd)) - continue if (aln.mapqv < args.minq): continue @@ -385,7 +370,7 @@ def RemoveMismatches(aln): if (args.flank != 0 and (frontIdent < args.flankIdentity or backIdent < args.flankIdentity)): continue - + for i in range(len(packedOps)): op = packedOps[i] oplen = packedLengths[i] @@ -396,18 +381,14 @@ def RemoveMismatches(aln): if (IsMatch(op)): # Inside match block (if op == M) if (args.snv is not None): - targetSeq = Tools.ExtractSeq((aln.tName, tPos,tPos+oplen), genomeFile, fai) - querySeq = aln.seq[qPos:qPos+oplen] + targetSeq = Tools.ExtractSeq((aln.tName, tPos,tPos+oplen), genomeFile, fai).strip().upper() + querySeq = aln.seq[qPos:qPos+oplen].strip().upper() nMis = 0 -# if (len(targetSeq) != len(querySeq)): -# print "ERROR IN SEQ" -# print aln.title for mp in range(0,len(targetSeq)): if (mp >= len(querySeq) or mp >= len(targetSeq)): print("ERROR with seq " + aln.title) continue - - if (querySeq[mp].upper() != targetSeq[mp].upper() and targetSeq[mp].upper() != 'N' and querySeq[mp].upper() != 'N'): + if (querySeq[mp] != targetSeq[mp] and targetSeq[mp].upper() != 'N' and querySeq[mp].upper() != 'N'): nMis +=1 snvOut.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(aln.tName, tPos+mp, tPos+mp+1, targetSeq[mp], querySeq[mp], aln.title, mp+qPos )) if (args.nloc is not None and (targetSeq[mp].upper() == 'N' or querySeq[mp].upper() == 'N')): @@ -484,13 +465,10 @@ def RemoveMismatches(aln): else: outFile = h2File outFile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(chrName, tPos, tPos + oplen, "insertion", oplen, gapSeq, tsd, aln.title, qPos, qPos + oplen)) - if (args.context > 0): - outFile.write("\t{}".format(homopolymer)) if (args.printStrand): outFile.write("\t{}".format(aln.qStrand)) if (args.printLength): - outFile.write("\t{}".format(aln.seq)) - + outFile.write("\t{}".format(int(aln.qEnd - aln.qStart))) if args.fractionMasked is True: nMasked = gapSeq.count('N') frac = '0' @@ -511,15 +489,10 @@ def RemoveMismatches(aln): chrName = aln.tName if (tPos > fai[chrName][0]): print("ERROR! tpos is past the genome end." + str(tPos) + " " + str(fai[chrName][0])) - delStart = max(tPos - args.context, 0) - delEnd = min(tPos + args.context + oplen, fai[chrName][0]) + delStart = tPos + delEnd = tPos + oplen if (delEnd < delStart): continue - context= aln.seq[qPos+oplen:min(qPos+oplen+args.context, len(aln.seq))] - if (context == "A"*len(context) or context == "T"*len(context)): - homopolymer="T" - else: - homopolymer="F" #delSeq = genomeDict[chrName].seq[delStart:delEnd].tostring() delSeq = Tools.ExtractSeq([chrName, delStart, delEnd], genomeFile, fai) @@ -559,8 +532,10 @@ def RemoveMismatches(aln): print("SET OUTFILE \n\n\n") outFile.write("{}\t{}\t{}\t{}\t{}\t{}\tno_tsd\t{}\t{}\t{}".format(chrName, tPos, tPos + oplen, "deletion", oplen, delSeq, aln.title, qPos, qPos)) - if (args.context > 0): - outFile.write("\t{}".format(homopolymer)) + if (args.printStrand): + outFile.write("\t{}".format(aln.qStrand)) + if (args.printLength): + outFile.write("\t{}".format(int(aln.qEnd - aln.qStart))) if args.fractionMasked is True: nMasked = delSeq.count('N')