diff --git a/bin/GetGeneASE.py b/bin/GetGeneASE.py index c8d11d6..8323664 100755 --- a/bin/GetGeneASE.py +++ b/bin/GetGeneASE.py @@ -22,6 +22,9 @@ ########### import sys # Access to simple command-line arguments import argparse # Access to long command-line parsing +from collections import defaultdict +from bisect import bisect +import pandas as pd # Us from ASEr import run # File handling utilities @@ -146,16 +149,20 @@ def read_snp_phasing_file(snp_file): :snp_file: A bed format file with phased SNPs, can be produces with the create_phased_bed script. :returns: A dictionary of:: - position => REF|ALT + chromosome => [(pos, REF|ALT), ...] """ - snp_phase_dict = {} + snp_phase_dict = defaultdict(list) with run.open_zipped(snp_file) as snp_file: for line in snp_file: line = line.rstrip('\n') line_t = line.split('\t') - pos = str(line_t[0]) + '|' + str(line_t[2]) - snp_phase_dict[pos] = line_t[3] + chrom = line_t[0] + pos = int(line_t[2]) + if len(snp_phase_dict[chrom]) and pos < snp_phase_dict[chrom][-1][0]: + raise ValueError("SNP file {} should be sorted by chromosome position" + .format(snp_file.name)) + snp_phase_dict[chrom].append((pos, line_t[3])) return snp_phase_dict @@ -261,7 +268,8 @@ def main(argv=None): sys.exit(1) features[name] = 1 - chromosome[name] = line_t[0] + chrom = line_t[0] + chromosome[name] = chrom ori[name] = line_t[6] orientation = line_t[6] @@ -275,10 +283,14 @@ def main(argv=None): position[name].append(int(line_t[4])) # Now go through the positions overlapped by the annotation and add in SNPs if appropriate - for i in range(int(line_t[3]), int(line_t[4])+1): - pos = line_t[0] + '|' + str(i) + if chrom not in snp_phase_dict: + continue + lower = bisect(snp_phase_dict[chrom], (int(line_t[3]), )) + upper = bisect(snp_phase_dict[chrom], (int(line_t[4])+1, )) + for i, snp in snp_phase_dict[chrom][lower:upper]: + pos = chrom + '|' + str(i) - if pos in snp_counts_dict and pos in snp_phase_dict: + if pos in snp_counts_dict: # Count SNPs if name in total_snps: @@ -289,7 +301,7 @@ def main(argv=None): # Get REF|ALT counts # Parse the REF|ALT dict - refalt = snp_phase_dict[pos].split('|') + refalt = snp.split('|') # Get pos/neg counts ref_pos_counts = snp_counts_dict[pos][0][refalt[0]] @@ -328,7 +340,7 @@ def main(argv=None): # Add it to the total SNP arrays if name in snp_array: snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(ref_pos_counts) + '|' + str(alt_pos_counts)) phased_snp_array.append( @@ -340,7 +352,7 @@ def main(argv=None): else: snp_array[name] = [] snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(ref_pos_counts) + '|' + str(alt_pos_counts)) phased_snp_array.append( @@ -379,7 +391,7 @@ def main(argv=None): # Add it to the total SNP array if name in snp_array: snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(ref_neg_counts) + '|' + str(alt_neg_counts)) phased_snp_array.append( @@ -391,7 +403,7 @@ def main(argv=None): else: snp_array[name] = [] snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(ref_neg_counts) + '|' + str(alt_neg_counts)) phased_snp_array.append( @@ -455,55 +467,44 @@ def main(argv=None): # Add it to the total SNP array if name in snp_array: snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(int(tot_ref)) + '|' + str(int(tot_alt))) else: snp_array[name] = [] snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + ',' + + str(i) + ',' + str(snp) + ',' + str(int(tot_ref)) + '|' + str(int(tot_alt))) # Print the output keys = sorted(list(features.keys())) with open(args.outfile, 'w') as outfile: - # Header - outfile.write('FEATURE\tCHROMOSOME\tORIENTATION\tSTART-STOP\t' + - 'REFERENCE_COUNTS\tALT_COUNTS\tTOTAL_SNPS\tREF_BIASED\t' + - 'ALT_BIASED\tREF-ALT_RATIO\tSNPS\n') - - for key in keys: - # Get the ultimate 5'-3' positions - position[key].sort(key=int) - posit = str(position[key][0]) + '-' + str(position[key][-1]) - - if key in total_ref: - pos = key.split('|') - if ref_biased[key] >= alt_biased[key]: - if alt_biased[key] == 0: - rat = 1 - else: - rat = ref_biased[key]/float(alt_biased[key]+ref_biased[key]) - else: - if ref_biased[key] == 0: - rat = 1 - else: - rat = alt_biased[key]/float(ref_biased[key]+alt_biased[key]) + posits = {key:'{}-{}'.format(min(position[key]), max(position[key])) for key in keys} + snp_arrays = {key: ';'.join(snp_array[key]) for key in total_ref} + out = pd.DataFrame({ + 'CHROMOSOME': chromosome, + 'ORIENTATION': ori, + 'START-STOP': posits, + 'REFERENCE_COUNTS': total_ref, + 'ALT_COUNTS' : total_alt, + 'TOTAL_SNPS': total_snps, + 'REF_BIASED' : ref_biased, + 'ALT_BIASED' : alt_biased, + 'REF-ALT_RATIO': pd.np.nan, + 'SNPS' : snp_arrays}, + index=keys, + columns=['CHROMOSOME', 'ORIENTATION', 'START-STOP', + 'REFERENCE_COUNTS', 'ALT_COUNTS', 'TOTAL_SNPS', 'REF_BIASED', + 'ALT_BIASED', 'REF-ALT_RATIO', 'SNPS']) + out.index.name = 'FEATURE' + out['REF-ALT_RATIO'] = (out.ix[:, ['REF_BIASED', 'ALT_BIASED']].max(axis=1) + / out.ix[:, ['REF_BIASED', 'ALT_BIASED']].sum(axis=1)) + out.ix[(out.REF_BIASED == 0) & (out.ALT_BIASED == 0), 'REF-ALT_RATIO'] = 1 + out.to_csv(outfile, sep='\t', na_rep='NA', float_format='%.16g') + - snp_array_out = ';'.join(snp_array[key]) - outfile.write('\t'.join( - [str(i) for i in [key, chromosome[key], ori[key], posit, - total_ref[key], total_alt[key], - total_snps[key], ref_biased[key], - alt_biased[key], rat, snp_array_out]]) + - '\n') - # No counts for this feature - else: - outfile.write('\t'.join( - [str(i) for i in [key, chromosome[key], ori[key], posit, - 'NA\tNA\tNA\tNA\tNA\tNA\tNA\n']])) if args.write is True: with open(args.outfile + '.snps.txt', 'w') as outfile: diff --git a/setup.py b/setup.py index 9d2015b..7c6bb74 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ keywords='ASE allele-specific expression RNA-seq fastq BAM SAM SNP', - install_requires=['pybedtools', 'pysam'], + install_requires=['pybedtools', 'pysam', 'pandas'], scripts=scpts, packages=['ASEr']