From 4f5b462c6fdd95444e34e40e2160c626438bfad6 Mon Sep 17 00:00:00 2001 From: petercombs Date: Thu, 24 Mar 2016 12:08:00 -0700 Subject: [PATCH 1/3] Change the ouptut function to use pandas --- bin/GetGeneASE.py | 58 ++++++++++++++++++++--------------------------- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/bin/GetGeneASE.py b/bin/GetGeneASE.py index c8d11d6..e7fdbdb 100755 --- a/bin/GetGeneASE.py +++ b/bin/GetGeneASE.py @@ -22,6 +22,7 @@ ########### import sys # Access to simple command-line arguments import argparse # Access to long command-line parsing +import pandas as pd # Us from ASEr import run # File handling utilities @@ -468,42 +469,31 @@ def main(argv=None): keys = sorted(list(features.keys())) with open(args.outfile, 'w') as outfile: - # Header - outfile.write('FEATURE\tCHROMOSOME\tORIENTATION\tSTART-STOP\t' + - 'REFERENCE_COUNTS\tALT_COUNTS\tTOTAL_SNPS\tREF_BIASED\t' + - 'ALT_BIASED\tREF-ALT_RATIO\tSNPS\n') - - for key in keys: - # Get the ultimate 5'-3' positions - position[key].sort(key=int) - posit = str(position[key][0]) + '-' + str(position[key][-1]) - - if key in total_ref: - pos = key.split('|') - if ref_biased[key] >= alt_biased[key]: - if alt_biased[key] == 0: - rat = 1 - else: - rat = ref_biased[key]/float(alt_biased[key]+ref_biased[key]) - else: - if ref_biased[key] == 0: - rat = 1 - else: - rat = alt_biased[key]/float(ref_biased[key]+alt_biased[key]) + posits = {key:'{}-{}'.format(min(position[key]), max(position[key])) for key in keys} + snp_arrays = {key: ';'.join(snp_array[key]) for key in total_ref} + out = pd.DataFrame({ + 'CHROMOSOME': chromosome, + 'ORIENTATION': ori, + 'START-STOP': posits, + 'REFERENCE_COUNTS': total_ref, + 'ALT_COUNTS' : total_alt, + 'TOTAL_SNPS': total_snps, + 'REF_BIASED' : ref_biased, + 'ALT_BIASED' : alt_biased, + 'REF-ALT_RATIO': pd.np.nan, + 'SNPS' : snp_arrays}, + index=keys, + columns=['CHROMOSOME', 'ORIENTATION', 'START-STOP', + 'REFERENCE_COUNTS', 'ALT_COUNTS', 'TOTAL_SNPS', 'REF_BIASED', + 'ALT_BIASED', 'REF-ALT_RATIO', 'SNPS']) + out.index.name = 'FEATURE' + out['REF-ALT_RATIO'] = (out.ix[:, ['REF_BIASED', 'ALT_BIASED']].max(axis=1) + / out.ix[:, ['REF_BIASED', 'ALT_BIASED']].sum(axis=1)) + out.ix[(out.REF_BIASED == 0) & (out.ALT_BIASED == 0), 'REF-ALT_RATIO'] = 1 + out.to_csv(outfile, sep='\t', na_rep='NA', float_format='%.16g') + - snp_array_out = ';'.join(snp_array[key]) - outfile.write('\t'.join( - [str(i) for i in [key, chromosome[key], ori[key], posit, - total_ref[key], total_alt[key], - total_snps[key], ref_biased[key], - alt_biased[key], rat, snp_array_out]]) + - '\n') - # No counts for this feature - else: - outfile.write('\t'.join( - [str(i) for i in [key, chromosome[key], ori[key], posit, - 'NA\tNA\tNA\tNA\tNA\tNA\tNA\n']])) if args.write is True: with open(args.outfile + '.snps.txt', 'w') as outfile: From 835325e1e9171603485a0035380e61ed6ba35157 Mon Sep 17 00:00:00 2001 From: petercombs Date: Thu, 24 Mar 2016 12:36:29 -0700 Subject: [PATCH 2/3] Speedup for GetGeneASE Instead of looping over coordinates of each feature and checking if present, we use the sorted nature of the phased snp data to pull out from there all the SNPs that we care about. --- bin/GetGeneASE.py | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/bin/GetGeneASE.py b/bin/GetGeneASE.py index e7fdbdb..8323664 100755 --- a/bin/GetGeneASE.py +++ b/bin/GetGeneASE.py @@ -22,6 +22,8 @@ ########### import sys # Access to simple command-line arguments import argparse # Access to long command-line parsing +from collections import defaultdict +from bisect import bisect import pandas as pd # Us @@ -147,16 +149,20 @@ def read_snp_phasing_file(snp_file): :snp_file: A bed format file with phased SNPs, can be produces with the create_phased_bed script. :returns: A dictionary of:: - position => REF|ALT + chromosome => [(pos, REF|ALT), ...] """ - snp_phase_dict = {} + snp_phase_dict = defaultdict(list) with run.open_zipped(snp_file) as snp_file: for line in snp_file: line = line.rstrip('\n') line_t = line.split('\t') - pos = str(line_t[0]) + '|' + str(line_t[2]) - snp_phase_dict[pos] = line_t[3] + chrom = line_t[0] + pos = int(line_t[2]) + if len(snp_phase_dict[chrom]) and pos < snp_phase_dict[chrom][-1][0]: + raise ValueError("SNP file {} should be sorted by chromosome position" + .format(snp_file.name)) + snp_phase_dict[chrom].append((pos, line_t[3])) return snp_phase_dict @@ -262,7 +268,8 @@ def main(argv=None): sys.exit(1) features[name] = 1 - chromosome[name] = line_t[0] + chrom = line_t[0] + chromosome[name] = chrom ori[name] = line_t[6] orientation = line_t[6] @@ -276,10 +283,14 @@ def main(argv=None): position[name].append(int(line_t[4])) # Now go through the positions overlapped by the annotation and add in SNPs if appropriate - for i in range(int(line_t[3]), int(line_t[4])+1): - pos = line_t[0] + '|' + str(i) + if chrom not in snp_phase_dict: + continue + lower = bisect(snp_phase_dict[chrom], (int(line_t[3]), )) + upper = bisect(snp_phase_dict[chrom], (int(line_t[4])+1, )) + for i, snp in snp_phase_dict[chrom][lower:upper]: + pos = chrom + '|' + str(i) - if pos in snp_counts_dict and pos in snp_phase_dict: + if pos in snp_counts_dict: # Count SNPs if name in total_snps: @@ -290,7 +301,7 @@ def main(argv=None): # Get REF|ALT counts # Parse the REF|ALT dict - refalt = snp_phase_dict[pos].split('|') + refalt = snp.split('|') # Get pos/neg counts ref_pos_counts = snp_counts_dict[pos][0][refalt[0]] @@ -329,7 +340,7 @@ def main(argv=None): # Add it to the total SNP arrays if name in snp_array: snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(ref_pos_counts) + '|' + str(alt_pos_counts)) phased_snp_array.append( @@ -341,7 +352,7 @@ def main(argv=None): else: snp_array[name] = [] snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(ref_pos_counts) + '|' + str(alt_pos_counts)) phased_snp_array.append( @@ -380,7 +391,7 @@ def main(argv=None): # Add it to the total SNP array if name in snp_array: snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(ref_neg_counts) + '|' + str(alt_neg_counts)) phased_snp_array.append( @@ -392,7 +403,7 @@ def main(argv=None): else: snp_array[name] = [] snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(ref_neg_counts) + '|' + str(alt_neg_counts)) phased_snp_array.append( @@ -456,13 +467,13 @@ def main(argv=None): # Add it to the total SNP array if name in snp_array: snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + + str(i) + ',' + str(snp) + ',' + str(int(tot_ref)) + '|' + str(int(tot_alt))) else: snp_array[name] = [] snp_array[name].append( - str(i) + ',' + str(snp_phase_dict[pos]) + ',' + + str(i) + ',' + str(snp) + ',' + str(int(tot_ref)) + '|' + str(int(tot_alt))) # Print the output From 3731656c80626b4618b4ac91a1958b6453284ec8 Mon Sep 17 00:00:00 2001 From: Mike Dacre Date: Thu, 24 Mar 2016 16:18:40 -0700 Subject: [PATCH 3/3] Add pandas to the requirements of this package --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9d2015b..7c6bb74 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,7 @@ keywords='ASE allele-specific expression RNA-seq fastq BAM SAM SNP', - install_requires=['pybedtools', 'pysam'], + install_requires=['pybedtools', 'pysam', 'pandas'], scripts=scpts, packages=['ASEr']