From 4f5b462c6fdd95444e34e40e2160c626438bfad6 Mon Sep 17 00:00:00 2001
From: petercombs <pcombs@stanford.edu>
Date: Thu, 24 Mar 2016 12:08:00 -0700
Subject: [PATCH 1/3] Change the ouptut function to use pandas

---
 bin/GetGeneASE.py | 58 ++++++++++++++++++++---------------------------
 1 file changed, 24 insertions(+), 34 deletions(-)

diff --git a/bin/GetGeneASE.py b/bin/GetGeneASE.py
index c8d11d6..e7fdbdb 100755
--- a/bin/GetGeneASE.py
+++ b/bin/GetGeneASE.py
@@ -22,6 +22,7 @@
 ###########
 import sys              # Access to simple command-line arguments
 import argparse         # Access to long command-line parsing
+import pandas as pd
 
 # Us
 from ASEr import run    # File handling utilities
@@ -468,42 +469,31 @@ def main(argv=None):
     keys = sorted(list(features.keys()))
 
     with open(args.outfile, 'w') as outfile:
-        # Header
-        outfile.write('FEATURE\tCHROMOSOME\tORIENTATION\tSTART-STOP\t' +
-                      'REFERENCE_COUNTS\tALT_COUNTS\tTOTAL_SNPS\tREF_BIASED\t' +
-                      'ALT_BIASED\tREF-ALT_RATIO\tSNPS\n')
-
-        for key in keys:
-            # Get the ultimate 5'-3' positions
-            position[key].sort(key=int)
-            posit = str(position[key][0]) + '-' + str(position[key][-1])
-
-            if key in total_ref:
-                pos = key.split('|')
-                if ref_biased[key] >= alt_biased[key]:
-                    if alt_biased[key] == 0:
-                        rat = 1
-                    else:
-                        rat = ref_biased[key]/float(alt_biased[key]+ref_biased[key])
-                else:
-                    if ref_biased[key] == 0:
-                        rat = 1
-                    else:
-                        rat = alt_biased[key]/float(ref_biased[key]+alt_biased[key])
+        posits = {key:'{}-{}'.format(min(position[key]), max(position[key])) for key in keys}
+        snp_arrays = {key: ';'.join(snp_array[key]) for key in total_ref}
+        out = pd.DataFrame({
+            'CHROMOSOME': chromosome,
+            'ORIENTATION': ori,
+            'START-STOP': posits,
+            'REFERENCE_COUNTS': total_ref,
+            'ALT_COUNTS' : total_alt,
+            'TOTAL_SNPS': total_snps,
+            'REF_BIASED' : ref_biased,
+            'ALT_BIASED' : alt_biased,
+            'REF-ALT_RATIO': pd.np.nan,
+            'SNPS' : snp_arrays},
+            index=keys,
+            columns=['CHROMOSOME', 'ORIENTATION', 'START-STOP',
+                'REFERENCE_COUNTS', 'ALT_COUNTS', 'TOTAL_SNPS', 'REF_BIASED',
+                'ALT_BIASED', 'REF-ALT_RATIO', 'SNPS'])
+        out.index.name = 'FEATURE'
+        out['REF-ALT_RATIO'] = (out.ix[:, ['REF_BIASED', 'ALT_BIASED']].max(axis=1)
+                / out.ix[:, ['REF_BIASED', 'ALT_BIASED']].sum(axis=1))
+        out.ix[(out.REF_BIASED == 0) & (out.ALT_BIASED == 0), 'REF-ALT_RATIO'] = 1
+        out.to_csv(outfile, sep='\t', na_rep='NA', float_format='%.16g')
+
 
-                snp_array_out = ';'.join(snp_array[key])
 
-                outfile.write('\t'.join(
-                    [str(i) for i in [key, chromosome[key], ori[key], posit,
-                                      total_ref[key], total_alt[key],
-                                      total_snps[key], ref_biased[key],
-                                      alt_biased[key], rat, snp_array_out]]) +
-                              '\n')
-            # No counts for this feature
-            else:
-                outfile.write('\t'.join(
-                    [str(i) for i in [key, chromosome[key], ori[key], posit,
-                                      'NA\tNA\tNA\tNA\tNA\tNA\tNA\n']]))
 
     if args.write is True:
         with open(args.outfile + '.snps.txt', 'w') as outfile:

From 835325e1e9171603485a0035380e61ed6ba35157 Mon Sep 17 00:00:00 2001
From: petercombs <pcombs@stanford.edu>
Date: Thu, 24 Mar 2016 12:36:29 -0700
Subject: [PATCH 2/3] Speedup for GetGeneASE

Instead of looping over coordinates of each feature and checking if
present, we use the sorted nature of the phased snp data to pull out
from there all the SNPs that we care about.
---
 bin/GetGeneASE.py | 41 ++++++++++++++++++++++++++---------------
 1 file changed, 26 insertions(+), 15 deletions(-)

diff --git a/bin/GetGeneASE.py b/bin/GetGeneASE.py
index e7fdbdb..8323664 100755
--- a/bin/GetGeneASE.py
+++ b/bin/GetGeneASE.py
@@ -22,6 +22,8 @@
 ###########
 import sys              # Access to simple command-line arguments
 import argparse         # Access to long command-line parsing
+from collections import defaultdict
+from bisect import bisect
 import pandas as pd
 
 # Us
@@ -147,16 +149,20 @@ def read_snp_phasing_file(snp_file):
     :snp_file: A bed format file with phased SNPs, can be produces with the
                create_phased_bed script.
     :returns:  A dictionary of::
-                    position => REF|ALT
+                    chromosome => [(pos, REF|ALT), ...]
     """
-    snp_phase_dict = {}
+    snp_phase_dict = defaultdict(list)
     with run.open_zipped(snp_file) as snp_file:
         for line in snp_file:
             line   = line.rstrip('\n')
             line_t = line.split('\t')
 
-            pos = str(line_t[0]) + '|' + str(line_t[2])
-            snp_phase_dict[pos] = line_t[3]
+            chrom = line_t[0]
+            pos = int(line_t[2])
+            if len(snp_phase_dict[chrom]) and pos < snp_phase_dict[chrom][-1][0]:
+                raise ValueError("SNP file {} should be sorted by chromosome position"
+                        .format(snp_file.name))
+            snp_phase_dict[chrom].append((pos, line_t[3]))
     return snp_phase_dict
 
 
@@ -262,7 +268,8 @@ def main(argv=None):
                 sys.exit(1)
 
             features[name] = 1
-            chromosome[name] = line_t[0]
+            chrom = line_t[0]
+            chromosome[name] = chrom
 
             ori[name] = line_t[6]
             orientation = line_t[6]
@@ -276,10 +283,14 @@ def main(argv=None):
                 position[name].append(int(line_t[4]))
 
             # Now go through the positions overlapped by the annotation and add in SNPs if appropriate
-            for i in range(int(line_t[3]), int(line_t[4])+1):
-                pos = line_t[0] + '|' + str(i)
+            if chrom not in snp_phase_dict:
+                continue
+            lower = bisect(snp_phase_dict[chrom], (int(line_t[3]), ))
+            upper = bisect(snp_phase_dict[chrom], (int(line_t[4])+1, ))
+            for i, snp in snp_phase_dict[chrom][lower:upper]:
+                pos = chrom + '|' + str(i)
 
-                if pos in snp_counts_dict and pos in snp_phase_dict:
+                if pos in snp_counts_dict:
 
                     # Count SNPs
                     if name in total_snps:
@@ -290,7 +301,7 @@ def main(argv=None):
 
                     # Get REF|ALT counts
                     # Parse the REF|ALT dict
-                    refalt = snp_phase_dict[pos].split('|')
+                    refalt = snp.split('|')
 
                     # Get pos/neg counts
                     ref_pos_counts = snp_counts_dict[pos][0][refalt[0]]
@@ -329,7 +340,7 @@ def main(argv=None):
                             # Add it to the total SNP arrays
                             if name in snp_array:
                                 snp_array[name].append(
-                                    str(i) + ',' + str(snp_phase_dict[pos]) +
+                                    str(i) + ',' + str(snp) +
                                     ',' + str(ref_pos_counts) + '|' +
                                     str(alt_pos_counts))
                                 phased_snp_array.append(
@@ -341,7 +352,7 @@ def main(argv=None):
                             else:
                                 snp_array[name] = []
                                 snp_array[name].append(
-                                    str(i) + ',' + str(snp_phase_dict[pos]) +
+                                    str(i) + ',' + str(snp) +
                                     ',' + str(ref_pos_counts) + '|' +
                                     str(alt_pos_counts))
                                 phased_snp_array.append(
@@ -380,7 +391,7 @@ def main(argv=None):
                             # Add it to the total SNP array
                             if name in snp_array:
                                 snp_array[name].append(
-                                    str(i) + ',' + str(snp_phase_dict[pos]) +
+                                    str(i) + ',' + str(snp) +
                                     ',' + str(ref_neg_counts) + '|' +
                                     str(alt_neg_counts))
                                 phased_snp_array.append(
@@ -392,7 +403,7 @@ def main(argv=None):
                             else:
                                 snp_array[name] = []
                                 snp_array[name].append(
-                                    str(i) + ',' + str(snp_phase_dict[pos]) +
+                                    str(i) + ',' + str(snp) +
                                     ',' + str(ref_neg_counts) + '|' +
                                     str(alt_neg_counts))
                                 phased_snp_array.append(
@@ -456,13 +467,13 @@ def main(argv=None):
                         # Add it to the total SNP array
                         if name in snp_array:
                             snp_array[name].append(
-                                str(i) + ',' + str(snp_phase_dict[pos]) +
+                                str(i) + ',' + str(snp) +
                                 ',' + str(int(tot_ref)) + '|' +
                                 str(int(tot_alt)))
                         else:
                             snp_array[name] = []
                             snp_array[name].append(
-                                str(i) + ',' + str(snp_phase_dict[pos]) + ',' +
+                                str(i) + ',' + str(snp) + ',' +
                                 str(int(tot_ref)) + '|' + str(int(tot_alt)))
 
     # Print the output

From 3731656c80626b4618b4ac91a1958b6453284ec8 Mon Sep 17 00:00:00 2001
From: Mike Dacre <mike.dacre@gmail.com>
Date: Thu, 24 Mar 2016 16:18:40 -0700
Subject: [PATCH 3/3] Add pandas to the requirements of this package

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 9d2015b..7c6bb74 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,7 @@
 
     keywords='ASE allele-specific expression RNA-seq fastq BAM SAM SNP',
 
-    install_requires=['pybedtools', 'pysam'],
+    install_requires=['pybedtools', 'pysam', 'pandas'],
     scripts=scpts,
     packages=['ASEr']