Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 50 additions & 49 deletions bin/GetGeneASE.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
###########
import sys # Access to simple command-line arguments
import argparse # Access to long command-line parsing
from collections import defaultdict
from bisect import bisect
import pandas as pd

# Us
from ASEr import run # File handling utilities
Expand Down Expand Up @@ -146,16 +149,20 @@ def read_snp_phasing_file(snp_file):
:snp_file: A bed format file with phased SNPs, can be produces with the
create_phased_bed script.
:returns: A dictionary of::
position => REF|ALT
chromosome => [(pos, REF|ALT), ...]
"""
snp_phase_dict = {}
snp_phase_dict = defaultdict(list)
with run.open_zipped(snp_file) as snp_file:
for line in snp_file:
line = line.rstrip('\n')
line_t = line.split('\t')

pos = str(line_t[0]) + '|' + str(line_t[2])
snp_phase_dict[pos] = line_t[3]
chrom = line_t[0]
pos = int(line_t[2])
if len(snp_phase_dict[chrom]) and pos < snp_phase_dict[chrom][-1][0]:
raise ValueError("SNP file {} should be sorted by chromosome position"
.format(snp_file.name))
snp_phase_dict[chrom].append((pos, line_t[3]))
return snp_phase_dict


Expand Down Expand Up @@ -261,7 +268,8 @@ def main(argv=None):
sys.exit(1)

features[name] = 1
chromosome[name] = line_t[0]
chrom = line_t[0]
chromosome[name] = chrom

ori[name] = line_t[6]
orientation = line_t[6]
Expand All @@ -275,10 +283,14 @@ def main(argv=None):
position[name].append(int(line_t[4]))

# Now go through the positions overlapped by the annotation and add in SNPs if appropriate
for i in range(int(line_t[3]), int(line_t[4])+1):
pos = line_t[0] + '|' + str(i)
if chrom not in snp_phase_dict:
continue
lower = bisect(snp_phase_dict[chrom], (int(line_t[3]), ))
upper = bisect(snp_phase_dict[chrom], (int(line_t[4])+1, ))
for i, snp in snp_phase_dict[chrom][lower:upper]:
pos = chrom + '|' + str(i)

if pos in snp_counts_dict and pos in snp_phase_dict:
if pos in snp_counts_dict:

# Count SNPs
if name in total_snps:
Expand All @@ -289,7 +301,7 @@ def main(argv=None):

# Get REF|ALT counts
# Parse the REF|ALT dict
refalt = snp_phase_dict[pos].split('|')
refalt = snp.split('|')

# Get pos/neg counts
ref_pos_counts = snp_counts_dict[pos][0][refalt[0]]
Expand Down Expand Up @@ -328,7 +340,7 @@ def main(argv=None):
# Add it to the total SNP arrays
if name in snp_array:
snp_array[name].append(
str(i) + ',' + str(snp_phase_dict[pos]) +
str(i) + ',' + str(snp) +
',' + str(ref_pos_counts) + '|' +
str(alt_pos_counts))
phased_snp_array.append(
Expand All @@ -340,7 +352,7 @@ def main(argv=None):
else:
snp_array[name] = []
snp_array[name].append(
str(i) + ',' + str(snp_phase_dict[pos]) +
str(i) + ',' + str(snp) +
',' + str(ref_pos_counts) + '|' +
str(alt_pos_counts))
phased_snp_array.append(
Expand Down Expand Up @@ -379,7 +391,7 @@ def main(argv=None):
# Add it to the total SNP array
if name in snp_array:
snp_array[name].append(
str(i) + ',' + str(snp_phase_dict[pos]) +
str(i) + ',' + str(snp) +
',' + str(ref_neg_counts) + '|' +
str(alt_neg_counts))
phased_snp_array.append(
Expand All @@ -391,7 +403,7 @@ def main(argv=None):
else:
snp_array[name] = []
snp_array[name].append(
str(i) + ',' + str(snp_phase_dict[pos]) +
str(i) + ',' + str(snp) +
',' + str(ref_neg_counts) + '|' +
str(alt_neg_counts))
phased_snp_array.append(
Expand Down Expand Up @@ -455,55 +467,44 @@ def main(argv=None):
# Add it to the total SNP array
if name in snp_array:
snp_array[name].append(
str(i) + ',' + str(snp_phase_dict[pos]) +
str(i) + ',' + str(snp) +
',' + str(int(tot_ref)) + '|' +
str(int(tot_alt)))
else:
snp_array[name] = []
snp_array[name].append(
str(i) + ',' + str(snp_phase_dict[pos]) + ',' +
str(i) + ',' + str(snp) + ',' +
str(int(tot_ref)) + '|' + str(int(tot_alt)))

# Print the output
keys = sorted(list(features.keys()))

with open(args.outfile, 'w') as outfile:
# Header
outfile.write('FEATURE\tCHROMOSOME\tORIENTATION\tSTART-STOP\t' +
'REFERENCE_COUNTS\tALT_COUNTS\tTOTAL_SNPS\tREF_BIASED\t' +
'ALT_BIASED\tREF-ALT_RATIO\tSNPS\n')

for key in keys:
# Get the ultimate 5'-3' positions
position[key].sort(key=int)
posit = str(position[key][0]) + '-' + str(position[key][-1])

if key in total_ref:
pos = key.split('|')
if ref_biased[key] >= alt_biased[key]:
if alt_biased[key] == 0:
rat = 1
else:
rat = ref_biased[key]/float(alt_biased[key]+ref_biased[key])
else:
if ref_biased[key] == 0:
rat = 1
else:
rat = alt_biased[key]/float(ref_biased[key]+alt_biased[key])
posits = {key:'{}-{}'.format(min(position[key]), max(position[key])) for key in keys}
snp_arrays = {key: ';'.join(snp_array[key]) for key in total_ref}
out = pd.DataFrame({
'CHROMOSOME': chromosome,
'ORIENTATION': ori,
'START-STOP': posits,
'REFERENCE_COUNTS': total_ref,
'ALT_COUNTS' : total_alt,
'TOTAL_SNPS': total_snps,
'REF_BIASED' : ref_biased,
'ALT_BIASED' : alt_biased,
'REF-ALT_RATIO': pd.np.nan,
'SNPS' : snp_arrays},
index=keys,
columns=['CHROMOSOME', 'ORIENTATION', 'START-STOP',
'REFERENCE_COUNTS', 'ALT_COUNTS', 'TOTAL_SNPS', 'REF_BIASED',
'ALT_BIASED', 'REF-ALT_RATIO', 'SNPS'])
out.index.name = 'FEATURE'
out['REF-ALT_RATIO'] = (out.ix[:, ['REF_BIASED', 'ALT_BIASED']].max(axis=1)
/ out.ix[:, ['REF_BIASED', 'ALT_BIASED']].sum(axis=1))
out.ix[(out.REF_BIASED == 0) & (out.ALT_BIASED == 0), 'REF-ALT_RATIO'] = 1
out.to_csv(outfile, sep='\t', na_rep='NA', float_format='%.16g')


snp_array_out = ';'.join(snp_array[key])

outfile.write('\t'.join(
[str(i) for i in [key, chromosome[key], ori[key], posit,
total_ref[key], total_alt[key],
total_snps[key], ref_biased[key],
alt_biased[key], rat, snp_array_out]]) +
'\n')
# No counts for this feature
else:
outfile.write('\t'.join(
[str(i) for i in [key, chromosome[key], ori[key], posit,
'NA\tNA\tNA\tNA\tNA\tNA\tNA\n']]))

if args.write is True:
with open(args.outfile + '.snps.txt', 'w') as outfile:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@

keywords='ASE allele-specific expression RNA-seq fastq BAM SAM SNP',

install_requires=['pybedtools', 'pysam'],
install_requires=['pybedtools', 'pysam', 'pandas'],
scripts=scpts,
packages=['ASEr']

Expand Down