Skip to content

Commit 60d8f75

Browse files
committed
Just adding all possible files for release. These will be pruned later.
1 parent db618cc commit 60d8f75

36 files changed

+2109
-593
lines changed

AnnotateWithTRF.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
import sys
5+
import intervaltree as itree
6+
7+
ap = argparse.ArgumentParser(description="Annotate a gap bed file with an associated TRF table.")
8+
ap.add_argument("trf", help="Input TRF table file.")
9+
ap.add_argument("bed_out", help="Output BED file.")
10+
args = ap.parse_args()
11+
12+
trfFile = open(args.trf, 'r')
13+
prevSeqName = ""
14+
15+
annotations = {}
16+
for line in trfFile:
17+
if (line[0] == '@'):
18+
seqName = line[1:].strip()
19+
else:
20+
vals = line.split()
21+
if (seqName not in annotations):
22+
annotations[seqName] = itree.IntervalTree()
23+
annotations[seqName].addi(int(vals[0]), int(vals[1]))
24+
25+
26+
27+
28+
bedFile = open(args.bed, 'r')
29+
bedOutFile = open(args.bed_out, 'w')
30+
h = {}
31+
for line in bedFile:
32+
if line[0] == "#":
33+
header = line.split()
34+
h={header[i] : i for i in range(0,len(header))}
35+
bedOutFile.write("nTR\tfracTR\n")
36+
continue
37+
vals = line.split()
38+
seqTitle = '/'.join(vals[0:3])
39+
totalTR = 0
40+
if (seqTitle in annotations):
41+
seq = vals[h["svSeq"]]
42+
i = 0
43+
annotations[seqTitle].merge_overlaps()
44+
45+
for intv in annotations[seqTitle]:
46+
totalTR += intv[1] - intv[0]
47+
seq = seq[:intv[0]] + seq[intv[0]:intv[1]].lower() + seq[intv[1]:]
48+
vals[5] = seq
49+
50+
vals= [str(totalTR), "{:2.2f}".format(float(totalTR)/len(seq))]
51+
else:
52+
vals=["0","0"]
53+
54+
bedOutFile.write('\t'.join(vals) + "\n")
55+

BamToCoverage.cpp

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
#include <iostream>
2+
#include <vector>
3+
#include <string>
4+
#include <sstream>
5+
#include "htslib/sam.h"
6+
#include <cmath>
7+
8+
using namespace std;
9+
10+
int main(int argc, char* argv[]) {
11+
string filename = argv[1];
12+
13+
htsFile *htsfp;
14+
bam_hdr_t *samHeader;
15+
16+
17+
htsfp = hts_open(filename.c_str(),"r");
18+
const htsFormat *fmt = hts_get_format(htsfp);
19+
20+
21+
samHeader = sam_hdr_read(htsfp);
22+
23+
24+
vector<vector<int> > covBins;
25+
for (int i =0; i < samHeader->n_targets; i++) {
26+
covBins.push_back(vector<int>() );
27+
int last=covBins.size()-1;
28+
covBins[last].resize(samHeader->target_len[i]/100+1);
29+
}
30+
31+
32+
bam1_t *b = bam_init1();
33+
int res=1;
34+
res= sam_read1(htsfp, samHeader, b);
35+
long readIndex=0;
36+
long nCounted=0;
37+
while (res > 0) {
38+
readIndex+=1;
39+
long alnPos = b->core.pos;
40+
int tid=b->core.tid;
41+
if (alnPos >= 0) {
42+
covBins[tid][alnPos/100]+=1;
43+
nCounted+=1;
44+
uint8_t *xaData = bam_aux_get(b, "XA");
45+
if (xaData != 0) {
46+
char *xaString=bam_aux2Z(xaData);
47+
stringstream auxStrm((char*)xaString);
48+
string aln;
49+
int index=0;
50+
while(std::getline(auxStrm, aln, ';')) {
51+
52+
size_t pos=0;
53+
while ((pos=aln.find(',',pos)) != string::npos) {
54+
aln[pos] = '\t';
55+
}
56+
stringstream elemStrm(aln);
57+
string token;
58+
int ap=0;
59+
string chrom;
60+
long signedPos;
61+
string cigar;
62+
int mapq;
63+
elemStrm >> chrom >> signedPos >> cigar >> mapq;
64+
int tid =sam_hdr_name2tid(samHeader, chrom.c_str());
65+
covBins[tid][abs(signedPos)/100]+=1;
66+
nCounted+=1;
67+
index+=1;
68+
}
69+
}
70+
}
71+
res = sam_read1(htsfp, samHeader, b);
72+
if (readIndex %1000000 == 0) {
73+
cerr << "proc " << readIndex /1000000 << "M\t" << nCounted << endl;
74+
}
75+
}
76+
77+
for (int i=0; i < covBins.size(); i++) {
78+
string name=samHeader->target_name[i];
79+
for (int j=0; j < covBins[i].size(); j++) {
80+
if (covBins[i][j] > 0) {
81+
cout << name << "\t" << j*100 << "\t" << (j+1)*100 << "\t" << covBins[i][j] << endl;
82+
}
83+
}
84+
}
85+
}

BamToMask.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#include <iostream>
2+
#include <vector>
3+
#include <string>
4+
#include <sstream>
5+
#include "htslib/sam.h"
6+
#include <cmath>
7+
8+
using namespace std;
9+
10+
int main(int argc, char* argv[]) {
11+
string filename = argv[1];
12+
13+
htsFile *htsfp;
14+
bam_hdr_t *samHeader;
15+
16+
17+
htsfp = hts_open(filename.c_str(),"r");
18+
const htsFormat *fmt = hts_get_format(htsfp);
19+
20+
21+
samHeader = sam_hdr_read(htsfp);
22+
23+
24+
vector<vector<int> > covBins;
25+
for (int i =0; i < samHeader->n_targets; i++) {
26+
covBins.push_back(vector<int>() );
27+
int last=covBins.size()-1;
28+
covBins[last].resize(samHeader->target_len[i]/100+1);
29+
}
30+
31+
32+
bam1_t *b = bam_init1();
33+
int res=1;
34+
res= sam_read1(htsfp, samHeader, b);
35+
long readIndex=0;
36+
long nCounted=0;
37+
while (res > 0) {
38+
readIndex+=1;
39+
long alnPos = b->core.pos;
40+
int tid=b->core.tid;
41+
if (alnPos >= 0) {
42+
covBins[tid][alnPos/100]+=1;
43+
nCounted+=1;
44+
uint8_t *xaData = bam_aux_get(b, "XA");
45+
if (xaData != 0) {
46+
char *xaString=bam_aux2Z(xaData);
47+
stringstream auxStrm((char*)xaString);
48+
string aln;
49+
int index=0;
50+
int nAux=0;
51+
while(std::getline(auxStrm, aln, ';')) {
52+
nAux+=1;
53+
}
54+
if (nAux>= 20) {
55+
cout << samHeader->target_name[tid] << "\t" << alnPos << "\t" << alnPos + b->core.l_qseq << "\t" << nAux << endl;
56+
}
57+
}
58+
res = sam_read1(htsfp, samHeader, b);
59+
if (readIndex %1000000 == 0) {
60+
cerr << "proc " << readIndex /1000000 << "M\t" << nCounted << endl;
61+
}
62+
}
63+
}
64+
}

Bed12ToArcs.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#!/usr/bin/env python
2+
import sys
3+
dupGenes={}
4+
arcsFile=open(sys.argv[1],'w')
5+
namesFile=open(sys.argv[2],'w')
6+
for line in sys.stdin:
7+
vals=line.split()
8+
if vals[3] not in dupGenes:
9+
dupGenes[vals[3]] = []
10+
dupGenes[vals[3]].append(vals)
11+
12+
13+
for dupGene in dupGenes.keys():
14+
l=len(dupGenes[dupGene])
15+
if l > 1:
16+
for i in range(0,l-1):
17+
for j in range(i+1,l):
18+
vi=dupGenes[dupGene][i]
19+
vj=dupGenes[dupGene][j]
20+
21+
nExon=max(int(vi[9]), int(vj[9]))
22+
if vi[0] == vj[0]:
23+
if nExon == 1:
24+
col="paired-6-qual-1"
25+
else:
26+
col="paired-6-qual-2"
27+
else:
28+
if nExon == 1:
29+
col="paired-6-qual-3"
30+
else:
31+
col="paired-6-qual-4"
32+
arcsFile.write("var" + vi[0] + "\t" + vi[1] + "\t" + vi[2] + "\tvar" + vj[0] + "\t" + vj[1] + "\t" + vj[2]+ "\tcolor="+col + "\n")
33+
for i in range(0,l):
34+
vi=dupGenes[dupGene][i]
35+
namesFile.write("var" + vi[0] + "\t" + vi[1] + "\t" + vi[2] + "\t" + dupGene + "\n")
36+
37+

CollapsedDupsToName.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/usr/bin/env python
2+
3+
4+
import sys
5+
inFile=open(sys.argv[1])
6+
outFile=open(sys.argv[2], 'w')
7+
8+
for line in inFile:
9+
vals=line.split()
10+
vals[3] = vals[3].split("|")[5]
11+
outFile.write("\t".join(vals) + "\n")
12+

CombineMask.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ int main(int argc, char* argv[]) {
5959
}
6060
}
6161
outFile << ">" << ks[0]->name.s << endl;
62-
int p=0;
62+
long p=0;
6363
while (p < ks[0]->seq.l) {
64-
int end = min(p+60, (int) ks[0]->seq.l);
64+
long int end = min(p+60, (long int) ks[0]->seq.l);
6565
string sub(&ks[0]->seq.s[p], end-p);
6666
outFile << sub << endl;
6767
p=end;

CombineMasked.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
fileList=sys.argv[1]
44
originalFai=sys.argv[2]
55
outputFile=sys.argv[3]
6+
chromoutFilename=sys.argv[4]
67

78
origFaiFile=open(originalFai)
89
origOrder = [ l.split()[0] for l in origFaiFile]
@@ -13,6 +14,8 @@
1314
ifl=open(fileList)
1415
inFiles=ifl.readlines()
1516

17+
18+
1619
for line in inFiles:
1720

1821

@@ -29,8 +32,9 @@
2932
chroms[pre][suf] = line.rstrip()
3033

3134
outFile=open(outputFile,'w')
32-
33-
35+
chromOutFile=open(chromoutFilename,'w')
36+
wroteChromOut=False
37+
chromOutSeq=""
3438
for origContig in origOrder:
3539
if origContig not in chroms:
3640
sys.stderr.write("ERROR: Missing contig: " + origContig + "\n")
@@ -39,8 +43,25 @@
3943
outFile.write(">" + pre + "\n")
4044
nParts=max(chroms[pre].keys())
4145
seq=""
46+
chromOutSeq=""
4247
for i in range(0,nParts+1):
43-
partFile=open(chroms[pre][i])
48+
fn=chroms[pre][i]
49+
partFile=open(fn)
50+
51+
outName=fn[:fn.rfind(".")] + ".out"
52+
cf=open(outName)
53+
outHeader = [cf.readline(), cf.readline(), cf.readline()]
54+
if (wroteChromOut is False):
55+
chromOutSeq="\n".join(outHeader)
56+
wroteChromOut = True
57+
58+
for out in cf:
59+
vals=out.split()
60+
vals[5] = str(int(vals[5]) + len(seq))
61+
vals[6] = str(int(vals[6]) + len(seq))
62+
chromOutSeq+="\t".join(vals) + "\n"
63+
64+
4465
partFile.readline()
4566
seq+="".join([l.strip() for l in partFile.readlines()])
4667

@@ -52,5 +73,6 @@
5273
chromSeq += "\n"
5374

5475
outFile.write(chromSeq)
76+
chromOutFile.write(chromOutSeq)
5577

5678

CountContig.cpp

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#include <iostream>
2+
#include <string>
3+
#include <vector>
4+
using namespace std;
5+
int main() {
6+
int nl=0, nh=0, other=0;
7+
vector<int> low(256,0), high(256,0);
8+
string prevTitle = "";
9+
int start, cur;
10+
char prev='Z';
11+
while (cin) {
12+
string line;
13+
cin >> line;
14+
if (line.size() > 0 and line[0] == '>') {
15+
cerr << "proc " << line << endl;
16+
if (prev == 'Z') {
17+
cout << cur-prev << endl;
18+
}
19+
cur=0;
20+
start=0;
21+
prev='A';
22+
}
23+
else {
24+
for (int i=0; i < line.size(); i++) {
25+
if ( line[i] == 'N' or line[i] == 'n') {
26+
if (prev != 'n') {
27+
cout << cur - start<< endl;
28+
}
29+
prev='n';
30+
}
31+
else {
32+
cur++;
33+
if (prev == 'n') {
34+
start = 0;
35+
cur=0;
36+
}
37+
prev=line[i];
38+
}
39+
}
40+
}
41+
}
42+
}
43+

CountMasked.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
#!/usr/bin/env python
2+
import sys
3+
from Bio import Seq
4+
from Bio import SeqIO
5+
from Bio import SeqRecord
6+
printCount=False
7+
if len(sys.argv) > 2:
8+
if sys.argv[2] == "--count":
9+
printCount=True
10+
11+
for rec in SeqIO.parse(sys.argv[1], "fasta"):
12+
n = rec.seq.count("N") + rec.seq.count("a") + rec.seq.count("c") + rec.seq.count("g") + rec.seq.count("t") + rec.seq.count("n")
13+
output =rec.id + "\t{:2.2f}".format(n/len(rec.seq))
14+
if printCount:
15+
output+= "\t" + str(n)
16+
print(output)

0 commit comments

Comments
 (0)