@@ -104,55 +104,58 @@ def rev_comp(string):
104104 bases = [complement [base ] for base in bases ]
105105 bases .reverse ()
106106 return '' .join (bases )
107-
108- in_file = inputGFF
109- in_handle = open (in_file )
110- #gene_locations = {}
107+
111108 try :
112- parsed_gff = GFF .parse (in_handle )
109+ with open (inputGFF ) as in_handle :
110+ _ = next (GFF .parse (in_handle ))
113111 except :
114112 print ('Parsing of GFF failed. This is probably because your biopython version is too new. Try downgrading to 1.76 or older' )
115- for rec in GFF .parse (in_handle ):
116- tmp = []
117- for r in rec .features :
118- if "minced" in r .qualifiers ['source' ][0 ] or "Minced" in r .qualifiers ['source' ][0 ]:
119- # This catches CRISPR repeats.
120- continue
121- if r .sub_features :
122- prodigal_bool = 'Prodigal' in r .sub_features [0 ].qualifiers ['source' ][0 ] or 'prodigal' in r .sub_features [0 ].qualifiers ['source' ][0 ]
123- else :
124- prodigal_bool = 'Prodigal' in r .qualifiers ['source' ][0 ] or 'prodigal' in r .qualifiers ['source' ][0 ]
125-
126- if prodigal_bool :
127- # Prokka not only finds protein sequences, but also t-/r-RNA sequences. In order to only parse protein coding sequences,
128- # I search for Prodigal/Prodigal in the source entry of the sub_features attribute.
129-
130- # the sub_features attribute of a seq_record object is apparently deprecated. I couldn't find any other way to access
131- # the required information, though. Should probably be fixed when I can.
132- indices = str (r .location ).split ('[' )[1 ].split (']' )[0 ].split (':' )
133- indices = [int (x ) for x in indices ]
134- sense = str (r .location ).split ('(' )[1 ].split (')' )[0 ]
135- if sense == "-" :
136- gene_seq = rev_comp (rec .seq [indices [0 ]:indices [1 ]])
137- else :
138- gene_seq = rec .seq [indices [0 ]:indices [1 ]]
113+ sys .exit (1 )
139114
140- if (str (gene_seq [0 :3 ]) == "ATG" or str (gene_seq [0 :3 ]) == "GTG" or str (gene_seq [0 :3 ]) == "TTG" ):
141- pass
115+ with open (inputGFF ) as in_handle :
116+
117+ for rec in GFF .parse (in_handle ):
118+ tmp = []
119+ for r in rec .features :
120+ if "minced" in r .qualifiers ['source' ][0 ] or "Minced" in r .qualifiers ['source' ][0 ]:
121+ # This catches CRISPR repeats.
122+ continue
123+ if r .sub_features :
124+ prodigal_bool = 'Prodigal' in r .sub_features [0 ].qualifiers ['source' ][0 ] or 'prodigal' in r .sub_features [0 ].qualifiers ['source' ][0 ]
142125 else :
143- warnings .warn (str (r .id ) + " doesn't start with a common start codon. Beware. Continuing." )
126+ prodigal_bool = 'Prodigal' in r .qualifiers ['source' ][0 ] or 'prodigal' in r .qualifiers ['source' ][0 ]
127+
128+ if prodigal_bool :
129+ # Prokka not only finds protein sequences, but also t-/r-RNA sequences. In order to only parse protein coding sequences,
130+ # I search for Prodigal/Prodigal in the source entry of the sub_features attribute.
131+
132+ # the sub_features attribute of a seq_record object is apparently deprecated. I couldn't find any other way to access
133+ # the required information, though. Should probably be fixed when I can.
134+ indices = str (r .location ).split ('[' )[1 ].split (']' )[0 ].split (':' )
135+ indices = [int (x ) for x in indices ]
136+ sense = str (r .location ).split ('(' )[1 ].split (')' )[0 ]
137+ if sense == "-" :
138+ gene_seq = rev_comp (rec .seq [indices [0 ]:indices [1 ]])
139+ else :
140+ gene_seq = rec .seq [indices [0 ]:indices [1 ]]
141+
142+ if (str (gene_seq [0 :3 ]) == "ATG" or str (gene_seq [0 :3 ]) == "GTG" or str (gene_seq [0 :3 ]) == "TTG" ):
143+ pass
144+ else :
145+ warnings .warn (str (r .id ) + " doesn't start with a common start codon. Beware. Continuing." )
146+
147+ if (str (gene_seq [- 3 :]) == "TAG" or str (gene_seq [- 3 :]) == "TAA" or str (gene_seq [- 3 :]) == "TGA" ):
148+ pass
149+ else :
150+ warnings .warn (str (r .id ) + " doesn't stop with a usual stop codon. Beware. Continuing." )
151+ tmp .append ((indices , sense ))
152+
153+ if str (rec .id ) in self .contigs :
154+ self .contigs [str (rec .id )].annotations .append (tmp )
155+ else :
156+ warnings .warn (str (rec .id ) + " is not tracked by the BAMFile." )
144157
145- if (str (gene_seq [- 3 :]) == "TAG" or str (gene_seq [- 3 :]) == "TAA" or str (gene_seq [- 3 :]) == "TGA" ):
146- pass
147- else :
148- warnings .warn (str (r .id ) + " doesn't stop with a usual stop codon. Beware. Continuing." )
149- tmp .append ((indices , sense ))
150-
151- if str (rec .id ) in self .contigs :
152- self .contigs [str (rec .id )].annotations .append (tmp )
153- else :
154- warnings .warn (str (rec .id ) + " is not tracked by the BAMFile." )
155- in_handle .close ()
158+
156159
157160
158161 def parallel_reference_free_consensus (self ,ncores = 4 ,** kwargs ):
0 commit comments