@@ -76,14 +76,12 @@ def htmlentityreplace_errors(exc):
7676
7777 del register_error
7878
79- def encode (text , encoding ):
80- return text .encode (encoding , unicode_encode_errors )
8179
8280class HTMLSerializer (object ):
8381
8482 # attribute quoting options
8583 quote_attr_values = False
86- quote_char = '"'
84+ quote_char = u '"'
8785 use_best_quote_char = True
8886
8987 # tag syntax options
@@ -159,7 +157,22 @@ def __init__(self, **kwargs):
159157 self .errors = []
160158 self .strict = False
161159
160+ def encode (self , string ):
161+ assert (isinstance (string , unicode ))
162+ if self .encoding :
163+ return string .encode (self .encoding , unicode_encode_errors )
164+ else :
165+ return string
166+
167+ def encodeStrict (self , string ):
168+ assert (isinstance (string , unicode ))
169+ if self .encoding :
170+ return string .encode (self .encoding , "strict" )
171+ else :
172+ return string
173+
162174 def serialize (self , treewalker , encoding = None ):
175+ self .encoding = encoding
163176 in_cdata = False
164177 self .errors = []
165178 if encoding and self .inject_meta_charset :
@@ -195,27 +208,19 @@ def serialize(self, treewalker, encoding=None):
195208 doctype += u" %s%s%s" % (quote_char , token ["systemId" ], quote_char )
196209
197210 doctype += u">"
198-
199- if encoding :
200- yield doctype .encode (encoding )
201- else :
202- yield doctype
211+ yield self .encodeStrict (doctype )
203212
204213 elif type in ("Characters" , "SpaceCharacters" ):
205214 if type == "SpaceCharacters" or in_cdata :
206215 if in_cdata and token ["data" ].find ("</" ) >= 0 :
207216 self .serializeError (_ ("Unexpected </ in CDATA" ))
208- if encoding :
209- yield token ["data" ].encode (encoding , "strict" )
210- else :
211- yield token ["data" ]
212- elif encoding :
213- yield encode (escape (token ["data" ]), encoding )
217+ yield self .encode (token ["data" ])
214218 else :
215- yield escape (token ["data" ])
219+ yield self . encode ( escape (token ["data" ]) )
216220
217221 elif type in ("StartTag" , "EmptyTag" ):
218222 name = token ["name" ]
223+ yield self .encodeStrict (u"<%s" % name )
219224 if name in rcdataElements and not self .escape_rcdata :
220225 in_cdata = True
221226 elif in_cdata :
@@ -225,69 +230,56 @@ def serialize(self, treewalker, encoding=None):
225230 #TODO: Add namespace support here
226231 k = attr_name
227232 v = attr_value
228- if encoding :
229- k = k .encode (encoding , "strict" )
230- attributes .append (' ' )
233+ yield self .encodeStrict (u' ' )
231234
232- attributes . append (k )
235+ yield self . encodeStrict (k )
233236 if not self .minimize_boolean_attributes or \
234237 (k not in booleanAttributes .get (name , tuple ()) \
235238 and k not in booleanAttributes .get ("" , tuple ())):
236- attributes . append ( "=" )
239+ yield self . encodeStrict ( u "=" )
237240 if self .quote_attr_values or not v :
238241 quote_attr = True
239242 else :
240243 quote_attr = reduce (lambda x ,y : x or (y in v ),
241- spaceCharacters + ">\" '=" , False )
242- v = v .replace ("&" , "&" )
243- if self .escape_lt_in_attrs : v = v .replace ("<" , "<" )
244- if encoding :
245- v = encode (v , encoding )
244+ spaceCharacters + u">\" '=" , False )
245+ v = v .replace (u"&" , u"&" )
246+ if self .escape_lt_in_attrs : v = v .replace (u"<" , u"<" )
246247 if quote_attr :
247248 quote_char = self .quote_char
248249 if self .use_best_quote_char :
249- if "'" in v and '"' not in v :
250- quote_char = '"'
251- elif '"' in v and "'" not in v :
252- quote_char = "'"
253- if quote_char == "'" :
254- v = v .replace ("'" , "'" )
250+ if u "'" in v and u '"' not in v :
251+ quote_char = u '"'
252+ elif u '"' in v and u "'" not in v :
253+ quote_char = u "'"
254+ if quote_char == u "'" :
255+ v = v .replace (u "'" , u "'" )
255256 else :
256- v = v .replace ('"' , """ )
257- attributes . append (quote_char )
258- attributes . append (v )
259- attributes . append (quote_char )
257+ v = v .replace (u '"' , u """ )
258+ yield self . encodeStrict (quote_char )
259+ yield self . encode (v )
260+ yield self . encodeStrict (quote_char )
260261 else :
261- attributes . append (v )
262+ yield self . encode (v )
262263 if name in voidElements and self .use_trailing_solidus :
263264 if self .space_before_trailing_solidus :
264- attributes . append ( " /" )
265+ yield self . encodeStrict ( u " /" )
265266 else :
266- attributes .append ("/" )
267- if encoding :
268- yield "<%s%s>" % (name .encode (encoding , "strict" ), "" .join (attributes ))
269- else :
270- yield u"<%s%s>" % (name , u"" .join (attributes ))
267+ yield self .encodeStrict (u"/" )
268+ yield self .encode (u">" )
271269
272270 elif type == "EndTag" :
273271 name = token ["name" ]
274272 if name in rcdataElements :
275273 in_cdata = False
276274 elif in_cdata :
277275 self .serializeError (_ ("Unexpected child element of a CDATA element" ))
278- end_tag = u"</%s>" % name
279- if encoding :
280- end_tag = end_tag .encode (encoding , "strict" )
281- yield end_tag
276+ yield self .encodeStrict (u"</%s>" % name )
282277
283278 elif type == "Comment" :
284279 data = token ["data" ]
285280 if data .find ("--" ) >= 0 :
286281 self .serializeError (_ ("Comment contains --" ))
287- comment = u"<!--%s-->" % token ["data" ]
288- if encoding :
289- comment = comment .encode (encoding , unicode_encode_errors )
290- yield comment
282+ yield self .encodeStrict (u"<!--%s-->" % token ["data" ])
291283
292284 elif type == "Entity" :
293285 name = token ["name" ]
@@ -298,9 +290,7 @@ def serialize(self, treewalker, encoding=None):
298290 data = entities [key ]
299291 else :
300292 data = u"&%s;" % name
301- if encoding :
302- data = data .encode (encoding , unicode_encode_errors )
303- yield data
293+ yield self .encodeStrict (data )
304294
305295 else :
306296 self .serializeError (token ["data" ])
0 commit comments