55except ImportError :
66 import simplejson as json
77
8- from html5lib import html5parser , sanitizer , constants
8+ from html5lib import html5parser , sanitizer , constants , treebuilders
99
1010
11- def runSanitizerTest (name , expected , input ):
12- expected = '' .join ([token .toxml () for token in html5parser .HTMLParser ().
13- parseFragment (expected ).childNodes ])
11+ def toxmlFactory ():
12+ tree = treebuilders .getTreeBuilder ("etree" )
13+
14+ def toxml (element ):
15+ # encode/decode roundtrip required for Python 2.6 compatibility
16+ result_bytes = tree .implementation .tostring (element , encoding = "utf-8" )
17+ return result_bytes .decode ("utf-8" )
18+
19+ return toxml
20+
21+
22+ def runSanitizerTest (name , expected , input , toxml = None ):
23+ if toxml is None :
24+ toxml = toxmlFactory ()
25+ expected = '' .join ([toxml (token ) for token in html5parser .HTMLParser ().
26+ parseFragment (expected )])
1427 expected = json .loads (json .dumps (expected ))
1528 assert expected == sanitize_html (input )
1629
1730
18- def sanitize_html (stream ):
19- return '' .join ([token .toxml () for token in
31+ def sanitize_html (stream , toxml = None ):
32+ if toxml is None :
33+ toxml = toxmlFactory ()
34+ return '' .join ([toxml (token ) for token in
2035 html5parser .HTMLParser (tokenizer = sanitizer .HTMLSanitizer ).
21- parseFragment (stream ). childNodes ])
36+ parseFragment (stream )])
2237
2338
2439def test_should_handle_astral_plane_characters ():
25- assert "<p >\U0001d4b5 \U0001d538 </p>" == sanitize_html ("<p>𝒵 𝔸</p>" )
40+ assert '<html:p xmlns:html="http://www.w3.org/1999/xhtml" >\U0001d4b5 \U0001d538 </html:p>' == sanitize_html ("<p>𝒵 𝔸</p>" )
2641
2742
2843def test_sanitizer ():
44+ toxml = toxmlFactory ()
2945 for tag_name in sanitizer .HTMLSanitizer .allowed_elements :
3046 if tag_name in ['caption' , 'col' , 'colgroup' , 'optgroup' , 'option' , 'table' , 'tbody' , 'td' , 'tfoot' , 'th' , 'thead' , 'tr' ]:
3147 continue # TODO
@@ -34,25 +50,30 @@ def test_sanitizer():
3450 if tag_name == 'image' :
3551 yield (runSanitizerTest , "test_should_allow_%s_tag" % tag_name ,
3652 "<img title=\" 1\" />foo <bad>bar</bad> baz" ,
37- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ))
53+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ),
54+ toxml )
3855 elif tag_name == 'br' :
3956 yield (runSanitizerTest , "test_should_allow_%s_tag" % tag_name ,
4057 "<br title=\" 1\" />foo <bad>bar</bad> baz<br/>" ,
41- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ))
58+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ),
59+ toxml )
4260 elif tag_name in constants .voidElements :
4361 yield (runSanitizerTest , "test_should_allow_%s_tag" % tag_name ,
4462 "<%s title=\" 1\" />foo <bad>bar</bad> baz" % tag_name ,
45- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ))
63+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ),
64+ toxml )
4665 else :
4766 yield (runSanitizerTest , "test_should_allow_%s_tag" % tag_name ,
4867 "<%s title=\" 1\" >foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ),
49- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ))
68+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ),
69+ toxml )
5070
5171 for tag_name in sanitizer .HTMLSanitizer .allowed_elements :
5272 tag_name = tag_name .upper ()
5373 yield (runSanitizerTest , "test_should_forbid_%s_tag" % tag_name ,
5474 "<%s title=\" 1\" >foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ),
55- "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ))
75+ "<%s title='1'>foo <bad>bar</bad> baz</%s>" % (tag_name , tag_name ),
76+ toxml )
5677
5778 for attribute_name in sanitizer .HTMLSanitizer .allowed_attributes :
5879 if attribute_name != attribute_name .lower ():
@@ -61,20 +82,24 @@ def test_sanitizer():
6182 continue
6283 yield (runSanitizerTest , "test_should_allow_%s_attribute" % attribute_name ,
6384 "<p %s=\" foo\" >foo <bad>bar</bad> baz</p>" % attribute_name ,
64- "<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name )
85+ "<p %s='foo'>foo <bad>bar</bad> baz</p>" % attribute_name ,
86+ toxml )
6587
6688 for attribute_name in sanitizer .HTMLSanitizer .allowed_attributes :
6789 attribute_name = attribute_name .upper ()
6890 yield (runSanitizerTest , "test_should_forbid_%s_attribute" % attribute_name ,
6991 "<p>foo <bad>bar</bad> baz</p>" ,
70- "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name )
92+ "<p %s='display: none;'>foo <bad>bar</bad> baz</p>" % attribute_name ,
93+ toxml )
7194
7295 for protocol in sanitizer .HTMLSanitizer .allowed_protocols :
7396 yield (runSanitizerTest , "test_should_allow_%s_uris" % protocol ,
7497 "<a href=\" %s\" >foo</a>" % protocol ,
75- """<a href="%s">foo</a>""" % protocol )
98+ """<a href="%s">foo</a>""" % protocol ,
99+ toxml )
76100
77101 for protocol in sanitizer .HTMLSanitizer .allowed_protocols :
78102 yield (runSanitizerTest , "test_should_allow_uppercase_%s_uris" % protocol ,
79103 "<a href=\" %s\" >foo</a>" % protocol ,
80- """<a href="%s">foo</a>""" % protocol )
104+ """<a href="%s">foo</a>""" % protocol ,
105+ toxml )
0 commit comments