@@ -56,35 +56,44 @@ def extract(htmlstring,
5656 ', "ignore" or "strict"' )
5757 try :
5858 tree = parse_xmldom_html (htmlstring , encoding = encoding )
59- processors = []
60- if 'microdata' in syntaxes :
61- processors .append (
62- ('microdata' , MicrodataExtractor (
63- add_html_node = return_html_node ).extract_items , tree ))
64- if 'json-ld' in syntaxes :
65- processors .append (('json-ld' , JsonLdExtractor ().extract_items ,
66- tree ))
67- if 'opengraph' in syntaxes :
68- processors .append (('opengraph' , OpenGraphExtractor ().extract_items ,
69- tree ))
70- if 'microformat' in syntaxes :
71- processors .append (
72- ('microformat' , MicroformatExtractor ().extract_items ,
73- htmlstring ))
74- if 'rdfa' in syntaxes :
75- processors .append (('rdfa' , RDFaExtractor ().extract_items , tree ))
76- output = {}
77- for label , extract , document in processors :
78- try :
79- output [label ] = list (extract (document , base_url = base_url ))
80- except Exception :
81- if errors == 'log' :
82- logger .exception ('Failed to extract {}' .format (label ))
83- if errors == 'ignore' :
84- pass
85- if errors == 'strict' :
86- raise
87-
59+ except Exception as e :
60+ if errors == 'ignore' :
61+ return {}
62+ if errors == 'log' :
63+ logger .exception (
64+ 'Failed to parse html, exception raised {}' .format (e ))
65+ return {}
66+ if errors == 'strict' :
67+ raise e
68+ processors = []
69+ if 'microdata' in syntaxes :
70+ processors .append (
71+ ('microdata' , MicrodataExtractor (
72+ add_html_node = return_html_node ).extract_items , tree ))
73+ if 'json-ld' in syntaxes :
74+ processors .append (('json-ld' , JsonLdExtractor ().extract_items ,
75+ tree ))
76+ if 'opengraph' in syntaxes :
77+ processors .append (('opengraph' , OpenGraphExtractor ().extract_items ,
78+ tree ))
79+ if 'microformat' in syntaxes :
80+ processors .append (
81+ ('microformat' , MicroformatExtractor ().extract_items ,
82+ htmlstring ))
83+ if 'rdfa' in syntaxes :
84+ processors .append (('rdfa' , RDFaExtractor ().extract_items , tree ))
85+ output = {}
86+ for label , extract , document in processors :
87+ try :
88+ output [label ] = list (extract (document , base_url = base_url ))
89+ except Exception :
90+ if errors == 'log' :
91+ logger .exception ('Failed to extract {}' .format (label ))
92+ if errors == 'ignore' :
93+ pass
94+ if errors == 'strict' :
95+ raise
96+ try :
8897 if uniform :
8998 if 'microdata' in syntaxes :
9099 output ['microdata' ] = _umicrodata_microformat (
@@ -98,5 +107,11 @@ def extract(htmlstring,
98107 except Exception as e :
99108 if errors == 'ignore' :
100109 return {}
101- raise e
110+ if errors == 'log' :
111+ logger .exception (
112+ 'Failed to uniform extracted, exception raised {}' .format (e ))
113+ return {}
114+ if errors == 'strict' :
115+ raise e
116+
102117 return output
0 commit comments