1717
1818abortedFlag = None
1919
20- def parseSitemap (url , retVal = None ):
20+ def parseSitemap (url , retVal = None , visited = None ):
2121 global abortedFlag
2222
2323 if retVal is not None :
@@ -27,25 +27,41 @@ def parseSitemap(url, retVal=None):
2727 if retVal is None :
2828 abortedFlag = False
2929 retVal = OrderedSet ()
30+ visited = set ()
31+
32+ if url in visited :
33+ return retVal
34+
35+ visited .add (url )
3036
3137 try :
3238 content = Request .getPage (url = url , raise404 = True )[0 ] if not abortedFlag else ""
3339 except _http_client .InvalidURL :
3440 errMsg = "invalid URL given for sitemap ('%s')" % url
3541 raise SqlmapSyntaxException (errMsg )
3642
37- for match in re .finditer (r"<loc>\s*([^<]+)" , content or "" ):
38- if abortedFlag :
39- break
40- url = match .group (1 ).strip ()
41- if url .endswith (".xml" ) and "sitemap" in url .lower ():
42- if kb .followSitemapRecursion is None :
43- message = "sitemap recursion detected. Do you want to follow? [y/N] "
44- kb .followSitemapRecursion = readInput (message , default = 'N' , boolean = True )
45- if kb .followSitemapRecursion :
46- parseSitemap (url , retVal )
47- else :
48- retVal .add (url )
43+ if content :
44+ content = re .sub (r"" , "" , content , flags = re .DOTALL )
45+
46+ for match in re .finditer (r"<\w*?loc[^>]*>\s*([^<]+)" , content , re .I ):
47+ if abortedFlag :
48+ break
49+
50+ foundUrl = match .group (1 ).strip ()
51+
52+ # Basic validation to avoid junk
53+ if not foundUrl .startswith ("http" ):
54+ continue
55+
56+ if foundUrl .endswith (".xml" ) and "sitemap" in foundUrl .lower ():
57+ if kb .followSitemapRecursion is None :
58+ message = "sitemap recursion detected. Do you want to follow? [y/N] "
59+ kb .followSitemapRecursion = readInput (message , default = 'N' , boolean = True )
60+
61+ if kb .followSitemapRecursion :
62+ parseSitemap (foundUrl , retVal , visited )
63+ else :
64+ retVal .add (foundUrl )
4965
5066 except KeyboardInterrupt :
5167 abortedFlag = True
0 commit comments