1- import re
21from collections import namedtuple
32
4- import markdown
3+ from urlextract import URLExtract
54
6- REGULAR_EXP = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
5+ LINKS = namedtuple ( "LINKS" , [ "line" , "urls" , "skip" ])
76
8- LINKS = namedtuple ( "LINKS" , [ "line" , "urls" , "skip" , "valid" ] )
7+ EXTRACTOR = URLExtract ( )
98
109
1110def parse_line (line ):
1211 """Parse links from line/string
1312
1413 Args:
15- string : data string
14+ line : data line
1615 Returns:
1716 list of links
1817 """
19- string = line .strip ()
20- html_format = markdown .markdown (string , output_format = "html" )
21- links = re .findall (REGULAR_EXP , html_format )
22-
23- # TODO: Improve regex to remove this workaround for trailing </p> or </li>
24- links = [
25- l .replace ("</p>" , "" ).replace ("</li>" , "" ).replace ("</a>" , "" ).replace (")" , "" )
26- for l in links
27- ]
18+ links = EXTRACTOR .find_urls (line )
2819 return links
2920
3021
@@ -41,37 +32,5 @@ def parse_file(file_path):
4132 line_links = parse_line (line )
4233 if line_links :
4334 skip = True if "noqa" in line else False
44- links .append (LINKS (line = line_number + 1 , urls = line_links , skip = skip , valid = False ))
35+ links .append (LINKS (line = line_number + 1 , urls = line_links , skip = skip ))
4536 return links
46-
47-
48- def link_validator (links_list ):
49- """Validate link
50- Args:
51- links_list: List of links.
52-
53- Return:
54- Named tuple of the valid and invalid links.
55- """
56- validated_list = []
57-
58- regex = re .compile (
59- r"^(?:http|ftp)s?://" # http:// or https://
60- r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"
61- # for domain
62- r"localhost|" # localhost...
63- r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
64- r"(?::\d+)?" # optional port
65- r"(?:/?|[/?]\S+)$" ,
66- re .IGNORECASE ,
67- )
68-
69- for link in links_list :
70- urls = []
71- for i in link .urls :
72- if re .match (regex , i ):
73- urls .append (i )
74- else :
75- validated_list .append (LINKS (line = link .line , urls = [i ], valid = False , skip = True ))
76- validated_list .append (LINKS (line = link .line , urls = urls , skip = False , valid = True ))
77- return validated_list
0 commit comments