Skip to content

Commit 094ee84

Browse files
committed
Fix link parsing
1 parent 3a0119c commit 094ee84

File tree

5 files changed

+28
-60
lines changed

5 files changed

+28
-60
lines changed

linkstatus/linkstatus.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import pkg_resources
77
import requests
88

9-
from linkstatus.parser import link_validator
109
from linkstatus.parser import parse_file
1110

1211
CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
@@ -28,12 +27,13 @@ def link_status(link, timeout=5):
2827
status_code = requests.get(link, headers=headers, timeout=timeout).status_code
2928
except requests.exceptions.SSLError:
3029
status_code = requests.get(link, verify=False, headers=headers, timeout=timeout).status_code
30+
except requests.exceptions.MissingSchema:
31+
status_code = "Schema missing try with http/https"
3132
except Exception: # noqa
3233
# TODO: include exception in logging
3334
status_code = None
34-
pass
3535

36-
return status_code == 200, status_code
36+
return status_code == requests.codes.ok, status_code
3737

3838

3939
def all_files(source, recursive=False):
@@ -80,7 +80,6 @@ def main(source, recursive, timeout, retry):
8080

8181
for f in files:
8282
links = parse_file(f)
83-
links = link_validator(links)
8483
if links:
8584
click.echo(click.style("Links in File: '{}'".format(f), bg="blue", fg="white"))
8685

linkstatus/parser.py

Lines changed: 6 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,21 @@
1-
import re
21
from collections import namedtuple
32

4-
import markdown
3+
from urlextract import URLExtract
54

6-
REGULAR_EXP = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
5+
LINKS = namedtuple("LINKS", ["line", "urls", "skip"])
76

8-
LINKS = namedtuple("LINKS", ["line", "urls", "skip", "valid"])
7+
EXTRACTOR = URLExtract()
98

109

1110
def parse_line(line):
1211
"""Parse links from line/string
1312
1413
Args:
15-
string: data string
14+
line: data line
1615
Returns:
1716
list of links
1817
"""
19-
string = line.strip()
20-
html_format = markdown.markdown(string, output_format="html")
21-
links = re.findall(REGULAR_EXP, html_format)
22-
23-
# TODO: Improve regex to remove this workaround for trailing </p> or </li>
24-
links = [
25-
l.replace("</p>", "").replace("</li>", "").replace("</a>", "").replace(")", "")
26-
for l in links
27-
]
18+
links = EXTRACTOR.find_urls(line)
2819
return links
2920

3021

@@ -41,37 +32,5 @@ def parse_file(file_path):
4132
line_links = parse_line(line)
4233
if line_links:
4334
skip = True if "noqa" in line else False
44-
links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip, valid=False))
35+
links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip))
4536
return links
46-
47-
48-
def link_validator(links_list):
49-
"""Validate link
50-
Args:
51-
links_list: List of links.
52-
53-
Return:
54-
Named tuple of the valid and invalid links.
55-
"""
56-
validated_list = []
57-
58-
regex = re.compile(
59-
r"^(?:http|ftp)s?://" # http:// or https://
60-
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"
61-
# for domain
62-
r"localhost|" # localhost...
63-
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
64-
r"(?::\d+)?" # optional port
65-
r"(?:/?|[/?]\S+)$",
66-
re.IGNORECASE,
67-
)
68-
69-
for link in links_list:
70-
urls = []
71-
for i in link.urls:
72-
if re.match(regex, i):
73-
urls.append(i)
74-
else:
75-
validated_list.append(LINKS(line=link.line, urls=[i], valid=False, skip=True))
76-
validated_list.append(LINKS(line=link.line, urls=urls, skip=False, valid=True))
77-
return validated_list

setup.cfg

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ zip_safe = False
3535
setup_requires = setuptools_scm
3636
install_requires =
3737
click
38-
markdown
3938
requests
39+
urlextract
4040
include_package_data = True
4141
python_requires = >=3.6
4242

tests/data.yaml

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,12 @@ data:
1818
status: False
1919
line: 'L8'
2020
markdown_file.md:
21-
'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet':
21+
'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet#links':
2222
status: True
2323
line: 'L4'
24+
'www.google.com':
25+
status: False
26+
line: 'L6'
2427
'http://www.google.com':
2528
status: True
2629
line: 'L8'
@@ -33,16 +36,25 @@ data:
3336
'http://www.example.com':
3437
status: True
3538
line: 'L24'
39+
'https://www.mozilla.org':
40+
status: True
41+
line: 'L28'
42+
'http://slashdot.org':
43+
status: True
44+
line: 'L30'
45+
'http://www.foo.com':
46+
status: True
47+
line: 'L32'
3648
'https://github.com/pythonpune/linkcheck':
3749
status: False
3850
line: 'L34'
3951
'https://github.com//pythonpune/':
4052
status: True
4153
line: 'L39'
42-
'http://<hostname>:<port>':
54+
'http://localhost:8080':
4355
status: False
4456
line: 'L41'
45-
'https://<hostname>:<port>/pages':
57+
'https://localhost:8080/foo':
4658
status: False
4759
line: 'L43'
4860
recursive:

tests/data/markdown_file.md

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ Some text to show that the reference links can follow later.
2929

3030
[1]: http://slashdot.org
3131

32-
[link text itself]: http://www.example.com
32+
[link text itself]: http://www.foo.com
3333

3434
[broken link](https://github.com/pythonpune/linkcheck)
3535

@@ -38,8 +38,6 @@ Some text to show that the reference links can follow later.
3838

3939
https://github.com//pythonpune/
4040

41-
http://<hostname>:<port>
41+
http://localhost:8080
4242

43-
https://<hostname>:<port>/pages
44-
45-
file:///etc/hosts
43+
https://localhost:8080/foo

0 commit comments

Comments
 (0)