Fix link parsing

digitronik · digitronik · commit 094ee84ca080 · 2020-08-02T16:42:35.000+05:30
diff --git a/linkstatus/linkstatus.py b/linkstatus/linkstatus.py
@@ -6,7 +6,6 @@
 import pkg_resources
 import requests
 
-from linkstatus.parser import link_validator
 from linkstatus.parser import parse_file
 
 CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"])
@@ -28,12 +27,13 @@ def link_status(link, timeout=5):
         status_code = requests.get(link, headers=headers, timeout=timeout).status_code
     except requests.exceptions.SSLError:
         status_code = requests.get(link, verify=False, headers=headers, timeout=timeout).status_code
+    except requests.exceptions.MissingSchema:
+        status_code = "Schema missing try with http/https"
     except Exception:  # noqa
         # TODO: include exception in logging
         status_code = None
-        pass
 
-    return status_code == 200, status_code
+    return status_code == requests.codes.ok, status_code
 
 
 def all_files(source, recursive=False):
@@ -80,7 +80,6 @@ def main(source, recursive, timeout, retry):
 
     for f in files:
         links = parse_file(f)
-        links = link_validator(links)
         if links:
             click.echo(click.style("Links in File: '{}'".format(f), bg="blue", fg="white"))
 
diff --git a/linkstatus/parser.py b/linkstatus/parser.py
@@ -1,30 +1,21 @@
-import re
 from collections import namedtuple
 
-import markdown
+from urlextract import URLExtract
 
-REGULAR_EXP = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
+LINKS = namedtuple("LINKS", ["line", "urls", "skip"])
 
-LINKS = namedtuple("LINKS", ["line", "urls", "skip", "valid"])
+EXTRACTOR = URLExtract()
 
 
 def parse_line(line):
     """Parse links from line/string
 
     Args:
-        string: data string
+        line: data line
     Returns:
         list of links
     """
-    string = line.strip()
-    html_format = markdown.markdown(string, output_format="html")
-    links = re.findall(REGULAR_EXP, html_format)
-
-    # TODO: Improve regex to remove this workaround for trailing </p> or </li>
-    links = [
-        l.replace("</p>", "").replace("</li>", "").replace("</a>", "").replace(")", "")
-        for l in links
-    ]
+    links = EXTRACTOR.find_urls(line)
     return links
 
 
@@ -41,37 +32,5 @@ def parse_file(file_path):
             line_links = parse_line(line)
             if line_links:
                 skip = True if "noqa" in line else False
-                links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip, valid=False))
+                links.append(LINKS(line=line_number + 1, urls=line_links, skip=skip))
     return links
-
-
-def link_validator(links_list):
-    """Validate link
-    Args:
-        links_list: List of links.
-
-    Return:
-        Named tuple of the valid and invalid links.
-    """
-    validated_list = []
-
-    regex = re.compile(
-        r"^(?:http|ftp)s?://"  # http:// or https://
-        r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|"
-        # for domain
-        r"localhost|"  # localhost...
-        r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})"  # ...or ip
-        r"(?::\d+)?"  # optional port
-        r"(?:/?|[/?]\S+)$",
-        re.IGNORECASE,
-    )
-
-    for link in links_list:
-        urls = []
-        for i in link.urls:
-            if re.match(regex, i):
-                urls.append(i)
-            else:
-                validated_list.append(LINKS(line=link.line, urls=[i], valid=False, skip=True))
-        validated_list.append(LINKS(line=link.line, urls=urls, skip=False, valid=True))
-    return validated_list
diff --git a/setup.cfg b/setup.cfg
@@ -35,8 +35,8 @@ zip_safe = False
 setup_requires = setuptools_scm
 install_requires =
     click
-    markdown
     requests
+    urlextract
 include_package_data = True
 python_requires = >=3.6
 
diff --git a/tests/data.yaml b/tests/data.yaml
@@ -18,9 +18,12 @@ data:
         status: False
         line: 'L8'
     markdown_file.md:
-      'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet':
+      'https://github.com/adam-p/markdown-here/wiki/Markdown-Cheatsheet#links':
         status: True
         line: 'L4'
+      'www.google.com':
+        status: False
+        line: 'L6'
       'http://www.google.com':
         status: True
         line: 'L8'
@@ -33,16 +36,25 @@ data:
       'http://www.example.com':
         status: True
         line: 'L24'
+      'https://www.mozilla.org':
+        status: True
+        line: 'L28'
+      'http://slashdot.org':
+        status: True
+        line: 'L30'
+      'http://www.foo.com':
+        status: True
+        line: 'L32'
       'https://github.com/pythonpune/linkcheck':
         status: False
         line: 'L34'
       'https://github.com//pythonpune/':
         status: True
         line: 'L39'
-      'http://<hostname>:<port>':
+      'http://localhost:8080':
         status: False
         line: 'L41'
-      'https://<hostname>:<port>/pages':
+      'https://localhost:8080/foo':
         status: False
         line: 'L43'
 recursive:
diff --git a/tests/data/markdown_file.md b/tests/data/markdown_file.md
@@ -29,7 +29,7 @@ Some text to show that the reference links can follow later.
 
 [1]: http://slashdot.org
 
-[link text itself]: http://www.example.com
+[link text itself]: http://www.foo.com
 
 [broken link](https://github.com/pythonpune/linkcheck)
 
@@ -38,8 +38,6 @@ Some text to show that the reference links can follow later.
 
 https://github.com//pythonpune/
 
-http://<hostname>:<port>
+http://localhost:8080
 
-https://<hostname>:<port>/pages
-
-file:///etc/hosts
+https://localhost:8080/foo