From a446a6f82f78fc70702cef6b5c10bcf33a15d5fc Mon Sep 17 00:00:00 2001 From: Mike7R Date: Fri, 29 Jan 2016 01:55:28 +0100 Subject: [PATCH 1/2] Fixed regular expresions --- regex.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/regex.py b/regex.py index e8eb2bb..d4c7337 100644 --- a/regex.py +++ b/regex.py @@ -27,7 +27,7 @@ PROXY_HTML = re.compile(r'.*?', +COUNTRY_HTML = re.compile(r'class="country".*?\/>.*?', re.DOTALL) # This regex is used to recover the country @@ -70,5 +70,5 @@ # This regex is used to recover the type and anonymity level in the proxy # HTML code -TYPE_ANONYMITY = re.compile(r'(.*?)\s*(.*)') +TYPE_ANONYMITY = re.compile(r'\s*(.*)\s*\s*\s*(.*)\s*') From aeaab8ceace8fc2a0211c6b99b8e112c4f805578 Mon Sep 17 00:00:00 2001 From: Mike7R Date: Fri, 29 Jan 2016 02:15:37 +0100 Subject: [PATCH 2/2] and this the whitespaces --- parser.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/parser.py b/parser.py index d88ca6b..530d5d5 100644 --- a/parser.py +++ b/parser.py @@ -60,7 +60,7 @@ def parse_proxy(proxy_html): # We get the chunk of code corresponding to the country... country_html = regex.COUNTRY_HTML.search(proxy_html).group(0) # ...and we parse it - country = regex.COUNTRY.search(country_html).group(1) + country = regex.COUNTRY.search(country_html).group(1).strip() # We get the chunk of code corresponding to the speed... speed_html = regex.SPEED_HTML.search(proxy_html).group(1) @@ -75,8 +75,8 @@ def parse_proxy(proxy_html): # We get the chunk of code corresponding to the type and anonymity... match = regex.TYPE_ANONYMITY.search(proxy_html) # ...and we parse it - type = match.group(1) - anonymity = match.group(2) + type = match.group(1).strip() + anonymity = match.group(2).strip() # We return a tuple return ip, int(port), type, country, anonymity, speed, connection_time