Merge pull request #820 from markshannon/python-incomplete-url-sanitize

taus-semmle · web-flow · commit 3d7821c83aa7 · 2019-01-29T17:48:42.000+01:00
Python: Two new queries for URL and hostname sanitization (CWE-020).
diff --git a/change-notes/1.20/analysis-python.md b/change-notes/1.20/analysis-python.md
@@ -14,6 +14,8 @@ Removes false positives seen when using Python 3.6, but not when using earlier v
  | **Query**                   | **Tags**  | **Purpose**                                                        |
 |-----------------------------|-----------|--------------------------------------------------------------------|
 | Default version of SSL/TLS may be insecure (`py/insecure-default-protocol`) | security, external/cwe/cwe-327 | Finds instances where an insecure default protocol may be used. Results are shown on LGTM by default. |
+| Incomplete regular expression for hostnames (`py/incomplete-hostname-regexp`) | security, external/cwe/cwe-020 | Finds instances where a hostname is incompletely sanitized because a regular expression contains an unescaped character. Results are shown on LGTM by default. |
+| Incomplete URL substring sanitization (`py/incomplete-url-substring-sanitization`) | security, external/cwe/cwe-020 | Finds instances where a URL is incompletely sanitized due to insufficient checks. Results are shown on LGTM by default. |
 | Overly permissive file permissions (`py/overly-permissive-file`) | security, external/cwe/cwe-732 | Finds instances where a file is created with overly permissive permissions. Results are not shown on LGTM by default. |
 | Use of insecure SSL/TLS version (`py/insecure-protocol`) | security, external/cwe/cwe-327 | Finds instances where a known insecure protocol has been specified. Results are shown on LGTM by default. |
 
diff --git a/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp b/javascript/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
@@ -15,9 +15,9 @@
 
 		<p>
 
-			However, it is notoriously error-prone to treat the URL as
-			a string and check if one of the allowed hosts is a substring of the
-			URL. Malicious URLs can bypass such security checks by embedding one
+			However, treating the URL as a string and checking if one of the
+			allowed hosts is a substring of the URL is very prone to errors.
+			Malicious URLs can bypass such security checks by embedding one
 			of the allowed hosts in an unexpected location.
 
 		</p>
diff --git a/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.qhelp
@@ -0,0 +1,69 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+    <overview>
+        <p>
+
+            Sanitizing untrusted URLs is an important technique for
+            preventing attacks such as request forgeries and malicious
+            redirections. Often, this is done by checking that the host of a URL
+            is in a set of allowed hosts.
+
+        </p>
+
+        <p>
+
+            If a regular expression implements such a check, it is
+            easy to accidentally make the check too permissive by not escaping the
+            <code>.</code> meta-characters appropriately.
+
+            Even if the check is not used in a security-critical
+            context, the incomplete check may still cause undesirable behaviors
+            when it accidentally succeeds.
+
+        </p>
+    </overview>
+
+    <recommendation>
+        <p>
+
+            Escape all meta-characters appropriately when constructing
+            regular expressions for security checks, pay special attention to the
+            <code>.</code> meta-character.
+
+        </p>
+    </recommendation>
+
+    <example>
+
+        <p>
+
+            The following example code checks that a URL redirection
+            will reach the <code>example.com</code> domain, or one of its
+            subdomains.
+
+        </p>
+
+        <sample src="examples/IncompleteHostnameRegExp.py"/>
+
+        <p>
+            The <code>unsafe</code> check is easy to bypass because the unescaped
+            <code>.</code> allows for any character before
+            <code>example.com</code>, effectively allowing the redirect to go to
+            an attacker-controlled domain such as <code>wwwXexample.com</code>.
+
+        </p>
+        <p>
+            The <code>safe</code> check closes this vulnerability by escaping the <code>.</code>
+            so that URLs of the form <code>wwwXexample.com</code> are rejected.
+        </p>
+
+    </example>
+
+    <references>
+        <li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+        <li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+    </references>
+</qhelp>
diff --git a/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql b/python/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql
@@ -0,0 +1,44 @@
+/**
+ * @name Incomplete regular expression for hostnames
+ * @description Matching a URL or hostname against a regular expression that contains an unescaped dot as part of the hostname might match more hostnames than expected.
+ * @kind problem
+ * @problem.severity warning
+ * @precision high
+ * @id py/incomplete-hostname-regexp
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-20
+ */
+
+import python
+import semmle.python.regex
+
+private string commonTopLevelDomainRegex() {
+    result = "com|org|edu|gov|uk|net|io"
+}
+
+/**
+ * Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
+ * and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
+ */
+bindingset[pattern]
+predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
+  hostPart = pattern
+        .regexpCapture("(?i).*" +
+            // an unescaped single `.`
+            "(?<!\\\\)[.]" +
+            // immediately followed by a sequence of subdomains, perhaps with some regex characters mixed in, followed by a known TLD
+            "([():|?a-z0-9-]+(\\\\)?[.](" + commonTopLevelDomainRegex() + "))" + ".*", 1)
+}
+
+from Regex r, string pattern, string hostPart
+where
+  (
+    r.getText() = pattern
+  ) and
+  isIncompleteHostNameRegExpPattern(pattern, hostPart) and
+  // ignore patterns with capture groups after the TLD
+  not pattern.regexpMatch("(?i).*[.](" + commonTopLevelDomainRegex()  + ").*[(][?]:.*[)].*")
+select r,
+  "This regular expression has an unescaped '.' before '" + hostPart +
+    "', so it might match more hosts than expected."
diff --git a/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp b/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.qhelp
@@ -0,0 +1,85 @@
+<!DOCTYPE qhelp PUBLIC
+"-//Semmle//qhelp//EN"
+"qhelp.dtd">
+<qhelp>
+
+    <overview>
+        <p>
+
+            Sanitizing untrusted URLs is an important technique for
+            preventing attacks such as request forgeries and malicious
+            redirections. Usually, this is done by checking that the host of a URL
+            is in a set of allowed hosts.
+
+        </p>
+
+        <p>
+            However, treating the URL as a string and checking if one of the
+            allowed hosts is a substring of the URL is very prone to errors.
+            Malicious URLs can bypass such security checks by embedding one
+            of the allowed hosts in an unexpected location.
+
+        </p>
+
+        <p>
+
+            Even if the substring check is not used in a
+            security-critical context, the incomplete check may still cause
+            undesirable behaviors when the check succeeds accidentally.
+
+        </p>
+    </overview>
+
+    <recommendation>
+        <p>
+
+            Parse a URL before performing a check on its host value,
+            and ensure that the check handles arbitrary subdomain sequences
+            correctly.
+
+        </p>
+    </recommendation>
+
+    <example>
+
+        <p>
+
+            The following example code checks that a URL redirection
+            will reach the <code>example.com</code> domain.
+
+        </p>
+
+        <sample src="examples/IncompleteUrlSubstringSanitization.py"/>
+
+        <p>
+
+            The first two examples show unsafe checks that are easily bypassed.
+            In <code>unsafe1</code> the attacker can simply add 
+            <code>example.com</code> anywhere in the url. For example,
+            <code>http://evil-example.net/example.com</code>.
+        </p>
+        <p>
+            In <code>unsafe2</code> the attacker must use a hostname ending in
+            <code>example.com</code>, but that is easy to do.  For example,
+            <code>http://benign-looking-prefix-example.com</code>.
+
+        </p>
+
+        <p>
+
+            The second two examples show safe checks.
+            In <code>safe1</code>, a white-list is used. Although fairly inflexible,
+            this is easy to get right and is most likely to be safe.
+        </p>
+        <p>
+            In <code>safe2</code>, <code>urlparse</code> is used to parse the URL,
+            then the hostname is checked to make sure it ends with <code>.example.com</code>.
+        </p>
+
+    </example>
+
+    <references>
+        <li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
+        <li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
+    </references>
+</qhelp>
diff --git a/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql b/python/ql/src/Security/CWE-020/IncompleteUrlSubstringSanitization.ql
@@ -0,0 +1,58 @@
+/**
+ * @name Incomplete URL substring sanitization
+ * @description Security checks on the substrings of an unparsed URL are often vulnerable to bypassing.
+ * @kind problem
+ * @problem.severity warning
+ * @precision high
+ * @id py/incomplete-url-substring-sanitization
+ * @tags correctness
+ *       security
+ *       external/cwe/cwe-20
+ */
+
+
+import python
+import semmle.python.regex
+
+private string commonTopLevelDomainRegex() {
+    result = "com|org|edu|gov|uk|net|io"
+}
+
+predicate looksLikeUrl(StrConst s) {
+    exists(string text |
+        text = s.getText()
+        |
+        text.regexpMatch("(?i)([a-z]*:?//)?\\.?([a-z0-9-]+\\.)+(" + 
+            commonTopLevelDomainRegex() +")(:[0-9]+)?/?")
+        or
+        // target is a HTTP URL to a domain on any TLD
+        text.regexpMatch("(?i)https?://([a-z0-9-]+\\.)+([a-z]+)(:[0-9]+)?/?")
+    )
+}
+
+predicate incomplete_sanitization(Expr sanitizer, StrConst url) {
+    looksLikeUrl(url) and
+    (
+        sanitizer.(Compare).compares(url, any(In i), _)
+        or
+        call_to_startswith(sanitizer, url)
+        or
+        unsafe_call_to_endswith(sanitizer, url)
+    )
+}
+
+predicate call_to_startswith(Call sanitizer, StrConst url) {
+    sanitizer.getFunc().(Attribute).getName() = "startswith"
+    and
+    sanitizer.getArg(0) = url
+}
+
+predicate unsafe_call_to_endswith(Call sanitizer, StrConst url) {
+    sanitizer.getFunc().(Attribute).getName() = "endswith" and
+    sanitizer.getArg(0) = url and
+    not url.getText().regexpMatch("(?i)\\.([a-z0-9-]+)(\\.[a-z0-9-]+)+")
+}
+
+from Expr sanitizer, StrConst url
+where incomplete_sanitization(sanitizer, url)
+select sanitizer, "'$@' may be at an arbitrary position in the sanitized URL.", url, url.getText()
diff --git a/python/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.py b/python/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.py
@@ -0,0 +1,19 @@
+from flask import Flask, request, redirect
+import re
+
+app = Flask(__name__)
+
+UNSAFE_REGEX = re.compile("(www|beta).example.com/")
+SAFE_REGEX = re.compile(r"(www|beta)\.example\.com/")
+
+@app.route('/some/path/bad')
+def unsafe(request):
+    target = request.args.get('target', '')
+    if UNSAFE_REGEX.match(target):
+        return redirect(target)
+
+@app.route('/some/path/good')
+def safe(request):
+    target = request.args.get('target', '')
+    if SAFE_REGEX.match(target):
+        return redirect(target)
diff --git a/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py b/python/ql/src/Security/CWE-020/examples/IncompleteUrlSubstringSanitization.py
@@ -0,0 +1,45 @@
+from flask import Flask, request, redirect
+from urllib.parse import urlparse
+
+app = Flask(__name__)
+
+# Not safe, as "evil-example.net/example.com" would be accepted
+
+@app.route('/some/path/bad1')
+def unsafe1(request):
+    target = request.args.get('target', '')
+    if "example.com" in target:
+        return redirect(target)
+
+# Not safe, as "benign-looking-prefix-example.com" would be accepted
+
+@app.route('/some/path/bad2')
+def unsafe2(request):
+    target = request.args.get('target', '')
+    if target.endswith("example.com"):
+        return redirect(target)
+
+
+
+#Simplest and safest approach is to use a white-list
+
+@app.route('/some/path/good1')
+def safe1(request):
+    whitelist = [
+        "example.com/home",
+        "example.com/login",
+    ]
+    target = request.args.get('target', '')
+    if target in whitelist:
+        return redirect(target)
+
+#More complex example allowing sub-domains.
+
+@app.route('/some/path/good2')
+def safe2(request):
+    target = request.args.get('target', '')
+    host = urlparse(target).hostname
+    #Note the '.' preceding example.com
+    if host and host.endswith(".example.com"):
+        return redirect(target)
+
diff --git a/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected b/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.expected
@@ -0,0 +1 @@
+| hosttest.py:6:27:6:51 | Str | This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. |
diff --git a/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.qlref b/python/ql/test/query-tests/Security/CWE-020/IncompleteHostnameRegExp.qlref
@@ -0,0 +1 @@
+Security/CWE-020/IncompleteHostnameRegExp.ql
diff --git a/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.expected b/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.expected
@@ -0,0 +1,2 @@
+| urltest.py:9:8:9:30 | Compare | '$@' may be at an arbitrary position in the sanitized URL. | urltest.py:9:8:9:20 | Str | example.com |
+| urltest.py:15:8:15:37 | Attribute() | '$@' may be at an arbitrary position in the sanitized URL. | urltest.py:15:24:15:36 | Str | example.com |
diff --git a/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.qlref b/python/ql/test/query-tests/Security/CWE-020/IncompleteUrlSubstringSanitization.qlref
@@ -0,0 +1 @@
+Security/CWE-020/IncompleteUrlSubstringSanitization.ql
diff --git a/python/ql/test/query-tests/Security/CWE-020/hosttest.py b/python/ql/test/query-tests/Security/CWE-020/hosttest.py
@@ -0,0 +1,19 @@
+from flask import Flask, request, redirect
+import re
+
+app = Flask(__name__)
+
+UNSAFE_REGEX = re.compile("(www|beta).example.com/")
+SAFE_REGEX = re.compile(r"(www|beta)\.example\.com/")
+
+@app.route('/some/path/bad')
+def unsafe(request):
+    target = request.args.get('target', '')
+    if UNSAFE_REGEX.match(target):
+        return redirect(target)
+
+@app.route('/some/path/good')
+def safe(request):
+    target = request.args.get('target', '')
+    if SAFE_REGEX.match(target):
+        return redirect(target)
diff --git a/python/ql/test/query-tests/Security/CWE-020/urltest.py b/python/ql/test/query-tests/Security/CWE-020/urltest.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+\| hosttest.py:6:27:6:51 \| Str \| This regular expression has an unescaped '.' before 'example.com', so it might match more hosts than expected. \|`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+Security/CWE-020/IncompleteHostnameRegExp.ql`