Skip to content

Commit 80ff63a

Browse files
authored
Merge pull request #1387 from esben-semmle/js/unanchored-url-regex
Approved by mc-semmle, xiemaisi
2 parents bd1920c + 04868e5 commit 80ff63a

File tree

15 files changed

+610
-77
lines changed

15 files changed

+610
-77
lines changed

change-notes/1.21/analysis-javascript.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
| **Query** | **Tags** | **Purpose** |
2929
|-----------------------------------------------|------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
30+
| Missing regular expression anchor (`js/regex/missing-regexp-anchor`) | correctness, security, external/cwe/cwe-20 | Highlights regular expression patterns that may be missing an anchor, indicating a possible violation of [CWE-20](https://cwe.mitre.org/data/definitions/20.html). Results are not shown on LGTM by default. |
3031
| Prototype pollution (`js/prototype-pollution`) | security, external/cwe-250, external/cwe-400 | Highlights code that allows an attacker to modify a built-in prototype object through an unsanitized recursive merge function. The results are shown on LGTM by default. |
3132

3233
## Changes to existing queries

javascript/config/suites/javascript/security

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
+ semmlecode-javascript-queries/Security/CWE-020/IncompleteHostnameRegExp.ql: /Security/CWE/CWE-020
44
+ semmlecode-javascript-queries/Security/CWE-020/IncompleteUrlSubstringSanitization.ql: /Security/CWE/CWE-020
55
+ semmlecode-javascript-queries/Security/CWE-020/IncorrectSuffixCheck.ql: /Security/CWE/CWE-020
6+
+ semmlecode-javascript-queries/Security/CWE-020/MissingRegExpAnchor.ql: /Security/CWE/CWE-020
67
+ semmlecode-javascript-queries/Security/CWE-022/TaintedPath.ql: /Security/CWE/CWE-022
78
+ semmlecode-javascript-queries/Security/CWE-022/ZipSlip.ql: /Security/CWE/CWE-022
89
+ semmlecode-javascript-queries/Security/CWE-078/CommandInjection.ql: /Security/CWE/CWE-078

javascript/ql/src/Security/CWE-020/IncompleteHostnameRegExp.ql

Lines changed: 13 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -12,38 +12,6 @@
1212

1313
import javascript
1414

15-
/**
16-
* Gets a node whose value may flow (inter-procedurally) to a position where it is interpreted
17-
* as a regular expression.
18-
*/
19-
DataFlow::Node regExpSource(DataFlow::Node re, DataFlow::TypeBackTracker t) {
20-
t.start() and
21-
re = result and
22-
isInterpretedAsRegExp(result)
23-
or
24-
exists(DataFlow::TypeBackTracker t2, DataFlow::Node succ | succ = regExpSource(re, t2) |
25-
t2 = t.smallstep(result, succ)
26-
or
27-
any(TaintTracking::AdditionalTaintStep dts).step(result, succ) and
28-
t = t2
29-
)
30-
}
31-
32-
DataFlow::Node regExpSource(DataFlow::Node re) {
33-
result = regExpSource(re, DataFlow::TypeBackTracker::end())
34-
}
35-
36-
/** Holds if `re` is a regular expression with value `pattern`. */
37-
predicate regexp(DataFlow::Node re, string pattern, string kind, DataFlow::Node aux) {
38-
re.asExpr().(RegExpLiteral).getValue() = pattern and
39-
kind = "regular expression" and
40-
aux = re
41-
or
42-
re = regExpSource(aux) and
43-
pattern = re.getStringValue() and
44-
kind = "string, which is used as a regular expression $@,"
45-
}
46-
4715
/**
4816
* Holds if `pattern` is a regular expression pattern for URLs with a host matched by `hostPart`,
4917
* and `pattern` contains a subtle mistake that allows it to match unexpected hosts.
@@ -58,11 +26,21 @@ predicate isIncompleteHostNameRegExpPattern(string pattern, string hostPart) {
5826
"([():|?a-z0-9-]+(\\\\)?[.]" + RegExpPatterns::commonTLD() + ")" + ".*", 1)
5927
}
6028

61-
from DataFlow::Node re, string pattern, string hostPart, string kind, DataFlow::Node aux
62-
where regexp(re, pattern, kind, aux) and
29+
from RegExpPatternSource re, string pattern, string hostPart, string kind, DataFlow::Node aux
30+
where
31+
pattern = re.getPattern() and
6332
isIncompleteHostNameRegExpPattern(pattern, hostPart) and
33+
(
34+
if re.getAParse() != re
35+
then (
36+
kind = "string, which is used as a regular expression $@," and
37+
aux = re.getAParse()
38+
) else (
39+
kind = "regular expression" and aux = re
40+
)
41+
) and
6442
// ignore patterns with capture groups after the TLD
6543
not pattern.regexpMatch("(?i).*[.](" + RegExpPatterns::commonTLD() + ").*[(][?]:.*[)].*")
6644
select re,
6745
"This " + kind + " has an unescaped '.' before '" + hostPart +
68-
"', so it might match more hosts than expected.", aux, "here"
46+
"', so it might match more hosts than expected.", aux, "here"
Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
<!DOCTYPE qhelp PUBLIC
2+
"-//Semmle//qhelp//EN"
3+
"qhelp.dtd">
4+
<qhelp>
5+
6+
<overview>
7+
<p>
8+
9+
Sanitizing untrusted input with regular expressions is a
10+
common technique. However, it is error-prone to match untrusted input
11+
against regular expressions without anchors such as <code>^</code> or
12+
<code>$</code>. Malicious input can bypass such security checks by
13+
embedding one of the allowed patterns in an unexpected location.
14+
15+
</p>
16+
17+
<p>
18+
19+
Even if the matching is not done in a security-critical
20+
context, it may still cause undesirable behavior when the regular
21+
expression accidentally matches.
22+
23+
</p>
24+
</overview>
25+
26+
<recommendation>
27+
<p>
28+
29+
Use anchors to ensure that regular expressions match at
30+
the expected locations.
31+
32+
</p>
33+
</recommendation>
34+
35+
<example>
36+
37+
<p>
38+
39+
The following example code checks that a URL redirection
40+
will reach the <code>example.com</code> domain, or one of its
41+
subdomains, and not some malicious site.
42+
43+
</p>
44+
45+
<sample src="examples/MissingRegExpAnchor_BAD.js"/>
46+
47+
<p>
48+
49+
The check with the regular expression match is, however, easy to bypass. For example
50+
by embedding <code>example.com</code> in the path component:
51+
<code>http://evil-example.net/example.com</code>, or in the query
52+
string component: <code>http://evil-example.net/?x=example.com</code>.
53+
54+
Address these shortcomings by using anchors in the regular expression instead:
55+
56+
</p>
57+
58+
<sample src="examples/MissingRegExpAnchor_GOOD.js"/>
59+
60+
<p>
61+
62+
A related mistake is to write a regular expression with
63+
multiple alternatives, but to only include an anchor for one of the
64+
alternatives. As an example, the regular expression
65+
<code>/^www\\.example\\.com|beta\\.example\\.com/</code> will match the host
66+
<code>evil.beta.example.com</code> because the regular expression is parsed
67+
as <code>/(^www\\.example\\.com)|(beta\\.example\\.com)/</code>
68+
69+
</p>
70+
</example>
71+
72+
<references>
73+
<li>MDN: <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions">Regular Expressions</a></li>
74+
<li>OWASP: <a href="https://www.owasp.org/index.php/Server_Side_Request_Forgery">SSRF</a></li>
75+
<li>OWASP: <a href="https://www.owasp.org/index.php/Unvalidated_Redirects_and_Forwards_Cheat_Sheet">XSS Unvalidated Redirects and Forwards Cheat Sheet</a>.</li>
76+
</references>
77+
</qhelp>
Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
/**
2+
* @name Missing regular expression anchor
3+
* @description Regular expressions without anchors can be vulnerable to bypassing.
4+
* @kind problem
5+
* @problem.severity warning
6+
* @precision medium
7+
* @id js/regex/missing-regexp-anchor
8+
* @tags correctness
9+
* security
10+
* external/cwe/cwe-20
11+
*/
12+
13+
import javascript
14+
15+
/**
16+
* Holds if `src` is a pattern for a collection of alternatives where
17+
* only the first or last alternative is anchored, indicating a
18+
* precedence mistake explained by `msg`.
19+
*
20+
* The canonical example of such a mistake is: `^a|b|c`, which is
21+
* parsed as `(^a)|(b)|(c)`.
22+
*/
23+
predicate isInterestingSemiAnchoredRegExpString(RegExpPatternSource src, string msg) {
24+
exists(string str, string maybeGroupedStr, string regex, string anchorPart, string escapedDot |
25+
// a dot that might be escaped in a regular expression, for example `/\./` or `new RegExp('\\.')`
26+
escapedDot = "\\\\[.]" and
27+
// a string that is mostly free from special reqular expression symbols
28+
str = "(?:(?:" + escapedDot + ")|[a-z:/.?_,@0-9 -])+" and
29+
// the string may be wrapped in parentheses
30+
maybeGroupedStr = "(?:" + str + "|\\(" + str + "\\))" and
31+
(
32+
// a problematic pattern: `^a|b|...|x`
33+
regex = "(?i)(\\^" + maybeGroupedStr + ")(?:\\|" + maybeGroupedStr + ")+"
34+
or
35+
// a problematic pattern: `a|b|...|x$`
36+
regex = "(?i)(?:" + maybeGroupedStr + "\\|)+(" + maybeGroupedStr + "\\$)"
37+
) and
38+
anchorPart = src.getPattern().regexpCapture(regex, 1) and
39+
anchorPart.regexpMatch("(?i).*[a-z].*") and
40+
msg = "Misleading operator precedence. The subexpression '" + anchorPart + "' is anchored, but the other parts of this regular expression are not"
41+
)
42+
}
43+
44+
/**
45+
* Holds if `src` is an unanchored pattern for a URL, indicating a
46+
* mistake explained by `msg`.
47+
*/
48+
predicate isInterestingUnanchoredRegExpString(RegExpPatternSource src, string msg) {
49+
exists(string pattern | pattern = src.getPattern() |
50+
// a substring sequence of a protocol and subdomains, perhaps with some regex characters mixed in, followed by a known TLD
51+
pattern
52+
.regexpMatch("(?i)[():|?a-z0-9-\\\\./]+[.]" + RegExpPatterns::commonTLD() +
53+
"([/#?():]\\S*)?") and
54+
// without any anchors
55+
pattern.regexpMatch("[^$^]+") and
56+
// that is not used for capture or replace
57+
not exists(DataFlow::MethodCallNode mcn, string name | name = mcn.getMethodName() |
58+
name = "exec" and
59+
mcn = src.getARegExpObject().getAMethodCall() and
60+
exists(mcn.getAPropertyRead())
61+
or
62+
exists(DataFlow::Node arg |
63+
arg = mcn.getArgument(0) and
64+
(
65+
src.getARegExpObject().flowsTo(arg) or
66+
src.getAParse() = arg
67+
)
68+
|
69+
name = "replace"
70+
or
71+
name = "match" and exists(mcn.getAPropertyRead())
72+
)
73+
) and
74+
msg = "When this is used as a regular expression on a URL, it may match anywhere, and arbitrary hosts may come before or after it."
75+
)
76+
}
77+
78+
from DataFlow::Node nd, string msg
79+
where
80+
isInterestingUnanchoredRegExpString(nd, msg)
81+
or
82+
isInterestingSemiAnchoredRegExpString(nd, msg)
83+
select nd, msg

javascript/ql/src/Security/CWE-020/examples/IncompleteHostnameRegExp.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ app.get('/some/path', function(req, res) {
22
let url = req.param('url'),
33
host = urlLib.parse(url).host;
44
// BAD: the host of `url` may be controlled by an attacker
5-
let regex = /((www|beta).)?example.com/;
5+
let regex = /^((www|beta).)?example.com/;
66
if (host.match(regex)) {
77
res.redirect(url);
88
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
app.get("/some/path", function(req, res) {
2+
let url = req.param("url");
3+
// BAD: the host of `url` may be controlled by an attacker
4+
if (url.match(/https?:\/\/www\.example\.com\//)) {
5+
res.redirect(url);
6+
}
7+
});
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
app.get("/some/path", function(req, res) {
2+
let url = req.param("url");
3+
// GOOD: the host of `url` can not be controlled by an attacker
4+
if (url.match(/^https?:\/\/www\.example\.com\//)) {
5+
res.redirect(url);
6+
}
7+
});

javascript/ql/src/semmle/javascript/Regexp.qll

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,9 @@ abstract class RegExpTerm extends Locatable, @regexpterm {
4141

4242
override string toString() { regexpterm(this, _, _, _, result) }
4343

44+
/** Gets the raw source text of this term. */
45+
string getRawValue() { regexpterm(this, _, _, _, result) }
46+
4447
/** Holds if this regular expression term can match the empty string. */
4548
abstract predicate isNullable();
4649

@@ -404,3 +407,94 @@ module RegExpPatterns {
404407
result = "(?:com|org|edu|gov|uk|net|io)(?![a-z0-9])"
405408
}
406409
}
410+
411+
/**
412+
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
413+
* as a part of a regular expression.
414+
*/
415+
private DataFlow::Node regExpSource(DataFlow::Node re, DataFlow::TypeBackTracker t) {
416+
t.start() and
417+
re = result and
418+
isInterpretedAsRegExp(result)
419+
or
420+
exists(DataFlow::TypeBackTracker t2, DataFlow::Node succ | succ = regExpSource(re, t2) |
421+
t2 = t.smallstep(result, succ)
422+
or
423+
any(TaintTracking::AdditionalTaintStep dts).step(result, succ) and
424+
t = t2
425+
)
426+
}
427+
428+
/**
429+
* Gets a node whose value may flow (inter-procedurally) to `re`, where it is interpreted
430+
* as a part of a regular expression.
431+
*/
432+
private DataFlow::Node regExpSource(DataFlow::Node re) {
433+
result = regExpSource(re, DataFlow::TypeBackTracker::end())
434+
}
435+
436+
/**
437+
* A node whose value may flow to a position where it is interpreted
438+
* as a part of a regular expression.
439+
*/
440+
abstract class RegExpPatternSource extends DataFlow::Node {
441+
/**
442+
* Gets a node where the pattern of this node is parsed as a part of
443+
* a regular expression.
444+
*/
445+
abstract DataFlow::Node getAParse();
446+
447+
/**
448+
* Gets the pattern of this node that is interpreted as a part of a
449+
* regular expression.
450+
*/
451+
abstract string getPattern();
452+
453+
/**
454+
* Gets a regular expression object that is constructed from the pattern
455+
* of this node.
456+
*/
457+
abstract DataFlow::SourceNode getARegExpObject();
458+
}
459+
460+
/**
461+
* A regular expression literal, viewed as the pattern source for itself.
462+
*/
463+
private class RegExpLiteralPatternSource extends RegExpPatternSource {
464+
string pattern;
465+
466+
RegExpLiteralPatternSource() {
467+
exists(string raw | raw = asExpr().(RegExpLiteral).getRoot().getRawValue() |
468+
// hide the fact that `/` is escaped in the literal
469+
pattern = raw.regexpReplaceAll("\\\\/", "/")
470+
)
471+
}
472+
473+
override DataFlow::Node getAParse() { result = this }
474+
475+
override string getPattern() { result = pattern }
476+
477+
override DataFlow::SourceNode getARegExpObject() { result = this }
478+
}
479+
480+
/**
481+
* A node whose string value may flow to a position where it is interpreted
482+
* as a part of a regular expression.
483+
*/
484+
private class StringRegExpPatternSource extends RegExpPatternSource {
485+
DataFlow::Node parse;
486+
487+
StringRegExpPatternSource() { this = regExpSource(parse) }
488+
489+
override DataFlow::Node getAParse() { result = parse }
490+
491+
override DataFlow::SourceNode getARegExpObject() {
492+
exists(DataFlow::InvokeNode constructor |
493+
constructor = DataFlow::globalVarRef("RegExp").getAnInvocation() and
494+
parse = constructor.getArgument(0) and
495+
result = constructor
496+
)
497+
}
498+
499+
override string getPattern() { result = getStringValue() }
500+
}

0 commit comments

Comments
 (0)