Skip to content

Commit cb8e5fa

Browse files
authored
Merge pull request #2411 from asger-semmle/regexp-sanitizer-guards
Approved by esbena, max-schaefer
2 parents 837b1e2 + abec4ba commit cb8e5fa

File tree

14 files changed

+697
-437
lines changed

14 files changed

+697
-437
lines changed

javascript/ql/src/semmle/javascript/Expr.qll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -454,16 +454,16 @@ class RegExpLiteral extends @regexpliteral, Literal, RegExpParent {
454454
string getFlags() { result = getValue().regexpCapture(".*/(\\w*)$", 1) }
455455

456456
/** Holds if this regular expression has an `m` flag. */
457-
predicate isMultiline() { getFlags().matches("%m%") }
457+
predicate isMultiline() { RegExp::isMultiline(getFlags()) }
458458

459459
/** Holds if this regular expression has a `g` flag. */
460-
predicate isGlobal() { getFlags().matches("%g%") }
460+
predicate isGlobal() { RegExp::isGlobal(getFlags()) }
461461

462462
/** Holds if this regular expression has an `i` flag. */
463-
predicate isIgnoreCase() { getFlags().matches("%i%") }
463+
predicate isIgnoreCase() { RegExp::isIgnoreCase(getFlags()) }
464464

465465
/** Holds if this regular expression has an `s` flag. */
466-
predicate isDotAll() { getFlags().matches("%s%") }
466+
predicate isDotAll() { RegExp::isDotAll(getFlags()) }
467467
}
468468

469469
/**

javascript/ql/src/semmle/javascript/Regexp.qll

Lines changed: 133 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,7 @@ class RegExpTerm extends Locatable, @regexpterm {
113113
/**
114114
* Holds if this is the root term of a regular expression.
115115
*/
116-
predicate isRootTerm() {
117-
not getParent() instanceof RegExpTerm
118-
}
116+
predicate isRootTerm() { not getParent() instanceof RegExpTerm }
119117

120118
/**
121119
* Gets the outermost term of this regular expression.
@@ -130,19 +128,15 @@ class RegExpTerm extends Locatable, @regexpterm {
130128
/**
131129
* Holds if this term occurs as part of a regular expression literal.
132130
*/
133-
predicate isPartOfRegExpLiteral() {
134-
exists(getLiteral())
135-
}
131+
predicate isPartOfRegExpLiteral() { exists(getLiteral()) }
136132

137133
/**
138134
* Holds if this term occurs as part of a string literal.
139135
*
140136
* This predicate holds regardless of whether the string literal is actually
141137
* used as a regular expression. See `isUsedAsRegExp`.
142138
*/
143-
predicate isPartOfStringLiteral() {
144-
getRootTerm().getParent() instanceof StringLiteral
145-
}
139+
predicate isPartOfStringLiteral() { getRootTerm().getParent() instanceof StringLiteral }
146140

147141
/**
148142
* Holds if this term is part of a regular expression literal, or a string literal
@@ -344,8 +338,7 @@ class RegExpAnchor extends RegExpTerm, @regexp_anchor {
344338
* ^
345339
* ```
346340
*/
347-
class RegExpCaret extends RegExpAnchor, @regexp_caret {
348-
}
341+
class RegExpCaret extends RegExpAnchor, @regexp_caret { }
349342

350343
/**
351344
* A dollar assertion `$` matching the end of a line.
@@ -356,8 +349,7 @@ class RegExpCaret extends RegExpAnchor, @regexp_caret {
356349
* $
357350
* ```
358351
*/
359-
class RegExpDollar extends RegExpAnchor, @regexp_dollar {
360-
}
352+
class RegExpDollar extends RegExpAnchor, @regexp_dollar { }
361353

362354
/**
363355
* A word boundary assertion.
@@ -940,3 +932,131 @@ private class StringRegExpPatternSource extends RegExpPatternSource {
940932

941933
override RegExpTerm getRegExpTerm() { result = asExpr().(StringLiteral).asRegExp() }
942934
}
935+
936+
module RegExp {
937+
/** Gets the string `"?"` used to represent a regular expression whose flags are unknown. */
938+
string unknownFlag() { result = "?" }
939+
940+
/** Holds if `flags` includes the `m` flag. */
941+
bindingset[flags]
942+
predicate isMultiline(string flags) { flags.matches("%m%") }
943+
944+
/** Holds if `flags` includes the `g` flag. */
945+
bindingset[flags]
946+
predicate isGlobal(string flags) { flags.matches("%g%") }
947+
948+
/** Holds if `flags` includes the `i` flag. */
949+
bindingset[flags]
950+
predicate isIgnoreCase(string flags) { flags.matches("%i%") }
951+
952+
/** Holds if `flags` includes the `s` flag. */
953+
bindingset[flags]
954+
predicate isDotAll(string flags) { flags.matches("%s%") }
955+
956+
/** Holds if `flags` includes the `m` flag or is the unknown flag `?`. */
957+
bindingset[flags]
958+
predicate maybeMultiline(string flags) { flags = unknownFlag() or isMultiline(flags) }
959+
960+
/** Holds if `flags` includes the `g` flag or is the unknown flag `?`. */
961+
bindingset[flags]
962+
predicate maybeGlobal(string flags) { flags = unknownFlag() or isGlobal(flags) }
963+
964+
/** Holds if `flags` includes the `i` flag or is the unknown flag `?`. */
965+
bindingset[flags]
966+
predicate maybeIgnoreCase(string flags) { flags = unknownFlag() or isIgnoreCase(flags) }
967+
968+
/** Holds if `flags` includes the `s` flag or is the unknown flag `?`. */
969+
bindingset[flags]
970+
predicate maybeDotAll(string flags) { flags = unknownFlag() or isDotAll(flags) }
971+
972+
/** Holds if `term` and all of its disjuncts are anchored on both ends. */
973+
predicate isFullyAnchoredTerm(RegExpTerm term) {
974+
exists(RegExpSequence seq | term = seq |
975+
seq.getChild(0) instanceof RegExpCaret and
976+
seq.getLastChild() instanceof RegExpDollar
977+
)
978+
or
979+
isFullyAnchoredTerm(term.(RegExpGroup).getAChild())
980+
or
981+
isFullyAnchoredAlt(term, term.getNumChild())
982+
}
983+
984+
/** Holds if the first `i` disjuncts of `term` are fully anchored. */
985+
private predicate isFullyAnchoredAlt(RegExpAlt term, int i) {
986+
isFullyAnchoredTerm(term.getChild(0)) and i = 1
987+
or
988+
isFullyAnchoredAlt(term, i - 1) and
989+
isFullyAnchoredTerm(term.getChild(i - 1))
990+
}
991+
992+
/**
993+
* Holds if `term` matches any character except for explicitly listed exceptions.
994+
*
995+
* For example, holds for `.`, `[^<>]`, or `\W`, but not for `[a-z]`, `\w`, or `[^\W\S]`.
996+
*/
997+
predicate isWildcardLike(RegExpTerm term) {
998+
term instanceof RegExpDot
999+
or
1000+
term.(RegExpCharacterClassEscape).getValue().isUppercase()
1001+
or
1002+
// [^a-z]
1003+
exists(RegExpCharacterClass cls | term = cls |
1004+
cls.isInverted() and
1005+
not cls.getAChild().(RegExpCharacterClassEscape).getValue().isUppercase()
1006+
)
1007+
or
1008+
// [\W]
1009+
exists(RegExpCharacterClass cls | term = cls |
1010+
not cls.isInverted() and
1011+
cls.getAChild().(RegExpCharacterClassEscape).getValue().isUppercase()
1012+
)
1013+
}
1014+
1015+
/**
1016+
* Holds if `term` is a generic sanitizer for strings that match (if `outcome` is true)
1017+
* or strings that don't match (if `outcome` is false).
1018+
*
1019+
* Specifically, whitelisting regexps such as `^(foo|bar)$` sanitize matches in the true case.
1020+
* Inverted character classes such as `[^a-z]` or `\W` sanitize matches in the false case.
1021+
*/
1022+
predicate isGenericRegExpSanitizer(RegExpTerm term, boolean outcome) {
1023+
term.isRootTerm() and
1024+
(
1025+
outcome = true and
1026+
isFullyAnchoredTerm(term) and
1027+
not isWildcardLike(term.getAChild*())
1028+
or
1029+
// Character set restrictions like `/[^a-z]/.test(x)` sanitize in the false case
1030+
outcome = false and
1031+
exists(RegExpTerm root |
1032+
root = term
1033+
or
1034+
root = term.(RegExpGroup).getAChild()
1035+
|
1036+
isWildcardLike(root)
1037+
or
1038+
isWildcardLike(root.(RegExpAlt).getAChild())
1039+
)
1040+
)
1041+
}
1042+
1043+
/**
1044+
* Gets the AST of a regular expression object that can flow to `node`.
1045+
*/
1046+
RegExpTerm getRegExpObjectFromNode(DataFlow::Node node) {
1047+
exists(DataFlow::RegExpCreationNode regexp |
1048+
regexp.getAReference().flowsTo(node) and
1049+
result = regexp.getRoot()
1050+
)
1051+
}
1052+
1053+
/**
1054+
* Gets the AST of a regular expression that can flow to `node`,
1055+
* including `RegExp` objects as well as strings interpreted as regular expressions.
1056+
*/
1057+
RegExpTerm getRegExpFromNode(DataFlow::Node node) {
1058+
result = getRegExpObjectFromNode(node)
1059+
or
1060+
result = node.asExpr().(StringLiteral).asRegExp()
1061+
}
1062+
}

javascript/ql/src/semmle/javascript/dataflow/Nodes.qll

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -546,6 +546,11 @@ class RegExpLiteralNode extends DataFlow::ValueNode, DataFlow::SourceNode {
546546

547547
/** Gets the root term of this regular expression. */
548548
RegExpTerm getRoot() { result = astNode.getRoot() }
549+
550+
/** Gets the flags of this regular expression literal. */
551+
string getFlags() {
552+
result = astNode.getFlags()
553+
}
549554
}
550555

551556
/**
@@ -1315,3 +1320,110 @@ module PartialInvokeNode {
13151320
* This contributes additional argument-passing flow edges that should be added to all data flow configurations.
13161321
*/
13171322
deprecated class AdditionalPartialInvokeNode = PartialInvokeNode::Range;
1323+
1324+
/**
1325+
* An invocation of the `RegExp` constructor.
1326+
*
1327+
* Example:
1328+
* ```js
1329+
* new RegExp("#[a-z]+", "g");
1330+
* RegExp("^\w*$");
1331+
* ```
1332+
*/
1333+
class RegExpConstructorInvokeNode extends DataFlow::InvokeNode {
1334+
RegExpConstructorInvokeNode() {
1335+
this = DataFlow::globalVarRef("RegExp").getAnInvocation()
1336+
}
1337+
1338+
/**
1339+
* Gets the AST of the regular expression created here, provided that the
1340+
* first argument is a string literal.
1341+
*/
1342+
RegExpTerm getRoot() {
1343+
result = getArgument(0).asExpr().(StringLiteral).asRegExp()
1344+
}
1345+
1346+
/**
1347+
* Gets the flags provided in the second argument, or an empty string if no
1348+
* flags are provided.
1349+
*
1350+
* Has no result if the flags are provided but are not constant.
1351+
*/
1352+
string getFlags() {
1353+
result = getArgument(1).getStringValue()
1354+
or
1355+
not exists(getArgument(1)) and
1356+
result = ""
1357+
}
1358+
1359+
/**
1360+
* Gets the flags provided in the second argument, or an empty string if no
1361+
* flags are provided, or the string `"?"` if the provided flags are not known.
1362+
*/
1363+
string tryGetFlags() {
1364+
result = getFlags()
1365+
or
1366+
not exists(getFlags()) and
1367+
result = RegExp::unknownFlag()
1368+
}
1369+
}
1370+
1371+
/**
1372+
* A data flow node corresponding to a regular expression literal or
1373+
* an invocation of the `RegExp` constructor.
1374+
*
1375+
* Examples:
1376+
* ```js
1377+
* new RegExp("#[a-z]+", "g");
1378+
* RegExp("^\w*$");
1379+
* /[a-z]+/i
1380+
* ```
1381+
*/
1382+
class RegExpCreationNode extends DataFlow::SourceNode {
1383+
RegExpCreationNode() {
1384+
this instanceof RegExpConstructorInvokeNode or
1385+
this instanceof RegExpLiteralNode
1386+
}
1387+
1388+
/**
1389+
* Gets the root term of the created regular expression, if it is known.
1390+
*
1391+
* Has no result for calls to `RegExp` with a non-constant argument.
1392+
*/
1393+
RegExpTerm getRoot() {
1394+
result = this.(RegExpConstructorInvokeNode).getRoot() or
1395+
result = this.(RegExpLiteralNode).getRoot()
1396+
}
1397+
1398+
/**
1399+
* Gets the provided regular expression flags, if they are known.
1400+
*/
1401+
string getFlags() {
1402+
result = this.(RegExpConstructorInvokeNode).getFlags() or
1403+
result = this.(RegExpLiteralNode).getFlags()
1404+
}
1405+
1406+
/**
1407+
* Gets the flags provided in the second argument, or an empty string if no
1408+
* flags are provided, or the string `"?"` if the provided flags are not known.
1409+
*/
1410+
string tryGetFlags() {
1411+
result = this.(RegExpConstructorInvokeNode).tryGetFlags() or
1412+
result = this.(RegExpLiteralNode).getFlags()
1413+
}
1414+
1415+
/** Gets a data flow node referring to this regular expression. */
1416+
private DataFlow::SourceNode getAReference(DataFlow::TypeTracker t) {
1417+
t.start() and
1418+
result = this
1419+
or
1420+
exists(DataFlow::TypeTracker t2 |
1421+
result = getAReference(t2).track(t2, t)
1422+
)
1423+
}
1424+
1425+
/** Gets a data flow node referring to this regular expression. */
1426+
DataFlow::SourceNode getAReference() {
1427+
result = getAReference(DataFlow::TypeTracker::end())
1428+
}
1429+
}

javascript/ql/src/semmle/javascript/dataflow/TaintTracking.qll

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -696,33 +696,37 @@ module TaintTracking {
696696
*/
697697
class SanitizingRegExpTest extends AdditionalSanitizerGuardNode, DataFlow::ValueNode {
698698
Expr expr;
699+
boolean sanitizedOutcome;
699700

700701
SanitizingRegExpTest() {
701702
exists(MethodCallExpr mce, Expr base, string m, Expr firstArg |
702703
mce = astNode and mce.calls(base, m) and firstArg = mce.getArgument(0)
703704
|
704705
// /re/.test(u) or /re/.exec(u)
705-
base.analyze().getAType() = TTRegExp() and
706+
RegExp::isGenericRegExpSanitizer(RegExp::getRegExpObjectFromNode(base.flow()), sanitizedOutcome) and
706707
(m = "test" or m = "exec") and
707708
firstArg = expr
708709
or
709710
// u.match(/re/) or u.match("re")
710711
base = expr and
711712
m = "match" and
712-
exists(InferredType firstArgType | firstArgType = firstArg.analyze().getAType() |
713-
firstArgType = TTRegExp() or firstArgType = TTString()
714-
)
713+
RegExp::isGenericRegExpSanitizer(RegExp::getRegExpFromNode(firstArg.flow()), sanitizedOutcome)
715714
)
716715
or
717716
// m = /re/.exec(u) and similar
718-
DataFlow::valueNode(astNode.(AssignExpr).getRhs()).(SanitizingRegExpTest).getSanitizedExpr() =
719-
expr
717+
exists(SanitizingRegExpTest other |
718+
other = DataFlow::valueNode(astNode.(AssignExpr).getRhs()) and
719+
expr = other.getSanitizedExpr() and
720+
sanitizedOutcome = other.getSanitizedOutcome()
721+
)
720722
}
721723

722724
private Expr getSanitizedExpr() { result = expr }
723725

726+
private boolean getSanitizedOutcome() { result = sanitizedOutcome }
727+
724728
override predicate sanitizes(boolean outcome, Expr e) {
725-
(outcome = true or outcome = false) and
729+
outcome = sanitizedOutcome and
726730
e = expr
727731
}
728732

javascript/ql/test/library-tests/TaintBarriers/SanitizingGuard.expected

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,5 @@
1-
| tst.js:5:9:5:19 | /x/.test(v) | ExampleConfiguration | false | tst.js:5:18:5:18 | v |
2-
| tst.js:5:9:5:19 | /x/.test(v) | ExampleConfiguration | true | tst.js:5:18:5:18 | v |
3-
| tst.js:11:9:11:20 | v.match(/x/) | ExampleConfiguration | false | tst.js:11:9:11:9 | v |
4-
| tst.js:11:9:11:20 | v.match(/x/) | ExampleConfiguration | true | tst.js:11:9:11:9 | v |
1+
| tst.js:5:9:5:21 | /^x$/.test(v) | ExampleConfiguration | true | tst.js:5:20:5:20 | v |
2+
| tst.js:11:9:11:25 | v.match(/[^a-z]/) | ExampleConfiguration | false | tst.js:11:9:11:9 | v |
53
| tst.js:23:9:23:27 | o.hasOwnProperty(v) | ExampleConfiguration | true | tst.js:23:26:23:26 | v |
64
| tst.js:35:9:35:14 | v in o | ExampleConfiguration | true | tst.js:35:9:35:9 | v |
75
| tst.js:47:9:47:25 | o[v] == undefined | ExampleConfiguration | false | tst.js:47:11:47:11 | v |

javascript/ql/test/library-tests/TaintBarriers/TaintedSink.expected

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
| tst.js:3:10:3:10 | v | tst.js:2:13:2:20 | SOURCE() |
2+
| tst.js:8:14:8:14 | v | tst.js:2:13:2:20 | SOURCE() |
3+
| tst.js:12:14:12:14 | v | tst.js:2:13:2:20 | SOURCE() |
24
| tst.js:21:10:21:10 | v | tst.js:20:13:20:20 | SOURCE() |
35
| tst.js:26:14:26:14 | v | tst.js:20:13:20:20 | SOURCE() |
46
| tst.js:33:10:33:10 | v | tst.js:32:13:32:20 | SOURCE() |

javascript/ql/test/library-tests/TaintBarriers/isBarrier.expected

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,4 @@
11
| tst.js:6:14:6:14 | v | ExampleConfiguration |
2-
| tst.js:8:14:8:14 | v | ExampleConfiguration |
3-
| tst.js:12:14:12:14 | v | ExampleConfiguration |
42
| tst.js:14:14:14:14 | v | ExampleConfiguration |
53
| tst.js:24:14:24:14 | v | ExampleConfiguration |
64
| tst.js:36:14:36:14 | v | ExampleConfiguration |

0 commit comments

Comments
 (0)