Merge branch 'main' of github.com:github/codeql into SharedDataflow_NestedComprehensions

yoff · yoff · commit 551ae42fb987 · 2020-08-25T15:45:20.000+02:00
diff --git a/python/ql/src/experimental/dataflow/internal/TaintTrackingPrivate.qll b/python/ql/src/experimental/dataflow/internal/TaintTrackingPrivate.qll
@@ -3,13 +3,6 @@ private import experimental.dataflow.DataFlow
 private import experimental.dataflow.internal.DataFlowPrivate
 private import experimental.dataflow.internal.TaintTrackingPublic
 
-/**
- * Holds if taint can flow in one local step from `nodeFrom` to `nodeTo` excluding
- * local data flow steps. That is, `nodeFrom` and `nodeTo` are likely to represent
- * different objects.
- */
-predicate localAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) { none() }
-
 /**
  * Holds if `node` should be a barrier in all global taint flow configurations
  * but not in local taint.
@@ -25,3 +18,108 @@ predicate defaultAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nod
   or
   any(AdditionalTaintStep a).step(nodeFrom, nodeTo)
 }
+
+/**
+ * Holds if taint can flow in one local step from `nodeFrom` to `nodeTo` excluding
+ * local data flow steps. That is, `nodeFrom` and `nodeTo` are likely to represent
+ * different objects.
+ */
+predicate localAdditionalTaintStep(DataFlow::Node nodeFrom, DataFlow::Node nodeTo) {
+  concatStep(nodeFrom, nodeTo)
+  or
+  subscriptStep(nodeFrom, nodeTo)
+  or
+  stringManipulation(nodeFrom, nodeTo)
+}
+
+/**
+ * Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to concatenation.
+ *
+ * Note that since we cannot easily distinguish interesting types (like string, list, tuple),
+ * we consider any `+` operation to propagate taint. After consulting with the JS team, this
+ * doesn't sound like it is a big problem in practice.
+ */
+predicate concatStep(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
+  exists(BinaryExprNode add | add = nodeTo.getNode() |
+    add.getOp() instanceof Add and add.getAnOperand() = nodeFrom.getNode()
+  )
+}
+
+/**
+ * Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to subscripting.
+ */
+predicate subscriptStep(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
+  nodeTo.getNode().(SubscriptNode).getObject() = nodeFrom.getNode()
+}
+
+/**
+ * Holds if taint can flow from `nodeFrom` to `nodeTo` with a step related to string
+ * manipulation.
+ *
+ * Note that since we cannot easily distinguish when something is a string, this can
+ * also make taint flow on `<non string>.replace(foo, bar)`.
+ */
+predicate stringManipulation(DataFlow::CfgNode nodeFrom, DataFlow::CfgNode nodeTo) {
+  // transforming something tainted into a string will make the string tainted
+  exists(CallNode call | call = nodeTo.getNode() |
+    call.getFunction().(NameNode).getId() in ["str", "bytes", "unicode"] and
+    (
+      nodeFrom.getNode() = call.getArg(0)
+      or
+      nodeFrom.getNode() = call.getArgByName("object")
+    )
+  )
+  or
+  // String methods. Note that this doesn't recognize `meth = "foo".upper; meth()`
+  exists(CallNode call, string method_name, ControlFlowNode object |
+    call = nodeTo.getNode() and
+    object = call.getFunction().(AttrNode).getObject(method_name)
+  |
+    nodeFrom.getNode() = object and
+    method_name in ["capitalize", "casefold", "center", "expandtabs", "format", "format_map",
+          "join", "ljust", "lstrip", "lower", "replace", "rjust", "rstrip", "strip", "swapcase",
+          "title", "upper", "zfill", "encode", "decode"]
+    or
+    method_name = "replace" and
+    nodeFrom.getNode() = call.getArg(1)
+    or
+    method_name = "format" and
+    nodeFrom.getNode() = call.getAnArg()
+    or
+    // str -> List[str]
+    // TODO: check if these should be handled differently in regards to content
+    nodeFrom.getNode() = object and
+    method_name in ["partition", "rpartition", "rsplit", "split", "splitlines"]
+    or
+    // List[str] -> str
+    // TODO: check if these should be handled differently in regards to content
+    method_name = "join" and
+    nodeFrom.getNode() = call.getArg(0)
+    or
+    // Mapping[str, Any] -> str
+    method_name = "format_map" and
+    nodeFrom.getNode() = call.getArg(0)
+  )
+  or
+  // % formatting
+  exists(BinaryExprNode fmt | fmt = nodeTo.getNode() |
+    fmt.getOp() instanceof Mod and
+    (
+      fmt.getLeft() = nodeFrom.getNode()
+      or
+      fmt.getRight() = nodeFrom.getNode()
+    )
+  )
+  or
+  // string multiplication -- `"foo" * 10`
+  exists(BinaryExprNode mult | mult = nodeTo.getNode() |
+    mult.getOp() instanceof Mult and
+    mult.getLeft() = nodeFrom.getNode()
+  )
+  or
+  // f-strings
+  nodeTo.getNode().getNode().(Fstring).getAValue() = nodeFrom.getNode().getNode()
+  // TODO: Handle encode/decode from base64/quopri
+  // TODO: Handle os.path.join
+  // TODO: Handle functions in https://docs.python.org/3/library/binascii.html
+}
diff --git a/python/ql/test/experimental/dataflow/options b/python/ql/test/experimental/dataflow/options
@@ -0,0 +1 @@
+semmle-extractor-options: --max-import-depth=1
diff --git a/python/ql/test/experimental/dataflow/tainttracking/TestTaintLib.qll b/python/ql/test/experimental/dataflow/tainttracking/TestTaintLib.qll
@@ -0,0 +1,72 @@
+import python
+import experimental.dataflow.TaintTracking
+import experimental.dataflow.DataFlow
+
+class TestTaintTrackingConfiguration extends TaintTracking::Configuration {
+  TestTaintTrackingConfiguration() { this = "TestTaintTrackingConfiguration" }
+
+  override predicate isSource(DataFlow::Node source) {
+    source.(DataFlow::CfgNode).getNode().(NameNode).getId() in ["TAINTED_STRING", "TAINTED_BYTES"]
+  }
+
+  override predicate isSink(DataFlow::Node sink) {
+    exists(CallNode call |
+      call.getFunction().(NameNode).getId() in ["ensure_tainted", "ensure_not_tainted"] and
+      sink.(DataFlow::CfgNode).getNode() = call.getAnArg()
+    )
+  }
+}
+
+private string repr(Expr e) {
+  not e instanceof Num and
+  not e instanceof StrConst and
+  not e instanceof Subscript and
+  not e instanceof Call and
+  not e instanceof Attribute and
+  result = e.toString()
+  or
+  result = e.(Num).getN()
+  or
+  result =
+    e.(StrConst).getPrefix() + e.(StrConst).getText() +
+      e.(StrConst).getPrefix().regexpReplaceAll("[a-zA-Z]+", "")
+  or
+  result = repr(e.(Subscript).getObject()) + "[" + repr(e.(Subscript).getIndex()) + "]"
+  or
+  (
+    if exists(e.(Call).getAnArg()) or exists(e.(Call).getANamedArg())
+    then result = repr(e.(Call).getFunc()) + "(..)"
+    else result = repr(e.(Call).getFunc()) + "()"
+  )
+  or
+  result = repr(e.(Attribute).getObject()) + "." + e.(Attribute).getName()
+}
+
+query predicate test_taint(string arg_location, string test_res, string function_name, string repr) {
+  exists(Call call, Expr arg, boolean expected_taint, boolean has_taint |
+    call.getLocation().getFile().getShortName() = "test.py" and
+    (
+      call.getFunc().(Name).getId() = "ensure_tainted" and
+      expected_taint = true
+      or
+      call.getFunc().(Name).getId() = "ensure_not_tainted" and
+      expected_taint = false
+    ) and
+    arg = call.getAnArg() and
+    (
+      // TODO: Replace with `hasFlowToExpr` once that is working
+      if
+        exists(TaintTracking::Configuration c |
+          c.hasFlowTo(any(DataFlow::Node n | n.(DataFlow::CfgNode).getNode() = arg.getAFlowNode()))
+        )
+      then has_taint = true
+      else has_taint = false
+    ) and
+    (if expected_taint = has_taint then test_res = "ok  " else test_res = "fail") and
+    // select
+    arg_location = arg.getLocation().toString() and
+    test_res = test_res and
+    function_name = call.getScope().(Function).getName() and
+    repr = repr(arg)
+  )
+}
diff --git a/python/ql/test/experimental/dataflow/tainttracking/string-py3/TestTaint.expected b/python/ql/test/experimental/dataflow/tainttracking/string-py3/TestTaint.expected
@@ -0,0 +1,10 @@
+| test.py:26 | ok   | str_methods | ts.casefold() |
+| test.py:28 | ok   | str_methods | ts.format_map(..) |
+| test.py:29 | fail | str_methods | "{unsafe}".format_map(..) |
+| test.py:40 | fail | binary_decode_encode | base64.a85encode(..) |
+| test.py:41 | fail | binary_decode_encode | base64.a85decode(..) |
+| test.py:44 | fail | binary_decode_encode | base64.b85encode(..) |
+| test.py:45 | fail | binary_decode_encode | base64.b85decode(..) |
+| test.py:48 | fail | binary_decode_encode | base64.encodebytes(..) |
+| test.py:49 | fail | binary_decode_encode | base64.decodebytes(..) |
+| test.py:57 | ok   | f_strings | Fstring |
diff --git a/python/ql/test/experimental/dataflow/tainttracking/string-py3/TestTaint.ql b/python/ql/test/experimental/dataflow/tainttracking/string-py3/TestTaint.ql
@@ -0,0 +1 @@
+import experimental.dataflow.tainttracking.TestTaintLib
diff --git a/python/ql/test/experimental/dataflow/tainttracking/string-py3/options b/python/ql/test/experimental/dataflow/tainttracking/string-py3/options
@@ -0,0 +1 @@
+semmle-extractor-options: --max-import-depth=1 --lang=3
diff --git a/python/ql/test/experimental/dataflow/tainttracking/string-py3/test.py b/python/ql/test/experimental/dataflow/tainttracking/string-py3/test.py
@@ -0,0 +1,64 @@
+# Python 3 specific taint tracking for string
+
+TAINTED_STRING = "TAINTED_STRING"
+TAINTED_BYTES = b"TAINTED_BYTES"
+
+
+def ensure_tainted(*args):
+    print("- ensure_tainted")
+    for i, arg in enumerate(args):
+        print("arg {}: {!r}".format(i, arg))
+
+
+def ensure_not_tainted(*args):
+    print("- ensure_not_tainted")
+    for i, arg in enumerate(args):
+        print("arg {}: {!r}".format(i, arg))
+
+
+# Actual tests
+
+def str_methods():
+    print("\n# str_methods")
+    ts = TAINTED_STRING
+    tb = TAINTED_BYTES
+    ensure_tainted(
+        ts.casefold(),
+
+        ts.format_map({}),
+        "{unsafe}".format_map({"unsafe": ts}),
+    )
+
+
+def binary_decode_encode():
+    print("\n#percent_fmt")
+    tb = TAINTED_BYTES
+    import base64
+
+    ensure_tainted(
+        # New in Python 3.4
+        base64.a85encode(tb),
+        base64.a85decode(base64.a85encode(tb)),
+
+        # New in Python 3.4
+        base64.b85encode(tb),
+        base64.b85decode(base64.b85encode(tb)),
+
+        # New in Python 3.1
+        base64.encodebytes(tb),
+        base64.decodebytes(base64.encodebytes(tb)),
+    )
+
+
+def f_strings():
+    print("\n#f_strings")
+    ts = TAINTED_STRING
+
+    ensure_tainted(f"foo {ts} bar")
+
+
+# Make tests runable
+
+str_methods()
+binary_decode_encode()
+f_strings()
diff --git a/python/ql/test/experimental/dataflow/tainttracking/string/TestTaint.expected b/python/ql/test/experimental/dataflow/tainttracking/string/TestTaint.expected
@@ -0,0 +1,62 @@
+| test.py:32 | ok   | str_operations | ts |
+| test.py:33 | ok   | str_operations | BinaryExpr |
+| test.py:34 | ok   | str_operations | BinaryExpr |
+| test.py:35 | ok   | str_operations | BinaryExpr |
+| test.py:36 | ok   | str_operations | ts[Slice] |
+| test.py:37 | ok   | str_operations | ts[Slice] |
+| test.py:38 | ok   | str_operations | ts[Slice] |
+| test.py:39 | ok   | str_operations | ts[0] |
+| test.py:40 | ok   | str_operations | str(..) |
+| test.py:41 | ok   | str_operations | bytes(..) |
+| test.py:42 | ok   | str_operations | unicode(..) |
+| test.py:51 | ok   | str_methods | ts.capitalize() |
+| test.py:52 | ok   | str_methods | ts.center(..) |
+| test.py:53 | ok   | str_methods | ts.expandtabs() |
+| test.py:55 | ok   | str_methods | ts.format() |
+| test.py:56 | ok   | str_methods | "{}".format(..) |
+| test.py:57 | ok   | str_methods | "{unsafe}".format(..) |
+| test.py:59 | ok   | str_methods | ts.join(..) |
+| test.py:60 | fail | str_methods | "".join(..) |
+| test.py:62 | ok   | str_methods | ts.ljust(..) |
+| test.py:63 | ok   | str_methods | ts.lstrip() |
+| test.py:64 | ok   | str_methods | ts.lower() |
+| test.py:66 | ok   | str_methods | ts.replace(..) |
+| test.py:67 | ok   | str_methods | "safe".replace(..) |
+| test.py:69 | ok   | str_methods | ts.rjust(..) |
+| test.py:70 | ok   | str_methods | ts.rstrip() |
+| test.py:71 | ok   | str_methods | ts.strip() |
+| test.py:72 | ok   | str_methods | ts.swapcase() |
+| test.py:73 | ok   | str_methods | ts.title() |
+| test.py:74 | ok   | str_methods | ts.upper() |
+| test.py:75 | ok   | str_methods | ts.zfill(..) |
+| test.py:77 | ok   | str_methods | ts.encode(..) |
+| test.py:78 | ok   | str_methods | ts.encode(..).decode(..) |
+| test.py:80 | ok   | str_methods | tb.decode(..) |
+| test.py:81 | ok   | str_methods | tb.decode(..).encode(..) |
+| test.py:84 | ok   | str_methods | ts.partition(..) |
+| test.py:85 | ok   | str_methods | ts.rpartition(..) |
+| test.py:86 | ok   | str_methods | ts.rsplit(..) |
+| test.py:87 | ok   | str_methods | ts.split(..) |
+| test.py:88 | ok   | str_methods | ts.splitlines() |
+| test.py:93 | ok   | str_methods | "safe".replace(..) |
+| test.py:95 | fail | str_methods | ts.join(..) |
+| test.py:96 | fail | str_methods | ts.join(..) |
+| test.py:106 | fail | non_syntactic | meth() |
+| test.py:107 | fail | non_syntactic | _str(..) |
+| test.py:116 | ok   | percent_fmt | BinaryExpr |
+| test.py:117 | ok   | percent_fmt | BinaryExpr |
+| test.py:118 | fail | percent_fmt | BinaryExpr |
+| test.py:128 | fail | binary_decode_encode | base64.b64encode(..) |
+| test.py:129 | fail | binary_decode_encode | base64.b64decode(..) |
+| test.py:131 | fail | binary_decode_encode | base64.standard_b64encode(..) |
+| test.py:132 | fail | binary_decode_encode | base64.standard_b64decode(..) |
+| test.py:134 | fail | binary_decode_encode | base64.urlsafe_b64encode(..) |
+| test.py:135 | fail | binary_decode_encode | base64.urlsafe_b64decode(..) |
+| test.py:137 | fail | binary_decode_encode | base64.b32encode(..) |
+| test.py:138 | fail | binary_decode_encode | base64.b32decode(..) |
+| test.py:140 | fail | binary_decode_encode | base64.b16encode(..) |
+| test.py:141 | fail | binary_decode_encode | base64.b16decode(..) |
+| test.py:156 | fail | binary_decode_encode | base64.encodestring(..) |
+| test.py:157 | fail | binary_decode_encode | base64.decodestring(..) |
+| test.py:162 | fail | binary_decode_encode | quopri.encodestring(..) |
+| test.py:163 | fail | binary_decode_encode | quopri.decodestring(..) |
diff --git a/python/ql/test/experimental/dataflow/tainttracking/string/TestTaint.ql b/python/ql/test/experimental/dataflow/tainttracking/string/TestTaint.ql
@@ -0,0 +1 @@
+import experimental.dataflow.tainttracking.TestTaintLib
diff --git a/python/ql/test/experimental/dataflow/tainttracking/string/test.py b/python/ql/test/experimental/dataflow/tainttracking/string/test.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+semmle-extractor-options: --max-import-depth=1`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+import experimental.dataflow.tainttracking.TestTaintLib`