Merge pull request #3961 from tausbn/python-add-typetracker

calumgrant · web-flow · commit 29b3759655cf · 2020-09-02T09:42:14.000+01:00
Python: Add type tracker and step summary implementation.
diff --git a/python/ql/src/experimental/dataflow/TypeTracker.qll b/python/ql/src/experimental/dataflow/TypeTracker.qll
@@ -0,0 +1,282 @@
+/** Step Summaries and Type Tracking */
+
+import python
+import internal.DataFlowPublic
+import internal.DataFlowPrivate
+
+/** Any string that may appear as the name of an attribute or access path. */
+class AttributeName extends string {
+  AttributeName() { this = any(Attribute a).getName() }
+}
+
+/** Either an attribute name, or the empty string (representing no attribute). */
+class OptionalAttributeName extends string {
+  OptionalAttributeName() { this instanceof AttributeName or this = "" }
+}
+
+/**
+ * A description of a step on an inter-procedural data flow path.
+ */
+private newtype TStepSummary =
+  LevelStep() or
+  CallStep() or
+  ReturnStep() or
+  StoreStep(AttributeName attr) or
+  LoadStep(AttributeName attr)
+
+/**
+ * INTERNAL: Use `TypeTracker` or `TypeBackTracker` instead.
+ *
+ * A description of a step on an inter-procedural data flow path.
+ */
+class StepSummary extends TStepSummary {
+  /** Gets a textual representation of this step summary. */
+  string toString() {
+    this instanceof LevelStep and result = "level"
+    or
+    this instanceof CallStep and result = "call"
+    or
+    this instanceof ReturnStep and result = "return"
+    or
+    exists(string attr | this = StoreStep(attr) | result = "store " + attr)
+    or
+    exists(string attr | this = LoadStep(attr) | result = "load " + attr)
+  }
+}
+
+module StepSummary {
+  cached
+  predicate step(Node nodeFrom, Node nodeTo, StepSummary summary) {
+    exists(Node mid | EssaFlow::essaFlowStep*(nodeFrom, mid) and smallstep(mid, nodeTo, summary))
+  }
+
+  predicate smallstep(Node nodeFrom, Node nodeTo, StepSummary summary) {
+    EssaFlow::essaFlowStep(nodeFrom, nodeTo) and
+    summary = LevelStep()
+    or
+    callStep(nodeFrom, nodeTo) and summary = CallStep()
+    or
+    returnStep(nodeFrom, nodeTo) and
+    summary = ReturnStep()
+    or
+    exists(string attr |
+      basicStoreStep(nodeFrom, nodeTo, attr) and
+      summary = StoreStep(attr)
+      or
+      basicLoadStep(nodeFrom, nodeTo, attr) and summary = LoadStep(attr)
+    )
+  }
+}
+
+/** Holds if `nodeFrom` steps to `nodeTo` by being passed as a parameter in a call. */
+predicate callStep(ArgumentNode nodeFrom, ParameterNode nodeTo) {
+  // TODO: Support special methods?
+  exists(DataFlowCall call, int i |
+    nodeFrom.argumentOf(call, i) and nodeTo.isParameterOf(call.getCallable(), i)
+  )
+}
+
+/** Holds if `nodeFrom` steps to `nodeTo` by being returned from a call. */
+predicate returnStep(ReturnNode nodeFrom, Node nodeTo) {
+  exists(DataFlowCall call |
+    nodeFrom.getEnclosingCallable() = call.getCallable() and nodeTo.asCfgNode() = call.getNode()
+  )
+}
+
+/**
+ * Holds if `nodeFrom` is being written to the `attr` attribute of the object in `nodeTo`.
+ *
+ * Note that the choice of `nodeTo` does not have to make sense "chronologically".
+ * All we care about is whether the `attr` attribute of `nodeTo` can have a specific type,
+ * and the assumption is that if a specific type appears here, then any access of that
+ * particular attribute can yield something of that particular type.
+ *
+ * Thus, in an example such as
+ *
+ * ```python
+ * def foo(y):
+ *    x = Foo()
+ *    bar(x)
+ *    x.attr = y
+ *    baz(x)
+ *
+ * def bar(x):
+ *    z = x.attr
+ * ```
+ * for the attribute write `x.attr = y`, we will have `attr` being the literal string `"attr"`,
+ * `nodeFrom` will be `y`, and `nodeTo` will be the object `Foo()` created on the first line of the
+ * function. This means we will track the fact that `x.attr` can have the type of `y` into the
+ * assignment to `z` inside `bar`, even though this attribute write happens _after_ `bar` is called.
+ */
+predicate basicStoreStep(Node nodeFrom, Node nodeTo, string attr) {
+  exists(AttributeAssignment a, Node var |
+    a.getName() = attr and
+    EssaFlow::essaFlowStep*(nodeTo, var) and
+    var.asVar() = a.getInput() and
+    nodeFrom.asCfgNode() = a.getValue()
+  )
+}
+
+/**
+ * Holds if `nodeTo` is the result of accessing the `attr` attribute of `nodeFrom`.
+ */
+predicate basicLoadStep(Node nodeFrom, Node nodeTo, string attr) {
+  exists(AttrNode s | nodeTo.asCfgNode() = s and s.getObject(attr) = nodeFrom.asCfgNode())
+}
+
+/**
+ * A utility class that is equivalent to `boolean` but does not require type joining.
+ */
+private class Boolean extends boolean {
+  Boolean() { this = true or this = false }
+}
+
+private newtype TTypeTracker = MkTypeTracker(Boolean hasCall, OptionalAttributeName attr)
+
+/**
+ * Summary of the steps needed to track a value to a given dataflow node.
+ *
+ * This can be used to track objects that implement a certain API in order to
+ * recognize calls to that API. Note that type-tracking does not by itself provide a
+ * source/sink relation, that is, it may determine that a node has a given type,
+ * but it won't determine where that type came from.
+ *
+ * It is recommended that all uses of this type are written in the following form,
+ * for tracking some type `myType`:
+ * ```
+ * Node myType(DataFlow::TypeTracker t) {
+ *   t.start() and
+ *   result = < source of myType >
+ *   or
+ *   exists (TypeTracker t2 |
+ *     result = myType(t2).track(t2, t)
+ *   )
+ * }
+ *
+ * DataFlow::SourceNode myType() { result = myType(DataFlow::TypeTracker::end()) }
+ * ```
+ *
+ * Instead of `result = myType(t2).track(t2, t)`, you can also use the equivalent
+ * `t = t2.step(myType(t2), result)`. If you additionally want to track individual
+ * intra-procedural steps, use `t = t2.smallstep(myCallback(t2), result)`.
+ */
+class TypeTracker extends TTypeTracker {
+  Boolean hasCall;
+  OptionalAttributeName attr;
+
+  TypeTracker() { this = MkTypeTracker(hasCall, attr) }
+
+  /** Gets the summary resulting from appending `step` to this type-tracking summary. */
+  cached
+  TypeTracker append(StepSummary step) {
+    step = LevelStep() and result = this
+    or
+    step = CallStep() and result = MkTypeTracker(true, attr)
+    or
+    step = ReturnStep() and hasCall = false and result = this
+    or
+    step = LoadStep(attr) and result = MkTypeTracker(hasCall, "")
+    or
+    exists(string p | step = StoreStep(p) and attr = "" and result = MkTypeTracker(hasCall, p))
+  }
+
+  /** Gets a textual representation of this summary. */
+  string toString() {
+    exists(string withCall, string withAttr |
+      (if hasCall = true then withCall = "with" else withCall = "without") and
+      (if attr != "" then withAttr = " with attribute " + attr else withAttr = "") and
+      result = "type tracker " + withCall + " call steps" + withAttr
+    )
+  }
+
+  /**
+   * Holds if this is the starting point of type tracking.
+   */
+  predicate start() { hasCall = false and attr = "" }
+
+  /**
+   * Holds if this is the starting point of type tracking, and the value starts in the attribute named `attrName`.
+   * The type tracking only ends after the attribute has been loaded.
+   */
+  predicate startInAttr(AttributeName attrName) { hasCall = false and attr = attrName }
+
+  /**
+   * Holds if this is the starting point of type tracking
+   * when tracking a parameter into a call, but not out of it.
+   */
+  predicate call() { hasCall = true and attr = "" }
+
+  /**
+   * Holds if this is the end point of type tracking.
+   */
+  predicate end() { attr = "" }
+
+  /**
+   * INTERNAL. DO NOT USE.
+   *
+   * Holds if this type has been tracked into a call.
+   */
+  boolean hasCall() { result = hasCall }
+
+  /**
+   * INTERNAL. DO NOT USE.
+   *
+   * Gets the attribute associated with this type tracker.
+   */
+  string getAttr() { result = attr }
+
+  /**
+   * Gets a type tracker that starts where this one has left off to allow continued
+   * tracking.
+   *
+   * This predicate is only defined if the type has not been tracked into an attribute.
+   */
+  TypeTracker continue() { attr = "" and result = this }
+
+  /**
+   * Gets the summary that corresponds to having taken a forwards
+   * heap and/or inter-procedural step from `nodeFrom` to `nodeTo`.
+   */
+  pragma[inline]
+  TypeTracker step(Node nodeFrom, Node nodeTo) {
+    exists(StepSummary summary |
+      StepSummary::step(nodeFrom, nodeTo, summary) and
+      result = this.append(summary)
+    )
+  }
+
+  /**
+   * Gets the summary that corresponds to having taken a forwards
+   * local, heap and/or inter-procedural step from `nodeFrom` to `nodeTo`.
+   *
+   * Unlike `TypeTracker::step`, this predicate exposes all edges
+   * in the flow graph, and not just the edges between `Node`s.
+   * It may therefore be less performant.
+   *
+   * Type tracking predicates using small steps typically take the following form:
+   * ```ql
+   * DataFlow::Node myType(DataFlow::TypeTracker t) {
+   *   t.start() and
+   *   result = < source of myType >
+   *   or
+   *   exists (DataFlow::TypeTracker t2 |
+   *     t = t2.smallstep(myType(t2), result)
+   *   )
+   * }
+   *
+   * DataFlow::Node myType() {
+   *   result = myType(DataFlow::TypeTracker::end())
+   * }
+   * ```
+   */
+  pragma[inline]
+  TypeTracker smallstep(Node nodeFrom, Node nodeTo) {
+    exists(StepSummary summary |
+      StepSummary::smallstep(nodeFrom, nodeTo, summary) and
+      result = this.append(summary)
+    )
+    or
+    EssaFlow::essaFlowStep(nodeFrom, nodeTo) and
+    result = this
+  }
+}
diff --git a/python/ql/src/experimental/dataflow/internal/DataFlowPublic.qll b/python/ql/src/experimental/dataflow/internal/DataFlowPublic.qll
@@ -4,6 +4,7 @@
 
 private import python
 private import DataFlowPrivate
+import experimental.dataflow.TypeTracker
 
 /**
  * IPA type for data flow nodes.
@@ -69,6 +70,14 @@ class Node extends TNode {
 
   /** Convenience method for casting to ExprNode and calling getNode and getNode again. */
   Expr asExpr() { none() }
+
+  /**
+   * Gets a node that this node may flow to using one heap and/or interprocedural step.
+   *
+   * See `TypeTracker` for more details about how to use this.
+   */
+  pragma[inline]
+  Node track(TypeTracker t2, TypeTracker t) { t = t2.step(this, result) }
 }
 
 class EssaNode extends Node, TEssaNode {
diff --git a/python/ql/test/experimental/dataflow/typetracking/attribute_tests.py b/python/ql/test/experimental/dataflow/typetracking/attribute_tests.py
@@ -0,0 +1,31 @@
+class SomeClass:
+    pass
+
+def simple_read_write():
+    x = SomeClass() # $tracked=foo
+    x.foo = tracked # $tracked $tracked=foo
+    y = x.foo # $tracked=foo $tracked
+    do_stuff(y) # $tracked
+
+def foo():
+    x = SomeClass() # $tracked=attr
+    bar(x) # $tracked=attr
+    x.attr = tracked # $tracked=attr $tracked
+    baz(x) # $tracked=attr
+
+def bar(x): # $tracked=attr
+    z = x.attr # $tracked $tracked=attr
+    do_stuff(z) # $tracked
+
+def expects_int(x): # $int=field $f+:str=field
+    do_int_stuff(x.field) # $int $f+:str $int=field $f+:str=field
+
+def expects_string(x): # $f+:int=field $str=field
+    do_string_stuff(x.field) # $f+:int $str $f+:int=field $str=field
+
+def test_incompatible_types():
+    x = SomeClass() # $int,str=field
+    x.field = int(5) # $int=field $f+:str=field $int $f+:str
+    expects_int(x) # $int=field $f+:str=field
+    x.field = str("Hello") # $f+:int=field $str=field $f+:int $str
+    expects_string(x) # $f+:int=field $str=field
diff --git a/python/ql/test/experimental/dataflow/typetracking/test.py b/python/ql/test/experimental/dataflow/typetracking/test.py
@@ -0,0 +1,61 @@
+def get_tracked():
+    x = tracked # $tracked
+    return x # $tracked
+
+def use_tracked_foo(x): # $tracked
+    do_stuff(x) # $tracked
+
+def foo():
+    use_tracked_foo(
+        get_tracked() # $tracked
+    )
+
+def use_tracked_bar(x): # $tracked
+    do_stuff(x) # $tracked
+
+def bar():
+    x = get_tracked() # $tracked
+    use_tracked_bar(x) # $tracked
+
+def use_tracked_baz(x): # $tracked
+    do_stuff(x) # $tracked
+
+def baz():
+    x = tracked # $tracked
+    use_tracked_baz(x) # $tracked
+
+def id(x): # $tracked
+    return x # $tracked
+
+def use_tracked_quux(x): # $f-:tracked
+    do_stuff(y) # call after return -- not tracked in here.
+
+def quux():
+    x = tracked # $tracked
+    y = id(x) # $tracked
+    use_tracked_quux(y) # not tracked out of call to id.
+
+g = None
+
+def write_g(x): # $tracked
+    g = x # $tracked
+
+def use_g():
+    do_stuff(g) # $f-:tracked // no global flow for now.
+
+def global_var_write_test():
+    x = tracked # $tracked
+    write_g(x) # $tracked
+    use_g()
+
+def expects_int(x): # $int
+    do_int_stuff(x) # $int
+
+def expects_string(x): # $str
+    do_string_stuff(x) # $str
+
+def redefine_test():
+    x = int(5) # $int
+    expects_int(x) # $int
+    x = str("Hello") # $str
+    expects_string(x) # $str
diff --git a/python/ql/test/experimental/dataflow/typetracking/tracked.expected b/python/ql/test/experimental/dataflow/typetracking/tracked.expected
diff --git a/python/ql/test/experimental/dataflow/typetracking/tracked.ql b/python/ql/test/experimental/dataflow/typetracking/tracked.ql