@@ -12,76 +12,165 @@ import semmle.python.dataflow.new.DataFlow
1212private import semmle.python.internal.CachedStages
1313
1414/**
15- * Provides classes and predicates for working with APIs used in a database.
15+ * Provides classes and predicates for working with the API boundary between the current
16+ * codebase and external libraries.
17+ *
18+ * See `API::Node` for more in-depth documentation.
1619 */
1720module API {
1821 /**
19- * An abstract representation of a definition or use of an API component such as a function
20- * exported by a Python package, or its result.
22+ * A node in the API graph, representing a value that has crossed the boundary between this
23+ * codebase and an external library (or in general, any external codebase).
24+ *
25+ * ### Basic usage
26+ *
27+ * API graphs are typically used to identify "API calls", that is, calls to an external function
28+ * whose implementation is not necessarily part of the current codebase.
29+ *
30+ * The most basic use of API graphs is typically as follows:
31+ * 1. Start with `API::moduleImport` for the relevant library.
32+ * 2. Follow up with a chain of accessors such as `getMember` describing how to get to the relevant API function.
33+ * 3. Map the resulting API graph nodes to data-flow nodes, using `asSource` or `asSink`.
34+ *
35+ * For example, a simplified way to get the first argument of a call to `json.dumps` would be
36+ * ```ql
37+ * API::moduleImport("json").getMember("dumps").getParameter(0).asSink()
38+ * ```
39+ *
40+ * The most commonly used accessors are `getMember`, `getParameter`, and `getReturn`.
41+ *
42+ * ### API graph nodes
43+ *
44+ * There are two kinds of nodes in the API graphs, distinguished by who is "holding" the value:
45+ * - **Use-nodes** represent values held by the current codebase, which came from an external library.
46+ * (The current codebase is "using" a value that came from the library).
47+ * - **Def-nodes** represent values held by the external library, which came from this codebase.
48+ * (The current codebase "defines" the value seen by the library).
49+ *
50+ * API graph nodes are associated with data-flow nodes in the current codebase.
51+ * (API graphs are designed to work when external libraries are not part of the database,
52+ * so we do not associate with concrete data-flow nodes from the external library).
53+ * - **Use-nodes** are associated with data-flow nodes where a value enters the current codebase,
54+ * such as the return value of a call to an external function.
55+ * - **Def-nodes** are associated with data-flow nodes where a value leaves the current codebase,
56+ * such as an argument passed in a call to an external function.
57+ *
58+ *
59+ * ### Access paths and edge labels
60+ *
61+ * Nodes in the API graph are associated with a set of access paths, describing a series of operations
62+ * that may be performed to obtain that value.
63+ *
64+ * For example, the access path `API::moduleImport("json").getMember("dumps")` represents the action of
65+ * importing `json` and then accessing the member `dumps` on the resulting object.
66+ *
67+ * Each edge in the graph is labelled by such an "operation". For an edge `A->B`, the type of the `A` node
68+ * determines who is performing the operation, and the type of the `B` node determines who ends up holding
69+ * the result:
70+ * - An edge starting from a use-node describes what the current codebase is doing to a value that
71+ * came from a library.
72+ * - An edge starting from a def-node describes what the external library might do to a value that
73+ * came from the current codebase.
74+ * - An edge ending in a use-node means the result ends up in the current codebase (at its associated data-flow node).
75+ * - An edge ending in a def-node means the result ends up in external code (its associated data-flow node is
76+ * the place where it was "last seen" in the current codebase before flowing out)
77+ *
78+ * Because the implementation of the external library is not visible, it is not known exactly what operations
79+ * it will perform on values that flow there. Instead, the edges starting from a def-node are operations that would
80+ * lead to an observable effect within the current codebase; without knowing for certain if the library will actually perform
81+ * those operations. (When constructing these edges, we assume the library is somewhat well-behaved).
82+ *
83+ * For example, given this snippet:
84+ * ```python
85+ * import foo
86+ * foo.bar(lambda x: doSomething(x))
87+ * ```
88+ * A callback is passed to the external function `foo.bar`. We can't know if `foo.bar` will actually invoke this callback.
89+ * But _if_ the library should decide to invoke the callback, then a value will flow into the current codebase via the `x` parameter.
90+ * For that reason, an edge is generated representing the argument-passing operation that might be performed by `foo.bar`.
91+ * This edge is going from the def-node associated with the callback to the use-node associated with the parameter `x`.
2192 */
2293 class Node extends Impl:: TApiNode {
2394 /**
24- * Gets a data-flow node corresponding to a use of the API component represented by this node .
95+ * Gets a data-flow node where this value may flow after entering the current codebase .
2596 *
26- * For example, `import re; re.escape` is a use of the `escape` function from the
27- * `re` module, and `import re; re.escape("hello")` is a use of the return of that function.
28- *
29- * This includes indirect uses found via data flow, meaning that in
30- * ```python
31- * def f(x):
32- * pass
33- *
34- * f(obj.foo)
35- * ```
36- * both `obj.foo` and `x` are uses of the `foo` member from `obj`.
97+ * This is similar to `asSource()` but additionally includes nodes that are transitively reachable by data flow.
98+ * See `asSource()` for examples.
3799 */
38- DataFlow:: Node getAUse ( ) {
100+ DataFlow:: Node getAValueReachableFromSource ( ) {
39101 exists ( DataFlow:: LocalSourceNode src | Impl:: use ( this , src ) |
40102 Impl:: trackUseNode ( src ) .flowsTo ( result )
41103 )
42104 }
43105
44106 /**
45- * Gets a data-flow node corresponding to the right-hand side of a definition of the API
46- * component represented by this node .
107+ * Gets a data-flow node where this value leaves the current codebase and flows into an
108+ * external library (or in general, any external codebase) .
47109 *
48- * For example, in the property write `foo.bar = x`, variable `x` is the the right-hand side
49- * of a write to the `bar` property of `foo` .
110+ * Concretely, this is either an argument passed to a call to external code,
111+ * or the right-hand side of an attribute write on an object flowing into such a call .
50112 *
51- * Note that for parameters, it is the arguments flowing into that parameter that count as
52- * right-hand sides of the definition, not the declaration of the parameter itself.
53- * Consequently, in :
113+ * For example:
54114 * ```python
55- * from mypkg import foo;
115+ * import foo
116+ *
117+ * # 'x' is matched by API::moduleImport("foo").getMember("bar").getParameter(0).asSink()
56118 * foo.bar(x)
119+ *
120+ * # 'x' is matched by API::moduleImport("foo").getMember("bar").getParameter(0).getMember("prop").asSink()
121+ * obj.prop = x
122+ * foo.bar(obj);
57123 * ```
58- * `x` is the right-hand side of a definition of the first parameter of `bar` from the `mypkg.foo` module.
124+ *
125+ * This predicate does not include nodes transitively reaching the sink by data flow;
126+ * use `getAValueReachingSink` for that.
59127 */
60- DataFlow:: Node getARhs ( ) { Impl:: rhs ( this , result ) }
128+ DataFlow:: Node asSink ( ) { Impl:: rhs ( this , result ) }
61129
62130 /**
63- * Gets a data-flow node that may interprocedurally flow to the right-hand side of a definition
64- * of the API component represented by this node.
131+ * Gets a data-flow node that transitively flows to an external library (or in general, any external codebase).
132+ *
133+ * This is similar to `asSink()` but additionally includes nodes that transitively reach a sink by data flow.
134+ * See `asSink()` for examples.
65135 */
66- DataFlow:: Node getAValueReachingRhs ( ) { result = Impl:: trackDefNode ( this .getARhs ( ) ) }
136+ DataFlow:: Node getAValueReachingSink ( ) { result = Impl:: trackDefNode ( this .asSink ( ) ) }
67137
68138 /**
69- * Gets an immediate use of the API component represented by this node .
139+ * Gets a data-flow node where this value enters the current codebase .
70140 *
71- * For example, `import re; re.escape` is a an immediate use of the `escape` member
72- * from the `re` module.
141+ * For example:
142+ * ```python
143+ * # API::moduleImport("re").asSource()
144+ * import re
145+ *
146+ * # API::moduleImport("re").getMember("escape").asSource()
147+ * re.escape
148+ *
149+ * # API::moduleImport("re").getMember("escape").getReturn().asSource()
150+ * re.escape()
151+ * ```
73152 *
74- * Unlike `getAUse()`, this predicate only gets the immediate references, not the indirect uses
75- * found via data flow. This means that in `x = re.escape` only `re.escape` is a reference
76- * to the `escape` member of `re`, neither `x` nor any node that `x` flows to is a reference to
77- * this API component.
153+ * This predicate does not include nodes transitively reachable by data flow;
154+ * use `getAValueReachableFromSource` for that.
78155 */
79- DataFlow:: LocalSourceNode getAnImmediateUse ( ) { Impl:: use ( this , result ) }
156+ DataFlow:: LocalSourceNode asSource ( ) { Impl:: use ( this , result ) }
157+
158+ /** DEPRECATED. This predicate has been renamed to `getAValueReachableFromSource()`. */
159+ deprecated DataFlow:: Node getAUse ( ) { result = this .getAValueReachableFromSource ( ) }
160+
161+ /** DEPRECATED. This predicate has been renamed to `asSource()`. */
162+ deprecated DataFlow:: LocalSourceNode getAnImmediateUse ( ) { result = this .asSource ( ) }
163+
164+ /** DEPRECATED. This predicate has been renamed to `asSink()`. */
165+ deprecated DataFlow:: Node getARhs ( ) { result = this .asSink ( ) }
166+
167+ /** DEPRECATED. This predicate has been renamed to `getAValueReachingSink()`. */
168+ deprecated DataFlow:: Node getAValueReachingRhs ( ) { result = this .getAValueReachingSink ( ) }
80169
81170 /**
82171 * Gets a call to the function represented by this API component.
83172 */
84- CallNode getACall ( ) { result = this .getReturn ( ) .getAnImmediateUse ( ) }
173+ CallNode getACall ( ) { result = this .getReturn ( ) .asSource ( ) }
85174
86175 /**
87176 * Gets a node representing member `m` of this API component.
@@ -306,7 +395,7 @@ module API {
306395 class CallNode extends DataFlow:: CallCfgNode {
307396 API:: Node callee ;
308397
309- CallNode ( ) { this = callee .getReturn ( ) .getAnImmediateUse ( ) }
398+ CallNode ( ) { this = callee .getReturn ( ) .asSource ( ) }
310399
311400 /** Gets the API node for the `i`th parameter of this invocation. */
312401 pragma [ nomagic]
@@ -319,14 +408,14 @@ module API {
319408 * Gets an API node where a RHS of the node is the `i`th argument to this call.
320409 */
321410 pragma [ noinline]
322- private Node getAParameterCandidate ( int i ) { result .getARhs ( ) = this .getArg ( i ) }
411+ private Node getAParameterCandidate ( int i ) { result .asSink ( ) = this .getArg ( i ) }
323412
324413 /** Gets the API node for a parameter of this invocation. */
325414 Node getAParameter ( ) { result = this .getParameter ( _) }
326415
327416 /** Gets the object that this method-call is being called on, if this is a method-call */
328417 Node getSelfParameter ( ) {
329- result .getARhs ( ) = this .( DataFlow:: MethodCallNode ) .getObject ( ) and
418+ result .asSink ( ) = this .( DataFlow:: MethodCallNode ) .getObject ( ) and
330419 result = callee .getSelfParameter ( )
331420 }
332421
@@ -346,13 +435,13 @@ module API {
346435
347436 pragma [ noinline]
348437 private Node getAKeywordParameterCandidate ( string name ) {
349- result .getARhs ( ) = this .getArgByName ( name )
438+ result .asSink ( ) = this .getArgByName ( name )
350439 }
351440
352441 /** Gets the API node for the return value of this call. */
353442 Node getReturn ( ) {
354443 result = callee .getReturn ( ) and
355- result .getAnImmediateUse ( ) = this
444+ result .asSource ( ) = this
356445 }
357446
358447 /**
0 commit comments