|
12 | 12 | */ |
13 | 13 |
|
14 | 14 | import python |
15 | | -private import LegacyPointsTo |
| 15 | +import semmle.python.dataflow.new.DataFlow |
| 16 | +private import semmle.python.dataflow.new.internal.DataFlowDispatch |
| 17 | +private import semmle.python.ApiGraphs |
16 | 18 |
|
17 | | -/* |
18 | | - * This assumes that any indexing operation where the value is not a sequence or numpy array involves hashing. |
19 | | - * For sequences, the index must be an int, which are hashable, so we don't need to treat them specially. |
20 | | - * For numpy arrays, the index may be a list, which are not hashable and needs to be treated specially. |
| 19 | +/** |
| 20 | + * Holds if `cls` explicitly sets `__hash__` to `None`, making instances unhashable. |
21 | 21 | */ |
22 | | - |
23 | | -predicate numpy_array_type(ClassValue na) { |
24 | | - exists(ModuleValue np | np.getName() = "numpy" or np.getName() = "numpy.core" | |
25 | | - na.getASuperType() = np.attr("ndarray") |
26 | | - ) |
| 22 | +predicate setsHashToNone(Class cls) { |
| 23 | + DuckTyping::getAnAttributeValue(cls, "__hash__") instanceof None |
27 | 24 | } |
28 | 25 |
|
29 | | -predicate has_custom_getitem(Value v) { |
30 | | - v.getClass().lookup("__getitem__") instanceof PythonFunctionValue |
| 26 | +/** |
| 27 | + * Holds if `cls` is a user-defined class whose instances are unhashable. |
| 28 | + * A new-style class without `__hash__` is unhashable, as is one that explicitly |
| 29 | + * sets `__hash__ = None`. |
| 30 | + */ |
| 31 | +predicate isUnhashableUserClass(Class cls) { |
| 32 | + DuckTyping::isNewStyle(cls) and |
| 33 | + not DuckTyping::hasMethod(cls, "__hash__") and |
| 34 | + not DuckTyping::hasUnresolvedBase(getADirectSuperclass*(cls)) |
31 | 35 | or |
32 | | - numpy_array_type(v.getClass()) |
| 36 | + setsHashToNone(cls) |
33 | 37 | } |
34 | 38 |
|
35 | | -predicate explicitly_hashed(ControlFlowNode f) { |
36 | | - exists(CallNode c, GlobalVariable hash | |
37 | | - c.getArg(0) = f and c.getFunction().(NameNode).uses(hash) and hash.getId() = "hash" |
| 39 | +/** |
| 40 | + * Gets the name of a builtin type whose instances are unhashable. |
| 41 | + */ |
| 42 | +string getUnhashableBuiltinName() { result = ["list", "set", "dict", "bytearray"] } |
| 43 | + |
| 44 | +/** |
| 45 | + * Holds if `origin` is a local source node tracking an unhashable instance that |
| 46 | + * flows to `node`, with `clsName` describing the class for the alert. |
| 47 | + */ |
| 48 | +predicate isUnhashable(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) { |
| 49 | + exists(Class c | |
| 50 | + isUnhashableUserClass(c) and |
| 51 | + origin = classInstanceTracker(c) and |
| 52 | + origin.flowsTo(node) and |
| 53 | + clsName = c.getName() |
38 | 54 | ) |
| 55 | + or |
| 56 | + clsName = getUnhashableBuiltinName() and |
| 57 | + origin = API::builtin(clsName).getAnInstance().asSource() and |
| 58 | + origin.flowsTo(node) |
| 59 | +} |
| 60 | + |
| 61 | +predicate explicitly_hashed(DataFlow::Node node) { |
| 62 | + node = API::builtin("hash").getACall().getArg(0) |
39 | 63 | } |
40 | 64 |
|
41 | | -predicate unhashable_subscript(ControlFlowNode f, ClassValue c, ControlFlowNode origin) { |
42 | | - is_unhashable(f, c, origin) and |
43 | | - exists(SubscriptNode sub | sub.getIndex() = f | |
44 | | - exists(Value custom_getitem | |
45 | | - sub.getObject().(ControlFlowNodeWithPointsTo).pointsTo(custom_getitem) and |
46 | | - not has_custom_getitem(custom_getitem) |
47 | | - ) |
| 65 | +/** |
| 66 | + * Holds if the subscript object in `sub[...]` is known to use hashing for indexing, |
| 67 | + * i.e. it does not have a custom `__getitem__` that could accept unhashable indices. |
| 68 | + */ |
| 69 | +predicate subscriptUsesHashing(Subscript sub) { |
| 70 | + DataFlow::exprNode(sub.getObject()) = |
| 71 | + API::builtin("dict").getAnInstance().getAValueReachableFromSource() |
| 72 | + or |
| 73 | + exists(Class cls | |
| 74 | + classInstanceTracker(cls) |
| 75 | + .(DataFlow::LocalSourceNode) |
| 76 | + .flowsTo(DataFlow::exprNode(sub.getObject())) and |
| 77 | + not DuckTyping::hasMethod(cls, "__getitem__") |
48 | 78 | ) |
49 | 79 | } |
50 | 80 |
|
51 | | -predicate is_unhashable(ControlFlowNodeWithPointsTo f, ClassValue cls, ControlFlowNode origin) { |
52 | | - exists(Value v | f.pointsTo(v, origin) and v.getClass() = cls | |
53 | | - not cls.hasAttribute("__hash__") and not cls.failedInference(_) and cls.isNewStyle() |
54 | | - or |
55 | | - cls.lookup("__hash__") = Value::named("None") |
| 81 | +predicate unhashable_subscript(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) { |
| 82 | + exists(Subscript sub | |
| 83 | + node = DataFlow::exprNode(sub.getIndex()) and |
| 84 | + subscriptUsesHashing(sub) |
| 85 | + | |
| 86 | + isUnhashable(origin, node, clsName) |
56 | 87 | ) |
57 | 88 | } |
58 | 89 |
|
59 | 90 | /** |
60 | | - * Holds if `f` is inside a `try` that catches `TypeError`. For example: |
61 | | - * |
62 | | - * try: |
63 | | - * ... f ... |
64 | | - * except TypeError: |
65 | | - * ... |
66 | | - * |
67 | | - * This predicate is used to eliminate false positive results. If `hash` |
68 | | - * is called on an unhashable object then a `TypeError` will be thrown. |
69 | | - * But this is not a bug if the code catches the `TypeError` and handles |
70 | | - * it. |
| 91 | + * Holds if `e` is inside a `try` that catches `TypeError`. |
71 | 92 | */ |
72 | | -predicate typeerror_is_caught(ControlFlowNode f) { |
| 93 | +predicate typeerror_is_caught(Expr e) { |
73 | 94 | exists(Try try | |
74 | | - try.getBody().contains(f.getNode()) and |
75 | | - try.getAHandler().getType().(ExprWithPointsTo).pointsTo(ClassValue::typeError()) |
| 95 | + try.getBody().contains(e) and |
| 96 | + try.getAHandler().getType() = API::builtin("TypeError").getAValueReachableFromSource().asExpr() |
76 | 97 | ) |
77 | 98 | } |
78 | 99 |
|
79 | | -from ControlFlowNode f, ClassValue c, ControlFlowNode origin |
| 100 | +from DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName |
80 | 101 | where |
81 | | - not typeerror_is_caught(f) and |
| 102 | + not typeerror_is_caught(node.asExpr()) and |
82 | 103 | ( |
83 | | - explicitly_hashed(f) and is_unhashable(f, c, origin) |
| 104 | + explicitly_hashed(node) and isUnhashable(origin, node, clsName) |
84 | 105 | or |
85 | | - unhashable_subscript(f, c, origin) |
| 106 | + unhashable_subscript(origin, node, clsName) |
86 | 107 | ) |
87 | | -select f.getNode(), "This $@ of $@ is unhashable.", origin, "instance", c, c.getQualifiedName() |
| 108 | +select node, "This $@ of $@ is unhashable.", origin, "instance", origin, clsName |
0 commit comments