Skip to content

Commit 7b34f11

Browse files
committed
Python: Port HashedButNoHash.ql
This one is a bit more involved. Of note is the fact that it at present only uses local flow when determining the origin of some value (whereas the points-to version used global flow). It may be desirable to rewrite this query to use global data-flow, but this should be done with some care (as using "all unhashable objects" as the set of sources is somewhat iffy with respect to performance). For that reason, I'm sticking to mostly local flow (except for well behaved things like types and built-ins).
1 parent 6826c4e commit 7b34f11

File tree

2 files changed

+69
-48
lines changed

2 files changed

+69
-48
lines changed

python/ql/src/Expressions/HashedButNoHash.ql

Lines changed: 68 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -12,76 +12,97 @@
1212
*/
1313

1414
import python
15-
private import LegacyPointsTo
15+
import semmle.python.dataflow.new.DataFlow
16+
private import semmle.python.dataflow.new.internal.DataFlowDispatch
17+
private import semmle.python.ApiGraphs
1618

17-
/*
18-
* This assumes that any indexing operation where the value is not a sequence or numpy array involves hashing.
19-
* For sequences, the index must be an int, which are hashable, so we don't need to treat them specially.
20-
* For numpy arrays, the index may be a list, which are not hashable and needs to be treated specially.
19+
/**
20+
* Holds if `cls` explicitly sets `__hash__` to `None`, making instances unhashable.
2121
*/
22-
23-
predicate numpy_array_type(ClassValue na) {
24-
exists(ModuleValue np | np.getName() = "numpy" or np.getName() = "numpy.core" |
25-
na.getASuperType() = np.attr("ndarray")
26-
)
22+
predicate setsHashToNone(Class cls) {
23+
DuckTyping::getAnAttributeValue(cls, "__hash__") instanceof None
2724
}
2825

29-
predicate has_custom_getitem(Value v) {
30-
v.getClass().lookup("__getitem__") instanceof PythonFunctionValue
26+
/**
27+
* Holds if `cls` is a user-defined class whose instances are unhashable.
28+
* A new-style class without `__hash__` is unhashable, as is one that explicitly
29+
* sets `__hash__ = None`.
30+
*/
31+
predicate isUnhashableUserClass(Class cls) {
32+
DuckTyping::isNewStyle(cls) and
33+
not DuckTyping::hasMethod(cls, "__hash__") and
34+
not DuckTyping::hasUnresolvedBase(getADirectSuperclass*(cls))
3135
or
32-
numpy_array_type(v.getClass())
36+
setsHashToNone(cls)
3337
}
3438

35-
predicate explicitly_hashed(ControlFlowNode f) {
36-
exists(CallNode c, GlobalVariable hash |
37-
c.getArg(0) = f and c.getFunction().(NameNode).uses(hash) and hash.getId() = "hash"
39+
/**
40+
* Gets the name of a builtin type whose instances are unhashable.
41+
*/
42+
string getUnhashableBuiltinName() { result = ["list", "set", "dict", "bytearray"] }
43+
44+
/**
45+
* Holds if `origin` is a local source node tracking an unhashable instance that
46+
* flows to `node`, with `clsName` describing the class for the alert.
47+
*/
48+
predicate isUnhashable(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) {
49+
exists(Class c |
50+
isUnhashableUserClass(c) and
51+
origin = classInstanceTracker(c) and
52+
origin.flowsTo(node) and
53+
clsName = c.getName()
3854
)
55+
or
56+
clsName = getUnhashableBuiltinName() and
57+
origin = API::builtin(clsName).getAnInstance().asSource() and
58+
origin.flowsTo(node)
59+
}
60+
61+
predicate explicitly_hashed(DataFlow::Node node) {
62+
node = API::builtin("hash").getACall().getArg(0)
3963
}
4064

41-
predicate unhashable_subscript(ControlFlowNode f, ClassValue c, ControlFlowNode origin) {
42-
is_unhashable(f, c, origin) and
43-
exists(SubscriptNode sub | sub.getIndex() = f |
44-
exists(Value custom_getitem |
45-
sub.getObject().(ControlFlowNodeWithPointsTo).pointsTo(custom_getitem) and
46-
not has_custom_getitem(custom_getitem)
47-
)
65+
/**
66+
* Holds if the subscript object in `sub[...]` is known to use hashing for indexing,
67+
* i.e. it does not have a custom `__getitem__` that could accept unhashable indices.
68+
*/
69+
predicate subscriptUsesHashing(Subscript sub) {
70+
DataFlow::exprNode(sub.getObject()) =
71+
API::builtin("dict").getAnInstance().getAValueReachableFromSource()
72+
or
73+
exists(Class cls |
74+
classInstanceTracker(cls)
75+
.(DataFlow::LocalSourceNode)
76+
.flowsTo(DataFlow::exprNode(sub.getObject())) and
77+
not DuckTyping::hasMethod(cls, "__getitem__")
4878
)
4979
}
5080

51-
predicate is_unhashable(ControlFlowNodeWithPointsTo f, ClassValue cls, ControlFlowNode origin) {
52-
exists(Value v | f.pointsTo(v, origin) and v.getClass() = cls |
53-
not cls.hasAttribute("__hash__") and not cls.failedInference(_) and cls.isNewStyle()
54-
or
55-
cls.lookup("__hash__") = Value::named("None")
81+
predicate unhashable_subscript(DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName) {
82+
exists(Subscript sub |
83+
node = DataFlow::exprNode(sub.getIndex()) and
84+
subscriptUsesHashing(sub)
85+
|
86+
isUnhashable(origin, node, clsName)
5687
)
5788
}
5889

5990
/**
60-
* Holds if `f` is inside a `try` that catches `TypeError`. For example:
61-
*
62-
* try:
63-
* ... f ...
64-
* except TypeError:
65-
* ...
66-
*
67-
* This predicate is used to eliminate false positive results. If `hash`
68-
* is called on an unhashable object then a `TypeError` will be thrown.
69-
* But this is not a bug if the code catches the `TypeError` and handles
70-
* it.
91+
* Holds if `e` is inside a `try` that catches `TypeError`.
7192
*/
72-
predicate typeerror_is_caught(ControlFlowNode f) {
93+
predicate typeerror_is_caught(Expr e) {
7394
exists(Try try |
74-
try.getBody().contains(f.getNode()) and
75-
try.getAHandler().getType().(ExprWithPointsTo).pointsTo(ClassValue::typeError())
95+
try.getBody().contains(e) and
96+
try.getAHandler().getType() = API::builtin("TypeError").getAValueReachableFromSource().asExpr()
7697
)
7798
}
7899

79-
from ControlFlowNode f, ClassValue c, ControlFlowNode origin
100+
from DataFlow::LocalSourceNode origin, DataFlow::Node node, string clsName
80101
where
81-
not typeerror_is_caught(f) and
102+
not typeerror_is_caught(node.asExpr()) and
82103
(
83-
explicitly_hashed(f) and is_unhashable(f, c, origin)
104+
explicitly_hashed(node) and isUnhashable(origin, node, clsName)
84105
or
85-
unhashable_subscript(f, c, origin)
106+
unhashable_subscript(origin, node, clsName)
86107
)
87-
select f.getNode(), "This $@ of $@ is unhashable.", origin, "instance", c, c.getQualifiedName()
108+
select node, "This $@ of $@ is unhashable.", origin, "instance", origin, clsName
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
| expressions_test.py:42:20:42:25 | unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | file://:0:0:0:0 | builtin-class list | list |
1+
| expressions_test.py:42:20:42:25 | ControlFlowNode for unhash | This $@ of $@ is unhashable. | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | instance | expressions_test.py:41:32:41:37 | ControlFlowNode for list() | list |

0 commit comments

Comments
 (0)