Skip to content

Commit 3d7db0e

Browse files
committed
add panas code execution sinks, add proper tests
1 parent 6b63492 commit 3d7db0e

File tree

2 files changed

+199
-0
lines changed

2 files changed

+199
-0
lines changed

python/ql/lib/semmle/python/frameworks/Pandas.qll

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,4 +34,118 @@ private module Pandas {
3434

3535
override string getFormat() { result = "pickle" }
3636
}
37+
38+
module DataFrame {
39+
/**
40+
* A `pandas.DataFrame` Object.
41+
* See https://pandas.pydata.org/docs/reference/frame.html
42+
*/
43+
abstract class Range extends API::Node {
44+
override string toString() { result = this.(API::Node).toString() }
45+
}
46+
}
47+
48+
/**
49+
* The `pandas.DataFrame` Objects including secondary `pandas.DataFrame` Objects.
50+
* Use this class where you want to find all `pandas.DataFrame` Objects.
51+
* See https://pandas.pydata.org/pandas-docs/stable/reference/frame.html
52+
*/
53+
class DataFrame extends API::Node {
54+
DataFrame() {
55+
this = any(DataFrame::Range df)
56+
or
57+
exists(API::Node dataFrame | dataFrame = any(DataFrame::Range df) |
58+
this =
59+
dataFrame
60+
.getMember([
61+
"copy", "from_records", "from_dict", "from_spmatrix", "assign", "select_dtypes",
62+
"set_flags", "astype", "infer_objects", "head", "xs", "get", "isin", "where",
63+
"mask", "query", "add", "mul", "truediv", "mod", "pow", "dot", "radd", "rsub",
64+
"rdiv", "rfloordiv", "rtruediv", "rpow", "lt", "gt", "le", "ne", "agg", "combine",
65+
"apply", "aggregate", "transform", "all", "any", "clip", "corr", "cov", "cummax",
66+
"cummin", "cumprod", "describe", "mode", "pct_change", "quantile", "rank",
67+
"round", "sem", "add_prefix", "add_suffix", "at_time", "between_time", "drop",
68+
"drop_duplicates", "filter", "first", "head", "idxmin", "last", "reindex",
69+
"reindex_like", "reset_index", "sample", "set_axis", "tail", "take", "truncate",
70+
"bfill", "dropna", "ffill", "fillna", "interpolate", "isna", "isnull", "notna",
71+
"notnull", "pad", "replace", "droplevel", "pivot", "pivot_table",
72+
"reorder_levels", "sort_values", "sort_index", "nlargest", "nsmallest",
73+
"swaplevel", "stack", "unstack", "isnull", "notna", "notnull", "replace",
74+
"droplevel", "pivot", "pivot_table", "reorder_levels", "sort_values",
75+
"sort_index", "nlargest", "nsmallest", "swaplevel", "stack", "unstack", "melt",
76+
"explode", "squeeze", "T", "transpose", "compare", "join", "from_spmatrix",
77+
"shift", "asof", "merge", "from_dict", "tz_convert", "to_period", "asfreq",
78+
"to_dense", "tz_localize", "box", "__dataframe__"
79+
])
80+
.getReturn()
81+
)
82+
}
83+
84+
override string toString() { result = this.(API::Node).toString() }
85+
}
86+
87+
/**
88+
* A `pandas.DataFrame` instantiation.
89+
* See https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html
90+
*/
91+
class DataFrameConstructor extends DataFrame::Range {
92+
DataFrameConstructor() { this = API::moduleImport("pandas").getMember("DataFrame").getReturn() }
93+
}
94+
95+
/**
96+
* The `pandas.read_*` functions that return a `pandas.DataFrame`.
97+
* See https://pandas.pydata.org/docs/reference/io.html
98+
*/
99+
class InputRead extends DataFrame::Range {
100+
InputRead() {
101+
this =
102+
API::moduleImport("pandas")
103+
.getMember([
104+
"read_csv", "read_fwf", "read_pickle", "read_table", "read_clipboard", "read_excel",
105+
"read_xml", "read_parquet", "read_orc", "read_spss", "read_sql_table",
106+
"read_sql_query", "read_sql", "read_gbq", "read_stata"
107+
])
108+
.getReturn()
109+
or
110+
this = API::moduleImport("pandas").getMember("read_html").getReturn().getASubscript()
111+
or
112+
exists(API::Node readSas, API::CallNode readSasCall |
113+
readSas = API::moduleImport("pandas").getMember("read_sas") and
114+
this = readSas.getReturn() and
115+
readSasCall = readSas.getACall()
116+
|
117+
// Returns DataFrame if iterator=False and chunksize=None, With default values it returns DataFrame.
118+
(
119+
not readSasCall.getParameter(5, "iterator").asSink().asExpr().(BooleanLiteral) instanceof
120+
True
121+
or
122+
not exists(readSasCall.getParameter(5, "iterator").asSink())
123+
) and
124+
not exists(
125+
readSasCall.getParameter(4, "chunksize").asSink().asExpr().(IntegerLiteral).getN()
126+
)
127+
)
128+
}
129+
}
130+
131+
/**
132+
* A Call to `pandas.DataFrame.query` or `pandas.DataFrame.eval`.
133+
* See https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html
134+
* https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.eval.html
135+
*/
136+
class DataFlowQueryCall extends CodeExecution::Range, API::CallNode {
137+
DataFlowQueryCall() { this = any(DataFrame df).getMember(["query", "eval"]).getACall() }
138+
139+
override DataFlow::Node getCode() { result = this.getParameter(0, "expr").asSink() }
140+
}
141+
142+
/**
143+
* A Call to `pandas.eval`.
144+
* See https://pandas.pydata.org/docs/reference/api/pandas.eval.html
145+
*/
146+
class PandasEval extends CodeExecution::Range, API::CallNode {
147+
PandasEval() { this = API::moduleImport("pandas").getMember("eval").getACall() }
148+
149+
override DataFlow::Node getCode() { result = this.getParameter(0, "expr").asSink() }
150+
}
37151
}
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
import pandas as pd
2+
3+
4+
df = pd.DataFrame({'temp_c': [17.0, 25.0]}, index=['Portland', 'Berkeley'])
5+
df.sample().query("query") # $getCode="query"
6+
df.mod().query("query") # $getCode="query"
7+
pd.eval("pythonExpr", target=df) # $getCode="pythonExpr"
8+
9+
df = pd.read_csv("filepath")
10+
df.query("query") # $getCode="query"
11+
df.eval("query") # $getCode="query"
12+
df.copy().query("query") # $getCode="query"
13+
14+
df = pd.read_fwf("filepath")
15+
df.query("query") # $getCode="query"
16+
df.eval("query") # $getCode="query"
17+
18+
19+
df = pd.read_pickle("filepath") # $ decodeInput="filepath" decodeOutput=pd.read_pickle(..) decodeFormat=pickle decodeMayExecuteInput
20+
df.query("query") # $getCode="query"
21+
df.eval("query") # $getCode="query"
22+
23+
df = pd.read_table("filepath")
24+
df.query("query") # $getCode="query"
25+
df.eval("query") # $getCode="query"
26+
27+
df = pd.read_clipboard("filepath")
28+
df.query("query") # $getCode="query"
29+
df.eval("query") # $getCode="query"
30+
31+
df = pd.read_excel("filepath")
32+
df.query("query") # $getCode="query"
33+
df.eval("query") # $getCode="query"
34+
35+
df = pd.read_html("filepath")
36+
df[0].query("query") # $getCode="query"
37+
38+
df = pd.read_xml("filepath")
39+
df.query("query") # $getCode="query"
40+
df.eval("query") # $getCode="query"
41+
42+
df = pd.read_parquet("filepath")
43+
df.query("query") # $getCode="query"
44+
df.eval("query") # $getCode="query"
45+
46+
df = pd.read_orc("filepath")
47+
df.query("query") # $getCode="query"
48+
df.eval("query") # $getCode="query"
49+
50+
df = pd.read_spss("filepath")
51+
df.query("query") # $getCode="query"
52+
df.eval("query") # $getCode="query"
53+
54+
df = pd.read_sql_table("filepath", 'postgres:///db_name')
55+
df.query("query") # $getCode="query"
56+
df.eval("query") # $getCode="query"
57+
58+
df = pd.read_sql_query("filepath", 'postgres:///db_name')
59+
df.query("query") # $getCode="query"
60+
df.eval("query") # $getCode="query"
61+
62+
df = pd.read_sql("filepath", 'postgres:///db_name')
63+
df.query("query") # $getCode="query"
64+
df.eval("query") # $getCode="query"
65+
66+
df = pd.read_gbq("filepath")
67+
df.query("query") # $getCode="query"
68+
df.eval("query") # $getCode="query"
69+
70+
df = pd.read_stata("filepath")
71+
df.query("query") # $getCode="query"
72+
df.eval("query") # $getCode="query"
73+
74+
df = pd.read_sas("filepath")
75+
df.query("query") # $getCode="query"
76+
df.eval("query") # $getCode="query"
77+
df = pd.read_sas("filepath", iterator=True, chunksize=1)
78+
df.query("query")
79+
df = pd.read_sas("filepath", iterator=False, chunksize=1)
80+
df.query("query")
81+
df = pd.read_sas("filepath", iterator=True, chunksize=None)
82+
df.query("query")
83+
df = pd.read_sas("filepath", iterator=False, chunksize=None)
84+
df.query("query") # $getCode="query"
85+
df.eval("query") # $getCode="query"

0 commit comments

Comments
 (0)