Skip to content

Commit 9a4ad3c

Browse files
authored
Add compiler optimization to remove px.contains calls with empty strings (#1679)
1 parent 8157d69 commit 9a4ad3c

File tree

6 files changed

+326
-0
lines changed

6 files changed

+326
-0
lines changed

src/carnot/planner/compiler/optimizer/BUILD.bazel

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,3 +82,13 @@ pl_cc_test(
8282
"//src/carnot/udf_exporter:cc_library",
8383
],
8484
)
85+
86+
pl_cc_test(
87+
name = "prune_unused_contains_rule_test",
88+
srcs = ["prune_unused_contains_rule_test.cc"],
89+
deps = [
90+
":cc_library",
91+
"//src/carnot/planner/compiler:test_utils",
92+
"//src/carnot/udf_exporter:cc_library",
93+
],
94+
)

src/carnot/planner/compiler/optimizer/optimizer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include "src/carnot/planner/compiler/optimizer/merge_nodes_rule.h"
2727
#include "src/carnot/planner/compiler/optimizer/prune_unconnected_operators_rule.h"
2828
#include "src/carnot/planner/compiler/optimizer/prune_unused_columns_rule.h"
29+
#include "src/carnot/planner/compiler/optimizer/prune_unused_contains_rule.h"
2930
#include "src/carnot/planner/compiler_state/compiler_state.h"
3031
#include "src/carnot/planner/compiler_state/registry_info.h"
3132
#include "src/carnot/planner/ir/ir.h"
@@ -62,10 +63,16 @@ class Optimizer : public RuleExecutor<IR> {
6263
prune_unused_columns->AddRule<PruneUnusedColumnsRule>();
6364
}
6465

66+
void CreatePruneUnusedContainsBatch() {
67+
RuleBatch* prune_unused_columns = CreateRuleBatch<DoOnce>("PruneUnusedContains");
68+
prune_unused_columns->AddRule<PruneUnusedContainsRule>();
69+
}
70+
6571
Status Init() {
6672
CreatePruneUnconnectedOpsBatch();
6773
CreateMergeNodesBatch();
6874
CreatePruneUnusedColumnsBatch();
75+
CreatePruneUnusedContainsBatch();
6976
return Status::OK();
7077
}
7178

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
* Copyright 2018- The Pixie Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* SPDX-License-Identifier: Apache-2.0
17+
*/
18+
19+
#include "src/carnot/planner/compiler/optimizer/prune_unused_contains_rule.h"
20+
21+
#include <algorithm>
22+
#include <queue>
23+
24+
namespace px {
25+
namespace carnot {
26+
namespace planner {
27+
namespace compiler {
28+
29+
StatusOr<bool> PruneUnusedContainsRule::RemoveMatchingFilter(IRNode* ir_node) {
30+
auto ir_graph = ir_node->graph();
31+
auto node_id = ir_node->id();
32+
if (!Match(ir_node, Filter())) return false;
33+
34+
FilterIR* filter = static_cast<FilterIR*>(ir_node);
35+
ExpressionIR* expr = filter->filter_expr();
36+
37+
if (!Match(expr, Func("contains"))) return false;
38+
39+
FuncIR* func = static_cast<FuncIR*>(expr);
40+
auto args = func->all_args();
41+
auto str = args[0];
42+
auto substr = args[1];
43+
44+
if (!Match(substr, String(""))) {
45+
return false;
46+
}
47+
48+
DCHECK_EQ(ir_graph->dag().ParentsOf(node_id).size(), 1UL);
49+
auto parent_id = ir_graph->dag().ParentsOf(node_id)[0];
50+
auto parent_node = ir_graph->Get(parent_id);
51+
52+
PX_RETURN_IF_ERROR(ir_graph->DeleteNode(func->id()));
53+
54+
// str and substr may be connected to other nodes in the
55+
// IR graph. We should only delete those nodes once their
56+
// last parent is removed. This should happen with the
57+
// preceding FuncIR deletion.
58+
if (ir_graph->dag().ParentsOf(str->id()).size() == 0) {
59+
PX_RETURN_IF_ERROR(ir_graph->DeleteNode(str->id()));
60+
}
61+
if (ir_graph->dag().ParentsOf(substr->id()).size() == 0) {
62+
PX_RETURN_IF_ERROR(ir_graph->DeleteNode(substr->id()));
63+
}
64+
65+
// Reparent any child nodes of the filter
66+
for (int64_t child_id : ir_graph->dag().DependenciesOf(node_id)) {
67+
auto child_node = ir_graph->Get(child_id);
68+
69+
if (!Match(child_node, Operator())) {
70+
continue;
71+
}
72+
auto child_op_node = static_cast<OperatorIR*>(child_node);
73+
74+
PX_RETURN_IF_ERROR(child_op_node->ReplaceParent(static_cast<OperatorIR*>(ir_node),
75+
static_cast<OperatorIR*>(parent_node)));
76+
}
77+
PX_RETURN_IF_ERROR(ir_graph->DeleteNode(node_id));
78+
return true;
79+
}
80+
81+
StatusOr<bool> PruneUnusedContainsRule::Apply(IRNode* ir_node) {
82+
return RemoveMatchingFilter(ir_node);
83+
}
84+
85+
} // namespace compiler
86+
} // namespace planner
87+
} // namespace carnot
88+
} // namespace px
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
/*
2+
* Copyright 2018- The Pixie Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* SPDX-License-Identifier: Apache-2.0
17+
*/
18+
19+
#pragma once
20+
21+
#include <vector>
22+
23+
#include "src/carnot/planner/rules/rules.h"
24+
25+
namespace px {
26+
namespace carnot {
27+
namespace planner {
28+
namespace compiler {
29+
30+
/**
31+
* @brief This rule removes contains functions that compare against an empty string.
32+
* This results in returning the original dataset and so it is equivalent to no filter
33+
* as seen in the example below:
34+
*
35+
* df = df[px.contains(df.pod, "")]
36+
*
37+
*/
38+
class PruneUnusedContainsRule : public Rule {
39+
public:
40+
PruneUnusedContainsRule()
41+
: Rule(nullptr, /*use_topo*/ true, /*reverse_topological_execution*/ true) {}
42+
43+
protected:
44+
StatusOr<bool> Apply(IRNode* ir_node) override;
45+
StatusOr<bool> RemoveMatchingFilter(IRNode* ir_node);
46+
};
47+
48+
} // namespace compiler
49+
} // namespace planner
50+
} // namespace carnot
51+
} // namespace px
Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
/*
2+
* Copyright 2018- The Pixie Authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*
16+
* SPDX-License-Identifier: Apache-2.0
17+
*/
18+
19+
#include <string>
20+
21+
#include <gtest/gtest.h>
22+
23+
#include "src/carnot/planner/compiler/analyzer/analyzer.h"
24+
#include "src/carnot/planner/compiler/optimizer/prune_unused_contains_rule.h"
25+
#include "src/carnot/planner/compiler/test_utils.h"
26+
27+
namespace px {
28+
namespace carnot {
29+
namespace planner {
30+
namespace compiler {
31+
32+
using table_store::schema::Relation;
33+
34+
using PruneUnusedContainsRuleTest = RulesTest;
35+
36+
TEST_F(PruneUnusedContainsRuleTest, Basic) {
37+
MemorySourceIR* mem_src = MakeMemSource(MakeRelation());
38+
compiler_state_->relation_map()->emplace("table", MakeRelation());
39+
40+
auto constant1 = graph->CreateNode<StringIR>(ast, "testing 1").ValueOrDie();
41+
auto constant2 = graph->CreateNode<StringIR>(ast, "testing 2").ValueOrDie();
42+
auto empty_str = graph->CreateNode<StringIR>(ast, "").ValueOrDie();
43+
44+
auto filter_func_1 =
45+
graph
46+
->CreateNode<FuncIR>(ast, FuncIR::Op{FuncIR::Opcode::non_op, "", "contains"},
47+
std::vector<ExpressionIR*>{constant1, empty_str})
48+
.ValueOrDie();
49+
50+
auto filter_func_2 =
51+
graph
52+
->CreateNode<FuncIR>(ast, FuncIR::Op{FuncIR::Opcode::non_op, "", "contains"},
53+
std::vector<ExpressionIR*>{constant2, empty_str})
54+
.ValueOrDie();
55+
56+
auto filter_ir_1 = graph->CreateNode<FilterIR>(ast, mem_src, filter_func_1).ValueOrDie();
57+
auto filter_ir_2 = graph->CreateNode<FilterIR>(ast, filter_ir_1, filter_func_2).ValueOrDie();
58+
59+
auto filter_ir_1_id = filter_ir_1->id();
60+
auto filter_ir_2_id = filter_ir_2->id();
61+
auto filter_func_ir_1_id = filter_func_1->id();
62+
auto filter_func_ir_2_id = filter_func_2->id();
63+
64+
auto limit = MakeLimit(static_cast<OperatorIR*>(filter_ir_2), 1000);
65+
66+
ResolveTypesRule type_rule(compiler_state_.get());
67+
ASSERT_OK(type_rule.Execute(graph.get()));
68+
69+
PruneUnusedContainsRule rule;
70+
auto result = rule.Execute(graph.get());
71+
ASSERT_OK(result);
72+
ASSERT_TRUE(result.ConsumeValueOrDie());
73+
74+
EXPECT_FALSE(graph->HasNode(filter_ir_1_id));
75+
EXPECT_FALSE(graph->HasNode(filter_ir_2_id));
76+
EXPECT_FALSE(graph->HasNode(filter_func_ir_1_id));
77+
EXPECT_FALSE(graph->HasNode(filter_func_ir_2_id));
78+
EXPECT_TRUE(graph->HasNode(limit->id()));
79+
}
80+
81+
TEST_F(PruneUnusedContainsRuleTest, IgnoresNonEmptyStringContains) {
82+
MemorySourceIR* mem_src = MakeMemSource(MakeRelation());
83+
compiler_state_->relation_map()->emplace("table", MakeRelation());
84+
85+
auto constant1 = graph->CreateNode<StringIR>(ast, "testing").ValueOrDie();
86+
auto constant2 = graph->CreateNode<StringIR>(ast, "existing value").ValueOrDie();
87+
88+
auto filter_func =
89+
graph
90+
->CreateNode<FuncIR>(ast, FuncIR::Op{FuncIR::Opcode::non_op, "", "contains"},
91+
std::vector<ExpressionIR*>{constant1, constant2})
92+
.ValueOrDie();
93+
auto filter_func_id = filter_func->id();
94+
95+
auto filter_ir = graph->CreateNode<FilterIR>(ast, mem_src, filter_func).ValueOrDie();
96+
auto filter_ir_id = filter_ir->id();
97+
98+
ResolveTypesRule type_rule(compiler_state_.get());
99+
ASSERT_OK(type_rule.Execute(graph.get()));
100+
101+
PruneUnusedContainsRule rule;
102+
auto result = rule.Execute(graph.get());
103+
ASSERT_OK(result);
104+
ASSERT_FALSE(result.ConsumeValueOrDie());
105+
106+
EXPECT_TRUE(graph->HasNode(filter_ir_id));
107+
EXPECT_TRUE(graph->HasNode(filter_func_id));
108+
}
109+
110+
} // namespace compiler
111+
} // namespace planner
112+
} // namespace carnot
113+
} // namespace px

src/carnot/planner/logical_planner_test.cc

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -673,6 +673,63 @@ TEST_F(LogicalPlannerTest, filter_pushdown_bug) {
673673
ASSERT_OK(plan->ToProto());
674674
}
675675

676+
const char kHttpDataScript[] = R"pxl(
677+
import px
678+
679+
680+
def http_data(start_time: str, source_filter: str, destination_filter: str, num_head: int):
681+
682+
df = px.DataFrame(table='http_events', start_time=start_time)
683+
684+
# Add context.
685+
df.node = df.ctx['node']
686+
df.pid = px.upid_to_pid(df.upid)
687+
688+
# Filter out entities as specified by the user.
689+
df = df[px.contains("source", source_filter)]
690+
df = df[px.contains("destination", destination_filter)]
691+
692+
# Add additional filters below:
693+
694+
# Restrict number of results.
695+
df = df.head(num_head)
696+
697+
# Order columns.
698+
699+
return df
700+
)pxl";
701+
702+
TEST_F(LogicalPlannerTest, VerifyEmptyContainsCallsDoNotSegFaultTest) {
703+
auto planner = LogicalPlanner::Create(info_).ConsumeValueOrDie();
704+
plannerpb::QueryRequest req;
705+
req.set_query_str(kHttpDataScript);
706+
auto f = req.add_exec_funcs();
707+
f->set_func_name("http_data");
708+
f->set_output_table_prefix("http_data");
709+
auto start_time = f->add_arg_values();
710+
start_time->set_name("start_time");
711+
start_time->set_value("-5m");
712+
713+
auto source_filter = f->add_arg_values();
714+
source_filter->set_name("source_filter");
715+
source_filter->set_value("");
716+
717+
auto dest_filter = f->add_arg_values();
718+
dest_filter->set_name("destination_filter");
719+
dest_filter->set_value("");
720+
721+
auto num_head = f->add_arg_values();
722+
num_head->set_name("num_head");
723+
num_head->set_value("1000");
724+
725+
*req.mutable_logical_planner_state() =
726+
testutils::CreateTwoPEMsOneKelvinPlannerState(testutils::kHttpEventsSchema);
727+
auto plan_or_s = planner->Plan(req);
728+
ASSERT_OK(plan_or_s);
729+
auto plan = plan_or_s.ConsumeValueOrDie();
730+
EXPECT_OK(plan->ToProto());
731+
}
732+
676733
TEST_F(LogicalPlannerTest, create_compiler_state_has_endpoint_config) {
677734
auto state = testutils::CreateTwoPEMsOneKelvinPlannerState(kCheckoutProbeTableSchema);
678735
auto endpoint_config = state.mutable_otel_endpoint_config();

0 commit comments

Comments
 (0)