Skip to content

Commit 16d838f

Browse files
committed
add filter parameter for explicit splitting
1 parent c8130ba commit 16d838f

File tree

2 files changed

+37
-2
lines changed

2 files changed

+37
-2
lines changed

dataikuapi/dss/ml.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import json
55
import time
66
from .metrics import ComputedMetrics
7-
from .utils import DSSDatasetSelectionBuilder
7+
from .utils import DSSDatasetSelectionBuilder, DSSFilterBuilder
88

99
class PredictionSplitParamsHandler(object):
1010
"""Object to modify the train/test splitting params."""
@@ -57,14 +57,16 @@ def set_split_kfold(self, n_folds = 5, selection = None, dataset_name=None):
5757
if dataset_name is not None:
5858
sp["ssdDatasetSmartName"] = dataset_name
5959

60-
def set_split_explicit(self, train_selection, test_selection, dataset_name=None, test_dataset_name=None):
60+
def set_split_explicit(self, train_selection, test_selection, dataset_name=None, test_dataset_name=None, train_filter=None, test_filter=None):
6161
"""
6262
Sets the train/test split to explicit extract of one or two dataset
6363
6464
:param object train_selection: A :class:`DSSDatasetSelectionBuilder` to build the settings of the extract of the train dataset. May be None (won't be changed)
6565
:param object test_selection: A :class:`DSSDatasetSelectionBuilder` to build the settings of the extract of the test dataset. May be None (won't be changed)
6666
:param str dataset_name: Name of dataset to use for the extracts. If None, the main dataset used to create the ML Task will be used.
6767
:param str test_dataset_name: Name of a second dataset to use for the test data extract. If None, both extracts are done from dataset_name
68+
:param object train_filter: A :class:`DSSFilterBuilder` to build the settings of the filter of the train dataset. May be None (won't be changed)
69+
:param object test_filter: A :class:`DSSFilterBuilder` to build the settings of the filter of the test dataset. May be None (won't be changed)
6870
"""
6971
sp = self.mltask_settings["splitParams"]
7072
if dataset_name is None:
@@ -94,6 +96,17 @@ def set_split_explicit(self, train_selection, test_selection, dataset_name=None,
9496
else:
9597
test_split["selection"] = test_selection
9698

99+
if train_filter is not None:
100+
if isinstance(train_filter, DSSFilterBuilder):
101+
train_split["filter"] = train_filter.build()
102+
else:
103+
train_split["filter"] = train_filter
104+
if test_filter is not None:
105+
if isinstance(test_filter, DSSFilterBuilder):
106+
test_split["filter"] = test_filter.build()
107+
else:
108+
test_split["filter"] = test_filter
109+
97110

98111
class DSSMLTaskSettings(object):
99112
"""

dataikuapi/dss/utils.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,25 @@ def with_selected_partitions(self, ids):
4242
return self
4343

4444

45+
class DSSFilterBuilder(object):
46+
"""
47+
Builder for a "filter". In DSS, a filter is used to define a subset of rows for processing.
48+
"""
49+
def __init__(self):
50+
self.filter = {"enabled":False, "distinct":False, "expression":None, "uiData":{"mode":"CUSTOM"}}
51+
52+
def build(self):
53+
"""Returns the built filter dict"""
54+
return self.filter
55+
56+
def with_distinct(self):
57+
"""Sets the filter to deduplicate"""
58+
self.filter["distinct"] = True
59+
return self
60+
61+
def with_formula(self, expression):
62+
"""Sets the filter to deduplicate"""
63+
self.filter["enabled"] = True
64+
self.filter["expression"] = expression
65+
self.filter["uiData"]["mode"] = "CUSTOM"
66+
return self

0 commit comments

Comments
 (0)