Skip to content

Commit 09c5530

Browse files
authored
Merge PR #45 DSSMLTaskSettings sorted split
For time-based ordering from feature/dss60-time-variable-in-visual-ml
2 parents ce4dac4 + 385546a commit 09c5530

File tree

1 file changed

+34
-0
lines changed

1 file changed

+34
-0
lines changed

dataikuapi/dss/ml.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,40 @@ def get_split_params(self):
140140
"""
141141
return PredictionSplitParamsHandler(self.mltask_settings)
142142

143+
def split_ordered_by(self, feature_name, ascending=True):
144+
"""
145+
Uses a variable to sort the data for train/test split and hyperparameter optimization
146+
:param str feature_name: Name of the variable to use
147+
:param bool ascending: True iff the test set is expected to have larger time values than the train set
148+
"""
149+
self.remove_time_variable()
150+
if not feature_name in self.mltask_settings["preprocessing"]["per_feature"]:
151+
raise ValueError("Feature %s doesn't exist in this ML task, can't use as time" % feature_name)
152+
self.mltask_settings['time']['enabled'] = True
153+
self.mltask_settings['time']['timeVariable'] = feature_name
154+
self.mltask_settings['time']['ascending'] = ascending
155+
self.mltask_settings['preprocessing']['per_feature'][feature_name]['missing_handling'] = "DROP_ROW"
156+
if self.mltask_settings['splitParams']['ttPolicy'] == "SPLIT_SINGLE_DATASET":
157+
self.mltask_settings['splitParams']['ssdSplitMode'] = "SORTED"
158+
self.mltask_settings['splitParams']['ssdColumn'] = feature_name
159+
if self.mltask_settings['modeling']['gridSearchParams']['mode'] == "KFOLD":
160+
self.mltask_settings['modeling']['gridSearchParams']['mode'] = "TIME_SERIES_KFOLD"
161+
elif self.mltask_settings['modeling']['gridSearchParams']['mode'] == "SHUFFLE":
162+
self.mltask_settings['modeling']['gridSearchParams']['mode'] = "TIME_SERIES_SINGLE_SPLIT"
163+
164+
def remove_ordered_split(self):
165+
"""
166+
Remove time-based ordering.
167+
"""
168+
self.mltask_settings['time']['enabled'] = False
169+
self.mltask_settings['time']['timeVariable'] = None
170+
if self.mltask_settings['splitParams']['ttPolicy'] == "SPLIT_SINGLE_DATASET":
171+
self.mltask_settings['splitParams']['ssdSplitMode'] = "RANDOM"
172+
self.mltask_settings['splitParams']['ssdColumn'] = None
173+
if self.mltask_settings['modeling']['gridSearchParams']['mode'] == "TIME_SERIES_KFOLD":
174+
self.mltask_settings['modeling']['gridSearchParams']['mode'] = "KFOLD"
175+
elif self.mltask_settings['modeling']['gridSearchParams']['mode'] == "TIME_SERIES_SINGLE_SPLIT":
176+
self.mltask_settings['modeling']['gridSearchParams']['mode'] = "SHUFFLE"
143177

144178
def get_feature_preprocessing(self, feature_name):
145179
"""

0 commit comments

Comments
 (0)