11from ..utils import DataikuException
22from ..utils import DataikuUTF8CSVReader
33from ..utils import DataikuStreamedHttpUTF8CSVReader
4- import json
4+ import json , warnings
55import time
66from .metrics import ComputedMetrics
77from .utils import DSSDatasetSelectionBuilder , DSSFilterBuilder
1010class PredictionSplitParamsHandler (object ):
1111 """Object to modify the train/test splitting params."""
1212
13+ SPLIT_PARAMS_KEY = 'splitParams'
14+
1315 def __init__ (self , mltask_settings ):
1416 """Do not call directly, use :meth:`DSSMLTaskSettings.get_split_params`"""
1517 self .mltask_settings = mltask_settings
1618
19+ def get_raw (self ):
20+ """Gets the raw settings of the prediction split configuration. This returns a reference to the raw settings, not a copy,
21+ so changes made to the returned object will be reflected when saving.
22+
23+ :rtype: dict
24+ """
25+ return self .mltask_settings [PredictionSplitParamsHandler .SPLIT_PARAMS_KEY ]
26+
27+
1728 def set_split_random (self , train_ratio = 0.8 , selection = None , dataset_name = None ):
1829 """
1930 Sets the train/test split to random splitting of an extract of a single dataset
@@ -22,7 +33,7 @@ def set_split_random(self, train_ratio = 0.8, selection = None, dataset_name=Non
2233 :param object selection: A :class:`~dataikuapi.dss.utils.DSSDatasetSelectionBuilder` to build the settings of the extract of the dataset. May be None (won't be changed)
2334 :param str dataset_name: Name of dataset to split. If None, the main dataset used to create the visual analysis will be used.
2435 """
25- sp = self .mltask_settings ["splitParams" ]
36+ sp = self .mltask_settings [PredictionSplitParamsHandler . SPLIT_PARAMS_KEY ]
2637 sp ["ttPolicy" ] = "SPLIT_SINGLE_DATASET"
2738 if selection is not None :
2839 if isinstance (selection , DSSDatasetSelectionBuilder ):
@@ -36,6 +47,8 @@ def set_split_random(self, train_ratio = 0.8, selection = None, dataset_name=Non
3647 if dataset_name is not None :
3748 sp ["ssdDatasetSmartName" ] = dataset_name
3849
50+ return self
51+
3952 def set_split_kfold (self , n_folds = 5 , selection = None , dataset_name = None ):
4053 """
4154 Sets the train/test split to k-fold splitting of an extract of a single dataset
@@ -44,7 +57,7 @@ def set_split_kfold(self, n_folds = 5, selection = None, dataset_name=None):
4457 :param object selection: A :class:`~dataikuapi.dss.utils.DSSDatasetSelectionBuilder` to build the settings of the extract of the dataset. May be None (won't be changed)
4558 :param str dataset_name: Name of dataset to split. If None, the main dataset used to create the visual analysis will be used.
4659 """
47- sp = self .mltask_settings ["splitParams" ]
60+ sp = self .mltask_settings [PredictionSplitParamsHandler . SPLIT_PARAMS_KEY ]
4861 sp ["ttPolicy" ] = "SPLIT_SINGLE_DATASET"
4962 if selection is not None :
5063 if isinstance (selection , DSSDatasetSelectionBuilder ):
@@ -58,6 +71,8 @@ def set_split_kfold(self, n_folds = 5, selection = None, dataset_name=None):
5871 if dataset_name is not None :
5972 sp ["ssdDatasetSmartName" ] = dataset_name
6073
74+ return self
75+
6176 def set_split_explicit (self , train_selection , test_selection , dataset_name = None , test_dataset_name = None , train_filter = None , test_filter = None ):
6277 """
6378 Sets the train/test split to explicit extract of one or two dataset(s)
@@ -69,7 +84,7 @@ def set_split_explicit(self, train_selection, test_selection, dataset_name=None,
6984 :param object train_filter: A :class:`~dataikuapi.dss.utils.DSSFilterBuilder` to build the settings of the filter of the train dataset. May be None (won't be changed)
7085 :param object test_filter: A :class:`~dataikuapi.dss.utils.DSSFilterBuilder` to build the settings of the filter of the test dataset. May be None (won't be changed)
7186 """
72- sp = self .mltask_settings ["splitParams" ]
87+ sp = self .mltask_settings [PredictionSplitParamsHandler . SPLIT_PARAMS_KEY ]
7388 if dataset_name is None :
7489 raise Exception ("For explicit splitting a dataset_name is mandatory" )
7590 if test_dataset_name is None or test_dataset_name == dataset_name :
@@ -108,44 +123,15 @@ def set_split_explicit(self, train_selection, test_selection, dataset_name=None,
108123 else :
109124 test_split ["filter" ] = test_filter
110125
126+ return self
111127
112- class DSSMLTaskSettings (object ):
113- """
114- Object to read and modify the settings of a ML task.
115-
116- Do not create this object directly, use :meth:`DSSMLTask.get_settings()` instead
117- """
118- def __init__ (self , client , project_key , analysis_id , mltask_id , mltask_settings ):
119- self .client = client
120- self .project_key = project_key
121- self .analysis_id = analysis_id
122- self .mltask_id = mltask_id
123- self .mltask_settings = mltask_settings
124-
125- def get_raw (self ):
126- """
127- Gets the raw settings of this ML Task. This returns a reference to the raw settings, not a copy,
128- so changes made to the returned object will be reflected when saving.
129-
130- :rtype: dict
131- """
132- return self .mltask_settings
133-
134- def get_split_params (self ):
135- """
136- Gets an object to modify train/test splitting params.
137-
138- :rtype: :class:`PredictionSplitParamsHandler`
128+ def set_time_ordering (self , feature_name , ascending = True ):
139129 """
140- return PredictionSplitParamsHandler (self .mltask_settings )
141-
142- def split_ordered_by (self , feature_name , ascending = True ):
143- """
144- Uses a variable to sort the data for train/test split and hyperparameter optimization
130+ Uses a variable to sort the data for train/test split and hyperparameter optimization by time
145131 :param str feature_name: Name of the variable to use
146132 :param bool ascending: True iff the test set is expected to have larger time values than the train set
147133 """
148- self .remove_ordered_split ()
134+ self .unset_time_ordering ()
149135 if not feature_name in self .mltask_settings ["preprocessing" ]["per_feature" ]:
150136 raise ValueError ("Feature %s doesn't exist in this ML task, can't use as time" % feature_name )
151137 self .mltask_settings ['time' ]['enabled' ] = True
@@ -160,9 +146,11 @@ def split_ordered_by(self, feature_name, ascending=True):
160146 elif self .mltask_settings ['modeling' ]['gridSearchParams' ]['mode' ] == "SHUFFLE" :
161147 self .mltask_settings ['modeling' ]['gridSearchParams' ]['mode' ] = "TIME_SERIES_SINGLE_SPLIT"
162148
163- def remove_ordered_split (self ):
149+ return self
150+
151+ def unset_time_ordering (self ):
164152 """
165- Remove time-based ordering.
153+ Remove time-based ordering for train/test split and hyperparameter optimization
166154 """
167155 self .mltask_settings ['time' ]['enabled' ] = False
168156 self .mltask_settings ['time' ]['timeVariable' ] = None
@@ -174,6 +162,31 @@ def remove_ordered_split(self):
174162 elif self .mltask_settings ['modeling' ]['gridSearchParams' ]['mode' ] == "TIME_SERIES_SINGLE_SPLIT" :
175163 self .mltask_settings ['modeling' ]['gridSearchParams' ]['mode' ] = "SHUFFLE"
176164
165+ return self
166+
167+
168+ class DSSMLTaskSettings (object ):
169+ """
170+ Object to read and modify the settings of a ML task.
171+
172+ Do not create this object directly, use :meth:`DSSMLTask.get_settings()` instead
173+ """
174+ def __init__ (self , client , project_key , analysis_id , mltask_id , mltask_settings ):
175+ self .client = client
176+ self .project_key = project_key
177+ self .analysis_id = analysis_id
178+ self .mltask_id = mltask_id
179+ self .mltask_settings = mltask_settings
180+
181+ def get_raw (self ):
182+ """
183+ Gets the raw settings of this ML Task. This returns a reference to the raw settings, not a copy,
184+ so changes made to the returned object will be reflected when saving.
185+
186+ :rtype: dict
187+ """
188+ return self .mltask_settings
189+
177190 def get_feature_preprocessing (self , feature_name ):
178191 """
179192 Gets the feature preprocessing params for a particular feature. This returns a reference to the
@@ -214,27 +227,6 @@ def use_feature(self, feature_name):
214227 """
215228 self .get_feature_preprocessing (feature_name )["role" ] = "INPUT"
216229
217- def use_sample_weighting (self , feature_name ):
218- """
219- Uses a feature as sample weight
220- :param str feature_name: Name of the feature to use
221- """
222- self .remove_sample_weighting ()
223- if not feature_name in self .mltask_settings ["preprocessing" ]["per_feature" ]:
224- raise ValueError ("Feature %s doesn't exist in this ML task, can't use as weight" % feature_name )
225- self .mltask_settings ['weight' ]['weightMethod' ] = 'SAMPLE_WEIGHT'
226- self .mltask_settings ['weight' ]['sampleWeightVariable' ] = feature_name
227- self .mltask_settings ['preprocessing' ]['per_feature' ][feature_name ]['role' ] = 'WEIGHT'
228-
229- def remove_sample_weighting (self ):
230- """
231- Remove sample weighting. If a feature was used as weight, it's set back to being an input feature
232- """
233- self .mltask_settings ['weight' ]['weightMethod' ] = 'NO_WEIGHTING'
234- for feature_name in self .mltask_settings ['preprocessing' ]['per_feature' ]:
235- if self .mltask_settings ['preprocessing' ]['per_feature' ][feature_name ]['role' ] == 'WEIGHT' :
236- self .mltask_settings ['preprocessing' ]['per_feature' ][feature_name ]['role' ] = 'INPUT'
237-
238230 def get_algorithm_settings (self , algorithm_name ):
239231 """
240232 Gets the training settings for a particular algorithm. This returns a reference to the
@@ -360,6 +352,118 @@ class DSSPredictionMLTaskSettings(DSSMLTaskSettings):
360352 "KERAS_CODE" : "keras"
361353 }
362354
355+ class PredictionTypes :
356+ BINARY = "BINARY_CLASSIFICATION"
357+ REGRESSION = "REGRESSION"
358+ MULTICLASS = "MULTICLASS"
359+
360+ def __init__ (self , client , project_key , analysis_id , mltask_id , mltask_settings ):
361+ DSSMLTaskSettings .__init__ (self , client , project_key , analysis_id , mltask_id , mltask_settings )
362+
363+ if self .get_prediction_type () not in [self .PredictionTypes .BINARY , self .PredictionTypes .REGRESSION , self .PredictionTypes .MULTICLASS ]:
364+ raise ValueError ("Unknown prediction type: {}" .format (self .prediction_type ))
365+
366+ self .classification_prediction_types = [self .PredictionTypes .BINARY , self .PredictionTypes .MULTICLASS ]
367+
368+ def get_prediction_type (self ):
369+ return self .mltask_settings ['predictionType' ]
370+
371+ @property
372+ def split_params (self ):
373+ """
374+ Gets a handle to modify train/test splitting params.
375+
376+ :rtype: :class:`PredictionSplitParamsHandler`
377+ """
378+ return self .get_split_params ()
379+
380+ def get_split_params (self ):
381+ """
382+ Gets a handle to modify train/test splitting params.
383+
384+ :rtype: :class:`PredictionSplitParamsHandler`
385+ """
386+ return PredictionSplitParamsHandler (self .mltask_settings )
387+
388+ def split_ordered_by (self , feature_name , ascending = True ):
389+ """
390+ Deprecated. Use split_params.set_time_ordering()
391+ """
392+ warnings .warn ("split_ordered_by() is deprecated, please use split_params.set_time_ordering() instead" , DeprecationWarning )
393+ self .split_params .set_time_ordering (feature_name , ascending = ascending )
394+
395+ return self
396+
397+ def remove_ordered_split (self ):
398+ """
399+ Deprecated. Use split_params.unset_time_ordering()
400+ """
401+ warnings .warn ("remove_ordered_split() is deprecated, please use split_params.unset_time_ordering() instead" , DeprecationWarning )
402+ self .split_params .unset_time_ordering ()
403+
404+ return self
405+
406+ def use_sample_weighting (self , feature_name ):
407+ """
408+ Deprecated. use set_weighting()
409+ """
410+ warnings .warn ("use_sample_weighting() is deprecated, please use set_weighting() instead" , DeprecationWarning )
411+ return self .set_weighting (method = 'SAMPLE_WEIGHT' , feature_name = feature_name , )
412+
413+ def set_weighting (self , method , feature_name = None ):
414+ """
415+ Sets the method to weight samples.
416+
417+ If there was a WEIGHT feature declared previously, it will be set back as an INPUT feature first.
418+
419+ :param str method: Method to use. One of NO_WEIGHTING, SAMPLE_WEIGHT (must give a feature name),
420+ CLASS_WEIGHT or CLASS_AND_SAMPLE_WEIGHT (must give a feature name)
421+ :param str feature_name: Name of the feature to use as sample weight
422+ """
423+
424+ # First, if there was a WEIGHT feature, restore it as INPUT
425+ for feature_name in self .mltask_settings ['preprocessing' ]['per_feature' ]:
426+ if self .mltask_settings ['preprocessing' ]['per_feature' ][feature_name ]['role' ] == 'WEIGHT' :
427+ self .mltask_settings ['preprocessing' ]['per_feature' ][feature_name ]['role' ] = 'INPUT'
428+
429+ if method == "NO_WEIGHTING" :
430+ self .mltask_settings ['weight' ]['weightMethod' ] = method
431+
432+ elif method == "SAMPLE_WEIGHT" :
433+ if not feature_name in self .mltask_settings ["preprocessing" ]["per_feature" ]:
434+ raise ValueError ("Feature %s doesn't exist in this ML task, can't use as weight" % feature_name )
435+
436+ self .mltask_settings ['weight' ]['weightMethod' ] = method
437+ self .mltask_settings ['weight' ]['sampleWeightVariable' ] = feature_name
438+ self .mltask_settings ['preprocessing' ]['per_feature' ][feature_name ]['role' ] = 'WEIGHT'
439+
440+ elif method == "CLASS_WEIGHT" :
441+ if self .get_prediction_type () not in self .classification_prediction_types :
442+ raise ValueError ("Weighting method: {} not compatible with prediction type: {}, should be in {}" .format (method , self .get_prediction_type (), self .classification_prediction_types ))
443+
444+ self .mltask_settings ['weight' ]['weightMethod' ] = method
445+
446+ elif method == "CLASS_AND_SAMPLE_WEIGHT" :
447+ if self .get_prediction_type () not in self .classification_prediction_types :
448+ raise ValueError ("Weighting method: {} not compatible with prediction type: {}, should be in {}" .format (method , self .get_prediction_type (), self .classification_prediction_types ))
449+ if not feature_name in self .mltask_settings ["preprocessing" ]["per_feature" ]:
450+ raise ValueError ("Feature %s doesn't exist in this ML task, can't use as weight" % feature_name )
451+
452+ self .mltask_settings ['weight' ]['weightMethod' ] = method
453+ self .mltask_settings ['weight' ]['sampleWeightVariable' ] = feature_name
454+ self .mltask_settings ['preprocessing' ]['per_feature' ][feature_name ]['role' ] = 'WEIGHT'
455+
456+ else :
457+ raise ValueError ("Unknown weighting method: {}" .format (method ))
458+
459+ return self
460+
461+ def remove_sample_weighting (self ):
462+ """
463+ Deprecated. Use unset_weighting() instead
464+ """
465+ warnings .warn ("remove_sample_weighting() is deprecated, please use set_weigthing(method=\" NO_WEIGHTING\" ) instead" , DeprecationWarning )
466+ return self .unset_weighting ()
363467
364468class DSSClusteringMLTaskSettings (DSSMLTaskSettings ):
365469 __doc__ = []
0 commit comments