Skip to content

Commit e78a697

Browse files
authored
Merge pull request #67 from dataiku/feature/new-dataset-management-apis-patch
add recipe create option to avoid overwriting recipe by default
2 parents 80cae22 + 5191416 commit e78a697

File tree

4 files changed

+196
-65
lines changed

4 files changed

+196
-65
lines changed

dataikuapi/dss/dataset.py

Lines changed: 27 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ def __init__(self, client, project_key, dataset_name):
5959
self.project_key = project_key
6060
self.dataset_name = dataset_name
6161

62+
@property
63+
def name(self):
64+
return self.dataset_name
65+
6266
########################################################
6367
# Dataset deletion
6468
########################################################
@@ -398,13 +402,31 @@ def create_analysis(self):
398402
"""
399403
return self.project_create_analysis(self.dataset_name)
400404

401-
def list_analyses(self):
405+
def list_analyses(self, as_type="listitems"):
402406
"""
403407
List the visual analyses on this dataset
404-
:return list of dicts
408+
:param str as_type: How to return the list. Supported values are "listitems" and "objects".
409+
:returns: The list of the analyses. If "as_type" is "listitems", each one as a dict,
410+
If "as_type" is "objects", each one as a :class:`dataikuapi.dss.analysis.DSSAnalysis`
411+
:rtype: list
405412
"""
406-
analysis_list = self.project.list_analyses()
407-
return [desc for desc in analysis_list if self.dataset_name == desc.get('inputDataset')]
413+
analysis_list = [al for al in self.project.list_analyses() if self.dataset_name == al.get('inputDataset')]
414+
415+
if as_type == "listitems" or as_type == "listitem":
416+
return analysis_list
417+
elif as_type == "objects" or as_type == "object":
418+
return [self.project.get_analysis(item["analysisId"])for item in analysis_list]
419+
else:
420+
raise ValueError("Unknown as_type")
421+
422+
def delete_analyses(self, drop_data=False):
423+
"""
424+
Deletes all analyses that have this dataset as input dataset. Also deletes
425+
ML tasks that are part of the analysis
426+
427+
:param: bool drop_data: whether to drop data for all ML tasks in the analysis
428+
"""
429+
[analysis.delete(drop_data=drop_data) for analysis in self.list_analyses(as_type="objects")]
408430

409431
########################################################
410432
# Statistics worksheets
@@ -741,4 +763,4 @@ def already_exists(self):
741763
dataset.get_metadata()
742764
return True
743765
except Exception as e:
744-
return False
766+
return False

dataikuapi/dss/managedfolder.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,10 @@ def __init__(self, client, project_key, odb_id):
1515
self.project_key = project_key
1616
self.odb_id = odb_id
1717

18+
@property
19+
def id(self):
20+
return self.odb_id
21+
1822
########################################################
1923
# Managed folder deletion
2024
########################################################

dataikuapi/dss/ml.py

Lines changed: 164 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from ..utils import DataikuException
22
from ..utils import DataikuUTF8CSVReader
33
from ..utils import DataikuStreamedHttpUTF8CSVReader
4-
import json
4+
import json, warnings
55
import time
66
from .metrics import ComputedMetrics
77
from .utils import DSSDatasetSelectionBuilder, DSSFilterBuilder
@@ -10,10 +10,21 @@
1010
class PredictionSplitParamsHandler(object):
1111
"""Object to modify the train/test splitting params."""
1212

13+
SPLIT_PARAMS_KEY = 'splitParams'
14+
1315
def __init__(self, mltask_settings):
1416
"""Do not call directly, use :meth:`DSSMLTaskSettings.get_split_params`"""
1517
self.mltask_settings = mltask_settings
1618

19+
def get_raw(self):
20+
"""Gets the raw settings of the prediction split configuration. This returns a reference to the raw settings, not a copy,
21+
so changes made to the returned object will be reflected when saving.
22+
23+
:rtype: dict
24+
"""
25+
return self.mltask_settings[PredictionSplitParamsHandler.SPLIT_PARAMS_KEY]
26+
27+
1728
def set_split_random(self, train_ratio = 0.8, selection = None, dataset_name=None):
1829
"""
1930
Sets the train/test split to random splitting of an extract of a single dataset
@@ -22,7 +33,7 @@ def set_split_random(self, train_ratio = 0.8, selection = None, dataset_name=Non
2233
:param object selection: A :class:`~dataikuapi.dss.utils.DSSDatasetSelectionBuilder` to build the settings of the extract of the dataset. May be None (won't be changed)
2334
:param str dataset_name: Name of dataset to split. If None, the main dataset used to create the visual analysis will be used.
2435
"""
25-
sp = self.mltask_settings["splitParams"]
36+
sp = self.mltask_settings[PredictionSplitParamsHandler.SPLIT_PARAMS_KEY]
2637
sp["ttPolicy"] = "SPLIT_SINGLE_DATASET"
2738
if selection is not None:
2839
if isinstance(selection, DSSDatasetSelectionBuilder):
@@ -36,6 +47,8 @@ def set_split_random(self, train_ratio = 0.8, selection = None, dataset_name=Non
3647
if dataset_name is not None:
3748
sp["ssdDatasetSmartName"] = dataset_name
3849

50+
return self
51+
3952
def set_split_kfold(self, n_folds = 5, selection = None, dataset_name=None):
4053
"""
4154
Sets the train/test split to k-fold splitting of an extract of a single dataset
@@ -44,7 +57,7 @@ def set_split_kfold(self, n_folds = 5, selection = None, dataset_name=None):
4457
:param object selection: A :class:`~dataikuapi.dss.utils.DSSDatasetSelectionBuilder` to build the settings of the extract of the dataset. May be None (won't be changed)
4558
:param str dataset_name: Name of dataset to split. If None, the main dataset used to create the visual analysis will be used.
4659
"""
47-
sp = self.mltask_settings["splitParams"]
60+
sp = self.mltask_settings[PredictionSplitParamsHandler.SPLIT_PARAMS_KEY]
4861
sp["ttPolicy"] = "SPLIT_SINGLE_DATASET"
4962
if selection is not None:
5063
if isinstance(selection, DSSDatasetSelectionBuilder):
@@ -58,6 +71,8 @@ def set_split_kfold(self, n_folds = 5, selection = None, dataset_name=None):
5871
if dataset_name is not None:
5972
sp["ssdDatasetSmartName"] = dataset_name
6073

74+
return self
75+
6176
def set_split_explicit(self, train_selection, test_selection, dataset_name=None, test_dataset_name=None, train_filter=None, test_filter=None):
6277
"""
6378
Sets the train/test split to explicit extract of one or two dataset(s)
@@ -69,7 +84,7 @@ def set_split_explicit(self, train_selection, test_selection, dataset_name=None,
6984
:param object train_filter: A :class:`~dataikuapi.dss.utils.DSSFilterBuilder` to build the settings of the filter of the train dataset. May be None (won't be changed)
7085
:param object test_filter: A :class:`~dataikuapi.dss.utils.DSSFilterBuilder` to build the settings of the filter of the test dataset. May be None (won't be changed)
7186
"""
72-
sp = self.mltask_settings["splitParams"]
87+
sp = self.mltask_settings[PredictionSplitParamsHandler.SPLIT_PARAMS_KEY]
7388
if dataset_name is None:
7489
raise Exception("For explicit splitting a dataset_name is mandatory")
7590
if test_dataset_name is None or test_dataset_name == dataset_name:
@@ -108,44 +123,15 @@ def set_split_explicit(self, train_selection, test_selection, dataset_name=None,
108123
else:
109124
test_split["filter"] = test_filter
110125

126+
return self
111127

112-
class DSSMLTaskSettings(object):
113-
"""
114-
Object to read and modify the settings of a ML task.
115-
116-
Do not create this object directly, use :meth:`DSSMLTask.get_settings()` instead
117-
"""
118-
def __init__(self, client, project_key, analysis_id, mltask_id, mltask_settings):
119-
self.client = client
120-
self.project_key = project_key
121-
self.analysis_id = analysis_id
122-
self.mltask_id = mltask_id
123-
self.mltask_settings = mltask_settings
124-
125-
def get_raw(self):
126-
"""
127-
Gets the raw settings of this ML Task. This returns a reference to the raw settings, not a copy,
128-
so changes made to the returned object will be reflected when saving.
129-
130-
:rtype: dict
131-
"""
132-
return self.mltask_settings
133-
134-
def get_split_params(self):
135-
"""
136-
Gets an object to modify train/test splitting params.
137-
138-
:rtype: :class:`PredictionSplitParamsHandler`
128+
def set_time_ordering(self, feature_name, ascending=True):
139129
"""
140-
return PredictionSplitParamsHandler(self.mltask_settings)
141-
142-
def split_ordered_by(self, feature_name, ascending=True):
143-
"""
144-
Uses a variable to sort the data for train/test split and hyperparameter optimization
130+
Uses a variable to sort the data for train/test split and hyperparameter optimization by time
145131
:param str feature_name: Name of the variable to use
146132
:param bool ascending: True iff the test set is expected to have larger time values than the train set
147133
"""
148-
self.remove_ordered_split()
134+
self.unset_time_ordering()
149135
if not feature_name in self.mltask_settings["preprocessing"]["per_feature"]:
150136
raise ValueError("Feature %s doesn't exist in this ML task, can't use as time" % feature_name)
151137
self.mltask_settings['time']['enabled'] = True
@@ -160,9 +146,11 @@ def split_ordered_by(self, feature_name, ascending=True):
160146
elif self.mltask_settings['modeling']['gridSearchParams']['mode'] == "SHUFFLE":
161147
self.mltask_settings['modeling']['gridSearchParams']['mode'] = "TIME_SERIES_SINGLE_SPLIT"
162148

163-
def remove_ordered_split(self):
149+
return self
150+
151+
def unset_time_ordering(self):
164152
"""
165-
Remove time-based ordering.
153+
Remove time-based ordering for train/test split and hyperparameter optimization
166154
"""
167155
self.mltask_settings['time']['enabled'] = False
168156
self.mltask_settings['time']['timeVariable'] = None
@@ -174,6 +162,31 @@ def remove_ordered_split(self):
174162
elif self.mltask_settings['modeling']['gridSearchParams']['mode'] == "TIME_SERIES_SINGLE_SPLIT":
175163
self.mltask_settings['modeling']['gridSearchParams']['mode'] = "SHUFFLE"
176164

165+
return self
166+
167+
168+
class DSSMLTaskSettings(object):
169+
"""
170+
Object to read and modify the settings of a ML task.
171+
172+
Do not create this object directly, use :meth:`DSSMLTask.get_settings()` instead
173+
"""
174+
def __init__(self, client, project_key, analysis_id, mltask_id, mltask_settings):
175+
self.client = client
176+
self.project_key = project_key
177+
self.analysis_id = analysis_id
178+
self.mltask_id = mltask_id
179+
self.mltask_settings = mltask_settings
180+
181+
def get_raw(self):
182+
"""
183+
Gets the raw settings of this ML Task. This returns a reference to the raw settings, not a copy,
184+
so changes made to the returned object will be reflected when saving.
185+
186+
:rtype: dict
187+
"""
188+
return self.mltask_settings
189+
177190
def get_feature_preprocessing(self, feature_name):
178191
"""
179192
Gets the feature preprocessing params for a particular feature. This returns a reference to the
@@ -214,27 +227,6 @@ def use_feature(self, feature_name):
214227
"""
215228
self.get_feature_preprocessing(feature_name)["role"] = "INPUT"
216229

217-
def use_sample_weighting(self, feature_name):
218-
"""
219-
Uses a feature as sample weight
220-
:param str feature_name: Name of the feature to use
221-
"""
222-
self.remove_sample_weighting()
223-
if not feature_name in self.mltask_settings["preprocessing"]["per_feature"]:
224-
raise ValueError("Feature %s doesn't exist in this ML task, can't use as weight" % feature_name)
225-
self.mltask_settings['weight']['weightMethod'] = 'SAMPLE_WEIGHT'
226-
self.mltask_settings['weight']['sampleWeightVariable'] = feature_name
227-
self.mltask_settings['preprocessing']['per_feature'][feature_name]['role'] = 'WEIGHT'
228-
229-
def remove_sample_weighting(self):
230-
"""
231-
Remove sample weighting. If a feature was used as weight, it's set back to being an input feature
232-
"""
233-
self.mltask_settings['weight']['weightMethod'] = 'NO_WEIGHTING'
234-
for feature_name in self.mltask_settings['preprocessing']['per_feature']:
235-
if self.mltask_settings['preprocessing']['per_feature'][feature_name]['role'] == 'WEIGHT':
236-
self.mltask_settings['preprocessing']['per_feature'][feature_name]['role'] = 'INPUT'
237-
238230
def get_algorithm_settings(self, algorithm_name):
239231
"""
240232
Gets the training settings for a particular algorithm. This returns a reference to the
@@ -360,6 +352,118 @@ class DSSPredictionMLTaskSettings(DSSMLTaskSettings):
360352
"KERAS_CODE" : "keras"
361353
}
362354

355+
class PredictionTypes:
356+
BINARY = "BINARY_CLASSIFICATION"
357+
REGRESSION = "REGRESSION"
358+
MULTICLASS = "MULTICLASS"
359+
360+
def __init__(self, client, project_key, analysis_id, mltask_id, mltask_settings):
361+
DSSMLTaskSettings.__init__(self, client, project_key, analysis_id, mltask_id, mltask_settings)
362+
363+
if self.get_prediction_type() not in [self.PredictionTypes.BINARY, self.PredictionTypes.REGRESSION, self.PredictionTypes.MULTICLASS]:
364+
raise ValueError("Unknown prediction type: {}".format(self.prediction_type))
365+
366+
self.classification_prediction_types = [self.PredictionTypes.BINARY, self.PredictionTypes.MULTICLASS]
367+
368+
def get_prediction_type(self):
369+
return self.mltask_settings['predictionType']
370+
371+
@property
372+
def split_params(self):
373+
"""
374+
Gets a handle to modify train/test splitting params.
375+
376+
:rtype: :class:`PredictionSplitParamsHandler`
377+
"""
378+
return self.get_split_params()
379+
380+
def get_split_params(self):
381+
"""
382+
Gets a handle to modify train/test splitting params.
383+
384+
:rtype: :class:`PredictionSplitParamsHandler`
385+
"""
386+
return PredictionSplitParamsHandler(self.mltask_settings)
387+
388+
def split_ordered_by(self, feature_name, ascending=True):
389+
"""
390+
Deprecated. Use split_params.set_time_ordering()
391+
"""
392+
warnings.warn("split_ordered_by() is deprecated, please use split_params.set_time_ordering() instead", DeprecationWarning)
393+
self.split_params.set_time_ordering(feature_name, ascending=ascending)
394+
395+
return self
396+
397+
def remove_ordered_split(self):
398+
"""
399+
Deprecated. Use split_params.unset_time_ordering()
400+
"""
401+
warnings.warn("remove_ordered_split() is deprecated, please use split_params.unset_time_ordering() instead", DeprecationWarning)
402+
self.split_params.unset_time_ordering()
403+
404+
return self
405+
406+
def use_sample_weighting(self, feature_name):
407+
"""
408+
Deprecated. use set_weighting()
409+
"""
410+
warnings.warn("use_sample_weighting() is deprecated, please use set_weighting() instead", DeprecationWarning)
411+
return self.set_weighting(method='SAMPLE_WEIGHT', feature_name=feature_name, )
412+
413+
def set_weighting(self, method, feature_name=None):
414+
"""
415+
Sets the method to weight samples.
416+
417+
If there was a WEIGHT feature declared previously, it will be set back as an INPUT feature first.
418+
419+
:param str method: Method to use. One of NO_WEIGHTING, SAMPLE_WEIGHT (must give a feature name),
420+
CLASS_WEIGHT or CLASS_AND_SAMPLE_WEIGHT (must give a feature name)
421+
:param str feature_name: Name of the feature to use as sample weight
422+
"""
423+
424+
# First, if there was a WEIGHT feature, restore it as INPUT
425+
for feature_name in self.mltask_settings['preprocessing']['per_feature']:
426+
if self.mltask_settings['preprocessing']['per_feature'][feature_name]['role'] == 'WEIGHT':
427+
self.mltask_settings['preprocessing']['per_feature'][feature_name]['role'] = 'INPUT'
428+
429+
if method == "NO_WEIGHTING":
430+
self.mltask_settings['weight']['weightMethod'] = method
431+
432+
elif method == "SAMPLE_WEIGHT":
433+
if not feature_name in self.mltask_settings["preprocessing"]["per_feature"]:
434+
raise ValueError("Feature %s doesn't exist in this ML task, can't use as weight" % feature_name)
435+
436+
self.mltask_settings['weight']['weightMethod'] = method
437+
self.mltask_settings['weight']['sampleWeightVariable'] = feature_name
438+
self.mltask_settings['preprocessing']['per_feature'][feature_name]['role'] = 'WEIGHT'
439+
440+
elif method == "CLASS_WEIGHT":
441+
if self.get_prediction_type() not in self.classification_prediction_types:
442+
raise ValueError("Weighting method: {} not compatible with prediction type: {}, should be in {}".format(method, self.get_prediction_type(), self.classification_prediction_types))
443+
444+
self.mltask_settings['weight']['weightMethod'] = method
445+
446+
elif method == "CLASS_AND_SAMPLE_WEIGHT":
447+
if self.get_prediction_type() not in self.classification_prediction_types:
448+
raise ValueError("Weighting method: {} not compatible with prediction type: {}, should be in {}".format(method, self.get_prediction_type(), self.classification_prediction_types))
449+
if not feature_name in self.mltask_settings["preprocessing"]["per_feature"]:
450+
raise ValueError("Feature %s doesn't exist in this ML task, can't use as weight" % feature_name)
451+
452+
self.mltask_settings['weight']['weightMethod'] = method
453+
self.mltask_settings['weight']['sampleWeightVariable'] = feature_name
454+
self.mltask_settings['preprocessing']['per_feature'][feature_name]['role'] = 'WEIGHT'
455+
456+
else:
457+
raise ValueError("Unknown weighting method: {}".format(method))
458+
459+
return self
460+
461+
def remove_sample_weighting(self):
462+
"""
463+
Deprecated. Use unset_weighting() instead
464+
"""
465+
warnings.warn("remove_sample_weighting() is deprecated, please use set_weigthing(method=\"NO_WEIGHTING\") instead", DeprecationWarning)
466+
return self.unset_weighting()
363467

364468
class DSSClusteringMLTaskSettings(DSSMLTaskSettings):
365469
__doc__ = []

dataikuapi/dss/recipe.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ def __init__(self, client, project_key, recipe_name):
1919

2020
@property
2121
def name(self):
22+
"""The name of the recipe"""
2223
return self.recipe_name
2324

2425
def compute_schema_updates(self):

0 commit comments

Comments
 (0)