Skip to content

Commit 74f9ae0

Browse files
committed
add some helpers for the analysis api
1 parent 6c5ecc1 commit 74f9ae0

File tree

3 files changed

+265
-50
lines changed

3 files changed

+265
-50
lines changed

dataikuapi/dss/analysis.py

Lines changed: 220 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,99 @@
55
import time
66
from .metrics import ComputedMetrics
77
from .ml import DSSMLTask
8+
from .utils import DSSDatasetSelectionBuilder
9+
10+
class DSSAnalysisStepBuilder(object):
11+
def __init__(self, step_type=None, step_name=None):
12+
self.step = {'metaType':'PROCESSOR', 'type':step_type, 'name':step_name, 'params':{}}
13+
14+
def build(self):
15+
"""Returns the built step dict"""
16+
return self.step
17+
18+
def with_type(self, step_type):
19+
"""Sets the step's type"""
20+
self.step["type"] = step_type
21+
return self
22+
23+
def with_name(self, step_name):
24+
"""Sets the step's name"""
25+
self.step["name"] = step_name
26+
return self
27+
28+
29+
class DSSAnalysisDefinition():
30+
"""
31+
Object to manipulate the definition of a visual analysis
32+
"""
33+
34+
def __init__(self, analysis, acp):
35+
self.analysis = analysis
36+
self.acp = acp
37+
38+
def get_raw(self):
39+
"""
40+
Gets the raw dictionary of the visual analysis definition
41+
"""
42+
return self.acp
43+
44+
def get_raw_script(self):
45+
"""
46+
Gets the raw dictionary of visual analysis' script settings (including steps, sampling, ...)
47+
"""
48+
acp = self.get_raw()
49+
if not 'script' in acp:
50+
acp['script'] = {'steps':[]}
51+
return acp['script']
52+
53+
def get_raw_script_steps(self):
54+
"""
55+
Gets the raw dictionary of visual analysis' script steps
56+
"""
57+
script = self.get_raw_script()
58+
if not 'steps' in script:
59+
script['steps'] = []
60+
return script['steps']
61+
62+
def get_raw_script_sampling(self):
63+
"""
64+
Gets the raw dictionary of visual analysis' script sampling
65+
"""
66+
script = self.get_raw_script()
67+
if not 'explorationSampling' in script:
68+
script['explorationSampling'] = {}
69+
return script['explorationSampling']
70+
71+
def save(self):
72+
"""
73+
Shortcut to :meth:`DSSAnalysis.set_definition()`
74+
"""
75+
self.analysis.set_definition(self)
76+
77+
def add_step(self, step):
78+
"""
79+
Add a step to the script
80+
81+
:param object selection: A :class:`DSSAnalysisStepBuilder` to build the settings of the step.
82+
"""
83+
steps = self.get_raw_script_steps()
84+
if isinstance(step, DSSAnalysisStepBuilder):
85+
steps.append(step.build())
86+
else:
87+
steps.append(step)
88+
89+
def set_script_sampling_selection(self, selection):
90+
"""
91+
Sets the sampling for the script
92+
93+
:param object selection: A :class:`DSSDatasetSelectionBuilder` to build the settings of the extract of the dataset.
94+
"""
95+
sampling = self.get_raw_script_sampling()
96+
if isinstance(selection, DSSDatasetSelectionBuilder):
97+
sampling['selection'] = selection.build()
98+
else:
99+
sampling['selection'] = selection
100+
8101

9102
class DSSAnalysis(object):
10103
"""A handle to interact with a DSS visual analysis"""
@@ -34,20 +127,25 @@ def get_definition(self):
34127
"""
35128
Get the definition of the analysis
36129
37-
Returns:
38-
the definition, as a JSON object
130+
:return: a DSSAnalysisDefinition object to interact with the settings
131+
:rtype: :class:`dataikuapi.dss.analysis.DSSAnalysisDefinition`
39132
"""
40-
return self.client._perform_json("GET", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id))
133+
acp = self.client._perform_json("GET", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id))
134+
return DSSAnalysisDefinition(self, acp)
41135

42136
def set_definition(self, definition):
43137
"""
44138
Set the definition of the analysis
45139
46140
Args:
47-
definition: the definition, as a JSON object. You should only set a definition object
48-
that has been retrieved using the get_definition call.
141+
definition: the definition, as a JSON object or a :class:`dataikuapi.dss.analysis.DSSAnalysisDefinition`.
142+
You should only set a definition object that has been retrieved using the get_definition call.
49143
"""
50-
return self.client._perform_json("PUT", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id), body=definition)
144+
if isinstance(definition, DSSAnalysisDefinition):
145+
acp = definition.get_raw()
146+
else:
147+
acp = definition
148+
return self.client._perform_json("PUT", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id), body=acp)
51149

52150

53151
########################################################
@@ -134,3 +232,119 @@ def get_ml_task(self, mltask_id):
134232
"""
135233
return DSSMLTask(self.client, self.project_key, self.analysis_id, mltask_id)
136234

235+
236+
# some basic steps
237+
class DSSFormulaStepBuilder(DSSAnalysisStepBuilder):
238+
def __init__(self, step_name=None):
239+
super(DSSFormulaStepBuilder, self).__init__(step_type='CreateColumnWithGREL', step_name=step_name)
240+
241+
def with_output_column(self, column_name):
242+
"""Sets the step's output column's name"""
243+
self.step["params"]["column"] = column_name
244+
return self
245+
246+
def with_error_column(self, column_name):
247+
"""Sets the step's output column's name"""
248+
self.step["params"]["errorColumn"] = column_name
249+
return self
250+
251+
def with_expression(self, expression):
252+
"""Sets the step's expression"""
253+
self.step["params"]["expression"] = expression
254+
return self
255+
256+
class AppliesToStepBuilder(DSSAnalysisStepBuilder):
257+
def __init__(self, step_type=None, step_name=None):
258+
super(AppliesToStepBuilder, self).__init__(step_type=step_type, step_name=step_name)
259+
self.step["params"]["appliesTo"] = 'SINGLE_COLUMN'
260+
261+
def with_column_selection_mode(self, column_selection_mode):
262+
"""Sets the step's column selection mode (SINGLE_COLUMN, COLUMNS, PATTERN, ALL)"""
263+
self.step["params"]["appliesTo"] = column_selection_mode
264+
return self
265+
266+
def with_columns(self, *column_names):
267+
"""Sets the step's selected columns"""
268+
self.step["params"]["columns"] = [c for c in column_names]
269+
return self
270+
271+
def with_column_regex(self, regex):
272+
"""Sets the step's column selection regular expression"""
273+
self.step["params"]["appliesToPattern"] = regex
274+
return self
275+
276+
def with_single_column_selection(self, column_name):
277+
"""Sets the step's as applying to a single column"""
278+
return self.with_column_selection_mode('SINGLE_COLUMN').with_columns(column_name)
279+
280+
def with_multiple_column_selection(self, *column_names):
281+
"""Sets the step's as applying to a single column"""
282+
return self.with_column_selection_mode('COLUMNS').with_columns(column_names)
283+
284+
def with_regex_column_selection(self, regex):
285+
"""Sets the step's as applying to a single column"""
286+
return self.with_column_selection_mode('PATTERN').with_column_regex(regex)
287+
288+
def with_all_column_selection(self, column_name):
289+
"""Sets the step's as applying to all columns"""
290+
return self.with_column_selection_mode('ALL')
291+
292+
class FilterAndFlagStepBuilder(AppliesToStepBuilder):
293+
def __init__(self, step_type=None, step_name=None):
294+
super(FilterAndFlagStepBuilder, self).__init__(step_type=step_type, step_name=step_name)
295+
self.step["params"]["booleanMode"] = 'AND'
296+
self.step["params"]["action"] = 'REMOVE_ROW'
297+
298+
def with_action(self, action):
299+
"""Sets the step's action on match (KEEP_ROW, REMOVE_ROW, CLEAR_CELL, DONTCLEAR_CELL, FLAG)"""
300+
self.step["params"]["action"] = column_selection_mode
301+
return self
302+
303+
def with_boolean_mode(self, boolean_mode):
304+
"""Sets the step's mode for combining matches in different columns (AND, OR)"""
305+
self.step["params"]["booleanMode"] = boolean_mode
306+
return self
307+
308+
def with_flag_column(self, column_name):
309+
"""Sets the step's column for outputing the flag"""
310+
self.step["params"]["flagColumn"] = column_name
311+
return self
312+
313+
class FilterOnValueStepBuilder(FilterAndFlagStepBuilder):
314+
def __init__(self, step_name=None):
315+
super(FilterOnValueStepBuilder, self).__init__(step_type='FlagOnValue', step_name=step_name)
316+
317+
def with_values(self, *values):
318+
"""Sets the step's flagged values"""
319+
self.step["params"]["values"] = [v for v in values]
320+
return self
321+
322+
def with_matching_mode(self, matching_mode):
323+
"""Sets the step's matching_mode (FULL_STRING, SUBSTRING, PATTERN)"""
324+
self.step["params"]["matchingMode"] = matching_mode
325+
return self
326+
327+
def with_normalization_mode(self, normalization_mode):
328+
"""Sets the step's normalization_mode (EXACT, LOWERCASE, NORMALIZED)"""
329+
self.step["params"]["normalizationMode"] = normalization_mode
330+
return self
331+
332+
class FilterOnBadTypeStepBuilder(FilterAndFlagStepBuilder):
333+
def __init__(self, step_name=None):
334+
super(FilterOnBadTypeStepBuilder, self).__init__(step_type='FilterOnBadType', step_name=step_name)
335+
336+
def with_meaning(self, meaning):
337+
"""Sets the step's meaning to check"""
338+
self.step["params"]["type"] = meaning
339+
return self
340+
341+
class RemoveRowsStepBuilder(AppliesToStepBuilder):
342+
def __init__(self, step_name=None):
343+
super(RemoveRowsStepBuilder, self).__init__(step_type='RemoveRowsOnEmpty', step_name=step_name)
344+
345+
def with_meaning(self, keep):
346+
"""Sets the step's behavior when an empty value is found : True=keep, False=drop (default)"""
347+
self.step["params"]["keep"] = keep
348+
return self
349+
350+

dataikuapi/dss/ml.py

Lines changed: 1 addition & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,50 +4,7 @@
44
import json
55
import time
66
from .metrics import ComputedMetrics
7-
8-
class DSSDatasetSelectionBuilder(object):
9-
"""Builder for a "dataset selection". In DSS, a dataset selection is used to select a part of a dataset for processing.
10-
11-
Depending on the location where it is used, a selection can include:
12-
* Sampling
13-
* Filtering by partitions (for partitioned datasets)
14-
* Filtering by an expression
15-
* Selection of columns
16-
* Ordering
17-
18-
Please see the sampling documentation of DSS for a detailed explanation of the sampling methods.
19-
20-
"""
21-
def __init__(self):
22-
self.selection = {}
23-
24-
def build(self):
25-
"""Returns the built selection dict"""
26-
return self.selection
27-
28-
def with_head_sampling(self, limit):
29-
"""Sets the sampling to 'first records' mode"""
30-
self.selection["samplingMethod"] = "HEAD_SEQUENTIAL"
31-
self.selection["maxRecords"] = limit
32-
return self
33-
34-
def with_all_data_sampling(self):
35-
"""Sets the sampling to 'no sampling, all data' mode"""
36-
self.selection["samplingMethod"] = "FULL"
37-
return self
38-
39-
def with_random_fixed_nb_sampling(self, nb):
40-
"""Sets the sampling to 'Random sampling, fixed number of records' mode"""
41-
self.selection["samplingMethod"] = "RANDOM_FIXED_NB"
42-
self.selection["maxRecords"] = nb
43-
return self
44-
45-
def with_selected_partitions(self, ids):
46-
"""Sets partition filtering on the given partition identifiers. The dataset to select must be partitioned."""
47-
self.selection["partitionSelectionMethod"] = "SELECTED"
48-
self.selection["selectedPartitions"] = ids
49-
return self
50-
7+
from .utils import DSSDatasetSelectionBuilder
518

529
class PredictionSplitParamsHandler(object):
5310
"""Object to modify the train/test splitting params."""

dataikuapi/dss/utils.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
class DSSDatasetSelectionBuilder(object):
2+
"""Builder for a "dataset selection". In DSS, a dataset selection is used to select a part of a dataset for processing.
3+
4+
Depending on the location where it is used, a selection can include:
5+
* Sampling
6+
* Filtering by partitions (for partitioned datasets)
7+
* Filtering by an expression
8+
* Selection of columns
9+
* Ordering
10+
11+
Please see the sampling documentation of DSS for a detailed explanation of the sampling methods.
12+
13+
"""
14+
def __init__(self):
15+
self.selection = {}
16+
17+
def build(self):
18+
"""Returns the built selection dict"""
19+
return self.selection
20+
21+
def with_head_sampling(self, limit):
22+
"""Sets the sampling to 'first records' mode"""
23+
self.selection["samplingMethod"] = "HEAD_SEQUENTIAL"
24+
self.selection["maxRecords"] = limit
25+
return self
26+
27+
def with_all_data_sampling(self):
28+
"""Sets the sampling to 'no sampling, all data' mode"""
29+
self.selection["samplingMethod"] = "FULL"
30+
return self
31+
32+
def with_random_fixed_nb_sampling(self, nb):
33+
"""Sets the sampling to 'Random sampling, fixed number of records' mode"""
34+
self.selection["samplingMethod"] = "RANDOM_FIXED_NB"
35+
self.selection["maxRecords"] = nb
36+
return self
37+
38+
def with_selected_partitions(self, ids):
39+
"""Sets partition filtering on the given partition identifiers. The dataset to select must be partitioned."""
40+
self.selection["partitionSelectionMethod"] = "SELECTED"
41+
self.selection["selectedPartitions"] = ids
42+
return self
43+
44+

0 commit comments

Comments
 (0)