Skip to content

Commit 4664ef4

Browse files
authored
Merge pull request #21 from dataiku/feature/dss42-ml-api
ML API (DSS 4.2)
2 parents 438770d + 16d838f commit 4664ef4

File tree

11 files changed

+1440
-32
lines changed

11 files changed

+1440
-32
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
*.pyc
2+
.idea

HISTORY.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,16 @@
11
Changelog
22
==========
33

4+
4.1.0 (2018-01-10)
5+
-------------------
6+
7+
* Initial release for DSS 4.1
8+
9+
4.0.0 (2018-01-10)
10+
------------------
11+
12+
* Initial release for DSS 4.0
13+
414
3.1.4 (2017-01-03)
515
-------------------
616

dataikuapi/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@
33
from .apinode_client import APINodeClient
44
from .apinode_admin_client import APINodeAdminClient
55

6-
from .dss.recipe import GroupingRecipeCreator, JoinRecipeCreator, StackRecipeCreator, WindowRecipeCreator, SyncRecipeCreator, SamplingRecipeCreator, SQLQueryRecipeCreator, CodeRecipeCreator, SplitRecipeCreator, SortRecipeCreator, TopNRecipeCreator, DistinctRecipeCreator, DownloadRecipeCreator
6+
from .dss.recipe import GroupingRecipeCreator, JoinRecipeCreator, StackRecipeCreator, WindowRecipeCreator, SyncRecipeCreator, SamplingRecipeCreator, SQLQueryRecipeCreator, CodeRecipeCreator, SplitRecipeCreator, SortRecipeCreator, TopNRecipeCreator, DistinctRecipeCreator, DownloadRecipeCreator, PredictionScoringRecipeCreator, ClusteringScoringRecipeCreator
77

88
from .dss.admin import DSSUserImpersonationRule, DSSGroupImpersonationRule

dataikuapi/dss/analysis.py

Lines changed: 350 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,350 @@
1+
from ..utils import DataikuException
2+
from ..utils import DataikuUTF8CSVReader
3+
from ..utils import DataikuStreamedHttpUTF8CSVReader
4+
import json
5+
import time
6+
from .metrics import ComputedMetrics
7+
from .ml import DSSMLTask
8+
from .utils import DSSDatasetSelectionBuilder
9+
10+
class DSSAnalysisStepBuilder(object):
11+
def __init__(self, step_type=None, step_name=None):
12+
self.step = {'metaType':'PROCESSOR', 'type':step_type, 'name':step_name, 'params':{}}
13+
14+
def build(self):
15+
"""Returns the built step dict"""
16+
return self.step
17+
18+
def with_type(self, step_type):
19+
"""Sets the step's type"""
20+
self.step["type"] = step_type
21+
return self
22+
23+
def with_name(self, step_name):
24+
"""Sets the step's name"""
25+
self.step["name"] = step_name
26+
return self
27+
28+
29+
class DSSAnalysisDefinition():
30+
"""
31+
Object to manipulate the definition of a visual analysis
32+
"""
33+
34+
def __init__(self, analysis, acp):
35+
self.analysis = analysis
36+
self.acp = acp
37+
38+
def get_raw(self):
39+
"""
40+
Gets the raw dictionary of the visual analysis definition
41+
"""
42+
return self.acp
43+
44+
def get_raw_script(self):
45+
"""
46+
Gets the raw dictionary of visual analysis' script settings (including steps, sampling, ...)
47+
"""
48+
acp = self.get_raw()
49+
if not 'script' in acp:
50+
acp['script'] = {'steps':[]}
51+
return acp['script']
52+
53+
def get_raw_script_steps(self):
54+
"""
55+
Gets the raw dictionary of visual analysis' script steps
56+
"""
57+
script = self.get_raw_script()
58+
if not 'steps' in script:
59+
script['steps'] = []
60+
return script['steps']
61+
62+
def get_raw_script_sampling(self):
63+
"""
64+
Gets the raw dictionary of visual analysis' script sampling
65+
"""
66+
script = self.get_raw_script()
67+
if not 'explorationSampling' in script:
68+
script['explorationSampling'] = {}
69+
return script['explorationSampling']
70+
71+
def save(self):
72+
"""
73+
Shortcut to :meth:`DSSAnalysis.set_definition()`
74+
"""
75+
self.analysis.set_definition(self)
76+
77+
def add_step(self, step):
78+
"""
79+
Add a step to the script
80+
81+
:param object selection: A :class:`DSSAnalysisStepBuilder` to build the settings of the step.
82+
"""
83+
steps = self.get_raw_script_steps()
84+
if isinstance(step, DSSAnalysisStepBuilder):
85+
steps.append(step.build())
86+
else:
87+
steps.append(step)
88+
89+
def set_script_sampling_selection(self, selection):
90+
"""
91+
Sets the sampling for the script
92+
93+
:param object selection: A :class:`DSSDatasetSelectionBuilder` to build the settings of the extract of the dataset.
94+
"""
95+
sampling = self.get_raw_script_sampling()
96+
if isinstance(selection, DSSDatasetSelectionBuilder):
97+
sampling['selection'] = selection.build()
98+
else:
99+
sampling['selection'] = selection
100+
101+
102+
class DSSAnalysis(object):
103+
"""A handle to interact with a DSS visual analysis"""
104+
def __init__(self, client, project_key, analysis_id):
105+
self.client = client
106+
self.project_key = project_key
107+
self.analysis_id = analysis_id
108+
109+
########################################################
110+
# Analysis deletion
111+
########################################################
112+
113+
def delete(self, drop_data=False):
114+
"""
115+
Delete the dataset
116+
117+
:param bool drop_data: Should the data of the dataset be dropped
118+
"""
119+
return self.client._perform_empty("DELETE", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id))
120+
121+
122+
########################################################
123+
# Analysis definition
124+
########################################################
125+
126+
def get_definition(self):
127+
"""
128+
Get the definition of the analysis
129+
130+
:return: a DSSAnalysisDefinition object to interact with the settings
131+
:rtype: :class:`dataikuapi.dss.analysis.DSSAnalysisDefinition`
132+
"""
133+
acp = self.client._perform_json("GET", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id))
134+
return DSSAnalysisDefinition(self, acp)
135+
136+
def set_definition(self, definition):
137+
"""
138+
Set the definition of the analysis
139+
140+
Args:
141+
definition: the definition, as a JSON object or a :class:`dataikuapi.dss.analysis.DSSAnalysisDefinition`.
142+
You should only set a definition object that has been retrieved using the get_definition call.
143+
"""
144+
if isinstance(definition, DSSAnalysisDefinition):
145+
acp = definition.get_raw()
146+
else:
147+
acp = definition
148+
return self.client._perform_json("PUT", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id), body=acp)
149+
150+
151+
########################################################
152+
# ML
153+
########################################################
154+
155+
def create_prediction_ml_task(self, target_variable,
156+
ml_backend_type = "PY_MEMORY",
157+
guess_policy = "DEFAULT"):
158+
159+
160+
"""Creates a new prediction task in this visual analysis lab
161+
for a dataset.
162+
163+
164+
The returned ML task will be in 'guessing' state, i.e. analyzing
165+
the input dataset to determine feature handling and algorithms.
166+
167+
You should wait for the guessing to be completed by calling
168+
``wait_guess_complete`` on the returned object before doing anything
169+
else (in particular calling ``train`` or ``get_settings``)
170+
171+
:param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
172+
:param string guess_policy: Policy to use for setting the default parameters. Valid values are: DEFAULT, SIMPLE_FORMULA, DECISION_TREE, EXPLANATORY and PERFORMANCE
173+
"""
174+
175+
obj = {
176+
"taskType" : "PREDICTION",
177+
"targetVariable" : target_variable,
178+
"backendType": ml_backend_type,
179+
"guessPolicy": guess_policy
180+
}
181+
182+
ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj)
183+
return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
184+
185+
def create_clustering_ml_task(self,
186+
ml_backend_type = "PY_MEMORY",
187+
guess_policy = "KMEANS"):
188+
189+
190+
"""Creates a new clustering task in a new visual analysis lab
191+
for a dataset.
192+
193+
194+
The returned ML task will be in 'guessing' state, i.e. analyzing
195+
the input dataset to determine feature handling and algorithms.
196+
197+
You should wait for the guessing to be completed by calling
198+
``wait_guess_complete`` on the returned object before doing anything
199+
else (in particular calling ``train`` or ``get_settings``)
200+
201+
:param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
202+
:param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION
203+
"""
204+
205+
obj = {
206+
"taskType" : "CLUSTERING",
207+
"backendType": ml_backend_type,
208+
"guessPolicy": guess_policy
209+
}
210+
211+
ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj)
212+
return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"])
213+
214+
def list_ml_tasks(self):
215+
"""
216+
List the ML tasks in this visual analysis
217+
218+
Returns:
219+
the list of the ML tasks summaries, each one as a JSON object
220+
"""
221+
return self.client._perform_json("GET", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id))
222+
223+
def get_ml_task(self, mltask_id):
224+
"""
225+
Get a handle to interact with a specific ML task
226+
227+
Args:
228+
mltask_id: the identifier of the desired ML task
229+
230+
Returns:
231+
A :class:`dataikuapi.dss.ml.DSSMLTask` ML task handle
232+
"""
233+
return DSSMLTask(self.client, self.project_key, self.analysis_id, mltask_id)
234+
235+
236+
# some basic steps
237+
class DSSFormulaStepBuilder(DSSAnalysisStepBuilder):
238+
def __init__(self, step_name=None):
239+
super(DSSFormulaStepBuilder, self).__init__(step_type='CreateColumnWithGREL', step_name=step_name)
240+
241+
def with_output_column(self, column_name):
242+
"""Sets the step's output column's name"""
243+
self.step["params"]["column"] = column_name
244+
return self
245+
246+
def with_error_column(self, column_name):
247+
"""Sets the step's output column's name"""
248+
self.step["params"]["errorColumn"] = column_name
249+
return self
250+
251+
def with_expression(self, expression):
252+
"""Sets the step's expression"""
253+
self.step["params"]["expression"] = expression
254+
return self
255+
256+
class AppliesToStepBuilder(DSSAnalysisStepBuilder):
257+
def __init__(self, step_type=None, step_name=None):
258+
super(AppliesToStepBuilder, self).__init__(step_type=step_type, step_name=step_name)
259+
self.step["params"]["appliesTo"] = 'SINGLE_COLUMN'
260+
261+
def with_column_selection_mode(self, column_selection_mode):
262+
"""Sets the step's column selection mode (SINGLE_COLUMN, COLUMNS, PATTERN, ALL)"""
263+
self.step["params"]["appliesTo"] = column_selection_mode
264+
return self
265+
266+
def with_columns(self, *column_names):
267+
"""Sets the step's selected columns"""
268+
self.step["params"]["columns"] = [c for c in column_names]
269+
return self
270+
271+
def with_column_regex(self, regex):
272+
"""Sets the step's column selection regular expression"""
273+
self.step["params"]["appliesToPattern"] = regex
274+
return self
275+
276+
def with_single_column_selection(self, column_name):
277+
"""Sets the step's as applying to a single column"""
278+
return self.with_column_selection_mode('SINGLE_COLUMN').with_columns(column_name)
279+
280+
def with_multiple_column_selection(self, *column_names):
281+
"""Sets the step's as applying to a single column"""
282+
return self.with_column_selection_mode('COLUMNS').with_columns(column_names)
283+
284+
def with_regex_column_selection(self, regex):
285+
"""Sets the step's as applying to a single column"""
286+
return self.with_column_selection_mode('PATTERN').with_column_regex(regex)
287+
288+
def with_all_column_selection(self, column_name):
289+
"""Sets the step's as applying to all columns"""
290+
return self.with_column_selection_mode('ALL')
291+
292+
class FilterAndFlagStepBuilder(AppliesToStepBuilder):
293+
def __init__(self, step_type=None, step_name=None):
294+
super(FilterAndFlagStepBuilder, self).__init__(step_type=step_type, step_name=step_name)
295+
self.step["params"]["booleanMode"] = 'AND'
296+
self.step["params"]["action"] = 'REMOVE_ROW'
297+
298+
def with_action(self, action):
299+
"""Sets the step's action on match (KEEP_ROW, REMOVE_ROW, CLEAR_CELL, DONTCLEAR_CELL, FLAG)"""
300+
self.step["params"]["action"] = action
301+
return self
302+
303+
def with_boolean_mode(self, boolean_mode):
304+
"""Sets the step's mode for combining matches in different columns (AND, OR)"""
305+
self.step["params"]["booleanMode"] = boolean_mode
306+
return self
307+
308+
def with_flag_column(self, column_name):
309+
"""Sets the step's column for outputing the flag"""
310+
self.step["params"]["flagColumn"] = column_name
311+
return self
312+
313+
class FilterOnValueStepBuilder(FilterAndFlagStepBuilder):
314+
def __init__(self, step_name=None):
315+
super(FilterOnValueStepBuilder, self).__init__(step_type='FlagOnValue', step_name=step_name)
316+
317+
def with_values(self, *values):
318+
"""Sets the step's flagged values"""
319+
self.step["params"]["values"] = [v for v in values]
320+
return self
321+
322+
def with_matching_mode(self, matching_mode):
323+
"""Sets the step's matching_mode (FULL_STRING, SUBSTRING, PATTERN)"""
324+
self.step["params"]["matchingMode"] = matching_mode
325+
return self
326+
327+
def with_normalization_mode(self, normalization_mode):
328+
"""Sets the step's normalization_mode (EXACT, LOWERCASE, NORMALIZED)"""
329+
self.step["params"]["normalizationMode"] = normalization_mode
330+
return self
331+
332+
class FilterOnBadTypeStepBuilder(FilterAndFlagStepBuilder):
333+
def __init__(self, step_name=None):
334+
super(FilterOnBadTypeStepBuilder, self).__init__(step_type='FilterOnBadType', step_name=step_name)
335+
336+
def with_meaning(self, meaning):
337+
"""Sets the step's meaning to check"""
338+
self.step["params"]["type"] = meaning
339+
return self
340+
341+
class RemoveRowsStepBuilder(AppliesToStepBuilder):
342+
def __init__(self, step_name=None):
343+
super(RemoveRowsStepBuilder, self).__init__(step_type='RemoveRowsOnEmpty', step_name=step_name)
344+
345+
def with_meaning(self, keep):
346+
"""Sets the step's behavior when an empty value is found : True=keep, False=drop (default)"""
347+
self.step["params"]["keep"] = keep
348+
return self
349+
350+

0 commit comments

Comments
 (0)