|
| 1 | +from ..utils import DataikuException |
| 2 | +from ..utils import DataikuUTF8CSVReader |
| 3 | +from ..utils import DataikuStreamedHttpUTF8CSVReader |
| 4 | +import json |
| 5 | +import time |
| 6 | +from .metrics import ComputedMetrics |
| 7 | +from .ml import DSSMLTask |
| 8 | +from .utils import DSSDatasetSelectionBuilder |
| 9 | + |
| 10 | +class DSSAnalysisStepBuilder(object): |
| 11 | + def __init__(self, step_type=None, step_name=None): |
| 12 | + self.step = {'metaType':'PROCESSOR', 'type':step_type, 'name':step_name, 'params':{}} |
| 13 | + |
| 14 | + def build(self): |
| 15 | + """Returns the built step dict""" |
| 16 | + return self.step |
| 17 | + |
| 18 | + def with_type(self, step_type): |
| 19 | + """Sets the step's type""" |
| 20 | + self.step["type"] = step_type |
| 21 | + return self |
| 22 | + |
| 23 | + def with_name(self, step_name): |
| 24 | + """Sets the step's name""" |
| 25 | + self.step["name"] = step_name |
| 26 | + return self |
| 27 | + |
| 28 | + |
| 29 | +class DSSAnalysisDefinition(): |
| 30 | + """ |
| 31 | + Object to manipulate the definition of a visual analysis |
| 32 | + """ |
| 33 | + |
| 34 | + def __init__(self, analysis, acp): |
| 35 | + self.analysis = analysis |
| 36 | + self.acp = acp |
| 37 | + |
| 38 | + def get_raw(self): |
| 39 | + """ |
| 40 | + Gets the raw dictionary of the visual analysis definition |
| 41 | + """ |
| 42 | + return self.acp |
| 43 | + |
| 44 | + def get_raw_script(self): |
| 45 | + """ |
| 46 | + Gets the raw dictionary of visual analysis' script settings (including steps, sampling, ...) |
| 47 | + """ |
| 48 | + acp = self.get_raw() |
| 49 | + if not 'script' in acp: |
| 50 | + acp['script'] = {'steps':[]} |
| 51 | + return acp['script'] |
| 52 | + |
| 53 | + def get_raw_script_steps(self): |
| 54 | + """ |
| 55 | + Gets the raw dictionary of visual analysis' script steps |
| 56 | + """ |
| 57 | + script = self.get_raw_script() |
| 58 | + if not 'steps' in script: |
| 59 | + script['steps'] = [] |
| 60 | + return script['steps'] |
| 61 | + |
| 62 | + def get_raw_script_sampling(self): |
| 63 | + """ |
| 64 | + Gets the raw dictionary of visual analysis' script sampling |
| 65 | + """ |
| 66 | + script = self.get_raw_script() |
| 67 | + if not 'explorationSampling' in script: |
| 68 | + script['explorationSampling'] = {} |
| 69 | + return script['explorationSampling'] |
| 70 | + |
| 71 | + def save(self): |
| 72 | + """ |
| 73 | + Shortcut to :meth:`DSSAnalysis.set_definition()` |
| 74 | + """ |
| 75 | + self.analysis.set_definition(self) |
| 76 | + |
| 77 | + def add_step(self, step): |
| 78 | + """ |
| 79 | + Add a step to the script |
| 80 | +
|
| 81 | + :param object selection: A :class:`DSSAnalysisStepBuilder` to build the settings of the step. |
| 82 | + """ |
| 83 | + steps = self.get_raw_script_steps() |
| 84 | + if isinstance(step, DSSAnalysisStepBuilder): |
| 85 | + steps.append(step.build()) |
| 86 | + else: |
| 87 | + steps.append(step) |
| 88 | + |
| 89 | + def set_script_sampling_selection(self, selection): |
| 90 | + """ |
| 91 | + Sets the sampling for the script |
| 92 | +
|
| 93 | + :param object selection: A :class:`DSSDatasetSelectionBuilder` to build the settings of the extract of the dataset. |
| 94 | + """ |
| 95 | + sampling = self.get_raw_script_sampling() |
| 96 | + if isinstance(selection, DSSDatasetSelectionBuilder): |
| 97 | + sampling['selection'] = selection.build() |
| 98 | + else: |
| 99 | + sampling['selection'] = selection |
| 100 | + |
| 101 | + |
| 102 | +class DSSAnalysis(object): |
| 103 | + """A handle to interact with a DSS visual analysis""" |
| 104 | + def __init__(self, client, project_key, analysis_id): |
| 105 | + self.client = client |
| 106 | + self.project_key = project_key |
| 107 | + self.analysis_id = analysis_id |
| 108 | + |
| 109 | + ######################################################## |
| 110 | + # Analysis deletion |
| 111 | + ######################################################## |
| 112 | + |
| 113 | + def delete(self, drop_data=False): |
| 114 | + """ |
| 115 | + Delete the dataset |
| 116 | +
|
| 117 | + :param bool drop_data: Should the data of the dataset be dropped |
| 118 | + """ |
| 119 | + return self.client._perform_empty("DELETE", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id)) |
| 120 | + |
| 121 | + |
| 122 | + ######################################################## |
| 123 | + # Analysis definition |
| 124 | + ######################################################## |
| 125 | + |
| 126 | + def get_definition(self): |
| 127 | + """ |
| 128 | + Get the definition of the analysis |
| 129 | +
|
| 130 | + :return: a DSSAnalysisDefinition object to interact with the settings |
| 131 | + :rtype: :class:`dataikuapi.dss.analysis.DSSAnalysisDefinition` |
| 132 | + """ |
| 133 | + acp = self.client._perform_json("GET", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id)) |
| 134 | + return DSSAnalysisDefinition(self, acp) |
| 135 | + |
| 136 | + def set_definition(self, definition): |
| 137 | + """ |
| 138 | + Set the definition of the analysis |
| 139 | + |
| 140 | + Args: |
| 141 | + definition: the definition, as a JSON object or a :class:`dataikuapi.dss.analysis.DSSAnalysisDefinition`. |
| 142 | + You should only set a definition object that has been retrieved using the get_definition call. |
| 143 | + """ |
| 144 | + if isinstance(definition, DSSAnalysisDefinition): |
| 145 | + acp = definition.get_raw() |
| 146 | + else: |
| 147 | + acp = definition |
| 148 | + return self.client._perform_json("PUT", "/projects/%s/lab/%s/" % (self.project_key, self.analysis_id), body=acp) |
| 149 | + |
| 150 | + |
| 151 | + ######################################################## |
| 152 | + # ML |
| 153 | + ######################################################## |
| 154 | + |
| 155 | + def create_prediction_ml_task(self, target_variable, |
| 156 | + ml_backend_type = "PY_MEMORY", |
| 157 | + guess_policy = "DEFAULT"): |
| 158 | + |
| 159 | + |
| 160 | + """Creates a new prediction task in this visual analysis lab |
| 161 | + for a dataset. |
| 162 | +
|
| 163 | +
|
| 164 | + The returned ML task will be in 'guessing' state, i.e. analyzing |
| 165 | + the input dataset to determine feature handling and algorithms. |
| 166 | +
|
| 167 | + You should wait for the guessing to be completed by calling |
| 168 | + ``wait_guess_complete`` on the returned object before doing anything |
| 169 | + else (in particular calling ``train`` or ``get_settings``) |
| 170 | +
|
| 171 | + :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O |
| 172 | + :param string guess_policy: Policy to use for setting the default parameters. Valid values are: DEFAULT, SIMPLE_FORMULA, DECISION_TREE, EXPLANATORY and PERFORMANCE |
| 173 | + """ |
| 174 | + |
| 175 | + obj = { |
| 176 | + "taskType" : "PREDICTION", |
| 177 | + "targetVariable" : target_variable, |
| 178 | + "backendType": ml_backend_type, |
| 179 | + "guessPolicy": guess_policy |
| 180 | + } |
| 181 | + |
| 182 | + ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj) |
| 183 | + return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"]) |
| 184 | + |
| 185 | + def create_clustering_ml_task(self, |
| 186 | + ml_backend_type = "PY_MEMORY", |
| 187 | + guess_policy = "KMEANS"): |
| 188 | + |
| 189 | + |
| 190 | + """Creates a new clustering task in a new visual analysis lab |
| 191 | + for a dataset. |
| 192 | +
|
| 193 | +
|
| 194 | + The returned ML task will be in 'guessing' state, i.e. analyzing |
| 195 | + the input dataset to determine feature handling and algorithms. |
| 196 | +
|
| 197 | + You should wait for the guessing to be completed by calling |
| 198 | + ``wait_guess_complete`` on the returned object before doing anything |
| 199 | + else (in particular calling ``train`` or ``get_settings``) |
| 200 | +
|
| 201 | + :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O |
| 202 | + :param string guess_policy: Policy to use for setting the default parameters. Valid values are: KMEANS and ANOMALY_DETECTION |
| 203 | + """ |
| 204 | + |
| 205 | + obj = { |
| 206 | + "taskType" : "CLUSTERING", |
| 207 | + "backendType": ml_backend_type, |
| 208 | + "guessPolicy": guess_policy |
| 209 | + } |
| 210 | + |
| 211 | + ref = self.client._perform_json("POST", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id), body=obj) |
| 212 | + return DSSMLTask(self.client, self.project_key, self.analysis_id, ref["mlTaskId"]) |
| 213 | + |
| 214 | + def list_ml_tasks(self): |
| 215 | + """ |
| 216 | + List the ML tasks in this visual analysis |
| 217 | + |
| 218 | + Returns: |
| 219 | + the list of the ML tasks summaries, each one as a JSON object |
| 220 | + """ |
| 221 | + return self.client._perform_json("GET", "/projects/%s/lab/%s/models/" % (self.project_key, self.analysis_id)) |
| 222 | + |
| 223 | + def get_ml_task(self, mltask_id): |
| 224 | + """ |
| 225 | + Get a handle to interact with a specific ML task |
| 226 | + |
| 227 | + Args: |
| 228 | + mltask_id: the identifier of the desired ML task |
| 229 | + |
| 230 | + Returns: |
| 231 | + A :class:`dataikuapi.dss.ml.DSSMLTask` ML task handle |
| 232 | + """ |
| 233 | + return DSSMLTask(self.client, self.project_key, self.analysis_id, mltask_id) |
| 234 | + |
| 235 | + |
| 236 | +# some basic steps |
| 237 | +class DSSFormulaStepBuilder(DSSAnalysisStepBuilder): |
| 238 | + def __init__(self, step_name=None): |
| 239 | + super(DSSFormulaStepBuilder, self).__init__(step_type='CreateColumnWithGREL', step_name=step_name) |
| 240 | + |
| 241 | + def with_output_column(self, column_name): |
| 242 | + """Sets the step's output column's name""" |
| 243 | + self.step["params"]["column"] = column_name |
| 244 | + return self |
| 245 | + |
| 246 | + def with_error_column(self, column_name): |
| 247 | + """Sets the step's output column's name""" |
| 248 | + self.step["params"]["errorColumn"] = column_name |
| 249 | + return self |
| 250 | + |
| 251 | + def with_expression(self, expression): |
| 252 | + """Sets the step's expression""" |
| 253 | + self.step["params"]["expression"] = expression |
| 254 | + return self |
| 255 | + |
| 256 | +class AppliesToStepBuilder(DSSAnalysisStepBuilder): |
| 257 | + def __init__(self, step_type=None, step_name=None): |
| 258 | + super(AppliesToStepBuilder, self).__init__(step_type=step_type, step_name=step_name) |
| 259 | + self.step["params"]["appliesTo"] = 'SINGLE_COLUMN' |
| 260 | + |
| 261 | + def with_column_selection_mode(self, column_selection_mode): |
| 262 | + """Sets the step's column selection mode (SINGLE_COLUMN, COLUMNS, PATTERN, ALL)""" |
| 263 | + self.step["params"]["appliesTo"] = column_selection_mode |
| 264 | + return self |
| 265 | + |
| 266 | + def with_columns(self, *column_names): |
| 267 | + """Sets the step's selected columns""" |
| 268 | + self.step["params"]["columns"] = [c for c in column_names] |
| 269 | + return self |
| 270 | + |
| 271 | + def with_column_regex(self, regex): |
| 272 | + """Sets the step's column selection regular expression""" |
| 273 | + self.step["params"]["appliesToPattern"] = regex |
| 274 | + return self |
| 275 | + |
| 276 | + def with_single_column_selection(self, column_name): |
| 277 | + """Sets the step's as applying to a single column""" |
| 278 | + return self.with_column_selection_mode('SINGLE_COLUMN').with_columns(column_name) |
| 279 | + |
| 280 | + def with_multiple_column_selection(self, *column_names): |
| 281 | + """Sets the step's as applying to a single column""" |
| 282 | + return self.with_column_selection_mode('COLUMNS').with_columns(column_names) |
| 283 | + |
| 284 | + def with_regex_column_selection(self, regex): |
| 285 | + """Sets the step's as applying to a single column""" |
| 286 | + return self.with_column_selection_mode('PATTERN').with_column_regex(regex) |
| 287 | + |
| 288 | + def with_all_column_selection(self, column_name): |
| 289 | + """Sets the step's as applying to all columns""" |
| 290 | + return self.with_column_selection_mode('ALL') |
| 291 | + |
| 292 | +class FilterAndFlagStepBuilder(AppliesToStepBuilder): |
| 293 | + def __init__(self, step_type=None, step_name=None): |
| 294 | + super(FilterAndFlagStepBuilder, self).__init__(step_type=step_type, step_name=step_name) |
| 295 | + self.step["params"]["booleanMode"] = 'AND' |
| 296 | + self.step["params"]["action"] = 'REMOVE_ROW' |
| 297 | + |
| 298 | + def with_action(self, action): |
| 299 | + """Sets the step's action on match (KEEP_ROW, REMOVE_ROW, CLEAR_CELL, DONTCLEAR_CELL, FLAG)""" |
| 300 | + self.step["params"]["action"] = action |
| 301 | + return self |
| 302 | + |
| 303 | + def with_boolean_mode(self, boolean_mode): |
| 304 | + """Sets the step's mode for combining matches in different columns (AND, OR)""" |
| 305 | + self.step["params"]["booleanMode"] = boolean_mode |
| 306 | + return self |
| 307 | + |
| 308 | + def with_flag_column(self, column_name): |
| 309 | + """Sets the step's column for outputing the flag""" |
| 310 | + self.step["params"]["flagColumn"] = column_name |
| 311 | + return self |
| 312 | + |
| 313 | +class FilterOnValueStepBuilder(FilterAndFlagStepBuilder): |
| 314 | + def __init__(self, step_name=None): |
| 315 | + super(FilterOnValueStepBuilder, self).__init__(step_type='FlagOnValue', step_name=step_name) |
| 316 | + |
| 317 | + def with_values(self, *values): |
| 318 | + """Sets the step's flagged values""" |
| 319 | + self.step["params"]["values"] = [v for v in values] |
| 320 | + return self |
| 321 | + |
| 322 | + def with_matching_mode(self, matching_mode): |
| 323 | + """Sets the step's matching_mode (FULL_STRING, SUBSTRING, PATTERN)""" |
| 324 | + self.step["params"]["matchingMode"] = matching_mode |
| 325 | + return self |
| 326 | + |
| 327 | + def with_normalization_mode(self, normalization_mode): |
| 328 | + """Sets the step's normalization_mode (EXACT, LOWERCASE, NORMALIZED)""" |
| 329 | + self.step["params"]["normalizationMode"] = normalization_mode |
| 330 | + return self |
| 331 | + |
| 332 | +class FilterOnBadTypeStepBuilder(FilterAndFlagStepBuilder): |
| 333 | + def __init__(self, step_name=None): |
| 334 | + super(FilterOnBadTypeStepBuilder, self).__init__(step_type='FilterOnBadType', step_name=step_name) |
| 335 | + |
| 336 | + def with_meaning(self, meaning): |
| 337 | + """Sets the step's meaning to check""" |
| 338 | + self.step["params"]["type"] = meaning |
| 339 | + return self |
| 340 | + |
| 341 | +class RemoveRowsStepBuilder(AppliesToStepBuilder): |
| 342 | + def __init__(self, step_name=None): |
| 343 | + super(RemoveRowsStepBuilder, self).__init__(step_type='RemoveRowsOnEmpty', step_name=step_name) |
| 344 | + |
| 345 | + def with_meaning(self, keep): |
| 346 | + """Sets the step's behavior when an empty value is found : True=keep, False=drop (default)""" |
| 347 | + self.step["params"]["keep"] = keep |
| 348 | + return self |
| 349 | + |
| 350 | + |
0 commit comments