ML API: add synchronous method for create / train / ensemble

cstenac · cstenac · commit dabb26fbabd7 · 2018-03-09T09:11:03.000+01:00
diff --git a/dataikuapi/dss/ml.py b/dataikuapi/dss/ml.py
@@ -664,7 +664,8 @@ def __init__(self, client, project_key, analysis_id, mltask_id):
 
     def wait_guess_complete(self):
         """
-        Waits for guess to be complete. This should be called immediately after the creation of a new ML Task,
+        Waits for guess to be complete. This should be called immediately after the creation of a new ML Task
+        (if the ML Task was created with wait_guess_complete=False),
         before calling ``get_settings`` or ``train``
         """
         while True:
@@ -699,22 +700,49 @@ def get_settings(self):
         else:
             return DSSClusteringMLTaskSettings(self.client, self.project_key, self.analysis_id, self.mltask_id, settings)
 
-    def start_train(self, session_name=None, session_description=None):
+    def train(self, session_name=None, session_description=None):
         """
-        Starts asynchronously a new train session for this ML Task.
-
+        Trains models for this ML Task
+        
         :param str session_name: name for the session
         :param str session_description: description for the session
 
-        This returns immediately, before train is complete. To wait for train to complete, use ``wait_train_complete()``
+        This method waits for train to complete. If you want to train asynchronously, use :meth:`start_train` and :meth:`wait_train_complete`
+
+        This method returns the list of trained model identifiers. It returns models that have been trained  for this train
+        session, not all trained models for this ML task. To get all identifiers for all models trained across all training sessions,
+        use :meth:`get_trained_models_ids`
+
+        These identifiers can be used for :meth:`get_trained_model_snippet`, :meth:`get_trained_model_details` and :meth:`deploy_to_flow`
+
+        :return: A list of model identifiers
+        :rtype: list of strings
         """
-        session_info = {
-                            "sessionName" : session_name,
-                            "sessionDescription" : session_description
-                        }
+        train_ret = self.start_train(session_name, session_description)
+        self.wait_train_complete()
+        return self.get_trained_models_ids(session_id = train_ret["sessionId"])
 
-        return self.client._perform_json(
-                "POST", "/projects/%s/models/lab/%s/%s/train" % (self.project_key, self.analysis_id, self.mltask_id), body=session_info)
+    def ensemble(self, model_ids=[], method=None):
+        """
+        Create an ensemble model of a set of models
+        
+        :param list model_ids: A list of model identifiers
+        :param str method: the ensembling method. One of: AVERAGE, PROBA_AVERAGE, MEDIAN, VOTE, LINEAR_MODEL, LOGISTIC_MODEL
+
+        This method waits for the ensemble train to complete. If you want to train asynchronously, use :meth:`start_ensembling` and :meth:`wait_train_complete`
+
+        This method returns the identifier of the trained ensemble.
+        To get all identifiers for all models trained across all training sessions,
+        use :meth:`get_trained_models_ids`
+
+        This identifier can be used for :meth:`get_trained_model_snippet`, :meth:`get_trained_model_details` and :meth:`deploy_to_flow`
+
+        :return: A model identifier
+        :rtype: string
+        """
+        train_ret = self.start_ensembling(model_ids, method)
+        self.wait_train_complete()
+        return train_ret
 
     def start_ensembling(self, model_ids=[], method=None):
         """
@@ -736,6 +764,24 @@ def start_ensembling(self, model_ids=[], method=None):
         return self.client._perform_json(
                 "POST", "/projects/%s/models/lab/%s/%s/ensemble" % (self.project_key, self.analysis_id, self.mltask_id), body=ensembling_request)['id']
 
+
+    def start_train(self, session_name=None, session_description=None):
+        """
+        Starts asynchronously a new train session for this ML Task.
+
+        :param str session_name: name for the session
+        :param str session_description: description for the session
+
+        This returns immediately, before train is complete. To wait for train to complete, use ``wait_train_complete()``
+        """
+        session_info = {
+                            "sessionName" : session_name,
+                            "sessionDescription" : session_description
+                        }
+
+        return self.client._perform_json(
+                "POST", "/projects/%s/models/lab/%s/%s/train" % (self.project_key, self.analysis_id, self.mltask_id), body=session_info)
+
     def wait_train_complete(self):
         """
         Waits for train to be complete.
diff --git a/dataikuapi/dss/project.py b/dataikuapi/dss/project.py
@@ -133,25 +133,30 @@ def get_dataset(self, dataset_name):
         """
         Get a handle to interact with a specific dataset
        
-        Args:
-            dataset_name: the name of the desired dataset
+        :param string dataset_name: the name of the desired dataset
         
-        Returns:
-            A :class:`dataikuapi.dss.dataset.DSSDataset` dataset handle
+        :returns: A :class:`dataikuapi.dss.dataset.DSSDataset` dataset handle
         """
         return DSSDataset(self.client, self.project_key, dataset_name)
 
     def create_dataset(self, dataset_name, type,
                 params={}, formatType=None, formatParams={}):
         """
-        Create a new dataset in the project, and return a handle to interact with it
+        Create a new dataset in the project, and return a handle to interact with it.
+
+        The precise structure of ``params`` and ``formatParams`` depends on the specific dataset 
+        type and dataset format type. To know which fields exist for a given dataset type and format type,
+        create a dataset from the UI, and use :meth:`get_dataset` to retrieve the configuration
+        of the dataset and inspect it. Then reproduce a similar structure in the :meth:`create_dataset` call.
+
+        Not all settings of a dataset can be set at creation time (for example partitioning). After creation,
+        you'll have the ability to modify the dataset
         
-        Args:
-            dataset_name: the name for the new dataset
-            type: the type of the dataset
-            params: the parameters for the type, as a JSON object
-            formatType: an optional format to create the dataset with
-            formatParams: the parameters to the format, as a JSON object
+        :param string dataset_name: the name for the new dataset
+        :param string type: the type of the dataset
+        :param dict params: the parameters for the type, as a JSON object
+        :param string formatType: an optional format to create the dataset with (only for file-oriented datasets)
+        :param string formatParams: the parameters to the format, as a JSON object (only for file-oriented datasets)
         
         Returns:
             A :class:`dataikuapi.dss.dataset.DSSDataset` dataset handle
@@ -173,25 +178,20 @@ def create_dataset(self, dataset_name, type,
     ########################################################
 
     def create_prediction_ml_task(self, input_dataset, target_variable,
-                                   ml_backend_type = "PY_MEMORY",
-                                   guess_policy = "DEFAULT"):
-
+                                  ml_backend_type = "PY_MEMORY",
+                                  guess_policy = "DEFAULT",
+                                  wait_guess_complete=True):
 
         """Creates a new prediction task in a new visual analysis lab
         for a dataset.
 
-
-        The returned ML task will be in 'guessing' state, i.e. analyzing
-        the input dataset to determine feature handling and algorithms.
-
-        You should wait for the guessing to be completed by calling
-        ``wait_guess_complete`` on the returned object before doing anything
-        else (in particular calling ``train`` or ``get_settings``)
-
         :param string ml_backend_type: ML backend to use, one of PY_MEMORY, MLLIB or H2O
         :param string guess_policy: Policy to use for setting the default parameters.  Valid values are: DEFAULT, SIMPLE_FORMULA, DECISION_TREE, EXPLANATORY and PERFORMANCE
+        :param boolean wait_guess_complete: if False, the returned ML task will be in 'guessing' state, i.e. analyzing the input dataset to determine feature handling and algorithms.
+                                            You should wait for the guessing to be completed by calling
+                                            ``wait_guess_complete`` on the returned object before doing anything
+                                            else (in particular calling ``train`` or ``get_settings``)
         """
-
         obj = {
             "inputDataset" : input_dataset,
             "taskType" : "PREDICTION",
@@ -201,7 +201,10 @@ def create_prediction_ml_task(self, input_dataset, target_variable,
         }
 
         ref = self.client._perform_json("POST", "/projects/%s/models/lab/" % self.project_key, body=obj)
-        return DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"])
+        ret = DSSMLTask(self.client, self.project_key, ref["analysisId"], ref["mlTaskId"])
+        if wait_guess_complete:
+            ret.wait_guess_complete()
+        return ret
 
     def create_clustering_ml_task(self, input_dataset,
                                    ml_backend_type = "PY_MEMORY",
diff --git a/dataikuapi/dssclient.py b/dataikuapi/dssclient.py
@@ -596,19 +596,31 @@ def get_general_settings(self):
     ########################################################
 
     def create_project_from_bundle_local_archive(self, archive_path):
+        """
+        Create a project from a bundle archive.
+        Warning: this method can only be used on an automation node.
+
+        :param string archive_path: Path on the local machine where the archive is
+        """
         return self._perform_json("POST",
                 "/projectsFromBundle/fromArchive",
                  params = { "archivePath" : osp.abspath(archive_path) })
 
     def create_project_from_bundle_archive(self, fp):
+        """
+        Create a project from a bundle archive (as a file object)
+        Warning: this method can only be used on an automation node.
+
+        :param string fp: A file-like object pointing to a bundle archive zip
+        """
         files = {'file': fp }
         return self._perform_json("POST",
                 "/projectsFromBundle/", files=files)
 
-
     def prepare_project_import(self, f):
         """
-        Prepares import of a project archive
+        Prepares import of a project archive.
+        Warning: this method can only be used on a design node.
 
         :param file-like fp: the input stream, as a file-like object
         :returns: a :class:`TemporaryImportHandle` to interact with the prepared import
@@ -624,6 +636,9 @@ def prepare_project_import(self, f):
     ########################################################
 
     def catalog_index_connections(self, connection_names=[], all_connections=False, indexing_mode="FULL"):
+        """
+        Triggers an indexing of multiple connections in the data catalog
+        """
         return self._perform_json("POST", "/catalog/index", body={
             "connectionNames": connection_names,
             "indexAllConnections": all_connections,