[sc-71083] Scratching itches on the Model Evaluation Stores API (#180)

michaelbourhis · web-flow · commit e3192b11310f · 2021-10-19T17:56:48.000+02:00
* Rename get_full_info to get_evaluation_full_info
Add getters for properties of DSSModelEvaluationFullInfo
Document data drift parameters and results

* Rollback renaming of get_full_info to get_evaluation_full_info

* Add consistency to naming styles in the documentation

* Rename ColumnReport to ColumnSettings

* Change some getters to properties for primitive types
Return raw data for get_raw in UnivariateDriftResult and add a property for getting per column data drift info.

* Taking into account PR review

* Removing unimportant helpers

* Use a list comprehension instead of a `list(map())` for wider compatibility

* Remove _sample_size attributes

* Fix parameter naming &amp; doc

* Better doc for `enabled` param of `PerColumnDriftParamBuilder.with_column_drift_param`
diff --git a/dataikuapi/dss/modelevaluationstore.py b/dataikuapi/dss/modelevaluationstore.py
@@ -5,8 +5,6 @@
 from .discussion import DSSObjectDiscussions
 from .future import DSSFuture
 
-from requests import utils
-
 try:
     basestring
 except NameError:
@@ -169,7 +167,7 @@ def delete_model_evaluations(self, evaluations):
 
     def build(self, job_type="NON_RECURSIVE_FORCED_BUILD", wait=True, no_fail=False):
         """
-        Starts a new job to build this Model Evaluation Store and wait for it to complete.
+        Starts a new job to build this model evaluation store and wait for it to complete.
         Raises if the job failed.
 
         .. code-block:: python
@@ -274,6 +272,8 @@ def __init__(self, model_evaluation_store, evaluation_id):
     def get_full_info(self):
         """
         Retrieve the model evaluation with its performance data
+
+        :return: the model evaluation full info, as a :class:`dataikuapi.dss.DSSModelEvaluationInfo`
         """
         data = self.client._perform_json(
             "GET", "/projects/%s/modelevaluationstores/%s/evaluations/%s" % (self.project_key, self.mes_id, self.evaluation_id))
@@ -301,21 +301,25 @@ def compute_data_drift(self, reference=None, data_drift_params=None, wait=True):
         :param reference: saved model version (full ID or DSSTrainedPredictionModelDetails)
                 or model evaluation (full ID or DSSModelEvaluation) to use as reference (optional)
         :type reference: Union[str, DSSModelEvaluation, DSSTrainedPredictionModelDetails]
-        :param data_drift_params: data drift computation settings (optional)
+        :param data_drift_params: data drift computation settings as a :class:`dataikuapi.dss.modelevaluationstore.DataDriftParams` (optional)
+        :type data_drift_params: DataDriftParams
         :param wait: data drift computation settings (optional)
-        :returns: a `dict` containing data drift analysis results if `wait` is `True`, or a :class:`~dataikuapi.dss.future.DSSFuture` handle otherwise
+        :returns: a :class:`dataikuapi.dss.modelevaluationstore.DataDriftResult` containing data drift analysis results if `wait` is `True`, or a :class:`~dataikuapi.dss.future.DSSFuture` handle otherwise
         """
 
         if hasattr(reference, 'full_id'):
             reference = reference.full_id
 
+        if data_drift_params:
+            data_drift_params = data_drift_params.data
+
         future_response = self.client._perform_json(
             "POST", "/projects/%s/modelevaluationstores/%s/evaluations/%s/computeDataDrift" % (self.project_key, self.mes_id, self.evaluation_id),
             body={
                 "referenceId": reference,
                 "dataDriftParams": data_drift_params
             })
-        future = DSSFuture(self.client, future_response.get('jobId', None), future_response)
+        future = DSSFuture(self.client, future_response.get('jobId', None), future_response, result_wrapper=DataDriftResult)
         return future.wait_for_result() if wait else future
 
     def get_metrics(self):
@@ -361,39 +365,190 @@ class DSSModelEvaluationFullInfo:
     def __init__(self, model_evaluation, full_info):
         self.model_evaluation = model_evaluation
         self.full_info = full_info
+        self.metrics = self.full_info["metrics"]  # type: dict
+        """The performance and data drift metric, if any."""
+        self.creation_date = self.full_info["evaluation"]["created"]  # type: int
+        """The date and time of the creation of the model evaluation, as an epoch."""
+        self.full_id = self.full_info["evaluation"]["ref"]["fullId"]  # type: str
+        self.model_full_id = self.full_info["evaluation"]["modelRef"]["fullId"]  # type: str
+        self.prediction_type = self.full_info["evaluation"]["predictionType"]  # type: str
+        self.prediction_variable = self.full_info["evaluation"]["predictionVariable"]  # type: str
+        self.target_variable = self.full_info["evaluation"]["targetVariable"]  # type: str
+        self.user_meta = self.full_info["evaluation"]["userMeta"]  # type: dict
+        """The user-accessible metadata (name, labels)
+        Returns the original object, not a copy. Changes to the returned object are persisted to DSS by calling :meth:`save_user_meta`."""
 
     def get_raw(self):
         return self.full_info
 
-    def get_metrics(self):
+    def save_user_meta(self):
+        return self.model_evaluation.client._perform_text(
+                "PUT", "/projects/%s/modelevaluationstores/%s/evaluations/%s/user-meta" %
+                       (self.model_evaluation.project_key, self.model_evaluation.mes_id, self.model_evaluation.evaluation_id), body=self.user_meta)
+
+
+class DataDriftParams(object):
+    """
+    Object that represents parameters for data drift computation.
+    Do not create this object directly, use :meth:`dataikuapi.dss.modelevaluationstore.DataDriftParams.from_params` instead.
+    """
+    def __init__(self, data):
+        self.data = data
+
+    def __repr__(self):
+        return u"{}({})".format(self.__class__.__name__, self.data)
+
+    @staticmethod
+    def from_params(per_column_settings, nb_bins=10, compute_histograms=True, confidence_level=0.95):
         """
-        Get the metrics evaluated, if any.
+        Creates parameters for data drift computation from columns, number of bins, compute histograms and confidence level
+
+        :param dict per_column_settings: A dict representing the per column settings.
+        You should use a :class:`~dataikuapi.dss.modelevaluationstore.PerColumnDriftParamBuilder` to build it.
+        :param int nb_bins: (optional) Nb. bins in histograms (apply to all columns) - default: 10
+        :param bool compute_histograms: (optional) Enable/disable histograms - default: True
+        :param float confidence_level: (optional) Used to compute confidence interval on drift's model accuracy - default: 0.95
 
-        :return: a dict containing the performance and data drift metric, if any
+        :rtype: :class:`dataikuapi.dss.modelevaluationstore.DataDriftParams`
         """
-        return self.full_info["metrics"]
+        return DataDriftParams({
+            "columns": per_column_settings,
+            "nbBins": nb_bins,
+            "computeHistograms": compute_histograms,
+            "confidenceLevel": confidence_level
+        })
 
-    def get_labels(self):
+
+class PerColumnDriftParamBuilder(object):
+    """
+    Builder for a map of per column drift params settings.
+    Used as a helper before computing data drift to build columns param expected in
+    :meth:`dataikuapi.dss.modelevaluationstore.DataDriftParams.from_params`.
+    """
+    def __init__(self):
+        self.columns = {}
+
+    def build(self):
+        """Returns the built dict for per column drift params settings"""
+        return self.columns
+
+    def with_column_drift_param(self, name, handling="AUTO", enabled=True):
         """
-        Get the labels of the Model Evaluation
+        Sets the drift params settings for given column name.
 
-        :return: a dict containing the labels
+        :param: string name: The name of the column
+        :param: string handling: (optional) The column type, should be either NUMERICAL, CATEGORICAL or AUTO (default: AUTO)
+        :param: bool enabled: (optional) False means the column is ignored in drift computation (default: True)
         """
-        return self.full_info["evaluation"]["labels"]
+        self.columns[name] = {
+            "handling": handling,
+            "enabled": enabled
+        }
+        return self
+
+
+class DataDriftResult(object):
+    """
+    A handle on the data drift result of a model evaluation.
+
+    Do not create this class directly, instead use :meth:`dataikuapi.dss.DSSModelEvaluation.compute_data_drift`
+    """
+    def __init__(self, data):
+        self.data = data
+        self.drift_model_result = DriftModelResult(self.data["driftModelResult"])
+        """Drift analysis based on drift modeling."""
+        self.univariate_drift_result = UnivariateDriftResult(self.data["univariateDriftResult"])
+        """Per-column drift analysis based on pairwise comparison of distributions."""
+        self.per_column_settings = [ColumnSettings(cs) for cs in self.data["perColumnSettings"]]
+        """Information about column handling that has been used (errors, types, etc)."""
 
-    def get_evaluation_parameters(self):
+    def get_raw(self):
+        """
+        :return: the raw data drift result
+        :rtype: dict
         """
-        Get info on the evaluation parameters, most noticeably the evaluation metric (evaluationMetric field
-        of the returned dict)
+        return self.data
+
 
-        :return: a dict
+class DriftModelResult(object):
+    """
+    A handle on the drift model result.
+
+    Do not create this class directly, instead use :attr:`dataikuapi.dss.modelevaluationstore.DataDriftResult.drift_model_result`
+    """
+    def __init__(self, data):
+        self.data = data
+        self.drift_model_accuracy = DriftModelAccuracy(self.data["driftModelAccuracy"])
+        self.feature_drift_importance = self.data["driftVersusImportance"]  # type: dict
+
+    def get_raw(self):
         """
-        return self.full_info["evaluation"]["metricParams"]
+        :return: the raw drift model result
+        :rtype: dict
+        """
+        return self.data
+
+
+class UnivariateDriftResult(object):
+    """
+    A handle on the univariate data drift.
+
+    Do not create this class directly, instead use :attr:`dataikuapi.dss.modelevaluationstore.DataDriftResult.univariate_drift_result`
+    """
+    def __init__(self, data):
+        self.data = data
+        self.per_column_drift_data = self.data["columns"]  # type: dict
+        """Drift data per column, as a dict of column name -> drift data."""
+
+    def get_raw(self):
+        """
+        :return: the raw univariate data drift
+        :rtype: dict
+        """
+        return self.data
+
 
-    def get_creation_date(self):
+class ColumnSettings(object):
+    """
+    A handle on column handling information.
+
+    Do not create this class directly, instead use :meth:`dataikuapi.dss.modelevaluationstore.DataDriftResult.get_per_column_settings`
+    """
+    def __init__(self, data):
+        self.data = data
+        self.name = self.data["name"]  # type: str
+        self.actual_column_handling = self.data["actualHandling"]  # type: str
+        """The actual column handling (either forced via drift params or inferred from model evaluation preprocessings).
+        It can be any of NUMERICAL, CATEGORICAL, or IGNORED."""
+        self.default_column_handling = self.data["defaultHandling"]  # type: str
+        """The default column handling (based on model evaluation preprocessing only).
+        It can be any of NUMERICAL, CATEGORICAL, or IGNORED."""
+        self.error_message = self.data.get("errorMessage", None)
+
+    def get_raw(self):
+        """
+        :return: the raw column handling information
+        :rtype: dict
         """
-        Return the date and time of the creation of the Model Evaluation
+        return self.data
+
 
-        :return: the date and time, as an epoch
+class DriftModelAccuracy(object):
+    """
+    A handle on the drift model accuracy.
+
+    Do not create this class directly, instead use :attr:`dataikuapi.dss.modelevaluationstore.DriftModelResult.drift_model_accuracy`
+    """
+    def __init__(self, data):
+        self.data = data
+        self.value = self.data["value"]  # type: float
+        self.lower_confidence_interval = self.data["lower"]  # type: float
+        self.upper_confidence_interval = self.data["upper"]  # type: float
+        self.pvalue = self.data["pvalue"]  # type: float
+
+    def get_raw(self):
+        """
+        :return: the drift model accuracy data
+        :rtype: dict
         """
-        return self.full_info["evaluation"]["created"]
+        return self.data