add incremental

md-shafiul-alam · md-shafiul-alam · commit 35282a0fe995 · 2024-10-04T08:19:04.000-07:00
diff --git a/configs/incremental.json b/configs/incremental.json
@@ -0,0 +1,96 @@
+{
+    "PARAMETERS_SETS": {
+        "common": {"bench": {"n_runs": 10, "time_limit": 60}},
+        "covariance data": {
+            "data": [
+                {
+                    "source": "make_blobs",
+                    "generation_kwargs": {
+                        "centers": 1,
+                        "n_samples": 1000,
+                        "n_features": [16, 64]
+                    },
+                    "split_kwargs": {"ignore": true}
+                }
+            ]
+        },
+        "basic_statistics data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 10000,
+                    "n_features": [16, 64]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "linear_regression data": {
+            "data": {
+                "source": "make_regression",
+                "split_kwargs": {"train_size": 0.2, "test_size": 0.8},
+                "generation_kwargs": {
+                    "n_samples": 5000,
+                    "n_features": [40, 100],
+                    "n_informative": 5,
+                    "noise": 2.0
+                }
+            }
+        },
+        "pca data": {
+            "data": {
+                "source": "make_blobs",
+                "generation_kwargs": {
+                    "centers": 1,
+                    "n_samples": 1000,
+                    "n_features": [16, 64]
+                },
+                "split_kwargs": {"ignore": true}
+            }
+        },
+        "covariance": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalEmpiricalCovariance",
+                    "library": "sklearnex.covariance",
+                    "estimator_methods": {"training": "partial_fit"},
+                    "num_batches": {"training": 2}
+                }
+            ]
+        },
+        "basic_statistics": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalBasicStatistics",
+                    "library": "sklearnex.basic_statistics",
+                    "num_batches": {"training": 2}
+                }
+            ]
+        },
+        "linear_regression": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalLinearRegression",
+                    "library": "sklearnex.linear_model",
+                    "num_batches": {"training": 2}
+                }
+            ]
+        },
+        "pca": {
+            "algorithm": [
+                {
+                    "estimator": "IncrementalPCA",
+                    "library": "sklearnex.preview.decomposition",
+                    "num_batches": {"training": 2}
+                }
+            ]
+        }
+    },
+    "TEMPLATES": {
+        "covariance": {"SETS": ["common", "covariance", "covariance data"]},
+        "linear_regression": {
+            "SETS": ["common", "linear_regression", "linear_regression data"]
+        },
+        "pca": {"SETS": ["common", "pca", "pca data"]}
+    }
+}
diff --git a/configs/spmd/large_scale/kmeans.json b/configs/spmd/large_scale/kmeans.json
@@ -12,7 +12,7 @@
 	},
 	"synthetic data": {
                 "data": [
-                        { "source": "make_blobs", "generation_kwargs": { "n_samples": 3750000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
+                    { "source": "make_blobs", "generation_kwargs": { "n_samples": 3750000,  "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },
 	                { "source": "make_blobs", "generation_kwargs": { "n_samples": 18750,  "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }
                 ]
         }
diff --git a/configs/spmd/large_scale/logreg.json b/configs/spmd/large_scale/logreg.json
@@ -10,9 +10,14 @@
 	},
         "synthetic data": {
             "data": [
+<<<<<<< HEAD
 		        { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
                 { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } },
                 { "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 501000, "n_features": 200, "n_classes":2, "n_clusters_per_class": 3, "flip_y":0.05 } }
+=======
+		{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },
+                { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 },    "generation_kwargs": {  "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }
+>>>>>>> oleg_online/inc-dist-support
             ]
         }
     },
@@ -22,8 +27,13 @@
                 "sklearnex spmd implementation",
                 "large scale 2k parameters",
                 "spmd logreg parameters",
+<<<<<<< HEAD
 		        "synthetic data",
 		        "spmd logreg2 parameters"
+=======
+		"synthetic data",
+		"spmd logreg2 parameters"
+>>>>>>> oleg_online/inc-dist-support
             ]
         }
     }
diff --git a/sklbench/benchmarks/sklearn_estimator.py b/sklbench/benchmarks/sklearn_estimator.py
@@ -74,7 +74,7 @@ def get_estimator(library_name: str, estimator_name: str):
 def get_estimator_methods(bench_case: BenchCase) -> Dict[str, List[str]]:
     # default estimator methods
     estimator_methods = {
-        "training": ["fit"],
+        "training": ["partial_fit", "fit"],
         "inference": ["predict", "predict_proba", "transform"],
     }
     for stage in estimator_methods.keys():
@@ -337,34 +337,43 @@ def verify_patching(stream: io.StringIO, function_name) -> bool:
     return acceleration_lines > 0 and fallback_lines == 0
 
 
-def create_online_function(method_instance, data_args, batch_size):
-    n_batches = data_args[0].shape[0] // batch_size
+def create_online_function(
+    estimator_instance, method_instance, data_args, num_batches, batch_size
+):
 
     if "y" in list(inspect.signature(method_instance).parameters):
 
         def ndarray_function(x, y):
-            for i in range(n_batches):
+            for i in range(num_batches):
                 method_instance(
                     x[i * batch_size : (i + 1) * batch_size],
                     y[i * batch_size : (i + 1) * batch_size],
                 )
+            if hasattr(estimator_instance, "_onedal_finalize_fit"):
+                estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x, y):
-            for i in range(n_batches):
+            for i in range(num_batches):
                 method_instance(
                     x.iloc[i * batch_size : (i + 1) * batch_size],
                     y.iloc[i * batch_size : (i + 1) * batch_size],
                 )
+            if hasattr(estimator_instance, "_onedal_finalize_fit"):
+                estimator_instance._onedal_finalize_fit()
 
     else:
 
         def ndarray_function(x):
-            for i in range(n_batches):
+            for i in range(num_batches):
                 method_instance(x[i * batch_size : (i + 1) * batch_size])
+            if hasattr(estimator_instance, "_onedal_finalize_fit"):
+                estimator_instance._onedal_finalize_fit()
 
         def dataframe_function(x):
-            for i in range(n_batches):
+            for i in range(num_batches):
                 method_instance(x.iloc[i * batch_size : (i + 1) * batch_size])
+            if hasattr(estimator_instance, "_onedal_finalize_fit"):
+                estimator_instance._onedal_finalize_fit()
 
     if "ndarray" in str(type(data_args[0])):
         return ndarray_function
@@ -417,12 +426,28 @@ def measure_sklearn_estimator(
                         data_args = (x_train,)
                     else:
                         data_args = (x_test,)
-                batch_size = get_bench_case_value(
-                    bench_case, f"algorithm:batch_size:{stage}"
-                )
-                if batch_size is not None:
+
+                if method == "partial_fit":
+                    num_batches = get_bench_case_value(bench_case, "data:num_batches")
+                    batch_size = get_bench_case_value(bench_case, "data:batch_size")
+
+                    if batch_size is None:
+                        if num_batches is None:
+                            num_batches = 5
+                        batch_size = (
+                            data_args[0].shape[0] + num_batches - 1
+                        ) // num_batches
+                    if num_batches is None:
+                        num_batches = (
+                            data_args[0].shape[0] + batch_size - 1
+                        ) // batch_size
+
                     method_instance = create_online_function(
-                        method_instance, data_args, batch_size
+                        estimator_instance,
+                        method_instance,
+                        data_args,
+                        num_batches,
+                        batch_size,
                     )
                 # daal4py model builders enabling branch
                 if enable_modelbuilders and stage == "inference":
@@ -440,10 +465,6 @@ def measure_sklearn_estimator(
                     metrics[method]["box filter mean[ms]"],
                     metrics[method]["box filter std[ms]"],
                 ) = measure_case(bench_case, method_instance, *data_args)
-                if batch_size is not None:
-                    metrics[method]["throughput[samples/ms]"] = (
-                        (data_args[0].shape[0] // batch_size) * batch_size
-                    ) / metrics[method]["time[ms]"]
                 if ensure_sklearnex_patching:
                     full_method_name = f"{estimator_class.__name__}.{method}"
                     sklearnex_logging_stream.seek(0)
diff --git a/sklbench/report/implementation.py b/sklbench/report/implementation.py
@@ -16,7 +16,7 @@
 
 import argparse
 import json
-from typing import Dict, List
+from typing import Dict, Hashable, List
 
 import openpyxl as xl
 import pandas as pd
@@ -32,6 +32,9 @@
 METRICS = {
     "lower is better": [
         "time[ms]",
+        "first iter[ms]",
+        "box filter mean[ms]",
+        "box filter std[ms]",
         "iterations",
         # classification
         "logloss",
@@ -239,6 +242,7 @@ def get_result_tables_as_df(
     bench_cases = pd.DataFrame(
         [flatten_dict(bench_case) for bench_case in results["bench_cases"]]
     )
+    bench_cases = bench_cases.map(lambda x: str(x) if not isinstance(x, Hashable) else x)
 
     if compatibility_mode:
         bench_cases = transform_results_to_compatible(bench_cases)
@@ -248,7 +252,7 @@ def get_result_tables_as_df(
             bench_cases.drop(columns=[column], inplace=True)
             diffby_columns.remove(column)
 
-    return split_df_by_columns(bench_cases, splitby_columns)
+    return split_df_by_columns(bench_cases, splitby_columns, False)
 
 
 def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
@@ -258,7 +262,10 @@ def get_summary_from_df(df: pd.DataFrame, df_name: str) -> pd.DataFrame:
             # only relative improvements are included in summary currently
             if len(column) > 1 and column[1] == f"{metric_name} relative improvement":
                 metric_columns.append(column)
-    summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
+    if metric_columns:
+        summary = df[metric_columns].aggregate(geomean_wrapper, axis=0).to_frame().T
+    else:
+        summary = pd.DataFrame()
     summary.index = pd.Index([df_name])
     return summary
 
diff --git a/test-configuration-linux.yml b/test-configuration-linux.yml
@@ -45,6 +45,11 @@ steps:
       conda activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
+  - script: |
+      source /usr/share/miniconda/etc/profile.d/conda.sh
+      conda activate bench-env
+      python -m sklbench --report -l DEBUG --report -c configs/incremental.json
+    displayName: Incremental algorithms example run
   - script: |
       source /usr/share/miniconda/etc/profile.d/conda.sh
       conda activate bench-env
diff --git a/test-configuration-win.yml b/test-configuration-win.yml
@@ -43,6 +43,10 @@ steps:
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/sklearn_example.json
     displayName: Sklearn example run
+  - script: |
+      call activate bench-env
+      python -m sklbench --report -l DEBUG --report -c configs/incremental.json
+    displayName: Incremental algorithms example run
   - script: |
       call activate bench-env
       python -m sklbench --report -l DEBUG --report -c configs/xgboost_example.json

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@`
`12`	`12`	`},`
`13`	`13`	`"synthetic data": {`
`14`	`14`	`"data": [`
`15`		`- { "source": "make_blobs", "generation_kwargs": { "n_samples": 3750000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },`
	`15`	`+ { "source": "make_blobs", "generation_kwargs": { "n_samples": 3750000, "n_features": 10, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } },`
`16`	`16`	`{ "source": "make_blobs", "generation_kwargs": { "n_samples": 18750, "n_features": 1000, "centers": 10 }, "algorithm": { "n_clusters": 10, "max_iter": 10 } }`
`17`	`17`	`]`
`18`	`18`	`}`
Original file line number	Diff line number	Diff line change
`@@ -10,9 +10,14 @@`
`10`	`10`	`},`
`11`	`11`	`"synthetic data": {`
`12`	`12`	`"data": [`
	`13`	`+<<<<<<< HEAD`
`13`	`14`	`{ "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },`
`14`	`15`	`{ "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } },`
`15`	`16`	`{ "source": "make_classification", "split_kwargs": { "train_size": 500000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 501000, "n_features": 200, "n_classes":2, "n_clusters_per_class": 3, "flip_y":0.05 } }`
	`17`	`+=======`
	`18`	`+ { "source": "make_classification", "split_kwargs": { "train_size": 5000000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 5001000, "n_features": 10, "n_classes": 2 } },`
	`19`	`+ { "source": "make_classification", "split_kwargs": { "train_size": 100000, "test_size": 1000 }, "generation_kwargs": { "n_samples": 101000, "n_features": 1000, "n_classes": 2 } }`
	`20`	`+>>>>>>> oleg_online/inc-dist-support`
`16`	`21`	`]`
`17`	`22`	`}`
`18`	`23`	`},`
`@@ -22,8 +27,13 @@`
`22`	`27`	`"sklearnex spmd implementation",`
`23`	`28`	`"large scale 2k parameters",`
`24`	`29`	`"spmd logreg parameters",`
	`30`	`+<<<<<<< HEAD`
`25`	`31`	`"synthetic data",`
`26`	`32`	`"spmd logreg2 parameters"`
	`33`	`+=======`
	`34`	`+ "synthetic data",`
	`35`	`+ "spmd logreg2 parameters"`
	`36`	`+>>>>>>> oleg_online/inc-dist-support`
`27`	`37`	`]`
`28`	`38`	`}`
`29`	`39`	`}`