diff --git a/examples/multiregression_trees/multiregression_run.py b/examples/multiregression_trees/multiregression_run.py new file mode 100644 index 0000000..505d0c2 --- /dev/null +++ b/examples/multiregression_trees/multiregression_run.py @@ -0,0 +1,68 @@ + +"""MicroPython code for doing multi-output regression with emlearn_trees +""" + +import os + +import emlearn_trees +import array + +class MultiRegressor(): + """Convenience wrapper for a collection of tree-based regression models""" + + def __init__(self, max_trees=10, max_nodes=1000, max_leaves=1000): + self.models = [] + + self.max_trees = max_trees + self.max_nodes = max_nodes + self.max_leaves = max_leaves + + # temporary buffer for invididual model output + self._output = array.array('f', [0.0]) + + def load(self, path): + """Load a directory of model files""" + + for filename in sorted(os.listdir(path)): + if not filename.endswith('.csv'): + print('Warning: Ignoring unknown file in model directory', filename) + continue + + model_path = path + '/' + filename + + # TODO: support reading neccesary capacity from file + model = emlearn_trees.new(self.max_trees, self.max_nodes, self.max_leaves) + + with open(model_path, 'r') as f: + emlearn_trees.load_model(model, f) + + self.models.append(model) + + def predict(self, features : array.array, outputs : array.array): + assert len(self.models), 'no models' + + for i, model in enumerate(self.models): + model.predict(features, self._output) + outputs[i] = self._output[0] + +def main(): + + # FIXME: read paths from sys.argv + model = MultiRegressor(max_nodes=10000) + model.load('models') + + outputs = array.array('f', [0.0 for _ in range(len(model.models))]) + + import npyfile + (n_samples, n_features), data = npyfile.load('input.npy') + + # TODO: write output to a file + for row in range(n_samples): + offset = row*n_features + f = data[offset:offset+n_features] + model.predict(f, outputs) + print(f, outputs) + + +if __name__ == '__main__': + main() diff --git a/examples/multiregression_trees/multiregression_train.py b/examples/multiregression_trees/multiregression_train.py new file mode 100644 index 0000000..3bbc180 --- /dev/null +++ b/examples/multiregression_trees/multiregression_train.py @@ -0,0 +1,153 @@ +import pandas as pd +import numpy as np +from pathlib import Path +from sklearn.ensemble import RandomForestRegressor +from sklearn.model_selection import train_test_split +from sklearn.preprocessing import MinMaxScaler +from sklearn.pipeline import Pipeline +from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error +import urllib.request +import zipfile +import os.path +import tempfile + +from sklearn.multioutput import MultiOutputRegressor + + +def airquality_download(data_dir='data'): + """ + UCI Air Quality dataset + https://archive.ics.uci.edu/dataset/360/air+quality + """ + + data_path = Path(data_dir) + data_path.mkdir(exist_ok=True) + + csv_file = data_path / 'AirQualityUCI.csv' + + if not csv_file.exists(): + url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip" + zip_path = data_path / 'AirQualityUCI.zip' + + urllib.request.urlretrieve(url, zip_path) + + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(data_path) + + zip_path.unlink() + + return csv_file + + +def airquality_load(csv_file): + df = pd.read_csv(csv_file, sep=';', decimal=',') + + # Remove missing values + df = df.replace(-200, np.nan) + df = df.dropna(axis=1, how='all').dropna() + + df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S') + df = df.drop(['Date', 'Time'], axis=1) + + target_cols = ['CO(GT)', 'NOx(GT)', 'NO2(GT)', 'NMHC(GT)', 'C6H6(GT)'] + exclude_cols = target_cols + ['datetime', 'Unnamed: 15', 'Unnamed: 16'] + feature_cols = [col for col in df.columns if col not in exclude_cols] + + X = df[feature_cols] + y = df[target_cols] + + return X, y + + +from emlearn.preprocessing import Quantizer +import emlearn + +def convert_multiregressor(multi, out_dir, format=None, prefix='regressor', **kwargs): + + out_dir = Path(out_dir) + out_dir.mkdir(exist_ok=True) + + if format is not None: + kwargs['format'] = format + + for i, estimator in enumerate(multi.estimators_): + ext = '.h' if format is None else '.'+format + p = out_dir / (f'{prefix}{i}' + ext) + converted = emlearn.convert(estimator) + converted.save(file=p, **kwargs) + + +def predict(data, model_dir): + """ + Make predictions using MicroPython model + """ + + with tempfile.TemporaryDirectory() as temp_dir: + #temp_dir = d.name + temp_dir = '' # XXX: temp + + input_path = os.path.join(temp_dir, 'input.npy') + output_path = os.path.join(temp_dir, 'output.npy') + + arr = np.ascontiguousarray(data.values).astype(np.int16) + assert len(arr.shape) == 2 + np.save(input_path, arr, allow_pickle=False) + + #subprocess.check_output() + + out = np.load(output_path) + return outs + + +def main(): + + print('Load dataset...') + csv_file = airquality_download() + X, y = airquality_load(csv_file) + + X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) + + print('Training...') + rf = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1) + regressor = MultiOutputRegressor(estimator=rf) + + pipeline = Pipeline([ + ('scaler', Quantizer()), # convert data to int16 range + ('regressor', regressor), + ]) + pipeline.fit(X_train, y_train) + + model_dir = 'models/' + convert_multiregressor(pipeline.named_steps['regressor'], out_dir=model_dir, format='csv') + print('Models exported to:', model_dir) + + + print("Performance Metrics:") + print("-" * 60) + + + y_pred_orig = pd.DataFrame(pipeline.predict(X_test), columns=y_train.columns) + + + X_test_scaled = pipeline.named_steps['scaler'].transform(X_test) + + print(X_test.head()) + + print(y_pred_orig.head()) + + y_pred_converted = pd.DataFrame(predict(X_test_scaled, model_dir), columns=y_train.columns) + + y_pred = y_pred_converted + + for i, target in enumerate(y.columns): + + rmse = np.sqrt(mean_squared_error(y_test[target], y_pred[target])) + r2 = r2_score(y_test[target], y_pred[target]) + mape = mean_absolute_percentage_error(y_test[target], y_pred[target]) * 100 + + print(f"{target:12} | RMSE: {rmse:8.3f} | MAPE: {mape:6.2f}% | R²: {r2:6.3f}") + +if __name__ == '__main__': + main() + + diff --git a/src/emlearn_trees/emlearn_trees.py b/src/emlearn_trees/emlearn_trees.py index f652ec4..f62a61e 100644 --- a/src/emlearn_trees/emlearn_trees.py +++ b/src/emlearn_trees/emlearn_trees.py @@ -12,33 +12,49 @@ def load_model(builder, f): leaves_found = 0 n_classes = None n_features = None + leaf_bits = 0 # default if not specified for line in f: line = line.rstrip('\r') line = line.rstrip('\n') tok = line.split(',') kind = tok[0] + + missing = leaf_bits is None or n_features is None or n_classes is None + if kind == 'r': + assert not missing, 'missing metadata before roots' root = int(tok[1]) builder.addroot(root) elif kind == 'n': + assert not missing, 'missing metadata before nodes' feature = int(tok[1]) value = int(float(tok[2])) left = int(tok[3]) right = int(tok[4]) builder.addnode(left, right, feature, value) elif kind == 'l': - leaf = int(tok[1]) + assert not missing, 'missing metadata before leaves' + assert len(tok) == 2, len(tok) + if leaf_bits == 32: + leaf = float(tok[1]) + else: + leaf = int(tok[1]) builder.addleaf(leaf) leaves_found += 1 + # metadata elif kind == 'f': n_features = int(tok[1]) elif kind == 'c': n_classes = int(tok[1]) + elif kind == 'lf': + leaf_bits = int(tok[1]) else: # unknown value pass - builder.setdata(n_features, n_classes) + if not missing: + # FIXME: pass leaf_bits + builder.setdata(n_features, n_classes, leaf_bits) #print('load-model', leaves_found) diff --git a/src/emlearn_trees/trees.c b/src/emlearn_trees/trees.c index c7a49bd..b57aaa6 100644 --- a/src/emlearn_trees/trees.c +++ b/src/emlearn_trees/trees.c @@ -82,7 +82,7 @@ static mp_obj_t builder_new(mp_obj_t trees_obj, mp_obj_t nodes_obj, mp_obj_t lea self->trees.n_trees = 0; self->trees.tree_roots = roots; - self->trees.leaf_bits = 0; // XXX: only class supported so far + self->trees.leaf_bits = 0; // default to majority voting self->trees.n_leaves = 0; self->trees.leaves = leaves; @@ -114,17 +114,19 @@ static mp_obj_t builder_del(mp_obj_t trees_obj) { static MP_DEFINE_CONST_FUN_OBJ_1(builder_del_obj, builder_del); // set number of features and classes -static mp_obj_t builder_setdata(mp_obj_t self_obj, mp_obj_t features_obj, mp_obj_t classes_obj) { +static mp_obj_t builder_setdata(size_t n_args, const mp_obj_t *args) { - mp_obj_trees_builder_t *o = MP_OBJ_TO_PTR(self_obj); + //mp_obj_t self_obj, mp_obj_t features_obj, mp_obj_t classes_obj, mp_obj_t leaf_bits_obj + mp_obj_trees_builder_t *o = MP_OBJ_TO_PTR(args[0]); EmlTreesBuilder *self = &o->builder; - self->trees.n_features = mp_obj_get_int(features_obj); - self->trees.n_classes = mp_obj_get_int(classes_obj); + self->trees.n_features = mp_obj_get_int(args[1]); + self->trees.n_classes = mp_obj_get_int(args[2]); + self->trees.leaf_bits = mp_obj_get_int(args[3]); return MP_OBJ_FROM_PTR(o); } -static MP_DEFINE_CONST_FUN_OBJ_3(builder_setdata_obj, builder_setdata); +static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(builder_setdata_obj, 4, 4, builder_setdata); // Add a node to the tree @@ -187,14 +189,22 @@ static mp_obj_t builder_addleaf(mp_obj_t self_obj, mp_obj_t leaf_obj) { mp_obj_trees_builder_t *o = MP_OBJ_TO_PTR(self_obj); EmlTreesBuilder *self = &o->builder; - mp_int_t leaf_value = mp_obj_get_int(leaf_obj); - if (self->trees.n_leaves >= self->max_leaves) { mp_raise_ValueError(MP_ERROR_TEXT("max leaves")); } const int leaf_index = self->trees.n_leaves++; - self->trees.leaves[leaf_index] = (uint8_t)leaf_value; + + if (self->trees.leaf_bits == 0) { + // majority voting, leaf should be a single integer (class index) + //mp_float_t leaf_value = mp_obj_get_float(leaf_obj); + mp_int_t leaf_int = mp_obj_get_int(leaf_obj); + self->trees.leaves[leaf_index] = (uint8_t)leaf_int; + } else if (self->trees.leaf_bits == 32) { + //const mp_float_t leaf_value = mp_obj_get_float(leaf_obj); + float *leaves = (float *)self->trees.leaves; + leaves[leaf_index] = mp_obj_get_float_to_f(leaf_obj); + } return mp_const_none; }