Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions examples/multiregression_trees/multiregression_run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@

"""MicroPython code for doing multi-output regression with emlearn_trees
"""

import os

import emlearn_trees
import array

class MultiRegressor():
"""Convenience wrapper for a collection of tree-based regression models"""

def __init__(self, max_trees=10, max_nodes=1000, max_leaves=1000):
self.models = []

self.max_trees = max_trees
self.max_nodes = max_nodes
self.max_leaves = max_leaves

# temporary buffer for invididual model output
self._output = array.array('f', [0.0])

def load(self, path):
"""Load a directory of model files"""

for filename in sorted(os.listdir(path)):
if not filename.endswith('.csv'):
print('Warning: Ignoring unknown file in model directory', filename)
continue

model_path = path + '/' + filename

# TODO: support reading neccesary capacity from file
model = emlearn_trees.new(self.max_trees, self.max_nodes, self.max_leaves)

with open(model_path, 'r') as f:
emlearn_trees.load_model(model, f)

self.models.append(model)

def predict(self, features : array.array, outputs : array.array):
assert len(self.models), 'no models'

for i, model in enumerate(self.models):
model.predict(features, self._output)
outputs[i] = self._output[0]

def main():

# FIXME: read paths from sys.argv
model = MultiRegressor(max_nodes=10000)
model.load('models')

outputs = array.array('f', [0.0 for _ in range(len(model.models))])

import npyfile
(n_samples, n_features), data = npyfile.load('input.npy')

# TODO: write output to a file
for row in range(n_samples):
offset = row*n_features
f = data[offset:offset+n_features]
model.predict(f, outputs)
print(f, outputs)


if __name__ == '__main__':
main()
153 changes: 153 additions & 0 deletions examples/multiregression_trees/multiregression_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
import urllib.request
import zipfile
import os.path
import tempfile

from sklearn.multioutput import MultiOutputRegressor


def airquality_download(data_dir='data'):
"""
UCI Air Quality dataset
https://archive.ics.uci.edu/dataset/360/air+quality
"""

data_path = Path(data_dir)
data_path.mkdir(exist_ok=True)

csv_file = data_path / 'AirQualityUCI.csv'

if not csv_file.exists():
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00360/AirQualityUCI.zip"
zip_path = data_path / 'AirQualityUCI.zip'

urllib.request.urlretrieve(url, zip_path)

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
zip_ref.extractall(data_path)

zip_path.unlink()

return csv_file


def airquality_load(csv_file):
df = pd.read_csv(csv_file, sep=';', decimal=',')

# Remove missing values
df = df.replace(-200, np.nan)
df = df.dropna(axis=1, how='all').dropna()

df['datetime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%d/%m/%Y %H.%M.%S')
df = df.drop(['Date', 'Time'], axis=1)

target_cols = ['CO(GT)', 'NOx(GT)', 'NO2(GT)', 'NMHC(GT)', 'C6H6(GT)']
exclude_cols = target_cols + ['datetime', 'Unnamed: 15', 'Unnamed: 16']
feature_cols = [col for col in df.columns if col not in exclude_cols]

X = df[feature_cols]
y = df[target_cols]

return X, y


from emlearn.preprocessing import Quantizer
import emlearn

def convert_multiregressor(multi, out_dir, format=None, prefix='regressor', **kwargs):

out_dir = Path(out_dir)
out_dir.mkdir(exist_ok=True)

if format is not None:
kwargs['format'] = format

for i, estimator in enumerate(multi.estimators_):
ext = '.h' if format is None else '.'+format
p = out_dir / (f'{prefix}{i}' + ext)
converted = emlearn.convert(estimator)
converted.save(file=p, **kwargs)


def predict(data, model_dir):
"""
Make predictions using MicroPython model
"""

with tempfile.TemporaryDirectory() as temp_dir:
#temp_dir = d.name
temp_dir = '' # XXX: temp

input_path = os.path.join(temp_dir, 'input.npy')
output_path = os.path.join(temp_dir, 'output.npy')

arr = np.ascontiguousarray(data.values).astype(np.int16)
assert len(arr.shape) == 2
np.save(input_path, arr, allow_pickle=False)

#subprocess.check_output()

out = np.load(output_path)
return outs


def main():

print('Load dataset...')
csv_file = airquality_download()
X, y = airquality_load(csv_file)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Training...')
rf = RandomForestRegressor(n_estimators=10, random_state=42, n_jobs=-1)
regressor = MultiOutputRegressor(estimator=rf)

pipeline = Pipeline([
('scaler', Quantizer()), # convert data to int16 range
('regressor', regressor),
])
pipeline.fit(X_train, y_train)

model_dir = 'models/'
convert_multiregressor(pipeline.named_steps['regressor'], out_dir=model_dir, format='csv')
print('Models exported to:', model_dir)


print("Performance Metrics:")
print("-" * 60)


y_pred_orig = pd.DataFrame(pipeline.predict(X_test), columns=y_train.columns)


X_test_scaled = pipeline.named_steps['scaler'].transform(X_test)

print(X_test.head())

print(y_pred_orig.head())

y_pred_converted = pd.DataFrame(predict(X_test_scaled, model_dir), columns=y_train.columns)

y_pred = y_pred_converted

for i, target in enumerate(y.columns):

rmse = np.sqrt(mean_squared_error(y_test[target], y_pred[target]))
r2 = r2_score(y_test[target], y_pred[target])
mape = mean_absolute_percentage_error(y_test[target], y_pred[target]) * 100

print(f"{target:12} | RMSE: {rmse:8.3f} | MAPE: {mape:6.2f}% | R²: {r2:6.3f}")

if __name__ == '__main__':
main()


20 changes: 18 additions & 2 deletions src/emlearn_trees/emlearn_trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,33 +12,49 @@ def load_model(builder, f):
leaves_found = 0
n_classes = None
n_features = None
leaf_bits = 0 # default if not specified

for line in f:
line = line.rstrip('\r')
line = line.rstrip('\n')
tok = line.split(',')
kind = tok[0]

missing = leaf_bits is None or n_features is None or n_classes is None

if kind == 'r':
assert not missing, 'missing metadata before roots'
root = int(tok[1])
builder.addroot(root)
elif kind == 'n':
assert not missing, 'missing metadata before nodes'
feature = int(tok[1])
value = int(float(tok[2]))
left = int(tok[3])
right = int(tok[4])
builder.addnode(left, right, feature, value)
elif kind == 'l':
leaf = int(tok[1])
assert not missing, 'missing metadata before leaves'
assert len(tok) == 2, len(tok)
if leaf_bits == 32:
leaf = float(tok[1])
else:
leaf = int(tok[1])
builder.addleaf(leaf)
leaves_found += 1
# metadata
elif kind == 'f':
n_features = int(tok[1])
elif kind == 'c':
n_classes = int(tok[1])
elif kind == 'lf':
leaf_bits = int(tok[1])
else:
# unknown value
pass

builder.setdata(n_features, n_classes)
if not missing:
# FIXME: pass leaf_bits
builder.setdata(n_features, n_classes, leaf_bits)

#print('load-model', leaves_found)
28 changes: 19 additions & 9 deletions src/emlearn_trees/trees.c
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ static mp_obj_t builder_new(mp_obj_t trees_obj, mp_obj_t nodes_obj, mp_obj_t lea
self->trees.n_trees = 0;
self->trees.tree_roots = roots;

self->trees.leaf_bits = 0; // XXX: only class supported so far
self->trees.leaf_bits = 0; // default to majority voting
self->trees.n_leaves = 0;
self->trees.leaves = leaves;

Expand Down Expand Up @@ -114,17 +114,19 @@ static mp_obj_t builder_del(mp_obj_t trees_obj) {
static MP_DEFINE_CONST_FUN_OBJ_1(builder_del_obj, builder_del);

// set number of features and classes
static mp_obj_t builder_setdata(mp_obj_t self_obj, mp_obj_t features_obj, mp_obj_t classes_obj) {
static mp_obj_t builder_setdata(size_t n_args, const mp_obj_t *args) {

mp_obj_trees_builder_t *o = MP_OBJ_TO_PTR(self_obj);
//mp_obj_t self_obj, mp_obj_t features_obj, mp_obj_t classes_obj, mp_obj_t leaf_bits_obj
mp_obj_trees_builder_t *o = MP_OBJ_TO_PTR(args[0]);
EmlTreesBuilder *self = &o->builder;

self->trees.n_features = mp_obj_get_int(features_obj);
self->trees.n_classes = mp_obj_get_int(classes_obj);
self->trees.n_features = mp_obj_get_int(args[1]);
self->trees.n_classes = mp_obj_get_int(args[2]);
self->trees.leaf_bits = mp_obj_get_int(args[3]);

return MP_OBJ_FROM_PTR(o);
}
static MP_DEFINE_CONST_FUN_OBJ_3(builder_setdata_obj, builder_setdata);
static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(builder_setdata_obj, 4, 4, builder_setdata);


// Add a node to the tree
Expand Down Expand Up @@ -187,14 +189,22 @@ static mp_obj_t builder_addleaf(mp_obj_t self_obj, mp_obj_t leaf_obj) {
mp_obj_trees_builder_t *o = MP_OBJ_TO_PTR(self_obj);
EmlTreesBuilder *self = &o->builder;

mp_int_t leaf_value = mp_obj_get_int(leaf_obj);

if (self->trees.n_leaves >= self->max_leaves) {
mp_raise_ValueError(MP_ERROR_TEXT("max leaves"));
}

const int leaf_index = self->trees.n_leaves++;
self->trees.leaves[leaf_index] = (uint8_t)leaf_value;

if (self->trees.leaf_bits == 0) {
// majority voting, leaf should be a single integer (class index)
//mp_float_t leaf_value = mp_obj_get_float(leaf_obj);
mp_int_t leaf_int = mp_obj_get_int(leaf_obj);
self->trees.leaves[leaf_index] = (uint8_t)leaf_int;
} else if (self->trees.leaf_bits == 32) {
//const mp_float_t leaf_value = mp_obj_get_float(leaf_obj);
float *leaves = (float *)self->trees.leaves;
leaves[leaf_index] = mp_obj_get_float_to_f(leaf_obj);
}

return mp_const_none;
}
Expand Down
Loading