Skip to content

Commit 752cb60

Browse files
committed
added option num_fill_null=none
1 parent 1d5c38c commit 752cb60

File tree

3 files changed

+49
-22
lines changed

3 files changed

+49
-22
lines changed

clearbox_preprocessor/preprocessor.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -11,18 +11,18 @@
1111
import warnings
1212
import numpy as np
1313

14-
from .utils.numerical_transformer import NumericalTransformer
15-
from .utils.categorical_transformer import CategoricalTransformer
16-
from .utils.datetime_transformer import DatetimeTransformer
14+
# from .utils.numerical_transformer import NumericalTransformer
15+
# from .utils.categorical_transformer import CategoricalTransformer
16+
# from .utils.datetime_transformer import DatetimeTransformer
1717

1818
# UNCOMMENT FOR DEBUGGING
19-
# from utils.numerical_transformer import NumericalTransformer
20-
# from utils.categorical_transformer import CategoricalTransformer
21-
# from utils.datetime_transformer import DatetimeTransformer
19+
from utils.numerical_transformer import NumericalTransformer
20+
from utils.categorical_transformer import CategoricalTransformer
21+
from utils.datetime_transformer import DatetimeTransformer
2222

2323
class Preprocessor:
2424
ML_TASKS = {"classification", "regression", None}
25-
NUM_FILL_NULL_STRATEGIES = {"interpolate","forward", "backward", "min", "max", "mean", "zero", "one"}
25+
NUM_FILL_NULL_STRATEGIES = {"none", "interpolate","forward", "backward", "min", "max", "mean", "zero", "one"}
2626
SCALING_STRATEGIES = {"none", "normalize", "standardize", "quantile"}
2727
"""
2828
A class for preprocessing datasets based on polars, including feature selection, handling missing values, scaling,
@@ -120,7 +120,7 @@ def __init__(
120120
missing_values_threshold: float = 0.999,
121121
n_bins: int = 0,
122122
scaling: Literal["none", "normalize", "standardize", "quantile"] = "none",
123-
num_fill_null : Literal["interpolate","forward", "backward", "min", "max", "mean", "zero", "one"] = "mean",
123+
num_fill_null : Literal["none", "interpolate","forward", "backward", "min", "max", "mean", "zero", "one"] = "none",
124124
unseen_labels = 'ignore',
125125
ml_task: Literal["classification", "regression", None] = None,
126126
target_column: str = None,
@@ -662,6 +662,7 @@ def get_categorical_features(self) -> Tuple[str]:
662662
# real_data = pd.read_csv(os.path.join(file_path,"dataset.csv"))
663663
# # real_data["income"] = real_data["income"].map({"<=50K": 0, ">50K": 1})
664664

665-
preprocessor = Preprocessor(real_data, num_fill_null=np.nan, scaling='standardize')
665+
preprocessor = Preprocessor(real_data, num_fill_null="none", scaling='quantile')
666666
real_data_preprocessed = preprocessor.transform(real_data)
667-
df_inverse = preprocessor.inverse_transform(real_data_preprocessed)
667+
df_inverse = preprocessor.inverse_transform(real_data_preprocessed)
668+
pass

clearbox_preprocessor/utils/numerical_transformer.py

Lines changed: 37 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -69,16 +69,6 @@ def transform(self, data: pl.DataFrame):
6969
numerical_features = self.numerical_features
7070
num_fill_null = self.num_fill_null
7171

72-
# Fill null values with the specified strategy
73-
col_num = pl.col(numerical_features)
74-
if isinstance(num_fill_null, str):
75-
if num_fill_null == "interpolate":
76-
data = data.with_columns(col_num).interpolate()
77-
else:
78-
data = data.with_columns(col_num.fill_null(strategy=num_fill_null))
79-
else:
80-
data = data.with_columns(col_num.fill_null(num_fill_null))
81-
8272
# Scale numerical features with the specified method
8373
match scaling:
8474
case "none":
@@ -104,6 +94,21 @@ def transform(self, data: pl.DataFrame):
10494
# KBinsDiscretizer applied to numerical features
10595
data = data.with_columns(numerical_features.qcut(self.n_bins, labels=self.n_bins_labels))
10696

97+
# Fill null values with the specified strategy
98+
col_num = pl.col(numerical_features)
99+
if isinstance(num_fill_null, str):
100+
if num_fill_null == "interpolate":
101+
data = data.with_columns(col_num).interpolate()
102+
elif num_fill_null == "none":
103+
if scaling in ["quantile", "normalize"]:
104+
data = data.with_columns(col_num.fill_null(-0.01))
105+
else:
106+
data = data.with_columns(col_num.fill_null(-10))
107+
else:
108+
data = data.with_columns(col_num.fill_null(strategy=num_fill_null))
109+
else:
110+
data = data.with_columns(col_num.fill_null(num_fill_null))
111+
107112
return data
108113

109114

@@ -132,7 +137,28 @@ def inverse_transform(self, data: pl.DataFrame):
132137
If an invalid scaling method is provided.
133138
"""
134139
numerical_features = self.numerical_features
135-
140+
num_fill_null = self.num_fill_null
141+
scaling = self.scaling
142+
143+
# If num_fill_null is "none", convert very negative values to NaN
144+
if num_fill_null=="none":
145+
if scaling in ["quantile", "normalize"]:
146+
for col in numerical_features:
147+
data = data.with_columns(
148+
pl.when(pl.col(col) <= -0.01)
149+
.then(None)
150+
.otherwise(pl.col(col))
151+
.alias(col)
152+
)
153+
else:
154+
for col in numerical_features:
155+
data = data.with_columns(
156+
pl.when(pl.col(col) <= -10)
157+
.then(None)
158+
.otherwise(pl.col(col))
159+
.alias(col)
160+
)
161+
136162
# Numerical features
137163
match self.scaling:
138164
case "none":

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111

1212
setup(
1313
name="clearbox-preprocessor",
14-
version="0.11.10",
14+
version="0.11.11",
1515
author="Dario Brunelli",
1616
author_email="dario@clearbox.ai",
1717
description="A fast polars based data pre-processor for ML datasets",

0 commit comments

Comments
 (0)