From 8a015d0e5fda5f27404e424a63816a0b355e5cbe Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Thu, 8 Jan 2026 09:48:50 -0600 Subject: [PATCH 1/6] Add TextFeatures transformer for text feature extraction - Add TextFeatures class to extract features from text columns - Support for features: char_count, word_count, digit_count, uppercase_count, etc. - Add comprehensive tests with pytest parametrize - Add user guide documentation --- docs/user_guide/index.rst | 1 + docs/user_guide/text/TextFeatures.rst | 152 ++++++++++++ docs/user_guide/text/index.rst | 18 ++ feature_engine/text/__init__.py | 9 + feature_engine/text/text_features.py | 327 ++++++++++++++++++++++++++ tests/test_text/__init__.py | 1 + tests/test_text/test_text_features.py | 197 ++++++++++++++++ 7 files changed, 705 insertions(+) create mode 100644 docs/user_guide/text/TextFeatures.rst create mode 100644 docs/user_guide/text/index.rst create mode 100644 feature_engine/text/__init__.py create mode 100644 feature_engine/text/text_features.py create mode 100644 tests/test_text/__init__.py create mode 100644 tests/test_text/test_text_features.py diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index c786e77e1..52c33a8f4 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -28,6 +28,7 @@ Creation creation/index datetime/index + text/index Selection diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst new file mode 100644 index 000000000..a1a82aa44 --- /dev/null +++ b/docs/user_guide/text/TextFeatures.rst @@ -0,0 +1,152 @@ +.. _text_features: + +.. currentmodule:: feature_engine.text + +TextFeatures +============ + +The :class:`TextFeatures()` extracts numerical features from text/string variables. +This transformer is useful for extracting basic text statistics that can be used +as features in machine learning models. + +Unlike scikit-learn's CountVectorizer or TfidfVectorizer which create sparse matrices, +:class:`TextFeatures()` extracts metadata features that remain in DataFrame format +and can be easily combined with other Feature-engine transformers in a pipeline. + +Available Features +~~~~~~~~~~~~~~~~~~ + +The transformer can extract the following features: + +- **char_count**: Number of characters in the text +- **word_count**: Number of words (whitespace-separated tokens) +- **sentence_count**: Number of sentences (based on .!? punctuation) +- **avg_word_length**: Average length of words +- **digit_count**: Number of digit characters +- **uppercase_count**: Number of uppercase letters +- **lowercase_count**: Number of lowercase letters +- **special_char_count**: Number of special characters (non-alphanumeric) +- **whitespace_count**: Number of whitespace characters +- **whitespace_ratio**: Ratio of whitespace to total characters +- **digit_ratio**: Ratio of digits to total characters +- **uppercase_ratio**: Ratio of uppercase to total characters +- **has_digits**: Binary indicator if text contains digits +- **has_uppercase**: Binary indicator if text contains uppercase +- **is_empty**: Binary indicator if text is empty +- **starts_with_uppercase**: Binary indicator if text starts with uppercase +- **ends_with_punctuation**: Binary indicator if text ends with .!? +- **unique_word_count**: Number of unique words (case-insensitive) +- **unique_word_ratio**: Ratio of unique words to total words + +Example +~~~~~~~ + +Let's create a dataframe with text data and extract features: + +.. code:: python + + import pandas as pd + from feature_engine.text import TextFeatures + + # Create sample data + X = pd.DataFrame({ + 'review': [ + 'This product is AMAZING! Best purchase ever.', + 'Not great. Would not recommend.', + 'OK for the price. 3 out of 5 stars.', + 'TERRIBLE!!! DO NOT BUY!', + ], + 'title': [ + 'Great Product', + 'Disappointed', + 'Average', + 'Awful', + ] + }) + +Now let's extract specific text features: + +.. code:: python + + # Set up the transformer with specific features + tf = TextFeatures( + variables=['review'], + features=['word_count', 'char_count', 'has_digits', 'uppercase_ratio'] + ) + + # Fit and transform + tf.fit(X) + X_transformed = tf.transform(X) + + print(X_transformed.columns.tolist()) + +Output: + +.. code:: python + + ['review', 'title', 'review_word_count', 'review_char_count', + 'review_has_digits', 'review_uppercase_ratio'] + +Extracting all features +~~~~~~~~~~~~~~~~~~~~~~~ + +By default, if no features are specified, all available features will be extracted: + +.. code:: python + + # Extract all features from all text columns + tf = TextFeatures() + tf.fit(X) + X_transformed = tf.transform(X) + + # This will create 19 new columns for each text variable + print(f"Original columns: {len(X.columns)}") + print(f"Transformed columns: {len(X_transformed.columns)}") + +Dropping original columns +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can drop the original text columns after extracting features: + +.. code:: python + + tf = TextFeatures( + variables=['review'], + features=['word_count', 'char_count'], + drop_original=True + ) + + tf.fit(X) + X_transformed = tf.transform(X) + + # 'review' column is now removed + print(X_transformed.columns.tolist()) + +Using in a Pipeline +~~~~~~~~~~~~~~~~~~~ + +:class:`TextFeatures()` works seamlessly with scikit-learn pipelines: + +.. code:: python + + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn.linear_model import LogisticRegression + + # Create a pipeline + pipe = Pipeline([ + ('text_features', TextFeatures( + variables=['review'], + features=['word_count', 'char_count', 'uppercase_ratio'], + drop_original=True + )), + ('scaler', StandardScaler()), + ('classifier', LogisticRegression()) + ]) + +API Reference +------------- + +.. autoclass:: TextFeatures + :members: + :inherited-members: diff --git a/docs/user_guide/text/index.rst b/docs/user_guide/text/index.rst new file mode 100644 index 000000000..ea23d7362 --- /dev/null +++ b/docs/user_guide/text/index.rst @@ -0,0 +1,18 @@ +.. -*- mode: rst -*- + +Text Feature Extraction +======================= + +Feature-engine's text module includes transformers to extract numerical features +from text/string variables. + +Text feature extraction is useful for machine learning problems where you have +text data but want to derive numerical statistics without creating sparse +bag-of-words or TF-IDF representations. + +**Transformers** + +.. toctree:: + :maxdepth: 1 + + TextFeatures diff --git a/feature_engine/text/__init__.py b/feature_engine/text/__init__.py new file mode 100644 index 000000000..14626b79c --- /dev/null +++ b/feature_engine/text/__init__.py @@ -0,0 +1,9 @@ +""" +The module text includes classes to extract features from text/string variables. +""" + +from .text_features import TextFeatures + +__all__ = [ + "TextFeatures", +] diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py new file mode 100644 index 000000000..63e9b0dac --- /dev/null +++ b/feature_engine/text/text_features.py @@ -0,0 +1,327 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_init_input_params import ( + _check_param_drop_original, +) +from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X +from feature_engine.tags import _return_tags + +# Available text features and their computation functions +TEXT_FEATURES = { + "char_count": lambda x: x.str.len(), + "word_count": lambda x: x.str.split().str.len(), + "sentence_count": lambda x: x.str.count(r"[.!?]+"), + "avg_word_length": lambda x: x.apply( + lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1) + ), + "digit_count": lambda x: x.str.count(r"\d"), + "uppercase_count": lambda x: x.str.count(r"[A-Z]"), + "lowercase_count": lambda x: x.str.count(r"[a-z]"), + "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"), + "whitespace_count": lambda x: x.str.count(r"\s"), + "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1), + "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1), + "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1), + "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int), + "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int), + "is_empty": lambda x: (x.str.len() == 0).astype(int), + "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), + "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), + "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))), + "unique_word_ratio": lambda x: x.apply( + lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1) + ), +} + + +class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + TextFeatures() extracts numerical features from text/string variables. This + transformer is useful for extracting basic text statistics that can be used + as features in machine learning models. + + The transformer can extract various text features including character counts, + word counts, sentence counts, and various ratios and indicators. + + A list of variables can be passed as an argument. Alternatively, the transformer + will automatically select and transform all variables of type object (string). + + More details in the :ref:`User Guide `. + + Parameters + ---------- + variables: list, default=None + The list of text/string variables to extract features from. If None, the + transformer will automatically select all object (string) columns. + + features: list, default=None + List of text features to extract. Available features are: + + - 'char_count': Number of characters in the text + - 'word_count': Number of words (whitespace-separated tokens) + - 'sentence_count': Number of sentences (based on .!? punctuation) + - 'avg_word_length': Average length of words + - 'digit_count': Number of digit characters + - 'uppercase_count': Number of uppercase letters + - 'lowercase_count': Number of lowercase letters + - 'special_char_count': Number of special characters (non-alphanumeric) + - 'whitespace_count': Number of whitespace characters + - 'whitespace_ratio': Ratio of whitespace to total characters + - 'digit_ratio': Ratio of digits to total characters + - 'uppercase_ratio': Ratio of uppercase to total characters + - 'has_digits': Binary indicator if text contains digits + - 'has_uppercase': Binary indicator if text contains uppercase + - 'is_empty': Binary indicator if text is empty + - 'starts_with_uppercase': Binary indicator if text starts with uppercase + - 'ends_with_punctuation': Binary indicator if text ends with .!? + - 'unique_word_count': Number of unique words (case-insensitive) + - 'unique_word_ratio': Ratio of unique words to total words + + If None, extracts all available features. + + drop_original: bool, default=False + Whether to drop the original text columns after transformation. + + Attributes + ---------- + variables_: + The list of text variables that will be transformed. + + features_: + The list of features that will be extracted. + + feature_names_in_: + List with the names of features seen during fit. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + This transformer does not learn parameters. It stores the feature names + and validates input. + + fit_transform: + Fit to data, then transform it. + + transform: + Extract text features and add them to the dataframe. + + get_feature_names_out: + Get output feature names for transformation. + + See Also + -------- + feature_engine.encoding.StringSimilarityEncoder : + Encodes categorical variables based on string similarity. + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.text import TextFeatures + >>> X = pd.DataFrame({ + ... 'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123'] + ... }) + >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits']) + >>> tf.fit(X) + >>> X = tf.transform(X) + >>> X + text text_char_count text_word_count text_has_digits + 0 Hello World! 12 2 0 + 1 Python is GREAT. 16 3 0 + 2 ML rocks 123 12 3 1 + """ + + def __init__( + self, + variables: Union[None, str, List[str]] = None, + features: Union[None, List[str]] = None, + drop_original: bool = False, + ) -> None: + + # Validate variables + if variables is not None: + if isinstance(variables, str): + variables = [variables] + elif not isinstance(variables, list) or not all( + isinstance(v, str) for v in variables + ): + raise ValueError( + "variables must be None, a string, or a list of strings. " + f"Got {type(variables).__name__} instead." + ) + + # Validate features + if features is not None: + if not isinstance(features, list) or not all( + isinstance(f, str) for f in features + ): + raise ValueError( + "features must be None or a list of strings. " + f"Got {type(features).__name__} instead." + ) + invalid_features = set(features) - set(TEXT_FEATURES.keys()) + if invalid_features: + raise ValueError( + f"Invalid features: {invalid_features}. " + f"Available features are: {list(TEXT_FEATURES.keys())}" + ) + + _check_param_drop_original(drop_original) + + self.variables = variables + self.features = features + self.drop_original = drop_original + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + This transformer does not learn parameters. + + Stores feature names and validates that the specified variables are + present and are of string/object type. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas Series, or np.array. Defaults to None. + It is not needed in this transformer. You can pass y or None. + + Returns + ------- + self: TextFeatures + The fitted transformer. + """ + + # check input dataframe + X = check_X(X) + + # Find or validate text variables + if self.variables is None: + # Select object/string columns + self.variables_ = [col for col in X.columns if X[col].dtype == "object"] + if len(self.variables_) == 0: + raise ValueError( + "No object/string columns found in the dataframe. " + "Please specify variables explicitly." + ) + else: + # Validate user-specified variables exist + missing = set(self.variables) - set(X.columns) + if missing: + raise ValueError( + f"Variables {missing} are not present in the dataframe." + ) + self.variables_ = self.variables + + # Set features to extract + if self.features is None: + self.features_ = list(TEXT_FEATURES.keys()) + else: + self.features_ = self.features + + # save input features + self.feature_names_in_ = X.columns.tolist() + + # save train set shape + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Extract text features and add them to the dataframe. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: Pandas dataframe + The dataframe with the original columns plus the new text features. + """ + + # Check method fit has been called + check_is_fitted(self) + + # check that input is a dataframe + X = check_X(X) + + # Check if input data contains same number of columns as dataframe used to fit. + _check_X_matches_training_df(X, self.n_features_in_) + + # reorder variables to match train set + X = X[self.feature_names_in_] + + # Extract features for each text variable + for var in self.variables_: + # Fill NaN with empty string for feature extraction + text_col = X[var].fillna("") + + for feature_name in self.features_: + new_col_name = f"{var}_{feature_name}" + feature_func = TEXT_FEATURES[feature_name] + X[new_col_name] = feature_func(text_col) + + # Fill any NaN values resulting from computation with 0 + X[new_col_name] = X[new_col_name].fillna(0) + + if self.drop_original: + X = X.drop(columns=self.variables_) + + return X + + def get_feature_names_out(self, input_features=None) -> List[str]: + """ + Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, uses ``feature_names_in_``. + + Returns + ------- + feature_names_out : list of str + Output feature names. + """ + check_is_fitted(self) + + # Start with original features + if self.drop_original: + feature_names = [ + f for f in self.feature_names_in_ if f not in self.variables_ + ] + else: + feature_names = list(self.feature_names_in_) + + # Add new text feature names + for var in self.variables_: + for feature_name in self.features_: + feature_names.append(f"{var}_{feature_name}") + + return feature_names + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["allow_nan"] = True + tags_dict["variables"] = "categorical" + return tags_dict + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py new file mode 100644 index 000000000..64a0339b6 --- /dev/null +++ b/tests/test_text/__init__.py @@ -0,0 +1 @@ +# Tests for text module diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py new file mode 100644 index 000000000..315049373 --- /dev/null +++ b/tests/test_text/test_text_features.py @@ -0,0 +1,197 @@ +import pandas as pd +import pytest + +from feature_engine.text import TextFeatures + + +@pytest.fixture +def df_text(): + """Fixture providing sample text data.""" + return pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) + + +@pytest.fixture +def df_multi_text(): + """Fixture providing DataFrame with multiple text columns.""" + return pd.DataFrame({ + "text1": ["Hello", "World"], + "text2": ["Foo", "Bar"], + "numeric": [1, 2], + }) + + +def test_default_all_features(df_text): + """Test extracting all features with default parameters. + + Expected values for ["Hello World!", "Python 123", "AI"]: + - char_count: [12, 10, 2] + - word_count: [2, 2, 1] + - digit_count: [0, 3, 0] + """ + transformer = TextFeatures() + X_tr = transformer.fit_transform(df_text.copy()) + + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" in X_tr.columns + assert X_tr["text_char_count"].tolist() == [12, 10, 2] + assert X_tr["text_word_count"].tolist() == [2, 2, 1] + assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + + +def test_specific_features(): + """Test extracting specific features only.""" + X = pd.DataFrame({"text": ["Hello", "World"]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + X_tr = transformer.fit_transform(X) + + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" not in X_tr.columns + assert "text_uppercase_count" not in X_tr.columns + + +def test_specific_variables(df_multi_text): + """Test extracting features from specific variables only.""" + transformer = TextFeatures(variables=["text1"], features=["char_count"]) + X_tr = transformer.fit_transform(df_multi_text.copy()) + + assert "text1_char_count" in X_tr.columns + assert "text2_char_count" not in X_tr.columns + + +def test_drop_original(): + """Test drop_original parameter removes text columns.""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + X_tr = transformer.fit_transform(X) + + assert "text" not in X_tr.columns + assert "text_char_count" in X_tr.columns + assert "other" in X_tr.columns + + +def test_empty_string_handling(): + """Test handling of empty strings.""" + X = pd.DataFrame({"text": ["", "Hello", ""]}) + transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [0, 5, 0] + assert X_tr["text_is_empty"].tolist() == [1, 0, 1] + + +def test_nan_handling(): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(features=["char_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [5, 0, 5] + + +def test_uppercase_features(): + """Test uppercase-related features.""" + X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) + transformer = TextFeatures( + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] + assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] + + +def test_sentence_count(): + """Test sentence counting.""" + X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) + transformer = TextFeatures(features=["sentence_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] + + +def test_unique_word_features(): + """Test unique word features.""" + X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) + transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] + assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] + + +@pytest.mark.parametrize("invalid_feature", ["invalid_feature", "not_a_feature"]) +def test_invalid_feature_raises_error(invalid_feature): + """Test that invalid feature names raise ValueError.""" + with pytest.raises(ValueError, match="Invalid features"): + TextFeatures(features=[invalid_feature]) + + +def test_non_string_feature_raises_error(): + """Test that non-string feature raises ValueError.""" + with pytest.raises(ValueError, match="features must be None or a list of strings"): + TextFeatures(features=[123]) + + +@pytest.mark.parametrize("invalid_variables", [123, 0.5, {"a": 1}]) +def test_invalid_variables_raises_error(invalid_variables): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables must be"): + TextFeatures(variables=invalid_variables) + + +def test_missing_variable_raises_error(): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) + + +def test_no_text_columns_raises_error(): + """Test that no text columns raises error when variables=None.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures() + with pytest.raises(ValueError, match="No object/string columns found"): + transformer.fit(X) + + +def test_fit_stores_attributes(): + """Test that fit stores expected attributes with correct values.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures() + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "features_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + assert transformer.variables_ == ["text"] + assert transformer.n_features_in_ == 1 + + +def test_get_feature_names_out(): + """Test get_feature_names_out returns correct feature names.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + assert "text_word_count" in feature_names + + +def test_get_feature_names_out_with_drop(): + """Test get_feature_names_out with drop_original=True.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" not in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names From d2d655fea803786218f65dee28448b271f9c4708 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 12 Jan 2026 05:59:36 -0500 Subject: [PATCH 2/6] Address PR review comments for TextFeatures transformer - Optimize avg_word_length using vectorized char_count / word_count - Simplify unique_word_count using x.str.lower().str.split().apply(set).str.len() - Rename unique_word_ratio to lexical_diversity (word_count / unique_word_count) - Use _check_variables_input_value for variable validation - Use find_categorical_variables for automatic variable selection - Remove redundant docstring text - Add comprehensive test assertions with expected values --- feature_engine/text/text_features.py | 34 ++++++++++++--------------- tests/test_text/test_text_features.py | 25 +++++++++++++++++--- 2 files changed, 37 insertions(+), 22 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 63e9b0dac..9696d628f 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -11,17 +11,19 @@ from feature_engine._check_init_parameters.check_init_input_params import ( _check_param_drop_original, ) +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X from feature_engine.tags import _return_tags +from feature_engine.variable_handling import find_categorical_variables # Available text features and their computation functions TEXT_FEATURES = { "char_count": lambda x: x.str.len(), "word_count": lambda x: x.str.split().str.len(), "sentence_count": lambda x: x.str.count(r"[.!?]+"), - "avg_word_length": lambda x: x.apply( - lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1) - ), + "avg_word_length": lambda x: x.str.len() / x.str.split().str.len().replace(0, 1), "digit_count": lambda x: x.str.count(r"\d"), "uppercase_count": lambda x: x.str.count(r"[A-Z]"), "lowercase_count": lambda x: x.str.count(r"[a-z]"), @@ -35,9 +37,10 @@ "is_empty": lambda x: (x.str.len() == 0).astype(int), "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), - "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))), - "unique_word_ratio": lambda x: x.apply( - lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1) + "unique_word_count": lambda x: x.str.lower().str.split().apply(set).str.len(), + "lexical_diversity": lambda x: ( + x.str.split().str.len() + / x.str.lower().str.split().apply(set).str.len().replace(0, 1) ), } @@ -83,9 +86,9 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): - 'starts_with_uppercase': Binary indicator if text starts with uppercase - 'ends_with_punctuation': Binary indicator if text ends with .!? - 'unique_word_count': Number of unique words (case-insensitive) - - 'unique_word_ratio': Ratio of unique words to total words + - 'lexical_diversity': Ratio of total words to unique words - If None, extracts all available features. + If None, extracts all features. drop_original: bool, default=False Whether to drop the original text columns after transformation. @@ -179,14 +182,12 @@ def __init__( _check_param_drop_original(drop_original) - self.variables = variables + self.variables = _check_variables_input_value(variables) self.features = features self.drop_original = drop_original def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): """ - This transformer does not learn parameters. - Stores feature names and validates that the specified variables are present and are of string/object type. @@ -209,13 +210,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): # Find or validate text variables if self.variables is None: - # Select object/string columns - self.variables_ = [col for col in X.columns if X[col].dtype == "object"] - if len(self.variables_) == 0: - raise ValueError( - "No object/string columns found in the dataframe. " - "Please specify variables explicitly." - ) + # Select object/string columns using existing utility + self.variables_ = find_categorical_variables(X) else: # Validate user-specified variables exist missing = set(self.variables) - set(X.columns) @@ -223,7 +219,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): raise ValueError( f"Variables {missing} are not present in the dataframe." ) - self.variables_ = self.variables + self.variables_ = list(self.variables) if isinstance(self.variables, (list, tuple)) else [self.variables] # Set features to extract if self.features is None: diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 315049373..b62328730 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -27,16 +27,25 @@ def test_default_all_features(df_text): - char_count: [12, 10, 2] - word_count: [2, 2, 1] - digit_count: [0, 3, 0] + - uppercase_count: [2, 1, 2] + - has_digits: [0, 1, 0] """ transformer = TextFeatures() X_tr = transformer.fit_transform(df_text.copy()) + # Verify all expected features are present assert "text_char_count" in X_tr.columns assert "text_word_count" in X_tr.columns assert "text_digit_count" in X_tr.columns + assert "text_uppercase_count" in X_tr.columns + assert "text_lexical_diversity" in X_tr.columns + + # Verify expected values assert X_tr["text_char_count"].tolist() == [12, 10, 2] assert X_tr["text_word_count"].tolist() == [2, 2, 1] assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + assert X_tr["text_uppercase_count"].tolist() == [2, 1, 2] + assert X_tr["text_has_digits"].tolist() == [0, 1, 0] def test_specific_features(): @@ -45,20 +54,29 @@ def test_specific_features(): transformer = TextFeatures(features=["char_count", "word_count"]) X_tr = transformer.fit_transform(X) + # Verify only specified features are present assert "text_char_count" in X_tr.columns assert "text_word_count" in X_tr.columns assert "text_digit_count" not in X_tr.columns assert "text_uppercase_count" not in X_tr.columns + # Verify expected values + assert X_tr["text_char_count"].tolist() == [5, 5] + assert X_tr["text_word_count"].tolist() == [1, 1] + def test_specific_variables(df_multi_text): """Test extracting features from specific variables only.""" transformer = TextFeatures(variables=["text1"], features=["char_count"]) X_tr = transformer.fit_transform(df_multi_text.copy()) + # Verify only specified variable has features extracted assert "text1_char_count" in X_tr.columns assert "text2_char_count" not in X_tr.columns + # Verify expected values + assert X_tr["text1_char_count"].tolist() == [5, 5] + def test_drop_original(): """Test drop_original parameter removes text columns.""" @@ -115,11 +133,12 @@ def test_sentence_count(): def test_unique_word_features(): """Test unique word features.""" X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) - transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"]) + transformer = TextFeatures(features=["unique_word_count", "lexical_diversity"]) X_tr = transformer.fit_transform(X) assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] - assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0] + # lexical_diversity = word_count / unique_word_count + assert X_tr["text_lexical_diversity"].tolist() == [3.0, 1.0, 1.0] @pytest.mark.parametrize("invalid_feature", ["invalid_feature", "not_a_feature"]) @@ -154,7 +173,7 @@ def test_no_text_columns_raises_error(): """Test that no text columns raises error when variables=None.""" X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) transformer = TextFeatures() - with pytest.raises(ValueError, match="No object/string columns found"): + with pytest.raises(TypeError, match="No categorical variables found"): transformer.fit(X) From 7c30958765c3398a3e1696e2a683ce224985af69 Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 12 Jan 2026 06:11:11 -0500 Subject: [PATCH 3/6] Fix style issues and update docs for lexical_diversity --- docs/user_guide/text/TextFeatures.rst | 2 +- feature_engine/text/text_features.py | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst index a1a82aa44..009a220f9 100644 --- a/docs/user_guide/text/TextFeatures.rst +++ b/docs/user_guide/text/TextFeatures.rst @@ -36,7 +36,7 @@ The transformer can extract the following features: - **starts_with_uppercase**: Binary indicator if text starts with uppercase - **ends_with_punctuation**: Binary indicator if text ends with .!? - **unique_word_count**: Number of unique words (case-insensitive) -- **unique_word_ratio**: Ratio of unique words to total words +- **lexical_diversity**: Ratio of total words to unique words Example ~~~~~~~ diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 9696d628f..e6d50aac9 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -147,8 +147,8 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): def __init__( self, - variables: Union[None, str, List[str]] = None, - features: Union[None, List[str]] = None, + variables: Optional[Union[str, List[str]]] = None, + features: Optional[List[str]] = None, drop_original: bool = False, ) -> None: @@ -219,7 +219,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): raise ValueError( f"Variables {missing} are not present in the dataframe." ) - self.variables_ = list(self.variables) if isinstance(self.variables, (list, tuple)) else [self.variables] + if isinstance(self.variables, (list, tuple)): + self.variables_ = list(self.variables) + else: + self.variables_ = [self.variables] # Set features to extract if self.features is None: From 2df54470041bfba47a046f17cba248d3686730ee Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 12 Jan 2026 06:13:55 -0500 Subject: [PATCH 4/6] Add comprehensive assertions for all 19 features in test_default_all_features --- tests/test_text/test_text_features.py | 49 +++++++++++++++++++-------- 1 file changed, 35 insertions(+), 14 deletions(-) diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index b62328730..07075fb3c 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -23,29 +23,50 @@ def df_multi_text(): def test_default_all_features(df_text): """Test extracting all features with default parameters. - Expected values for ["Hello World!", "Python 123", "AI"]: - - char_count: [12, 10, 2] - - word_count: [2, 2, 1] - - digit_count: [0, 3, 0] - - uppercase_count: [2, 1, 2] - - has_digits: [0, 1, 0] + Test data: ["Hello World!", "Python 123", "AI"] + Verifies all 19 text features produce expected values. """ transformer = TextFeatures() X_tr = transformer.fit_transform(df_text.copy()) - # Verify all expected features are present - assert "text_char_count" in X_tr.columns - assert "text_word_count" in X_tr.columns - assert "text_digit_count" in X_tr.columns - assert "text_uppercase_count" in X_tr.columns - assert "text_lexical_diversity" in X_tr.columns - - # Verify expected values + # Verify all 19 features have expected values + # Basic counts assert X_tr["text_char_count"].tolist() == [12, 10, 2] assert X_tr["text_word_count"].tolist() == [2, 2, 1] + assert X_tr["text_sentence_count"].tolist() == [1, 0, 0] + assert X_tr["text_avg_word_length"].tolist() == [6.0, 5.0, 2.0] + + # Character type counts assert X_tr["text_digit_count"].tolist() == [0, 3, 0] assert X_tr["text_uppercase_count"].tolist() == [2, 1, 2] + assert X_tr["text_lowercase_count"].tolist() == [8, 5, 0] + assert X_tr["text_special_char_count"].tolist() == [1, 0, 0] + assert X_tr["text_whitespace_count"].tolist() == [1, 1, 0] + + # Ratios (using pytest.approx for floating point comparison) + import pytest + assert X_tr["text_whitespace_ratio"].tolist() == pytest.approx( + [1 / 12, 1 / 10, 0.0], rel=1e-5 + ) + assert X_tr["text_digit_ratio"].tolist() == pytest.approx( + [0.0, 3 / 10, 0.0], rel=1e-5 + ) + assert X_tr["text_uppercase_ratio"].tolist() == pytest.approx( + [2 / 12, 1 / 10, 1.0], rel=1e-5 + ) + + # Binary indicators assert X_tr["text_has_digits"].tolist() == [0, 1, 0] + assert X_tr["text_has_uppercase"].tolist() == [1, 1, 1] + assert X_tr["text_is_empty"].tolist() == [0, 0, 0] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 1, 1] + assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 0] + + # Unique word features + assert X_tr["text_unique_word_count"].tolist() == [2, 2, 1] + assert X_tr["text_lexical_diversity"].tolist() == pytest.approx( + [1.0, 1.0, 1.0], rel=1e-5 + ) def test_specific_features(): From 1af41daa4ebe30f3a2c0ec9e9666c13f4da6303a Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 12 Jan 2026 06:18:50 -0500 Subject: [PATCH 5/6] Use _check_variables_input_value for variable validation --- feature_engine/text/text_features.py | 12 ------------ tests/test_text/test_text_features.py | 4 ++-- 2 files changed, 2 insertions(+), 14 deletions(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index e6d50aac9..9d84c6155 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -152,18 +152,6 @@ def __init__( drop_original: bool = False, ) -> None: - # Validate variables - if variables is not None: - if isinstance(variables, str): - variables = [variables] - elif not isinstance(variables, list) or not all( - isinstance(v, str) for v in variables - ): - raise ValueError( - "variables must be None, a string, or a list of strings. " - f"Got {type(variables).__name__} instead." - ) - # Validate features if features is not None: if not isinstance(features, list) or not all( diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py index 07075fb3c..5da1054c8 100644 --- a/tests/test_text/test_text_features.py +++ b/tests/test_text/test_text_features.py @@ -175,10 +175,10 @@ def test_non_string_feature_raises_error(): TextFeatures(features=[123]) -@pytest.mark.parametrize("invalid_variables", [123, 0.5, {"a": 1}]) +@pytest.mark.parametrize("invalid_variables", [0.5, {"a": 1}]) def test_invalid_variables_raises_error(invalid_variables): """Test that invalid variables parameter raises ValueError.""" - with pytest.raises(ValueError, match="variables must be"): + with pytest.raises(ValueError, match="variables"): TextFeatures(variables=invalid_variables) From 63f68b9a3798152787ac7f149a31dae18501132d Mon Sep 17 00:00:00 2001 From: ankitlade12 Date: Mon, 12 Jan 2026 06:24:57 -0500 Subject: [PATCH 6/6] Fix mypy type error: update variables type hint to match utility function --- feature_engine/text/text_features.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py index 9d84c6155..b4cac844f 100644 --- a/feature_engine/text/text_features.py +++ b/feature_engine/text/text_features.py @@ -147,7 +147,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): def __init__( self, - variables: Optional[Union[str, List[str]]] = None, + variables: Optional[Union[int, str, List[Union[str, int]]]] = None, features: Optional[List[str]] = None, drop_original: bool = False, ) -> None: