diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst index c786e77e1..52c33a8f4 100644 --- a/docs/user_guide/index.rst +++ b/docs/user_guide/index.rst @@ -28,6 +28,7 @@ Creation creation/index datetime/index + text/index Selection diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst new file mode 100644 index 000000000..009a220f9 --- /dev/null +++ b/docs/user_guide/text/TextFeatures.rst @@ -0,0 +1,152 @@ +.. _text_features: + +.. currentmodule:: feature_engine.text + +TextFeatures +============ + +The :class:`TextFeatures()` extracts numerical features from text/string variables. +This transformer is useful for extracting basic text statistics that can be used +as features in machine learning models. + +Unlike scikit-learn's CountVectorizer or TfidfVectorizer which create sparse matrices, +:class:`TextFeatures()` extracts metadata features that remain in DataFrame format +and can be easily combined with other Feature-engine transformers in a pipeline. + +Available Features +~~~~~~~~~~~~~~~~~~ + +The transformer can extract the following features: + +- **char_count**: Number of characters in the text +- **word_count**: Number of words (whitespace-separated tokens) +- **sentence_count**: Number of sentences (based on .!? punctuation) +- **avg_word_length**: Average length of words +- **digit_count**: Number of digit characters +- **uppercase_count**: Number of uppercase letters +- **lowercase_count**: Number of lowercase letters +- **special_char_count**: Number of special characters (non-alphanumeric) +- **whitespace_count**: Number of whitespace characters +- **whitespace_ratio**: Ratio of whitespace to total characters +- **digit_ratio**: Ratio of digits to total characters +- **uppercase_ratio**: Ratio of uppercase to total characters +- **has_digits**: Binary indicator if text contains digits +- **has_uppercase**: Binary indicator if text contains uppercase +- **is_empty**: Binary indicator if text is empty +- **starts_with_uppercase**: Binary indicator if text starts with uppercase +- **ends_with_punctuation**: Binary indicator if text ends with .!? +- **unique_word_count**: Number of unique words (case-insensitive) +- **lexical_diversity**: Ratio of total words to unique words + +Example +~~~~~~~ + +Let's create a dataframe with text data and extract features: + +.. code:: python + + import pandas as pd + from feature_engine.text import TextFeatures + + # Create sample data + X = pd.DataFrame({ + 'review': [ + 'This product is AMAZING! Best purchase ever.', + 'Not great. Would not recommend.', + 'OK for the price. 3 out of 5 stars.', + 'TERRIBLE!!! DO NOT BUY!', + ], + 'title': [ + 'Great Product', + 'Disappointed', + 'Average', + 'Awful', + ] + }) + +Now let's extract specific text features: + +.. code:: python + + # Set up the transformer with specific features + tf = TextFeatures( + variables=['review'], + features=['word_count', 'char_count', 'has_digits', 'uppercase_ratio'] + ) + + # Fit and transform + tf.fit(X) + X_transformed = tf.transform(X) + + print(X_transformed.columns.tolist()) + +Output: + +.. code:: python + + ['review', 'title', 'review_word_count', 'review_char_count', + 'review_has_digits', 'review_uppercase_ratio'] + +Extracting all features +~~~~~~~~~~~~~~~~~~~~~~~ + +By default, if no features are specified, all available features will be extracted: + +.. code:: python + + # Extract all features from all text columns + tf = TextFeatures() + tf.fit(X) + X_transformed = tf.transform(X) + + # This will create 19 new columns for each text variable + print(f"Original columns: {len(X.columns)}") + print(f"Transformed columns: {len(X_transformed.columns)}") + +Dropping original columns +~~~~~~~~~~~~~~~~~~~~~~~~~ + +You can drop the original text columns after extracting features: + +.. code:: python + + tf = TextFeatures( + variables=['review'], + features=['word_count', 'char_count'], + drop_original=True + ) + + tf.fit(X) + X_transformed = tf.transform(X) + + # 'review' column is now removed + print(X_transformed.columns.tolist()) + +Using in a Pipeline +~~~~~~~~~~~~~~~~~~~ + +:class:`TextFeatures()` works seamlessly with scikit-learn pipelines: + +.. code:: python + + from sklearn.pipeline import Pipeline + from sklearn.preprocessing import StandardScaler + from sklearn.linear_model import LogisticRegression + + # Create a pipeline + pipe = Pipeline([ + ('text_features', TextFeatures( + variables=['review'], + features=['word_count', 'char_count', 'uppercase_ratio'], + drop_original=True + )), + ('scaler', StandardScaler()), + ('classifier', LogisticRegression()) + ]) + +API Reference +------------- + +.. autoclass:: TextFeatures + :members: + :inherited-members: diff --git a/docs/user_guide/text/index.rst b/docs/user_guide/text/index.rst new file mode 100644 index 000000000..ea23d7362 --- /dev/null +++ b/docs/user_guide/text/index.rst @@ -0,0 +1,18 @@ +.. -*- mode: rst -*- + +Text Feature Extraction +======================= + +Feature-engine's text module includes transformers to extract numerical features +from text/string variables. + +Text feature extraction is useful for machine learning problems where you have +text data but want to derive numerical statistics without creating sparse +bag-of-words or TF-IDF representations. + +**Transformers** + +.. toctree:: + :maxdepth: 1 + + TextFeatures diff --git a/feature_engine/text/__init__.py b/feature_engine/text/__init__.py new file mode 100644 index 000000000..14626b79c --- /dev/null +++ b/feature_engine/text/__init__.py @@ -0,0 +1,9 @@ +""" +The module text includes classes to extract features from text/string variables. +""" + +from .text_features import TextFeatures + +__all__ = [ + "TextFeatures", +] diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py new file mode 100644 index 000000000..b4cac844f --- /dev/null +++ b/feature_engine/text/text_features.py @@ -0,0 +1,314 @@ +# Authors: Ankit Hemant Lade (contributor) +# License: BSD 3 clause + +from typing import List, Optional, Union + +import pandas as pd +from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin +from feature_engine._check_init_parameters.check_init_input_params import ( + _check_param_drop_original, +) +from feature_engine._check_init_parameters.check_variables import ( + _check_variables_input_value, +) +from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X +from feature_engine.tags import _return_tags +from feature_engine.variable_handling import find_categorical_variables + +# Available text features and their computation functions +TEXT_FEATURES = { + "char_count": lambda x: x.str.len(), + "word_count": lambda x: x.str.split().str.len(), + "sentence_count": lambda x: x.str.count(r"[.!?]+"), + "avg_word_length": lambda x: x.str.len() / x.str.split().str.len().replace(0, 1), + "digit_count": lambda x: x.str.count(r"\d"), + "uppercase_count": lambda x: x.str.count(r"[A-Z]"), + "lowercase_count": lambda x: x.str.count(r"[a-z]"), + "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"), + "whitespace_count": lambda x: x.str.count(r"\s"), + "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1), + "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1), + "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1), + "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int), + "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int), + "is_empty": lambda x: (x.str.len() == 0).astype(int), + "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int), + "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int), + "unique_word_count": lambda x: x.str.lower().str.split().apply(set).str.len(), + "lexical_diversity": lambda x: ( + x.str.split().str.len() + / x.str.lower().str.split().apply(set).str.len().replace(0, 1) + ), +} + + +class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin): + """ + TextFeatures() extracts numerical features from text/string variables. This + transformer is useful for extracting basic text statistics that can be used + as features in machine learning models. + + The transformer can extract various text features including character counts, + word counts, sentence counts, and various ratios and indicators. + + A list of variables can be passed as an argument. Alternatively, the transformer + will automatically select and transform all variables of type object (string). + + More details in the :ref:`User Guide `. + + Parameters + ---------- + variables: list, default=None + The list of text/string variables to extract features from. If None, the + transformer will automatically select all object (string) columns. + + features: list, default=None + List of text features to extract. Available features are: + + - 'char_count': Number of characters in the text + - 'word_count': Number of words (whitespace-separated tokens) + - 'sentence_count': Number of sentences (based on .!? punctuation) + - 'avg_word_length': Average length of words + - 'digit_count': Number of digit characters + - 'uppercase_count': Number of uppercase letters + - 'lowercase_count': Number of lowercase letters + - 'special_char_count': Number of special characters (non-alphanumeric) + - 'whitespace_count': Number of whitespace characters + - 'whitespace_ratio': Ratio of whitespace to total characters + - 'digit_ratio': Ratio of digits to total characters + - 'uppercase_ratio': Ratio of uppercase to total characters + - 'has_digits': Binary indicator if text contains digits + - 'has_uppercase': Binary indicator if text contains uppercase + - 'is_empty': Binary indicator if text is empty + - 'starts_with_uppercase': Binary indicator if text starts with uppercase + - 'ends_with_punctuation': Binary indicator if text ends with .!? + - 'unique_word_count': Number of unique words (case-insensitive) + - 'lexical_diversity': Ratio of total words to unique words + + If None, extracts all features. + + drop_original: bool, default=False + Whether to drop the original text columns after transformation. + + Attributes + ---------- + variables_: + The list of text variables that will be transformed. + + features_: + The list of features that will be extracted. + + feature_names_in_: + List with the names of features seen during fit. + + n_features_in_: + The number of features in the train set used in fit. + + Methods + ------- + fit: + This transformer does not learn parameters. It stores the feature names + and validates input. + + fit_transform: + Fit to data, then transform it. + + transform: + Extract text features and add them to the dataframe. + + get_feature_names_out: + Get output feature names for transformation. + + See Also + -------- + feature_engine.encoding.StringSimilarityEncoder : + Encodes categorical variables based on string similarity. + + Examples + -------- + + >>> import pandas as pd + >>> from feature_engine.text import TextFeatures + >>> X = pd.DataFrame({ + ... 'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123'] + ... }) + >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits']) + >>> tf.fit(X) + >>> X = tf.transform(X) + >>> X + text text_char_count text_word_count text_has_digits + 0 Hello World! 12 2 0 + 1 Python is GREAT. 16 3 0 + 2 ML rocks 123 12 3 1 + """ + + def __init__( + self, + variables: Optional[Union[int, str, List[Union[str, int]]]] = None, + features: Optional[List[str]] = None, + drop_original: bool = False, + ) -> None: + + # Validate features + if features is not None: + if not isinstance(features, list) or not all( + isinstance(f, str) for f in features + ): + raise ValueError( + "features must be None or a list of strings. " + f"Got {type(features).__name__} instead." + ) + invalid_features = set(features) - set(TEXT_FEATURES.keys()) + if invalid_features: + raise ValueError( + f"Invalid features: {invalid_features}. " + f"Available features are: {list(TEXT_FEATURES.keys())}" + ) + + _check_param_drop_original(drop_original) + + self.variables = _check_variables_input_value(variables) + self.features = features + self.drop_original = drop_original + + def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): + """ + Stores feature names and validates that the specified variables are + present and are of string/object type. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The training input samples. + + y: pandas Series, or np.array. Defaults to None. + It is not needed in this transformer. You can pass y or None. + + Returns + ------- + self: TextFeatures + The fitted transformer. + """ + + # check input dataframe + X = check_X(X) + + # Find or validate text variables + if self.variables is None: + # Select object/string columns using existing utility + self.variables_ = find_categorical_variables(X) + else: + # Validate user-specified variables exist + missing = set(self.variables) - set(X.columns) + if missing: + raise ValueError( + f"Variables {missing} are not present in the dataframe." + ) + if isinstance(self.variables, (list, tuple)): + self.variables_ = list(self.variables) + else: + self.variables_ = [self.variables] + + # Set features to extract + if self.features is None: + self.features_ = list(TEXT_FEATURES.keys()) + else: + self.features_ = self.features + + # save input features + self.feature_names_in_ = X.columns.tolist() + + # save train set shape + self.n_features_in_ = X.shape[1] + + return self + + def transform(self, X: pd.DataFrame) -> pd.DataFrame: + """ + Extract text features and add them to the dataframe. + + Parameters + ---------- + X: pandas dataframe of shape = [n_samples, n_features] + The data to transform. + + Returns + ------- + X_new: Pandas dataframe + The dataframe with the original columns plus the new text features. + """ + + # Check method fit has been called + check_is_fitted(self) + + # check that input is a dataframe + X = check_X(X) + + # Check if input data contains same number of columns as dataframe used to fit. + _check_X_matches_training_df(X, self.n_features_in_) + + # reorder variables to match train set + X = X[self.feature_names_in_] + + # Extract features for each text variable + for var in self.variables_: + # Fill NaN with empty string for feature extraction + text_col = X[var].fillna("") + + for feature_name in self.features_: + new_col_name = f"{var}_{feature_name}" + feature_func = TEXT_FEATURES[feature_name] + X[new_col_name] = feature_func(text_col) + + # Fill any NaN values resulting from computation with 0 + X[new_col_name] = X[new_col_name].fillna(0) + + if self.drop_original: + X = X.drop(columns=self.variables_) + + return X + + def get_feature_names_out(self, input_features=None) -> List[str]: + """ + Get output feature names for transformation. + + Parameters + ---------- + input_features : array-like of str or None, default=None + Input features. If None, uses ``feature_names_in_``. + + Returns + ------- + feature_names_out : list of str + Output feature names. + """ + check_is_fitted(self) + + # Start with original features + if self.drop_original: + feature_names = [ + f for f in self.feature_names_in_ if f not in self.variables_ + ] + else: + feature_names = list(self.feature_names_in_) + + # Add new text feature names + for var in self.variables_: + for feature_name in self.features_: + feature_names.append(f"{var}_{feature_name}") + + return feature_names + + def _more_tags(self): + tags_dict = _return_tags() + tags_dict["allow_nan"] = True + tags_dict["variables"] = "categorical" + return tags_dict + + def __sklearn_tags__(self): + tags = super().__sklearn_tags__() + tags.input_tags.allow_nan = True + return tags diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py new file mode 100644 index 000000000..64a0339b6 --- /dev/null +++ b/tests/test_text/__init__.py @@ -0,0 +1 @@ +# Tests for text module diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py new file mode 100644 index 000000000..5da1054c8 --- /dev/null +++ b/tests/test_text/test_text_features.py @@ -0,0 +1,237 @@ +import pandas as pd +import pytest + +from feature_engine.text import TextFeatures + + +@pytest.fixture +def df_text(): + """Fixture providing sample text data.""" + return pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]}) + + +@pytest.fixture +def df_multi_text(): + """Fixture providing DataFrame with multiple text columns.""" + return pd.DataFrame({ + "text1": ["Hello", "World"], + "text2": ["Foo", "Bar"], + "numeric": [1, 2], + }) + + +def test_default_all_features(df_text): + """Test extracting all features with default parameters. + + Test data: ["Hello World!", "Python 123", "AI"] + Verifies all 19 text features produce expected values. + """ + transformer = TextFeatures() + X_tr = transformer.fit_transform(df_text.copy()) + + # Verify all 19 features have expected values + # Basic counts + assert X_tr["text_char_count"].tolist() == [12, 10, 2] + assert X_tr["text_word_count"].tolist() == [2, 2, 1] + assert X_tr["text_sentence_count"].tolist() == [1, 0, 0] + assert X_tr["text_avg_word_length"].tolist() == [6.0, 5.0, 2.0] + + # Character type counts + assert X_tr["text_digit_count"].tolist() == [0, 3, 0] + assert X_tr["text_uppercase_count"].tolist() == [2, 1, 2] + assert X_tr["text_lowercase_count"].tolist() == [8, 5, 0] + assert X_tr["text_special_char_count"].tolist() == [1, 0, 0] + assert X_tr["text_whitespace_count"].tolist() == [1, 1, 0] + + # Ratios (using pytest.approx for floating point comparison) + import pytest + assert X_tr["text_whitespace_ratio"].tolist() == pytest.approx( + [1 / 12, 1 / 10, 0.0], rel=1e-5 + ) + assert X_tr["text_digit_ratio"].tolist() == pytest.approx( + [0.0, 3 / 10, 0.0], rel=1e-5 + ) + assert X_tr["text_uppercase_ratio"].tolist() == pytest.approx( + [2 / 12, 1 / 10, 1.0], rel=1e-5 + ) + + # Binary indicators + assert X_tr["text_has_digits"].tolist() == [0, 1, 0] + assert X_tr["text_has_uppercase"].tolist() == [1, 1, 1] + assert X_tr["text_is_empty"].tolist() == [0, 0, 0] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 1, 1] + assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 0] + + # Unique word features + assert X_tr["text_unique_word_count"].tolist() == [2, 2, 1] + assert X_tr["text_lexical_diversity"].tolist() == pytest.approx( + [1.0, 1.0, 1.0], rel=1e-5 + ) + + +def test_specific_features(): + """Test extracting specific features only.""" + X = pd.DataFrame({"text": ["Hello", "World"]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + X_tr = transformer.fit_transform(X) + + # Verify only specified features are present + assert "text_char_count" in X_tr.columns + assert "text_word_count" in X_tr.columns + assert "text_digit_count" not in X_tr.columns + assert "text_uppercase_count" not in X_tr.columns + + # Verify expected values + assert X_tr["text_char_count"].tolist() == [5, 5] + assert X_tr["text_word_count"].tolist() == [1, 1] + + +def test_specific_variables(df_multi_text): + """Test extracting features from specific variables only.""" + transformer = TextFeatures(variables=["text1"], features=["char_count"]) + X_tr = transformer.fit_transform(df_multi_text.copy()) + + # Verify only specified variable has features extracted + assert "text1_char_count" in X_tr.columns + assert "text2_char_count" not in X_tr.columns + + # Verify expected values + assert X_tr["text1_char_count"].tolist() == [5, 5] + + +def test_drop_original(): + """Test drop_original parameter removes text columns.""" + X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + X_tr = transformer.fit_transform(X) + + assert "text" not in X_tr.columns + assert "text_char_count" in X_tr.columns + assert "other" in X_tr.columns + + +def test_empty_string_handling(): + """Test handling of empty strings.""" + X = pd.DataFrame({"text": ["", "Hello", ""]}) + transformer = TextFeatures(features=["char_count", "word_count", "is_empty"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [0, 5, 0] + assert X_tr["text_is_empty"].tolist() == [1, 0, 1] + + +def test_nan_handling(): + """Test handling of NaN values.""" + X = pd.DataFrame({"text": ["Hello", None, "World"]}) + transformer = TextFeatures(features=["char_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_char_count"].tolist() == [5, 0, 5] + + +def test_uppercase_features(): + """Test uppercase-related features.""" + X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]}) + transformer = TextFeatures( + features=["uppercase_count", "has_uppercase", "starts_with_uppercase"] + ) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3] + assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1] + assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1] + + +def test_sentence_count(): + """Test sentence counting.""" + X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]}) + transformer = TextFeatures(features=["sentence_count"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_sentence_count"].tolist() == [2, 0, 3] + + +def test_unique_word_features(): + """Test unique word features.""" + X = pd.DataFrame({"text": ["the the the", "a b c", "x"]}) + transformer = TextFeatures(features=["unique_word_count", "lexical_diversity"]) + X_tr = transformer.fit_transform(X) + + assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1] + # lexical_diversity = word_count / unique_word_count + assert X_tr["text_lexical_diversity"].tolist() == [3.0, 1.0, 1.0] + + +@pytest.mark.parametrize("invalid_feature", ["invalid_feature", "not_a_feature"]) +def test_invalid_feature_raises_error(invalid_feature): + """Test that invalid feature names raise ValueError.""" + with pytest.raises(ValueError, match="Invalid features"): + TextFeatures(features=[invalid_feature]) + + +def test_non_string_feature_raises_error(): + """Test that non-string feature raises ValueError.""" + with pytest.raises(ValueError, match="features must be None or a list of strings"): + TextFeatures(features=[123]) + + +@pytest.mark.parametrize("invalid_variables", [0.5, {"a": 1}]) +def test_invalid_variables_raises_error(invalid_variables): + """Test that invalid variables parameter raises ValueError.""" + with pytest.raises(ValueError, match="variables"): + TextFeatures(variables=invalid_variables) + + +def test_missing_variable_raises_error(): + """Test that missing variable raises ValueError on fit.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures(variables=["nonexistent"]) + with pytest.raises(ValueError, match="not present in the dataframe"): + transformer.fit(X) + + +def test_no_text_columns_raises_error(): + """Test that no text columns raises error when variables=None.""" + X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]}) + transformer = TextFeatures() + with pytest.raises(TypeError, match="No categorical variables found"): + transformer.fit(X) + + +def test_fit_stores_attributes(): + """Test that fit stores expected attributes with correct values.""" + X = pd.DataFrame({"text": ["Hello"]}) + transformer = TextFeatures() + transformer.fit(X) + + assert hasattr(transformer, "variables_") + assert hasattr(transformer, "features_") + assert hasattr(transformer, "feature_names_in_") + assert hasattr(transformer, "n_features_in_") + assert transformer.variables_ == ["text"] + assert transformer.n_features_in_ == 1 + + +def test_get_feature_names_out(): + """Test get_feature_names_out returns correct feature names.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count", "word_count"]) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names + assert "text_word_count" in feature_names + + +def test_get_feature_names_out_with_drop(): + """Test get_feature_names_out with drop_original=True.""" + X = pd.DataFrame({"text": ["Hello"], "other": [1]}) + transformer = TextFeatures(features=["char_count"], drop_original=True) + transformer.fit(X) + + feature_names = transformer.get_feature_names_out() + assert "text" not in feature_names + assert "other" in feature_names + assert "text_char_count" in feature_names