From 8a015d0e5fda5f27404e424a63816a0b355e5cbe Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Thu, 8 Jan 2026 09:48:50 -0600
Subject: [PATCH 1/6] Add TextFeatures transformer for text feature extraction

- Add TextFeatures class to extract features from text columns
- Support for features: char_count, word_count, digit_count, uppercase_count, etc.
- Add comprehensive tests with pytest parametrize
- Add user guide documentation
---
 docs/user_guide/index.rst             |   1 +
 docs/user_guide/text/TextFeatures.rst | 152 ++++++++++++
 docs/user_guide/text/index.rst        |  18 ++
 feature_engine/text/__init__.py       |   9 +
 feature_engine/text/text_features.py  | 327 ++++++++++++++++++++++++++
 tests/test_text/__init__.py           |   1 +
 tests/test_text/test_text_features.py | 197 ++++++++++++++++
 7 files changed, 705 insertions(+)
 create mode 100644 docs/user_guide/text/TextFeatures.rst
 create mode 100644 docs/user_guide/text/index.rst
 create mode 100644 feature_engine/text/__init__.py
 create mode 100644 feature_engine/text/text_features.py
 create mode 100644 tests/test_text/__init__.py
 create mode 100644 tests/test_text/test_text_features.py

diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
index c786e77e1..52c33a8f4 100644
--- a/docs/user_guide/index.rst
+++ b/docs/user_guide/index.rst
@@ -28,6 +28,7 @@ Creation
 
    creation/index
    datetime/index
+   text/index
 
 
 Selection
diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst
new file mode 100644
index 000000000..a1a82aa44
--- /dev/null
+++ b/docs/user_guide/text/TextFeatures.rst
@@ -0,0 +1,152 @@
+.. _text_features:
+
+.. currentmodule:: feature_engine.text
+
+TextFeatures
+============
+
+The :class:`TextFeatures()` extracts numerical features from text/string variables.
+This transformer is useful for extracting basic text statistics that can be used
+as features in machine learning models.
+
+Unlike scikit-learn's CountVectorizer or TfidfVectorizer which create sparse matrices,
+:class:`TextFeatures()` extracts metadata features that remain in DataFrame format
+and can be easily combined with other Feature-engine transformers in a pipeline.
+
+Available Features
+~~~~~~~~~~~~~~~~~~
+
+The transformer can extract the following features:
+
+- **char_count**: Number of characters in the text
+- **word_count**: Number of words (whitespace-separated tokens)
+- **sentence_count**: Number of sentences (based on .!? punctuation)
+- **avg_word_length**: Average length of words
+- **digit_count**: Number of digit characters
+- **uppercase_count**: Number of uppercase letters
+- **lowercase_count**: Number of lowercase letters
+- **special_char_count**: Number of special characters (non-alphanumeric)
+- **whitespace_count**: Number of whitespace characters
+- **whitespace_ratio**: Ratio of whitespace to total characters
+- **digit_ratio**: Ratio of digits to total characters
+- **uppercase_ratio**: Ratio of uppercase to total characters
+- **has_digits**: Binary indicator if text contains digits
+- **has_uppercase**: Binary indicator if text contains uppercase
+- **is_empty**: Binary indicator if text is empty
+- **starts_with_uppercase**: Binary indicator if text starts with uppercase
+- **ends_with_punctuation**: Binary indicator if text ends with .!?
+- **unique_word_count**: Number of unique words (case-insensitive)
+- **unique_word_ratio**: Ratio of unique words to total words
+
+Example
+~~~~~~~
+
+Let's create a dataframe with text data and extract features:
+
+.. code:: python
+
+    import pandas as pd
+    from feature_engine.text import TextFeatures
+
+    # Create sample data
+    X = pd.DataFrame({
+        'review': [
+            'This product is AMAZING! Best purchase ever.',
+            'Not great. Would not recommend.',
+            'OK for the price. 3 out of 5 stars.',
+            'TERRIBLE!!! DO NOT BUY!',
+        ],
+        'title': [
+            'Great Product',
+            'Disappointed',
+            'Average',
+            'Awful',
+        ]
+    })
+
+Now let's extract specific text features:
+
+.. code:: python
+
+    # Set up the transformer with specific features
+    tf = TextFeatures(
+        variables=['review'],
+        features=['word_count', 'char_count', 'has_digits', 'uppercase_ratio']
+    )
+
+    # Fit and transform
+    tf.fit(X)
+    X_transformed = tf.transform(X)
+
+    print(X_transformed.columns.tolist())
+
+Output:
+
+.. code:: python
+
+    ['review', 'title', 'review_word_count', 'review_char_count',
+     'review_has_digits', 'review_uppercase_ratio']
+
+Extracting all features
+~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, if no features are specified, all available features will be extracted:
+
+.. code:: python
+
+    # Extract all features from all text columns
+    tf = TextFeatures()
+    tf.fit(X)
+    X_transformed = tf.transform(X)
+
+    # This will create 19 new columns for each text variable
+    print(f"Original columns: {len(X.columns)}")
+    print(f"Transformed columns: {len(X_transformed.columns)}")
+
+Dropping original columns
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+You can drop the original text columns after extracting features:
+
+.. code:: python
+
+    tf = TextFeatures(
+        variables=['review'],
+        features=['word_count', 'char_count'],
+        drop_original=True
+    )
+
+    tf.fit(X)
+    X_transformed = tf.transform(X)
+
+    # 'review' column is now removed
+    print(X_transformed.columns.tolist())
+
+Using in a Pipeline
+~~~~~~~~~~~~~~~~~~~
+
+:class:`TextFeatures()` works seamlessly with scikit-learn pipelines:
+
+.. code:: python
+
+    from sklearn.pipeline import Pipeline
+    from sklearn.preprocessing import StandardScaler
+    from sklearn.linear_model import LogisticRegression
+
+    # Create a pipeline
+    pipe = Pipeline([
+        ('text_features', TextFeatures(
+            variables=['review'],
+            features=['word_count', 'char_count', 'uppercase_ratio'],
+            drop_original=True
+        )),
+        ('scaler', StandardScaler()),
+        ('classifier', LogisticRegression())
+    ])
+
+API Reference
+-------------
+
+.. autoclass:: TextFeatures
+    :members:
+    :inherited-members:
diff --git a/docs/user_guide/text/index.rst b/docs/user_guide/text/index.rst
new file mode 100644
index 000000000..ea23d7362
--- /dev/null
+++ b/docs/user_guide/text/index.rst
@@ -0,0 +1,18 @@
+.. -*- mode: rst -*-
+
+Text Feature Extraction
+=======================
+
+Feature-engine's text module includes transformers to extract numerical features
+from text/string variables.
+
+Text feature extraction is useful for machine learning problems where you have
+text data but want to derive numerical statistics without creating sparse
+bag-of-words or TF-IDF representations.
+
+**Transformers**
+
+.. toctree::
+   :maxdepth: 1
+
+   TextFeatures
diff --git a/feature_engine/text/__init__.py b/feature_engine/text/__init__.py
new file mode 100644
index 000000000..14626b79c
--- /dev/null
+++ b/feature_engine/text/__init__.py
@@ -0,0 +1,9 @@
+"""
+The module text includes classes to extract features from text/string variables.
+"""
+
+from .text_features import TextFeatures
+
+__all__ = [
+    "TextFeatures",
+]
diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
new file mode 100644
index 000000000..63e9b0dac
--- /dev/null
+++ b/feature_engine/text/text_features.py
@@ -0,0 +1,327 @@
+# Authors: Ankit Hemant Lade (contributor)
+# License: BSD 3 clause
+
+from typing import List, Optional, Union
+
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils.validation import check_is_fitted
+
+from feature_engine._base_transformers.mixins import GetFeatureNamesOutMixin
+from feature_engine._check_init_parameters.check_init_input_params import (
+    _check_param_drop_original,
+)
+from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X
+from feature_engine.tags import _return_tags
+
+# Available text features and their computation functions
+TEXT_FEATURES = {
+    "char_count": lambda x: x.str.len(),
+    "word_count": lambda x: x.str.split().str.len(),
+    "sentence_count": lambda x: x.str.count(r"[.!?]+"),
+    "avg_word_length": lambda x: x.apply(
+        lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1)
+    ),
+    "digit_count": lambda x: x.str.count(r"\d"),
+    "uppercase_count": lambda x: x.str.count(r"[A-Z]"),
+    "lowercase_count": lambda x: x.str.count(r"[a-z]"),
+    "special_char_count": lambda x: x.str.count(r"[^a-zA-Z0-9\s]"),
+    "whitespace_count": lambda x: x.str.count(r"\s"),
+    "whitespace_ratio": lambda x: x.str.count(r"\s") / x.str.len().replace(0, 1),
+    "digit_ratio": lambda x: x.str.count(r"\d") / x.str.len().replace(0, 1),
+    "uppercase_ratio": lambda x: x.str.count(r"[A-Z]") / x.str.len().replace(0, 1),
+    "has_digits": lambda x: x.str.contains(r"\d", regex=True).astype(int),
+    "has_uppercase": lambda x: x.str.contains(r"[A-Z]", regex=True).astype(int),
+    "is_empty": lambda x: (x.str.len() == 0).astype(int),
+    "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int),
+    "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int),
+    "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))),
+    "unique_word_ratio": lambda x: x.apply(
+        lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1)
+    ),
+}
+
+
+class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
+    """
+    TextFeatures() extracts numerical features from text/string variables. This
+    transformer is useful for extracting basic text statistics that can be used
+    as features in machine learning models.
+
+    The transformer can extract various text features including character counts,
+    word counts, sentence counts, and various ratios and indicators.
+
+    A list of variables can be passed as an argument. Alternatively, the transformer
+    will automatically select and transform all variables of type object (string).
+
+    More details in the :ref:`User Guide <text_features>`.
+
+    Parameters
+    ----------
+    variables: list, default=None
+        The list of text/string variables to extract features from. If None, the
+        transformer will automatically select all object (string) columns.
+
+    features: list, default=None
+        List of text features to extract. Available features are:
+
+        - 'char_count': Number of characters in the text
+        - 'word_count': Number of words (whitespace-separated tokens)
+        - 'sentence_count': Number of sentences (based on .!? punctuation)
+        - 'avg_word_length': Average length of words
+        - 'digit_count': Number of digit characters
+        - 'uppercase_count': Number of uppercase letters
+        - 'lowercase_count': Number of lowercase letters
+        - 'special_char_count': Number of special characters (non-alphanumeric)
+        - 'whitespace_count': Number of whitespace characters
+        - 'whitespace_ratio': Ratio of whitespace to total characters
+        - 'digit_ratio': Ratio of digits to total characters
+        - 'uppercase_ratio': Ratio of uppercase to total characters
+        - 'has_digits': Binary indicator if text contains digits
+        - 'has_uppercase': Binary indicator if text contains uppercase
+        - 'is_empty': Binary indicator if text is empty
+        - 'starts_with_uppercase': Binary indicator if text starts with uppercase
+        - 'ends_with_punctuation': Binary indicator if text ends with .!?
+        - 'unique_word_count': Number of unique words (case-insensitive)
+        - 'unique_word_ratio': Ratio of unique words to total words
+
+        If None, extracts all available features.
+
+    drop_original: bool, default=False
+        Whether to drop the original text columns after transformation.
+
+    Attributes
+    ----------
+    variables_:
+        The list of text variables that will be transformed.
+
+    features_:
+        The list of features that will be extracted.
+
+    feature_names_in_:
+        List with the names of features seen during fit.
+
+    n_features_in_:
+        The number of features in the train set used in fit.
+
+    Methods
+    -------
+    fit:
+        This transformer does not learn parameters. It stores the feature names
+        and validates input.
+
+    fit_transform:
+        Fit to data, then transform it.
+
+    transform:
+        Extract text features and add them to the dataframe.
+
+    get_feature_names_out:
+        Get output feature names for transformation.
+
+    See Also
+    --------
+    feature_engine.encoding.StringSimilarityEncoder :
+        Encodes categorical variables based on string similarity.
+
+    Examples
+    --------
+
+    >>> import pandas as pd
+    >>> from feature_engine.text import TextFeatures
+    >>> X = pd.DataFrame({
+    ...     'text': ['Hello World!', 'Python is GREAT.', 'ML rocks 123']
+    ... })
+    >>> tf = TextFeatures(features=['char_count', 'word_count', 'has_digits'])
+    >>> tf.fit(X)
+    >>> X = tf.transform(X)
+    >>> X
+                   text  text_char_count  text_word_count  text_has_digits
+    0      Hello World!               12                2                0
+    1  Python is GREAT.               16                3                0
+    2       ML rocks 123               12                3                1
+    """
+
+    def __init__(
+        self,
+        variables: Union[None, str, List[str]] = None,
+        features: Union[None, List[str]] = None,
+        drop_original: bool = False,
+    ) -> None:
+
+        # Validate variables
+        if variables is not None:
+            if isinstance(variables, str):
+                variables = [variables]
+            elif not isinstance(variables, list) or not all(
+                isinstance(v, str) for v in variables
+            ):
+                raise ValueError(
+                    "variables must be None, a string, or a list of strings. "
+                    f"Got {type(variables).__name__} instead."
+                )
+
+        # Validate features
+        if features is not None:
+            if not isinstance(features, list) or not all(
+                isinstance(f, str) for f in features
+            ):
+                raise ValueError(
+                    "features must be None or a list of strings. "
+                    f"Got {type(features).__name__} instead."
+                )
+            invalid_features = set(features) - set(TEXT_FEATURES.keys())
+            if invalid_features:
+                raise ValueError(
+                    f"Invalid features: {invalid_features}. "
+                    f"Available features are: {list(TEXT_FEATURES.keys())}"
+                )
+
+        _check_param_drop_original(drop_original)
+
+        self.variables = variables
+        self.features = features
+        self.drop_original = drop_original
+
+    def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
+        """
+        This transformer does not learn parameters.
+
+        Stores feature names and validates that the specified variables are
+        present and are of string/object type.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The training input samples.
+
+        y: pandas Series, or np.array. Defaults to None.
+            It is not needed in this transformer. You can pass y or None.
+
+        Returns
+        -------
+        self: TextFeatures
+            The fitted transformer.
+        """
+
+        # check input dataframe
+        X = check_X(X)
+
+        # Find or validate text variables
+        if self.variables is None:
+            # Select object/string columns
+            self.variables_ = [col for col in X.columns if X[col].dtype == "object"]
+            if len(self.variables_) == 0:
+                raise ValueError(
+                    "No object/string columns found in the dataframe. "
+                    "Please specify variables explicitly."
+                )
+        else:
+            # Validate user-specified variables exist
+            missing = set(self.variables) - set(X.columns)
+            if missing:
+                raise ValueError(
+                    f"Variables {missing} are not present in the dataframe."
+                )
+            self.variables_ = self.variables
+
+        # Set features to extract
+        if self.features is None:
+            self.features_ = list(TEXT_FEATURES.keys())
+        else:
+            self.features_ = self.features
+
+        # save input features
+        self.feature_names_in_ = X.columns.tolist()
+
+        # save train set shape
+        self.n_features_in_ = X.shape[1]
+
+        return self
+
+    def transform(self, X: pd.DataFrame) -> pd.DataFrame:
+        """
+        Extract text features and add them to the dataframe.
+
+        Parameters
+        ----------
+        X: pandas dataframe of shape = [n_samples, n_features]
+            The data to transform.
+
+        Returns
+        -------
+        X_new: Pandas dataframe
+            The dataframe with the original columns plus the new text features.
+        """
+
+        # Check method fit has been called
+        check_is_fitted(self)
+
+        # check that input is a dataframe
+        X = check_X(X)
+
+        # Check if input data contains same number of columns as dataframe used to fit.
+        _check_X_matches_training_df(X, self.n_features_in_)
+
+        # reorder variables to match train set
+        X = X[self.feature_names_in_]
+
+        # Extract features for each text variable
+        for var in self.variables_:
+            # Fill NaN with empty string for feature extraction
+            text_col = X[var].fillna("")
+
+            for feature_name in self.features_:
+                new_col_name = f"{var}_{feature_name}"
+                feature_func = TEXT_FEATURES[feature_name]
+                X[new_col_name] = feature_func(text_col)
+
+                # Fill any NaN values resulting from computation with 0
+                X[new_col_name] = X[new_col_name].fillna(0)
+
+        if self.drop_original:
+            X = X.drop(columns=self.variables_)
+
+        return X
+
+    def get_feature_names_out(self, input_features=None) -> List[str]:
+        """
+        Get output feature names for transformation.
+
+        Parameters
+        ----------
+        input_features : array-like of str or None, default=None
+            Input features. If None, uses ``feature_names_in_``.
+
+        Returns
+        -------
+        feature_names_out : list of str
+            Output feature names.
+        """
+        check_is_fitted(self)
+
+        # Start with original features
+        if self.drop_original:
+            feature_names = [
+                f for f in self.feature_names_in_ if f not in self.variables_
+            ]
+        else:
+            feature_names = list(self.feature_names_in_)
+
+        # Add new text feature names
+        for var in self.variables_:
+            for feature_name in self.features_:
+                feature_names.append(f"{var}_{feature_name}")
+
+        return feature_names
+
+    def _more_tags(self):
+        tags_dict = _return_tags()
+        tags_dict["allow_nan"] = True
+        tags_dict["variables"] = "categorical"
+        return tags_dict
+
+    def __sklearn_tags__(self):
+        tags = super().__sklearn_tags__()
+        tags.input_tags.allow_nan = True
+        return tags
diff --git a/tests/test_text/__init__.py b/tests/test_text/__init__.py
new file mode 100644
index 000000000..64a0339b6
--- /dev/null
+++ b/tests/test_text/__init__.py
@@ -0,0 +1 @@
+# Tests for text module
diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py
new file mode 100644
index 000000000..315049373
--- /dev/null
+++ b/tests/test_text/test_text_features.py
@@ -0,0 +1,197 @@
+import pandas as pd
+import pytest
+
+from feature_engine.text import TextFeatures
+
+
+@pytest.fixture
+def df_text():
+    """Fixture providing sample text data."""
+    return pd.DataFrame({"text": ["Hello World!", "Python 123", "AI"]})
+
+
+@pytest.fixture
+def df_multi_text():
+    """Fixture providing DataFrame with multiple text columns."""
+    return pd.DataFrame({
+        "text1": ["Hello", "World"],
+        "text2": ["Foo", "Bar"],
+        "numeric": [1, 2],
+    })
+
+
+def test_default_all_features(df_text):
+    """Test extracting all features with default parameters.
+
+    Expected values for ["Hello World!", "Python 123", "AI"]:
+    - char_count: [12, 10, 2]
+    - word_count: [2, 2, 1]
+    - digit_count: [0, 3, 0]
+    """
+    transformer = TextFeatures()
+    X_tr = transformer.fit_transform(df_text.copy())
+
+    assert "text_char_count" in X_tr.columns
+    assert "text_word_count" in X_tr.columns
+    assert "text_digit_count" in X_tr.columns
+    assert X_tr["text_char_count"].tolist() == [12, 10, 2]
+    assert X_tr["text_word_count"].tolist() == [2, 2, 1]
+    assert X_tr["text_digit_count"].tolist() == [0, 3, 0]
+
+
+def test_specific_features():
+    """Test extracting specific features only."""
+    X = pd.DataFrame({"text": ["Hello", "World"]})
+    transformer = TextFeatures(features=["char_count", "word_count"])
+    X_tr = transformer.fit_transform(X)
+
+    assert "text_char_count" in X_tr.columns
+    assert "text_word_count" in X_tr.columns
+    assert "text_digit_count" not in X_tr.columns
+    assert "text_uppercase_count" not in X_tr.columns
+
+
+def test_specific_variables(df_multi_text):
+    """Test extracting features from specific variables only."""
+    transformer = TextFeatures(variables=["text1"], features=["char_count"])
+    X_tr = transformer.fit_transform(df_multi_text.copy())
+
+    assert "text1_char_count" in X_tr.columns
+    assert "text2_char_count" not in X_tr.columns
+
+
+def test_drop_original():
+    """Test drop_original parameter removes text columns."""
+    X = pd.DataFrame({"text": ["Hello", "World"], "other": [1, 2]})
+    transformer = TextFeatures(features=["char_count"], drop_original=True)
+    X_tr = transformer.fit_transform(X)
+
+    assert "text" not in X_tr.columns
+    assert "text_char_count" in X_tr.columns
+    assert "other" in X_tr.columns
+
+
+def test_empty_string_handling():
+    """Test handling of empty strings."""
+    X = pd.DataFrame({"text": ["", "Hello", ""]})
+    transformer = TextFeatures(features=["char_count", "word_count", "is_empty"])
+    X_tr = transformer.fit_transform(X)
+
+    assert X_tr["text_char_count"].tolist() == [0, 5, 0]
+    assert X_tr["text_is_empty"].tolist() == [1, 0, 1]
+
+
+def test_nan_handling():
+    """Test handling of NaN values."""
+    X = pd.DataFrame({"text": ["Hello", None, "World"]})
+    transformer = TextFeatures(features=["char_count"])
+    X_tr = transformer.fit_transform(X)
+
+    assert X_tr["text_char_count"].tolist() == [5, 0, 5]
+
+
+def test_uppercase_features():
+    """Test uppercase-related features."""
+    X = pd.DataFrame({"text": ["HELLO", "hello", "HeLLo"]})
+    transformer = TextFeatures(
+        features=["uppercase_count", "has_uppercase", "starts_with_uppercase"]
+    )
+    X_tr = transformer.fit_transform(X)
+
+    assert X_tr["text_uppercase_count"].tolist() == [5, 0, 3]
+    assert X_tr["text_has_uppercase"].tolist() == [1, 0, 1]
+    assert X_tr["text_starts_with_uppercase"].tolist() == [1, 0, 1]
+
+
+def test_sentence_count():
+    """Test sentence counting."""
+    X = pd.DataFrame({"text": ["Hello. World!", "One sentence", "A? B! C."]})
+    transformer = TextFeatures(features=["sentence_count"])
+    X_tr = transformer.fit_transform(X)
+
+    assert X_tr["text_sentence_count"].tolist() == [2, 0, 3]
+
+
+def test_unique_word_features():
+    """Test unique word features."""
+    X = pd.DataFrame({"text": ["the the the", "a b c", "x"]})
+    transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"])
+    X_tr = transformer.fit_transform(X)
+
+    assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1]
+    assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0]
+
+
+@pytest.mark.parametrize("invalid_feature", ["invalid_feature", "not_a_feature"])
+def test_invalid_feature_raises_error(invalid_feature):
+    """Test that invalid feature names raise ValueError."""
+    with pytest.raises(ValueError, match="Invalid features"):
+        TextFeatures(features=[invalid_feature])
+
+
+def test_non_string_feature_raises_error():
+    """Test that non-string feature raises ValueError."""
+    with pytest.raises(ValueError, match="features must be None or a list of strings"):
+        TextFeatures(features=[123])
+
+
+@pytest.mark.parametrize("invalid_variables", [123, 0.5, {"a": 1}])
+def test_invalid_variables_raises_error(invalid_variables):
+    """Test that invalid variables parameter raises ValueError."""
+    with pytest.raises(ValueError, match="variables must be"):
+        TextFeatures(variables=invalid_variables)
+
+
+def test_missing_variable_raises_error():
+    """Test that missing variable raises ValueError on fit."""
+    X = pd.DataFrame({"text": ["Hello"]})
+    transformer = TextFeatures(variables=["nonexistent"])
+    with pytest.raises(ValueError, match="not present in the dataframe"):
+        transformer.fit(X)
+
+
+def test_no_text_columns_raises_error():
+    """Test that no text columns raises error when variables=None."""
+    X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
+    transformer = TextFeatures()
+    with pytest.raises(ValueError, match="No object/string columns found"):
+        transformer.fit(X)
+
+
+def test_fit_stores_attributes():
+    """Test that fit stores expected attributes with correct values."""
+    X = pd.DataFrame({"text": ["Hello"]})
+    transformer = TextFeatures()
+    transformer.fit(X)
+
+    assert hasattr(transformer, "variables_")
+    assert hasattr(transformer, "features_")
+    assert hasattr(transformer, "feature_names_in_")
+    assert hasattr(transformer, "n_features_in_")
+    assert transformer.variables_ == ["text"]
+    assert transformer.n_features_in_ == 1
+
+
+def test_get_feature_names_out():
+    """Test get_feature_names_out returns correct feature names."""
+    X = pd.DataFrame({"text": ["Hello"], "other": [1]})
+    transformer = TextFeatures(features=["char_count", "word_count"])
+    transformer.fit(X)
+
+    feature_names = transformer.get_feature_names_out()
+    assert "text" in feature_names
+    assert "other" in feature_names
+    assert "text_char_count" in feature_names
+    assert "text_word_count" in feature_names
+
+
+def test_get_feature_names_out_with_drop():
+    """Test get_feature_names_out with drop_original=True."""
+    X = pd.DataFrame({"text": ["Hello"], "other": [1]})
+    transformer = TextFeatures(features=["char_count"], drop_original=True)
+    transformer.fit(X)
+
+    feature_names = transformer.get_feature_names_out()
+    assert "text" not in feature_names
+    assert "other" in feature_names
+    assert "text_char_count" in feature_names

From d2d655fea803786218f65dee28448b271f9c4708 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Mon, 12 Jan 2026 05:59:36 -0500
Subject: [PATCH 2/6] Address PR review comments for TextFeatures transformer

- Optimize avg_word_length using vectorized char_count / word_count
- Simplify unique_word_count using x.str.lower().str.split().apply(set).str.len()
- Rename unique_word_ratio to lexical_diversity (word_count / unique_word_count)
- Use _check_variables_input_value for variable validation
- Use find_categorical_variables for automatic variable selection
- Remove redundant docstring text
- Add comprehensive test assertions with expected values
---
 feature_engine/text/text_features.py  | 34 ++++++++++++---------------
 tests/test_text/test_text_features.py | 25 +++++++++++++++++---
 2 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
index 63e9b0dac..9696d628f 100644
--- a/feature_engine/text/text_features.py
+++ b/feature_engine/text/text_features.py
@@ -11,17 +11,19 @@
 from feature_engine._check_init_parameters.check_init_input_params import (
     _check_param_drop_original,
 )
+from feature_engine._check_init_parameters.check_variables import (
+    _check_variables_input_value,
+)
 from feature_engine.dataframe_checks import _check_X_matches_training_df, check_X
 from feature_engine.tags import _return_tags
+from feature_engine.variable_handling import find_categorical_variables
 
 # Available text features and their computation functions
 TEXT_FEATURES = {
     "char_count": lambda x: x.str.len(),
     "word_count": lambda x: x.str.split().str.len(),
     "sentence_count": lambda x: x.str.count(r"[.!?]+"),
-    "avg_word_length": lambda x: x.apply(
-        lambda s: sum(len(w) for w in str(s).split()) / max(len(str(s).split()), 1)
-    ),
+    "avg_word_length": lambda x: x.str.len() / x.str.split().str.len().replace(0, 1),
     "digit_count": lambda x: x.str.count(r"\d"),
     "uppercase_count": lambda x: x.str.count(r"[A-Z]"),
     "lowercase_count": lambda x: x.str.count(r"[a-z]"),
@@ -35,9 +37,10 @@
     "is_empty": lambda x: (x.str.len() == 0).astype(int),
     "starts_with_uppercase": lambda x: x.str.match(r"^[A-Z]").astype(int),
     "ends_with_punctuation": lambda x: x.str.match(r".*[.!?]$").astype(int),
-    "unique_word_count": lambda x: x.apply(lambda s: len(set(str(s).lower().split()))),
-    "unique_word_ratio": lambda x: x.apply(
-        lambda s: len(set(str(s).lower().split())) / max(len(str(s).split()), 1)
+    "unique_word_count": lambda x: x.str.lower().str.split().apply(set).str.len(),
+    "lexical_diversity": lambda x: (
+        x.str.split().str.len()
+        / x.str.lower().str.split().apply(set).str.len().replace(0, 1)
     ),
 }
 
@@ -83,9 +86,9 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
         - 'starts_with_uppercase': Binary indicator if text starts with uppercase
         - 'ends_with_punctuation': Binary indicator if text ends with .!?
         - 'unique_word_count': Number of unique words (case-insensitive)
-        - 'unique_word_ratio': Ratio of unique words to total words
+        - 'lexical_diversity': Ratio of total words to unique words
 
-        If None, extracts all available features.
+        If None, extracts all features.
 
     drop_original: bool, default=False
         Whether to drop the original text columns after transformation.
@@ -179,14 +182,12 @@ def __init__(
 
         _check_param_drop_original(drop_original)
 
-        self.variables = variables
+        self.variables = _check_variables_input_value(variables)
         self.features = features
         self.drop_original = drop_original
 
     def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         """
-        This transformer does not learn parameters.
-
         Stores feature names and validates that the specified variables are
         present and are of string/object type.
 
@@ -209,13 +210,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
 
         # Find or validate text variables
         if self.variables is None:
-            # Select object/string columns
-            self.variables_ = [col for col in X.columns if X[col].dtype == "object"]
-            if len(self.variables_) == 0:
-                raise ValueError(
-                    "No object/string columns found in the dataframe. "
-                    "Please specify variables explicitly."
-                )
+            # Select object/string columns using existing utility
+            self.variables_ = find_categorical_variables(X)
         else:
             # Validate user-specified variables exist
             missing = set(self.variables) - set(X.columns)
@@ -223,7 +219,7 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                 raise ValueError(
                     f"Variables {missing} are not present in the dataframe."
                 )
-            self.variables_ = self.variables
+            self.variables_ = list(self.variables) if isinstance(self.variables, (list, tuple)) else [self.variables]
 
         # Set features to extract
         if self.features is None:
diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py
index 315049373..b62328730 100644
--- a/tests/test_text/test_text_features.py
+++ b/tests/test_text/test_text_features.py
@@ -27,16 +27,25 @@ def test_default_all_features(df_text):
     - char_count: [12, 10, 2]
     - word_count: [2, 2, 1]
     - digit_count: [0, 3, 0]
+    - uppercase_count: [2, 1, 2]
+    - has_digits: [0, 1, 0]
     """
     transformer = TextFeatures()
     X_tr = transformer.fit_transform(df_text.copy())
 
+    # Verify all expected features are present
     assert "text_char_count" in X_tr.columns
     assert "text_word_count" in X_tr.columns
     assert "text_digit_count" in X_tr.columns
+    assert "text_uppercase_count" in X_tr.columns
+    assert "text_lexical_diversity" in X_tr.columns
+
+    # Verify expected values
     assert X_tr["text_char_count"].tolist() == [12, 10, 2]
     assert X_tr["text_word_count"].tolist() == [2, 2, 1]
     assert X_tr["text_digit_count"].tolist() == [0, 3, 0]
+    assert X_tr["text_uppercase_count"].tolist() == [2, 1, 2]
+    assert X_tr["text_has_digits"].tolist() == [0, 1, 0]
 
 
 def test_specific_features():
@@ -45,20 +54,29 @@ def test_specific_features():
     transformer = TextFeatures(features=["char_count", "word_count"])
     X_tr = transformer.fit_transform(X)
 
+    # Verify only specified features are present
     assert "text_char_count" in X_tr.columns
     assert "text_word_count" in X_tr.columns
     assert "text_digit_count" not in X_tr.columns
     assert "text_uppercase_count" not in X_tr.columns
 
+    # Verify expected values
+    assert X_tr["text_char_count"].tolist() == [5, 5]
+    assert X_tr["text_word_count"].tolist() == [1, 1]
+
 
 def test_specific_variables(df_multi_text):
     """Test extracting features from specific variables only."""
     transformer = TextFeatures(variables=["text1"], features=["char_count"])
     X_tr = transformer.fit_transform(df_multi_text.copy())
 
+    # Verify only specified variable has features extracted
     assert "text1_char_count" in X_tr.columns
     assert "text2_char_count" not in X_tr.columns
 
+    # Verify expected values
+    assert X_tr["text1_char_count"].tolist() == [5, 5]
+
 
 def test_drop_original():
     """Test drop_original parameter removes text columns."""
@@ -115,11 +133,12 @@ def test_sentence_count():
 def test_unique_word_features():
     """Test unique word features."""
     X = pd.DataFrame({"text": ["the the the", "a b c", "x"]})
-    transformer = TextFeatures(features=["unique_word_count", "unique_word_ratio"])
+    transformer = TextFeatures(features=["unique_word_count", "lexical_diversity"])
     X_tr = transformer.fit_transform(X)
 
     assert X_tr["text_unique_word_count"].tolist() == [1, 3, 1]
-    assert X_tr["text_unique_word_ratio"].tolist() == [1 / 3, 1.0, 1.0]
+    # lexical_diversity = word_count / unique_word_count
+    assert X_tr["text_lexical_diversity"].tolist() == [3.0, 1.0, 1.0]
 
 
 @pytest.mark.parametrize("invalid_feature", ["invalid_feature", "not_a_feature"])
@@ -154,7 +173,7 @@ def test_no_text_columns_raises_error():
     """Test that no text columns raises error when variables=None."""
     X = pd.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
     transformer = TextFeatures()
-    with pytest.raises(ValueError, match="No object/string columns found"):
+    with pytest.raises(TypeError, match="No categorical variables found"):
         transformer.fit(X)
 
 

From 7c30958765c3398a3e1696e2a683ce224985af69 Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Mon, 12 Jan 2026 06:11:11 -0500
Subject: [PATCH 3/6] Fix style issues and update docs for lexical_diversity

---
 docs/user_guide/text/TextFeatures.rst | 2 +-
 feature_engine/text/text_features.py  | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/docs/user_guide/text/TextFeatures.rst b/docs/user_guide/text/TextFeatures.rst
index a1a82aa44..009a220f9 100644
--- a/docs/user_guide/text/TextFeatures.rst
+++ b/docs/user_guide/text/TextFeatures.rst
@@ -36,7 +36,7 @@ The transformer can extract the following features:
 - **starts_with_uppercase**: Binary indicator if text starts with uppercase
 - **ends_with_punctuation**: Binary indicator if text ends with .!?
 - **unique_word_count**: Number of unique words (case-insensitive)
-- **unique_word_ratio**: Ratio of unique words to total words
+- **lexical_diversity**: Ratio of total words to unique words
 
 Example
 ~~~~~~~
diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
index 9696d628f..e6d50aac9 100644
--- a/feature_engine/text/text_features.py
+++ b/feature_engine/text/text_features.py
@@ -147,8 +147,8 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
 
     def __init__(
         self,
-        variables: Union[None, str, List[str]] = None,
-        features: Union[None, List[str]] = None,
+        variables: Optional[Union[str, List[str]]] = None,
+        features: Optional[List[str]] = None,
         drop_original: bool = False,
     ) -> None:
 
@@ -219,7 +219,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
                 raise ValueError(
                     f"Variables {missing} are not present in the dataframe."
                 )
-            self.variables_ = list(self.variables) if isinstance(self.variables, (list, tuple)) else [self.variables]
+            if isinstance(self.variables, (list, tuple)):
+                self.variables_ = list(self.variables)
+            else:
+                self.variables_ = [self.variables]
 
         # Set features to extract
         if self.features is None:

From 2df54470041bfba47a046f17cba248d3686730ee Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Mon, 12 Jan 2026 06:13:55 -0500
Subject: [PATCH 4/6] Add comprehensive assertions for all 19 features in
 test_default_all_features

---
 tests/test_text/test_text_features.py | 49 +++++++++++++++++++--------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py
index b62328730..07075fb3c 100644
--- a/tests/test_text/test_text_features.py
+++ b/tests/test_text/test_text_features.py
@@ -23,29 +23,50 @@ def df_multi_text():
 def test_default_all_features(df_text):
     """Test extracting all features with default parameters.
 
-    Expected values for ["Hello World!", "Python 123", "AI"]:
-    - char_count: [12, 10, 2]
-    - word_count: [2, 2, 1]
-    - digit_count: [0, 3, 0]
-    - uppercase_count: [2, 1, 2]
-    - has_digits: [0, 1, 0]
+    Test data: ["Hello World!", "Python 123", "AI"]
+    Verifies all 19 text features produce expected values.
     """
     transformer = TextFeatures()
     X_tr = transformer.fit_transform(df_text.copy())
 
-    # Verify all expected features are present
-    assert "text_char_count" in X_tr.columns
-    assert "text_word_count" in X_tr.columns
-    assert "text_digit_count" in X_tr.columns
-    assert "text_uppercase_count" in X_tr.columns
-    assert "text_lexical_diversity" in X_tr.columns
-
-    # Verify expected values
+    # Verify all 19 features have expected values
+    # Basic counts
     assert X_tr["text_char_count"].tolist() == [12, 10, 2]
     assert X_tr["text_word_count"].tolist() == [2, 2, 1]
+    assert X_tr["text_sentence_count"].tolist() == [1, 0, 0]
+    assert X_tr["text_avg_word_length"].tolist() == [6.0, 5.0, 2.0]
+
+    # Character type counts
     assert X_tr["text_digit_count"].tolist() == [0, 3, 0]
     assert X_tr["text_uppercase_count"].tolist() == [2, 1, 2]
+    assert X_tr["text_lowercase_count"].tolist() == [8, 5, 0]
+    assert X_tr["text_special_char_count"].tolist() == [1, 0, 0]
+    assert X_tr["text_whitespace_count"].tolist() == [1, 1, 0]
+
+    # Ratios (using pytest.approx for floating point comparison)
+    import pytest
+    assert X_tr["text_whitespace_ratio"].tolist() == pytest.approx(
+        [1 / 12, 1 / 10, 0.0], rel=1e-5
+    )
+    assert X_tr["text_digit_ratio"].tolist() == pytest.approx(
+        [0.0, 3 / 10, 0.0], rel=1e-5
+    )
+    assert X_tr["text_uppercase_ratio"].tolist() == pytest.approx(
+        [2 / 12, 1 / 10, 1.0], rel=1e-5
+    )
+
+    # Binary indicators
     assert X_tr["text_has_digits"].tolist() == [0, 1, 0]
+    assert X_tr["text_has_uppercase"].tolist() == [1, 1, 1]
+    assert X_tr["text_is_empty"].tolist() == [0, 0, 0]
+    assert X_tr["text_starts_with_uppercase"].tolist() == [1, 1, 1]
+    assert X_tr["text_ends_with_punctuation"].tolist() == [1, 0, 0]
+
+    # Unique word features
+    assert X_tr["text_unique_word_count"].tolist() == [2, 2, 1]
+    assert X_tr["text_lexical_diversity"].tolist() == pytest.approx(
+        [1.0, 1.0, 1.0], rel=1e-5
+    )
 
 
 def test_specific_features():

From 1af41daa4ebe30f3a2c0ec9e9666c13f4da6303a Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Mon, 12 Jan 2026 06:18:50 -0500
Subject: [PATCH 5/6] Use _check_variables_input_value for variable validation

---
 feature_engine/text/text_features.py  | 12 ------------
 tests/test_text/test_text_features.py |  4 ++--
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
index e6d50aac9..9d84c6155 100644
--- a/feature_engine/text/text_features.py
+++ b/feature_engine/text/text_features.py
@@ -152,18 +152,6 @@ def __init__(
         drop_original: bool = False,
     ) -> None:
 
-        # Validate variables
-        if variables is not None:
-            if isinstance(variables, str):
-                variables = [variables]
-            elif not isinstance(variables, list) or not all(
-                isinstance(v, str) for v in variables
-            ):
-                raise ValueError(
-                    "variables must be None, a string, or a list of strings. "
-                    f"Got {type(variables).__name__} instead."
-                )
-
         # Validate features
         if features is not None:
             if not isinstance(features, list) or not all(
diff --git a/tests/test_text/test_text_features.py b/tests/test_text/test_text_features.py
index 07075fb3c..5da1054c8 100644
--- a/tests/test_text/test_text_features.py
+++ b/tests/test_text/test_text_features.py
@@ -175,10 +175,10 @@ def test_non_string_feature_raises_error():
         TextFeatures(features=[123])
 
 
-@pytest.mark.parametrize("invalid_variables", [123, 0.5, {"a": 1}])
+@pytest.mark.parametrize("invalid_variables", [0.5, {"a": 1}])
 def test_invalid_variables_raises_error(invalid_variables):
     """Test that invalid variables parameter raises ValueError."""
-    with pytest.raises(ValueError, match="variables must be"):
+    with pytest.raises(ValueError, match="variables"):
         TextFeatures(variables=invalid_variables)
 
 

From 63f68b9a3798152787ac7f149a31dae18501132d Mon Sep 17 00:00:00 2001
From: ankitlade12 <ankitlade12@gmail.com>
Date: Mon, 12 Jan 2026 06:24:57 -0500
Subject: [PATCH 6/6] Fix mypy type error: update variables type hint to match
 utility function

---
 feature_engine/text/text_features.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/feature_engine/text/text_features.py b/feature_engine/text/text_features.py
index 9d84c6155..b4cac844f 100644
--- a/feature_engine/text/text_features.py
+++ b/feature_engine/text/text_features.py
@@ -147,7 +147,7 @@ class TextFeatures(TransformerMixin, BaseEstimator, GetFeatureNamesOutMixin):
 
     def __init__(
         self,
-        variables: Optional[Union[str, List[str]]] = None,
+        variables: Optional[Union[int, str, List[Union[str, int]]]] = None,
         features: Optional[List[str]] = None,
         drop_original: bool = False,
     ) -> None: